diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index ed37fa80b9..0e42635244 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -83,7 +83,7 @@ src/bond.* @sjplimp src/comm*.* @sjplimp src/compute.* @sjplimp src/dihedral.* @sjplimp -src/domain.* @sjplimp +src/domain.* @sjplimp @stanmoore1 src/dump*.* @sjplimp src/error.* @sjplimp src/finish.* @sjplimp diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake index e74893d0d0..a2f462905a 100644 --- a/cmake/Modules/Packages/KOKKOS.cmake +++ b/cmake/Modules/Packages/KOKKOS.cmake @@ -45,8 +45,8 @@ if(DOWNLOAD_KOKKOS) list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}") list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}") include(ExternalProject) - set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/4.2.01.tar.gz" CACHE STRING "URL for KOKKOS tarball") - set(KOKKOS_MD5 "16b9b09ae947d434dfb58fc5c87c2b76" CACHE STRING "MD5 checksum of KOKKOS tarball") + set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/4.3.00.tar.gz" CACHE STRING "URL for KOKKOS tarball") + set(KOKKOS_MD5 "889dcea2b5ced3debdc5b0820044bdc4" CACHE STRING "MD5 checksum of KOKKOS tarball") mark_as_advanced(KOKKOS_URL) mark_as_advanced(KOKKOS_MD5) GetFallbackURL(KOKKOS_URL KOKKOS_FALLBACK) @@ -71,7 +71,7 @@ if(DOWNLOAD_KOKKOS) add_dependencies(LAMMPS::KOKKOSCORE kokkos_build) add_dependencies(LAMMPS::KOKKOSCONTAINERS kokkos_build) elseif(EXTERNAL_KOKKOS) - find_package(Kokkos 4.2.01 REQUIRED CONFIG) + find_package(Kokkos 4.3.00 REQUIRED CONFIG) target_link_libraries(lammps PRIVATE Kokkos::kokkos) else() set(LAMMPS_LIB_KOKKOS_SRC_DIR ${LAMMPS_LIB_SOURCE_DIR}/kokkos) diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst index 1f643a9d14..f66238c3c9 100644 --- a/doc/src/Build_extras.rst +++ b/doc/src/Build_extras.rst @@ -533,9 +533,6 @@ They must be specified in uppercase. * - A64FX - HOST - ARMv8.2 with SVE Support - * - WSM - - HOST - - Intel Westmere CPU (SSE 4.2) * - SNB - HOST - Intel Sandy/Ivy Bridge CPU (AVX 1) @@ -566,18 +563,15 @@ They must be specified in uppercase. * - KNL - HOST - Intel Knights Landing Xeon Phi - * - BGQ - - HOST - - IBM Blue Gene/Q CPU - * - POWER7 - - HOST - - IBM POWER7 CPU * - POWER8 - HOST - IBM POWER8 CPU * - POWER9 - HOST - IBM POWER9 CPU + * - RISCV_SG2042 + - HOST + - SG2042 (RISC-V) CPU * - KEPLER30 - GPU - NVIDIA Kepler generation CC 3.0 GPU @@ -666,7 +660,7 @@ They must be specified in uppercase. - GPU - Intel GPU Ponte Vecchio -This list was last updated for version 4.2 of the Kokkos library. +This list was last updated for version 4.3.0 of the Kokkos library. .. tabs:: diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst index 9bbe216dec..514785c15c 100644 --- a/doc/src/Commands_pair.rst +++ b/doc/src/Commands_pair.rst @@ -245,6 +245,7 @@ OPT. * :doc:`oxrna2/coaxstk ` * :doc:`pace (k) ` * :doc:`pace/extrapolation (k) ` + * :doc:`pedone (o) ` * :doc:`pod ` * :doc:`peri/eps ` * :doc:`peri/lps (o) ` diff --git a/doc/src/Developer_utils.rst b/doc/src/Developer_utils.rst index 625ce15202..76e90e17ba 100644 --- a/doc/src/Developer_utils.rst +++ b/doc/src/Developer_utils.rst @@ -635,10 +635,10 @@ Tohoku University (under MIT license) ---------- -.. doxygenfunction:: MathEigen::jacobi3(double const *const *mat, double *eval, double **evec) +.. doxygenfunction:: MathEigen::jacobi3(double const *const *mat, double *eval, double **evec, int sort) :project: progguide -.. doxygenfunction:: MathEigen::jacobi3(double const mat[3][3], double *eval, double evec[3][3]) +.. doxygenfunction:: MathEigen::jacobi3(double const mat[3][3], double *eval, double evec[3][3], int sort) :project: progguide --------------------------- diff --git a/doc/src/Errors_details.rst b/doc/src/Errors_details.rst index b63d80a9ae..c798784106 100644 --- a/doc/src/Errors_details.rst +++ b/doc/src/Errors_details.rst @@ -13,15 +13,44 @@ discussions of such cases. Unknown identifier in data file ------------------------------- -This error happens when LAMMPS encounters a line of text in an unexpected format -while reading a data file. This is most commonly cause by inconsistent header and -section data. The header section informs LAMMPS how many entries or lines are expected in the -various sections (like Atoms, Masses, Pair Coeffs, *etc.*\ ) of the data file. -If there is a mismatch, LAMMPS will either keep reading beyond the end of a section -or stop reading before the section has ended. +This error happens when LAMMPS encounters a line of text with an +unexpected keyword while :doc:`reading a data file `. This +would be either header keywords or section header keywords. This is +most commonly due to a mistyped keyword or due to a keyword that is +inconsistent with the :doc:`atom style ` used. -Such a mismatch can happen unexpectedly when the first line of the data -is *not* a comment as required by the format. That would result in -LAMMPS expecting, for instance, 0 atoms because the "atoms" header line -is treated as a comment. +The header section informs LAMMPS how many entries or lines are expected +in the various sections (like Atoms, Masses, Pair Coeffs, *etc.*\ ) of +the data file. If there is a mismatch, LAMMPS will either keep reading +beyond the end of a section or stop reading before the section has +ended. In that case the next line will not contain a recognized keyword. +Such a mismatch can also happen when the first line of the data +is *not* a comment as required by the format, but a line with a valid +header keyword. That would result in LAMMPS expecting, for instance, +0 atoms because the "atoms" header line is the first line and thus +treated as a comment. + +Another possibility to trigger this error is to have a keyword in the +data file that corresponds to a fix (e.g. :doc:`fix cmap `) +but the :doc:`read_data ` command is missing the (optional) +arguments that identify the fix and the header keyword and section +keyword or those arguments are inconsistent with the keywords in the +data file. + +.. _err0002: + +Incorrect format in ... section of data file +-------------------------------------------- + +This error happens when LAMMPS reads the contents of a section of a +:doc:`data file ` and the number of parameters in the line +differs from what is expected. This most commonly happens, when the +atom style is different from what is expected for a specific data file +since changing the atom style usually changes the format of the line. + +This error can also happen when the number of entries indicated in the +header of a data file (e.g. the number of atoms) is larger than the +number of lines provided (e.g. in the corresponding Atoms section) +and then LAMMPS will continue reading into the next section and that +would have a completely different format. diff --git a/doc/src/fix_ave_correlate.rst b/doc/src/fix_ave_correlate.rst index 1aff749048..659e15105c 100644 --- a/doc/src/fix_ave_correlate.rst +++ b/doc/src/fix_ave_correlate.rst @@ -65,7 +65,6 @@ Examples fix 1 all ave/correlate 1 50 10000 & c_thermo_press[1] c_thermo_press[2] c_thermo_press[3] & type upper ave running title1 "My correlation data" - fix 1 all ave/correlate 1 50 10000 c_thermo_press[*] Description diff --git a/doc/src/fix_ave_correlate_long.rst b/doc/src/fix_ave_correlate_long.rst index e2b23248f2..003bdf897d 100644 --- a/doc/src/fix_ave_correlate_long.rst +++ b/doc/src/fix_ave_correlate_long.rst @@ -20,11 +20,11 @@ Syntax .. parsed-literal:: c_ID = global scalar calculated by a compute with ID - c_ID[I] = Ith component of global vector calculated by a compute with ID + c_ID[I] = Ith component of global vector calculated by a compute with ID, I can include wildcard (see below) f_ID = global scalar calculated by a fix with ID - f_ID[I] = Ith component of global vector calculated by a fix with ID + f_ID[I] = Ith component of global vector calculated by a fix with ID, I can include wildcard (see below) v_name = global value calculated by an equal-style variable with name - v_name[I] = Ith component of global vector calculated by a vector-style variable with name + v_name[I] = Ith component of a vector-style variable with name, I can include wildcard (see below) * zero or more keyword/arg pairs may be appended * keyword = *type* or *start* or *file* or *overwrite* or *title1* or *title2* or *ncorr* or *nlen* or *ncount* @@ -63,6 +63,7 @@ Examples fix 1 all ave/correlate/long 1 10000 & c_thermo_press[1] c_thermo_press[2] c_thermo_press[3] & type upper title1 "My correlation data" nlen 15 ncount 3 + fix 1 all ave/correlate/long 1 10000 c_thermo_press[*] Description """"""""""" @@ -80,8 +81,10 @@ specified values may represent calculations performed by computes and fixes which store their own "group" definitions. Each listed value can be the result of a compute or fix or the -evaluation of an equal-style variable. See the -:doc:`fix ave/correlate ` page for details. +evaluation of an equal-style or vector-style variable. For +vector-style variables, the specified indices can include a wildcard +character. See the :doc:`fix ave/correlate ` page +for details. The *Nevery* and *Nfreq* arguments specify on what time steps the input values will be used to calculate correlation data and the frequency diff --git a/doc/src/fix_ttm.rst b/doc/src/fix_ttm.rst index 5a7f864686..f3e6a08d61 100644 --- a/doc/src/fix_ttm.rst +++ b/doc/src/fix_ttm.rst @@ -136,23 +136,23 @@ transfer between the subsystems: \bigtriangledown (\kappa_e \bigtriangledown T_e) - g_p (T_e - T_a) + g_s T_a' -where C_e is the specific heat, rho_e is the density, kappa_e is the -thermal conductivity, T is temperature, the "e" and "a" subscripts -represent electronic and atomic subsystems respectively, g_p is the -coupling constant for the electron-ion interaction, and g_s is the -electron stopping coupling parameter. C_e, rho_e, and kappa_e are -specified as parameters to the fix. The other quantities are derived. -The form of the heat diffusion equation used here is almost the same -as that in equation 6 of :ref:`(Duffy) `, with the exception that the -electronic density is explicitly represented, rather than being part -of the specific heat parameter. +where :math:`C_e` is the specific heat, :math:`\rho_e` is the density, +:math:`\kappa_e` is the thermal conductivity, *T* is temperature, the +"e" and "a" subscripts represent electronic and atomic subsystems +respectively, :math:`g_p` is the coupling constant for the electron-ion +interaction, and :math:`g_s` is the electron stopping coupling +parameter. :math:`C_e`, :math:`\rho_e`, and :math:`\kappa_e` are +specified as parameters to the fix *ttm* or *ttm/grid*. The other +quantities are derived. The form of the heat diffusion equation used +here is almost the same as that in equation 6 of :ref:`(Duffy) `, +with the exception that the electronic density is explicitly +represented, rather than being part of the specific heat parameter. Currently, the TTM fixes assume that none of the user-supplied -parameters will vary with temperature. Note that :ref:`(Duffy) -` used a tanh() functional form for the temperature dependence -of the electronic specific heat, but ignored temperature dependencies -of any of the other parameters. See more discussion below for fix -ttm/mod. +parameters will vary with temperature. Note that :ref:`(Duffy) ` +used a tanh() functional form for the temperature dependence of the +electronic specific heat, but ignored temperature dependencies of any of +the other parameters. See more discussion below for fix *ttm/mod*. .. note:: @@ -265,27 +265,27 @@ heat sources (e.g. laser heating in ablation simulations): \bigtriangledown (\kappa_e \bigtriangledown T_e) - g_p (T_e - T_a) + g_s T_a' + \theta (x-x_{surface})I_0 \exp(-x/l_{skin}) -where theta is the Heaviside step function, I_0 is the (absorbed) -laser pulse intensity for ablation simulations, l_skin is the depth -of skin-layer, and all other designations have the same meaning as in -the former equation. The duration of the pulse is set by the parameter -*tau* in the *init_file*. +where :math:`\theta` is the Heaviside step function, :math:`I_0` is the +(absorbed) laser pulse intensity for ablation simulations, +:math:`l_{skin}` is the depth of the skin-layer, and all other +designations have the same meaning as in the former equation. The +duration of the pulse is set by the parameter *tau* in the *init_file*. -Fix ttm/mod also allows users to specify the dependencies of C_e and -kappa_e on the electronic temperature. The specific heat is expressed -as +Fix *ttm/mod* also allows users to specify the dependencies of +:math:`C_e` and :math:`\kappa_e` on the electronic temperature. The +specific heat is expressed as .. math:: C_e = C_0 + (a_0 + a_1 X + a_2 X^2 + a_3 X^3 + a_4 X^4) \exp (-(AX)^2) -where *X* = T_e/1000, and the thermal conductivity is defined as -kappa_e = D_e\*rho_e\*C_e, where D_e is the thermal diffusion -coefficient. +where :math:`X = \frac{T_e}{1000}`, and the thermal conductivity is +defined as :math:`\kappa_e = D_e \cdot rho_e \cdot C_e`, where +:math:`D_e` is the thermal diffusion coefficient. -Electronic pressure effects are included in the TTM model to account -for the blast force acting on ions because of electronic pressure -gradient (see :ref:`(Chen) `, :ref:`(Norman) `). The total force +Electronic pressure effects are included in the TTM model to account for +the blast force acting on ions because of electronic pressure gradient +(see :ref:`(Chen) `, :ref:`(Norman) `). The total force acting on an ion is: .. math:: @@ -293,13 +293,14 @@ acting on an ion is: {\vec F}_i = - \partial U / \partial {\vec r}_i + {\vec F}_{langevin} - \nabla P_e/n_{ion} -where F_langevin is a force from Langevin thermostat simulating -electron-phonon coupling, and nabla P_e/n_ion is the electron blast -force. +where :math:`F_{langevin}` is a force from Langevin thermostat +simulating electron-phonon coupling, and :math:`\nabla P_e/n_{ion}` is +the electron blast force. -The electronic pressure is taken to be P_e = B\*rho_e\*C_e\*T_e +The electronic pressure is taken to be :math:`P_e = B \cdot rho_e \cdot +C_e \cdot T_e` -The current fix ttm/mod implementation allows TTM simulations with a +The current fix *ttm/mod* implementation allows TTM simulations with a vacuum. The vacuum region is defined as the grid cells with zero electronic temperature. The numerical scheme does not allow energy exchange with such cells. Since the material can expand to previously @@ -319,10 +320,10 @@ electronic pressure gradient is calculated as \frac{x}{x+\lambda}\frac{(C_e{}T_e)_{x+\Delta x}-(C_e{}T_e)_{x}}{\Delta x} \right] -where lambda is the electron mean free path (see :ref:`(Norman) `, -:ref:`(Pisarev) `) +where :math:`\lambda` is the electron mean free path (see :ref:`(Norman) +`, :ref:`(Pisarev) `) -The fix ttm/mod parameter file *init_file* has the following syntax. +The fix *ttm/mod* parameter file *init_file* has the following syntax. Every line with an odd number is considered as a comment and ignored. The lines with the even numbers are treated as follows: diff --git a/doc/src/pair_pedone.rst b/doc/src/pair_pedone.rst new file mode 100644 index 0000000000..32fcb17f00 --- /dev/null +++ b/doc/src/pair_pedone.rst @@ -0,0 +1,137 @@ +.. index:: pair_style pedone +.. index:: pair_style pedone/omp + +pair_style pedone command +========================= + +Accelerator Variants: *pedone/omp* + + +Syntax +"""""" + +.. code-block:: LAMMPS + + pair_style style args + +* style = pedone* +* args = list of arguments for a particular style + +.. parsed-literal:: + + *pedone* args = cutoff + cutoff = global cutoff for Pedone interactions (distance units) + +Examples +"""""""" + +.. code-block:: LAMMPS + +pair_style hybrid/overlay pedone 15.0 coul/long 15.0 +kspace_style pppm 1.0e-5 + +pair_coeff * * coul/long +pair_coeff 1 2 pedone 0.030211 2.241334 2.923245 5.0 +pair_coeff 2 2 pedone 0.042395 1.379316 3.618701 22.0 + +Used in input scripts: + + .. parsed-literal:: + + examples/PACKAGES/pedone/in.pedone.relax + examples/PACKAGES/pedone/in.pedone.melt + + + +Description +""""""""""" + +.. versionadded:: TBD + +Pair style *pedone* computes the **non-Coulomb** interactions of the Pedone +(or PMMCS) potential :ref:`(Pedone) ` which combines Coulomb +interactions, Morse potential, and repulsive :math:`r^{-12}` +Lennard-Jones terms (see below). The *pedone* pair style is meant +to be used in addition to a :doc:`Coulomb pair style ` via +pair style :doc:`hybrid/overlay ` (see example above). +Using *coul/long* or *could/dsf* (for solids) is recommended. + +The full Pedone potential function from :ref:`(Pedone) ` for each +pair of atoms is: + +.. math:: + + E = \frac{C q_i q_j}{\epsilon r} + + D_0 \left[ e^{- 2 \alpha (r - r_0)} - 2 e^{- \alpha (r - r_0)} \right] + + \frac{B_0}{r^{12}} \qquad r < r_c + +:math:`r_c` is the cutoff and :math:`C` is a conversion factor that is +specific to the choice of :doc:`units ` so that the entire +Coulomb term is in energy units with :math:`q_i` and :math:`q_j` as the +assigned charges in multiples of the elementary charge. + +The following coefficients must be defined for the selected pairs of +atom types via the :doc:`pair_coeff ` command as in the +example above: + +* :math:`D_0` (energy units) +* :math:`\alpha` (1/distance units) +* :math:`r_0` (distance units) +* :math:`C_0` (energy units) +* cutoff (distance units) + +The last coefficient is optional. If not specified, the global *pedone* +cutoff is used. + +---------- + +.. include:: accel_styles.rst + +---------- + +Mixing, shift, table, tail correction, restart, rRESPA info +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +This pair style does not support mixing. + +This pair style support the :doc:`pair_modify ` shift +option for the energy of the pair interaction. + +This pair style does not support the :doc:`pair_modify ` +tail option for adding long-range tail corrections to energy and +pressure. + +This pair style writes its information to :doc:`binary restart files `, +so pair_style and pair_coeff commands does not need to be specified in an input +script that reads a restart file. + +This pair style can only be used via the *pair* keyword of the +:doc:`run_style respa ` command. It does not support the +*inner*, *middle*, or *outer* keywords. + +---------- + +Restrictions +"""""""""""" + +The *pedone* pair style is only enabled if LAMMPS was built with the +EXTRA-PAIR package. See the :doc:`Build package ` page +for more info. + +Related commands +"""""""""""""""" + +:doc:`pair_coeff `, :doc:`pair_style `, +:doc:`pair style coul/long and coul/dsf `, +:doc:`pair style morse ` + +Default +""""""" + +none + +------------- + +.. _Pedone: + +**(Pedone)** A. Pedone, G. Malavasi, M. C. Menziani, A. N. Cormack, and U. Segre, J. Phys. Chem. B, 110, 11780 (2006) diff --git a/doc/src/pair_style.rst b/doc/src/pair_style.rst index 53bf269e1c..74dfce6b01 100644 --- a/doc/src/pair_style.rst +++ b/doc/src/pair_style.rst @@ -275,30 +275,30 @@ accelerated styles exist. * :doc:`lj/smooth/linear ` - linear smoothed LJ potential * :doc:`lj/switch3/coulgauss/long ` - smoothed LJ vdW potential with Gaussian electrostatics * :doc:`lj96/cut ` - Lennard-Jones 9/6 potential -* :doc:`local/density ` - generalized basic local density potential -* :doc:`lubricate ` - hydrodynamic lubrication forces -* :doc:`lubricate/poly ` - hydrodynamic lubrication forces with polydispersity -* :doc:`lubricateU ` - hydrodynamic lubrication forces for Fast Lubrication Dynamics -* :doc:`lubricateU/poly ` - hydrodynamic lubrication forces for Fast Lubrication with polydispersity +* :doc:`local/density ` - Generalized basic local density potential +* :doc:`lubricate ` - Hydrodynamic lubrication forces +* :doc:`lubricate/poly ` - Hydrodynamic lubrication forces with polydispersity +* :doc:`lubricateU ` - Hydrodynamic lubrication forces for Fast Lubrication Dynamics +* :doc:`lubricateU/poly ` - Hydrodynamic lubrication forces for Fast Lubrication with polydispersity * :doc:`mdpd ` - mDPD particle interactions * :doc:`mdpd/rhosum ` - mDPD particle interactions for mass density -* :doc:`meam ` - modified embedded atom method (MEAM) -* :doc:`meam/ms ` - multi-state modified embedded atom method (MS-MEAM) -* :doc:`meam/spline ` - splined version of MEAM -* :doc:`meam/sw/spline ` - splined version of MEAM with a Stillinger-Weber term -* :doc:`mesocnt ` - mesoscopic vdW potential for (carbon) nanotubes -* :doc:`mesocnt/viscous ` - mesoscopic vdW potential for (carbon) nanotubes with friction -* :doc:`mgpt ` - simplified model generalized pseudopotential theory (MGPT) potential +* :doc:`meam ` - Modified embedded atom method (MEAM) +* :doc:`meam/ms ` - Multi-state modified embedded atom method (MS-MEAM) +* :doc:`meam/spline ` - Splined version of MEAM +* :doc:`meam/sw/spline ` - Splined version of MEAM with a Stillinger-Weber term +* :doc:`mesocnt ` - Mesoscopic vdW potential for (carbon) nanotubes +* :doc:`mesocnt/viscous ` - Mesoscopic vdW potential for (carbon) nanotubes with friction +* :doc:`mgpt ` - Simplified model generalized pseudopotential theory (MGPT) potential * :doc:`mie/cut ` - Mie potential -* :doc:`mm3/switch3/coulgauss/long ` - smoothed MM3 vdW potential with Gaussian electrostatics +* :doc:`mm3/switch3/coulgauss/long ` - Smoothed MM3 vdW potential with Gaussian electrostatics * :doc:`momb ` - Many-Body Metal-Organic (MOMB) force field * :doc:`morse ` - Morse potential -* :doc:`morse/smooth/linear ` - linear smoothed Morse potential +* :doc:`morse/smooth/linear ` - Linear smoothed Morse potential * :doc:`morse/soft ` - Morse potential with a soft core * :doc:`multi/lucy ` - DPD potential with density-dependent force * :doc:`multi/lucy/rx ` - reactive DPD potential with density-dependent force -* :doc:`nb3b/harmonic ` - non-bonded 3-body harmonic potential -* :doc:`nb3b/screened ` - non-bonded 3-body screened harmonic potential +* :doc:`nb3b/harmonic ` - Non-bonded 3-body harmonic potential +* :doc:`nb3b/screened ` - Non-bonded 3-body screened harmonic potential * :doc:`nm/cut ` - N-M potential * :doc:`nm/cut/coul/cut ` - N-M potential with cutoff Coulomb * :doc:`nm/cut/coul/long ` - N-M potential with long-range Coulomb @@ -322,21 +322,22 @@ accelerated styles exist. * :doc:`oxrna2/xstk ` - * :doc:`pace ` - Atomic Cluster Expansion (ACE) machine-learning potential * :doc:`pace/extrapolation ` - Atomic Cluster Expansion (ACE) machine-learning potential with extrapolation grades +* :doc:`pedone ` - Pedone (PMMCS) potential (non-Coulomb part) * :doc:`pod ` - Proper orthogonal decomposition (POD) machine-learning potential -* :doc:`peri/eps ` - peridynamic EPS potential -* :doc:`peri/lps ` - peridynamic LPS potential -* :doc:`peri/pmb ` - peridynamic PMB potential -* :doc:`peri/ves ` - peridynamic VES potential -* :doc:`polymorphic ` - polymorphic 3-body potential +* :doc:`peri/eps ` - Peridynamic EPS potential +* :doc:`peri/lps ` - Peridynamic LPS potential +* :doc:`peri/pmb ` - Peridynamic PMB potential +* :doc:`peri/ves ` - Peridynamic VES potential +* :doc:`polymorphic ` - Polymorphic 3-body potential * :doc:`python ` - * :doc:`quip ` - * :doc:`rann ` - * :doc:`reaxff ` - ReaxFF potential -* :doc:`rebo ` - second generation REBO potential of Brenner +* :doc:`rebo ` - Second generation REBO potential of Brenner * :doc:`rebomos ` - REBOMoS potential for MoS2 * :doc:`resquared ` - Everaers RE-Squared ellipsoidal potential -* :doc:`saip/metal ` - interlayer potential for hetero-junctions formed with hexagonal 2D materials and metal surfaces -* :doc:`sdpd/taitwater/isothermal ` - smoothed dissipative particle dynamics for water at isothermal conditions +* :doc:`saip/metal ` - Interlayer potential for hetero-junctions formed with hexagonal 2D materials and metal surfaces +* :doc:`sdpd/taitwater/isothermal ` - Smoothed dissipative particle dynamics for water at isothermal conditions * :doc:`smatb ` - Second Moment Approximation to the Tight Binding * :doc:`smatb/single ` - Second Moment Approximation to the Tight Binding for single-element systems * :doc:`smd/hertz ` - diff --git a/doc/src/variable.rst b/doc/src/variable.rst index 1cd96543f5..ba5e5efd39 100644 --- a/doc/src/variable.rst +++ b/doc/src/variable.rst @@ -279,9 +279,9 @@ This means the variable can then be evaluated as many times as desired and will return those values. There are two ways to cause the next set of per-atom values from the file to be read: use the :doc:`next ` command or the next() function in an atom-style -variable, as discussed below. Unlike most variable styles -atomfile-style variables are **deleted** during a :doc:`clear ` -command. +variable, as discussed below. Unlike most variable styles, which +remain defined, atomfile-style variables are **deleted** during a +:doc:`clear ` command. The rules for formatting the file are as follows. Each time a set of per-atom values is read, a non-blank line is searched for in the file. @@ -289,23 +289,37 @@ The file is read line by line but only up to 254 characters are used. The rest are ignored. A comment character "#" can be used anywhere on a line and all text following and the "#" character are ignored; text starting with the comment character is stripped. Blank lines -are skipped. The first "word" of a non-blank line, delimited by -white-space, is read as the count N of per-atom lines to immediately -follow. N can be the total number of atoms in the system, or only a -subset. The next N lines have the following format - -.. parsed-literal:: - - ID value - -where ID is an atom ID and value is the per-atom numeric value that -will be assigned to that atom. IDs can be listed in any order. +are skipped. The first non-blank line is expected to contain a single +integer number as the count *N* of per-atom lines to follow. *N* can +be the total number of atoms in the system or less, indicating that data +for a subset is read. The next N lines must consist of two numbers, +the atom-ID of the atom for which a value is set followed by a floating +point number with the value. The atom-IDs may be listed in any order. .. note:: - Every time a set of per-atom lines is read, the value for all - atoms is first set to 0.0. Thus values for atoms whose ID does not - appear in the set, will remain 0.0. + Every time a set of per-atom lines is read, the value of the atomfile + variable for **all** atoms is first initialized to 0.0. Thus values + for atoms whose ID do not appear in the set in the file will remain + at 0.0. + +Below is a small example for the atomfile variable file format: + + .. parsed-literal:: + + # first set + 4 + # atom-ID value + 3 1 + 4 -4 + 1 0.5 + 2 -0.5 + + # second set + 2 + + 2 1.0 + 4 -1.0 ---------- @@ -1174,12 +1188,17 @@ custom atom properties are the same; just replace the leading "i" with +--------+---------------+------------------------------------------+ | equal | i_name[I] | element of per-atom vector (I = atom ID) | ++--------+---------------+------------------------------------------+ | equal | i2_name[I][J] | element of per-atom array (I = atom ID) | +--------+---------------+------------------------------------------+ ++--------+---------------+------------------------------------------+ | vector | i_name[I] | element of per-atom vector (I = atom ID) | ++--------+---------------+------------------------------------------+ | vector | i2_name[I][J] | element of per-atom array (I = atom ID) | +--------+---------------+------------------------------------------+ ++--------+---------------+------------------------------------------+ | atom | i_name | per-atom vector | ++--------+---------------+------------------------------------------+ | atom | i2_name[I] | column of per-atom array | +--------+---------------+------------------------------------------+ @@ -1222,15 +1241,23 @@ table: +--------+------------+------------------------------------------+ | equal | c_ID | global scalar | ++--------+------------+------------------------------------------+ | equal | c_ID[I] | element of global vector | ++--------+------------+------------------------------------------+ | equal | c_ID[I][J] | element of global array | ++--------+------------+------------------------------------------+ | equal | C_ID[I] | element of per-atom vector (I = atom ID) | ++--------+------------+------------------------------------------+ | equal | C_ID[I][J] | element of per-atom array (I = atom ID) | +--------+------------+------------------------------------------+ ++--------+------------+------------------------------------------+ | vector | c_ID | global vector | ++--------+------------+------------------------------------------+ | vector | c_ID[I] | column of global array | +--------+------------+------------------------------------------+ ++--------+------------+------------------------------------------+ | atom | c_ID | per-atom vector | ++--------+------------+------------------------------------------+ | atom | c_ID[I] | column of per-atom array | +--------+------------+------------------------------------------+ @@ -1286,15 +1313,23 @@ and atom-style variables are listed in the following table: +--------+------------+------------------------------------------+ | equal | f_ID | global scalar | ++--------+------------+------------------------------------------+ | equal | f_ID[I] | element of global vector | ++--------+------------+------------------------------------------+ | equal | f_ID[I][J] | element of global array | ++--------+------------+------------------------------------------+ | equal | F_ID[I] | element of per-atom vector (I = atom ID) | ++--------+------------+------------------------------------------+ | equal | F_ID[I][J] | element of per-atom array (I = atom ID) | +--------+------------+------------------------------------------+ ++--------+------------+------------------------------------------+ | vector | f_ID | global vector | ++--------+------------+------------------------------------------+ | vector | f_ID[I] | column of global array | +--------+------------+------------------------------------------+ ++--------+------------+------------------------------------------+ | atom | f_ID | per-atom vector | ++--------+------------+------------------------------------------+ | atom | f_ID[I] | column of per-atom array | +--------+------------+------------------------------------------+ @@ -1365,17 +1400,27 @@ per-atom vector. +--------+-----------+-----------------------------------------------------------------------------------+ | equal | v_name | global scalar from an equal-style variable | ++--------+-----------+-----------------------------------------------------------------------------------+ | equal | v_name[I] | element of global vector from a vector-style variable | ++--------+-----------+-----------------------------------------------------------------------------------+ | equal | v_name[I] | element of per-atom vector (I = atom ID) from an atom- or atomfile-style variable | +--------+-----------+-----------------------------------------------------------------------------------+ ++--------+-----------+-----------------------------------------------------------------------------------+ | vector | v_name | global scalar from an equal-style variable | ++--------+-----------+-----------------------------------------------------------------------------------+ | vector | v_name | global vector from a vector-style variable | ++--------+-----------+-----------------------------------------------------------------------------------+ | vector | v_name[I] | element of global vector from a vector-style variable | ++--------+-----------+-----------------------------------------------------------------------------------+ | vector | v_name[I] | element of per-atom vector (I = atom ID) from an atom- or atomfile-style variable | +--------+-----------+-----------------------------------------------------------------------------------+ ++--------+-----------+-----------------------------------------------------------------------------------+ | atom | v_name | global scalar from an equal-style variable | ++--------+-----------+-----------------------------------------------------------------------------------+ | atom | v_name | per-atom vector from an atom-style or atomfile-style variable | ++--------+-----------+-----------------------------------------------------------------------------------+ | atom | v_name[I] | element of global vector from a vector-style variable | ++--------+-----------+-----------------------------------------------------------------------------------+ | atom | v_name[I] | element of per-atom vector (I = atom ID) from an atom- or atomfile-style variable | +--------+-----------+-----------------------------------------------------------------------------------+ diff --git a/doc/utils/sphinx-config/false_positives.txt b/doc/utils/sphinx-config/false_positives.txt index 980fe581e8..836bb775f5 100644 --- a/doc/utils/sphinx-config/false_positives.txt +++ b/doc/utils/sphinx-config/false_positives.txt @@ -2042,6 +2042,7 @@ Makefiles makelist makepkg Makse +Malavasi malloc Malolepsza Manby @@ -2151,6 +2152,7 @@ membered memcheck Mendelev Menon +Menziani mer Meremianin Mersenne @@ -2775,6 +2777,8 @@ Peachey peachpuff Pearlman Pedersen +pedone +Pedone peID PEigenDense Peng diff --git a/examples/PACKAGES/pedone/in.pedone.melt b/examples/PACKAGES/pedone/in.pedone.melt new file mode 100644 index 0000000000..a2b1c3a71e --- /dev/null +++ b/examples/PACKAGES/pedone/in.pedone.melt @@ -0,0 +1,38 @@ +# Ca-O melt with Pedone potential + +units metal +atom_style charge + +lattice fcc 4.8105 # experimental lattice parameter for fcc-lattice Ca cations + +region box block 0 4 0 4 0 4 +create_box 2 box +create_atoms 1 box + +lattice fcc 4.8105 origin 0.5 0.5 0.5 # O anion lattice shifted by half a unit cell +create_atoms 2 box + +mass 1 40.078 +mass 2 15.999 + +set type 1 charge 1.2 +set type 2 charge -1.2 + +timestep 0.002 +neigh_modify delay 5 every 1 check yes + +pair_style hybrid/overlay pedone 15.0 coul/long 15.0 +kspace_style pppm 1.0e-6 + +pair_coeff * * coul/long +pair_coeff 1 2 pedone 0.030211 2.241334 2.923245 5.0 +pair_coeff 2 2 pedone 0.042395 1.379316 3.618701 22.0 + +velocity all create 6000.0 98347 + +fix 1 all nvt temp 3000.0 3000.0 0.1 + +# dump 1 all atom 500 Ca-O-melt.lammpstrj + +thermo 100 +run 1000 diff --git a/examples/PACKAGES/pedone/in.pedone.relax b/examples/PACKAGES/pedone/in.pedone.relax new file mode 100644 index 0000000000..38ccd651e7 --- /dev/null +++ b/examples/PACKAGES/pedone/in.pedone.relax @@ -0,0 +1,38 @@ +# Ca-O crystal with Pedone potential + +units metal +atom_style charge + +lattice fcc 4.8105 # experimental lattice parameter for fcc-lattice Ca cations + +region box block 0 4 0 4 0 4 +create_box 2 box +create_atoms 1 box + +lattice fcc 4.8105 origin 0.5 0.5 0.5 # O anion lattice shifted by half a unit cell +create_atoms 2 box + +mass 1 40.078 +mass 2 15.999 + +displace_atoms all random 0.01 0.01 0.01 9084544 +set type 1 charge 1.2 +set type 2 charge -1.2 + +timestep 0.002 +neigh_modify delay 5 every 1 check yes + +pair_style hybrid/overlay pedone 15.0 coul/long 15.0 +kspace_style pppm 1.0e-6 + +pair_coeff * * coul/long +pair_coeff 1 2 pedone 0.030211 2.241334 2.923245 5.0 +pair_coeff 2 2 pedone 0.042395 1.379316 3.618701 22.0 + +variable len equal lx*0.25 +thermo_style custom step v_len lx pe press +thermo 100 +fix 1 all box/relax iso 0.0 +minimize 0.0 0.0 1000 10000 + +print "Expected lattice parameter: 4.7748, computed: $(v_len:%6.4f)" diff --git a/examples/PACKAGES/pedone/log.9Apr24.pedone.melt.g++.1 b/examples/PACKAGES/pedone/log.9Apr24.pedone.melt.g++.1 new file mode 100644 index 0000000000..dc33289391 --- /dev/null +++ b/examples/PACKAGES/pedone/log.9Apr24.pedone.melt.g++.1 @@ -0,0 +1,122 @@ +LAMMPS (7 Feb 2024 - Development - patch_7Feb2024_update1-373-g7ac84e18dd) + using 1 OpenMP thread(s) per MPI task +# Ca-O melt with Pedone potential + +units metal +atom_style charge + +lattice fcc 4.8105 # experimental lattice parameter for fcc-lattice Ca cations +Lattice spacing in x,y,z = 4.8105 4.8105 4.8105 + +region box block 0 4 0 4 0 4 +create_box 2 box +Created orthogonal box = (0 0 0) to (19.242 19.242 19.242) + 1 by 1 by 1 MPI processor grid +create_atoms 1 box +Created 256 atoms + using lattice units in orthogonal box = (0 0 0) to (19.242 19.242 19.242) + create_atoms CPU = 0.000 seconds + +lattice fcc 4.8105 origin 0.5 0.5 0.5 # O anion lattice shifted by half a unit cell +Lattice spacing in x,y,z = 4.8105 4.8105 4.8105 +create_atoms 2 box +Created 256 atoms + using lattice units in orthogonal box = (0 0 0) to (19.242 19.242 19.242) + create_atoms CPU = 0.000 seconds + +mass 1 40.078 +mass 2 15.999 + +set type 1 charge 1.2 +Setting atom values ... + 256 settings made for charge +set type 2 charge -1.2 +Setting atom values ... + 256 settings made for charge + +timestep 0.002 +neigh_modify delay 5 every 1 check yes + +pair_style hybrid/overlay pedone 15.0 coul/long 15.0 +kspace_style pppm 1.0e-6 + +pair_coeff * * coul/long +pair_coeff 1 2 pedone 0.030211 2.241334 2.923245 5.0 +pair_coeff 2 2 pedone 0.042395 1.379316 3.618701 22.0 + +velocity all create 6000.0 98347 + +fix 1 all nvt temp 3000.0 3000.0 0.1 + +# dump 1 all atom 500 Ca-O-melt.lammpstrj + +thermo 100 +run 1000 +PPPM initialization ... + using 12-bit tables for long-range coulomb (src/kspace.cpp:342) + G vector (1/distance) = 0.23676226 + grid = 24 24 24 + stencil order = 5 + estimated absolute RMS force accuracy = 1.3089053e-05 + estimated relative force accuracy = 9.089844e-07 + using double precision FFTW3 + 3d grid and FFT values/proc = 29791 13824 +Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule +Neighbor list info ... + update: every = 1 steps, delay = 5 steps, check = yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 17 + ghost atom cutoff = 17 + binsize = 8.5, bins = 3 3 3 + 2 neighbor lists, perpetual/occasional/extra = 2 0 0 + (1) pair pedone, perpetual, skip from (2) + attributes: half, newton on + pair build: skip + stencil: none + bin: none + (2) pair coul/long, perpetual + attributes: half, newton on + pair build: half/bin/atomonly/newton + stencil: half/bin/3d + bin: standard +Per MPI rank memory allocation (min/avg/max) = 9.239 | 9.239 | 9.239 Mbytes + Step Temp E_pair E_mol TotEng Press + 0 6000 -3771.5568 0 -3375.2452 34213.185 + 100 2894.1756 -3562.491 0 -3371.3251 114640.32 + 200 2980.3531 -3570.2657 0 -3373.4076 123673.56 + 300 2783.0437 -3574.5809 0 -3390.7554 119791.27 + 400 3021.6581 -3568.2149 0 -3368.6285 116032.29 + 500 3112.0438 -3580.0178 0 -3374.4613 114798.18 + 600 2973.4609 -3577.0582 0 -3380.6553 111843.46 + 700 3180.1687 -3568.4542 0 -3358.3979 121008.83 + 800 2923.7803 -3573.3023 0 -3380.181 111459.55 + 900 2940.3133 -3572.1322 0 -3377.9188 118177.36 + 1000 3070.2584 -3575.5655 0 -3372.769 114175.52 +Loop time of 13.683 on 1 procs for 1000 steps with 512 atoms + +Performance: 12.629 ns/day, 1.900 hours/ns, 73.084 timesteps/s, 37.419 katom-step/s +99.9% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 11.545 | 11.545 | 11.545 | 0.0 | 84.37 +Kspace | 1.4121 | 1.4121 | 1.4121 | 0.0 | 10.32 +Neigh | 0.65265 | 0.65265 | 0.65265 | 0.0 | 4.77 +Comm | 0.056036 | 0.056036 | 0.056036 | 0.0 | 0.41 +Output | 0.00022945 | 0.00022945 | 0.00022945 | 0.0 | 0.00 +Modify | 0.0090252 | 0.0090252 | 0.0090252 | 0.0 | 0.07 +Other | | 0.00801 | | | 0.06 + +Nlocal: 512 ave 512 max 512 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 10901 ave 10901 max 10901 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 374419 ave 374419 max 374419 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 374419 +Ave neighs/atom = 731.28711 +Neighbor list builds = 71 +Dangerous builds = 0 +Total wall time: 0:00:13 diff --git a/examples/PACKAGES/pedone/log.9Apr24.pedone.melt.g++.4 b/examples/PACKAGES/pedone/log.9Apr24.pedone.melt.g++.4 new file mode 100644 index 0000000000..693b8b3050 --- /dev/null +++ b/examples/PACKAGES/pedone/log.9Apr24.pedone.melt.g++.4 @@ -0,0 +1,122 @@ +LAMMPS (7 Feb 2024 - Development - patch_7Feb2024_update1-373-g7ac84e18dd) + using 1 OpenMP thread(s) per MPI task +# Ca-O melt with Pedone potential + +units metal +atom_style charge + +lattice fcc 4.8105 # experimental lattice parameter for fcc-lattice Ca cations +Lattice spacing in x,y,z = 4.8105 4.8105 4.8105 + +region box block 0 4 0 4 0 4 +create_box 2 box +Created orthogonal box = (0 0 0) to (19.242 19.242 19.242) + 1 by 2 by 2 MPI processor grid +create_atoms 1 box +Created 256 atoms + using lattice units in orthogonal box = (0 0 0) to (19.242 19.242 19.242) + create_atoms CPU = 0.000 seconds + +lattice fcc 4.8105 origin 0.5 0.5 0.5 # O anion lattice shifted by half a unit cell +Lattice spacing in x,y,z = 4.8105 4.8105 4.8105 +create_atoms 2 box +Created 256 atoms + using lattice units in orthogonal box = (0 0 0) to (19.242 19.242 19.242) + create_atoms CPU = 0.000 seconds + +mass 1 40.078 +mass 2 15.999 + +set type 1 charge 1.2 +Setting atom values ... + 256 settings made for charge +set type 2 charge -1.2 +Setting atom values ... + 256 settings made for charge + +timestep 0.002 +neigh_modify delay 5 every 1 check yes + +pair_style hybrid/overlay pedone 15.0 coul/long 15.0 +kspace_style pppm 1.0e-6 + +pair_coeff * * coul/long +pair_coeff 1 2 pedone 0.030211 2.241334 2.923245 5.0 +pair_coeff 2 2 pedone 0.042395 1.379316 3.618701 22.0 + +velocity all create 6000.0 98347 + +fix 1 all nvt temp 3000.0 3000.0 0.1 + +# dump 1 all atom 500 Ca-O-melt.lammpstrj + +thermo 100 +run 1000 +PPPM initialization ... + using 12-bit tables for long-range coulomb (src/kspace.cpp:342) + G vector (1/distance) = 0.23676226 + grid = 24 24 24 + stencil order = 5 + estimated absolute RMS force accuracy = 1.3089053e-05 + estimated relative force accuracy = 9.089844e-07 + using double precision FFTW3 + 3d grid and FFT values/proc = 11191 3456 +Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule +Neighbor list info ... + update: every = 1 steps, delay = 5 steps, check = yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 17 + ghost atom cutoff = 17 + binsize = 8.5, bins = 3 3 3 + 2 neighbor lists, perpetual/occasional/extra = 2 0 0 + (1) pair pedone, perpetual, skip from (2) + attributes: half, newton on + pair build: skip + stencil: none + bin: none + (2) pair coul/long, perpetual + attributes: half, newton on + pair build: half/bin/atomonly/newton + stencil: half/bin/3d + bin: standard +Per MPI rank memory allocation (min/avg/max) = 5.315 | 5.315 | 5.315 Mbytes + Step Temp E_pair E_mol TotEng Press + 0 6000 -3771.5568 0 -3375.2452 34213.185 + 100 3050.0106 -3571.4712 0 -3370.0121 118480.04 + 200 3100.0073 -3571.2534 0 -3366.492 120618.37 + 300 2959.7127 -3580.0883 0 -3384.5935 109184.72 + 400 2922.7083 -3563.9803 0 -3370.9298 120165.71 + 500 3145.0439 -3571.3828 0 -3363.6465 115057.51 + 600 2741.7439 -3563.5077 0 -3382.4102 115504.31 + 700 2906.3636 -3567.3604 0 -3375.3895 119518.5 + 800 2995.3864 -3567.3838 0 -3369.5327 117975.22 + 900 2965.24 -3565.7983 0 -3369.9385 123362.35 + 1000 2916.6485 -3578.7471 0 -3386.0968 115624.78 +Loop time of 4.50395 on 4 procs for 1000 steps with 512 atoms + +Performance: 38.366 ns/day, 0.626 hours/ns, 222.028 timesteps/s, 113.678 katom-step/s +99.4% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 3.2703 | 3.2983 | 3.3259 | 1.3 | 73.23 +Kspace | 0.79815 | 0.82633 | 0.85342 | 2.6 | 18.35 +Neigh | 0.18328 | 0.18398 | 0.18472 | 0.1 | 4.08 +Comm | 0.17423 | 0.17508 | 0.17592 | 0.2 | 3.89 +Output | 0.00019336 | 0.0002167 | 0.00028554 | 0.0 | 0.00 +Modify | 0.0089842 | 0.0091093 | 0.0092205 | 0.1 | 0.20 +Other | | 0.01096 | | | 0.24 + +Nlocal: 128 ave 143 max 118 min +Histogram: 2 0 0 0 0 1 0 0 0 1 +Nghost: 7622.75 ave 7651 max 7598 min +Histogram: 1 0 0 1 1 0 0 0 0 1 +Neighs: 93581.8 ave 106456 max 84898 min +Histogram: 1 1 0 0 1 0 0 0 0 1 + +Total # of neighbors = 374327 +Ave neighs/atom = 731.10742 +Neighbor list builds = 71 +Dangerous builds = 0 +Total wall time: 0:00:04 diff --git a/examples/PACKAGES/pedone/log.9Apr24.pedone.relax.g++.1 b/examples/PACKAGES/pedone/log.9Apr24.pedone.relax.g++.1 new file mode 100644 index 0000000000..bf58a8da9b --- /dev/null +++ b/examples/PACKAGES/pedone/log.9Apr24.pedone.relax.g++.1 @@ -0,0 +1,134 @@ +LAMMPS (7 Feb 2024 - Development - patch_7Feb2024_update1-373-g7ac84e18dd) + using 1 OpenMP thread(s) per MPI task +# Ca-O crystal with Pedone potential + +units metal +atom_style charge + +lattice fcc 4.8105 # experimental lattice parameter for fcc-lattice Ca cations +Lattice spacing in x,y,z = 4.8105 4.8105 4.8105 + +region box block 0 4 0 4 0 4 +create_box 2 box +Created orthogonal box = (0 0 0) to (19.242 19.242 19.242) + 1 by 1 by 1 MPI processor grid +create_atoms 1 box +Created 256 atoms + using lattice units in orthogonal box = (0 0 0) to (19.242 19.242 19.242) + create_atoms CPU = 0.000 seconds + +lattice fcc 4.8105 origin 0.5 0.5 0.5 # O anion lattice shifted by half a unit cell +Lattice spacing in x,y,z = 4.8105 4.8105 4.8105 +create_atoms 2 box +Created 256 atoms + using lattice units in orthogonal box = (0 0 0) to (19.242 19.242 19.242) + create_atoms CPU = 0.000 seconds + +mass 1 40.078 +mass 2 15.999 + +displace_atoms all random 0.01 0.01 0.01 9084544 +Displacing atoms ... +set type 1 charge 1.2 +Setting atom values ... + 256 settings made for charge +set type 2 charge -1.2 +Setting atom values ... + 256 settings made for charge + +timestep 0.002 +neigh_modify delay 5 every 1 check yes + +pair_style hybrid/overlay pedone 15.0 coul/long 15.0 +kspace_style pppm 1.0e-6 + +pair_coeff * * coul/long +pair_coeff 1 2 pedone 0.030211 2.241334 2.923245 5.0 +pair_coeff 2 2 pedone 0.042395 1.379316 3.618701 22.0 + +variable len equal lx*0.25 +thermo_style custom step v_len lx pe press +thermo 100 +fix 1 all box/relax iso 0.0 +minimize 0.0 0.0 1000 10000 +Switching to 'neigh_modify every 1 delay 0 check yes' setting during minimization +PPPM initialization ... + using 12-bit tables for long-range coulomb (src/kspace.cpp:342) + G vector (1/distance) = 0.23676226 + grid = 24 24 24 + stencil order = 5 + estimated absolute RMS force accuracy = 1.3089053e-05 + estimated relative force accuracy = 9.089844e-07 + using double precision FFTW3 + 3d grid and FFT values/proc = 29791 13824 +Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule +Neighbor list info ... + update: every = 1 steps, delay = 0 steps, check = yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 17 + ghost atom cutoff = 17 + binsize = 8.5, bins = 3 3 3 + 2 neighbor lists, perpetual/occasional/extra = 2 0 0 + (1) pair pedone, perpetual, skip from (2) + attributes: half, newton on + pair build: skip + stencil: none + bin: none + (2) pair coul/long, perpetual + attributes: half, newton on + pair build: half/bin/atomonly/newton + stencil: half/bin/3d + bin: standard +WARNING: Energy due to 1 extra global DOFs will be included in minimizer energies + (src/min.cpp:219) +Per MPI rank memory allocation (min/avg/max) = 10.33 | 10.33 | 10.33 Mbytes + Step v_len Lx PotEng Press + 0 4.8105 19.242 -3765.9116 -21299.914 + 100 4.7797128 19.118851 -3767.814 -164.13101 + 200 4.7787507 19.115003 -3769.1366 -373.58797 + 300 4.7768265 19.107306 -3770.5634 48.944709 + 400 4.7768265 19.107306 -3770.9879 -258.56116 + 500 4.7758644 19.103458 -3771.3898 173.91894 + 600 4.7758644 19.103458 -3771.7586 -91.813678 + 700 4.7758644 19.103458 -3771.9842 -252.52883 + 800 4.7749023 19.099609 -3772.3526 216.83318 + 857 4.7747927 19.099171 -3772.8223 32.586251 +Loop time of 18.0592 on 1 procs for 857 steps with 512 atoms + +99.8% CPU use with 1 MPI tasks x 1 OpenMP threads + +Minimization stats: + Stopping criterion = linesearch alpha is zero + Energy initial, next-to-last, final = + -3765.91161156884 -3772.82226663623 -3772.82226663623 + Force two-norm initial, final = 284.3967 0.46963871 + Force max component initial, final = 284.14458 0.42827677 + Final line search alpha, max atom move = 2.8580337e-08 1.2240294e-08 + Iterations, force evaluations = 857 894 + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 13.907 | 13.907 | 13.907 | 0.0 | 77.01 +Kspace | 1.3809 | 1.3809 | 1.3809 | 0.0 | 7.65 +Neigh | 0 | 0 | 0 | 0.0 | 0.00 +Comm | 0.045871 | 0.045871 | 0.045871 | 0.0 | 0.25 +Output | 0.0002809 | 0.0002809 | 0.0002809 | 0.0 | 0.00 +Modify | 0 | 0 | 0 | 0.0 | 0.00 +Other | | 2.726 | | | 15.09 + +Nlocal: 512 ave 512 max 512 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 11655 ave 11655 max 11655 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 372155 ave 372155 max 372155 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 372155 +Ave neighs/atom = 726.86523 +Neighbor list builds = 0 +Dangerous builds = 0 + +print "Expected lattice parameter: 4.7748, computed: $(v_len:%6.4f)" +Expected lattice parameter: 4.7748, computed: 4.7748 +Total wall time: 0:00:18 diff --git a/examples/PACKAGES/pedone/log.9Apr24.pedone.relax.g++.4 b/examples/PACKAGES/pedone/log.9Apr24.pedone.relax.g++.4 new file mode 100644 index 0000000000..95a84f81e0 --- /dev/null +++ b/examples/PACKAGES/pedone/log.9Apr24.pedone.relax.g++.4 @@ -0,0 +1,134 @@ +LAMMPS (7 Feb 2024 - Development - patch_7Feb2024_update1-373-g7ac84e18dd) + using 1 OpenMP thread(s) per MPI task +# Ca-O crystal with Pedone potential + +units metal +atom_style charge + +lattice fcc 4.8105 # experimental lattice parameter for fcc-lattice Ca cations +Lattice spacing in x,y,z = 4.8105 4.8105 4.8105 + +region box block 0 4 0 4 0 4 +create_box 2 box +Created orthogonal box = (0 0 0) to (19.242 19.242 19.242) + 1 by 2 by 2 MPI processor grid +create_atoms 1 box +Created 256 atoms + using lattice units in orthogonal box = (0 0 0) to (19.242 19.242 19.242) + create_atoms CPU = 0.000 seconds + +lattice fcc 4.8105 origin 0.5 0.5 0.5 # O anion lattice shifted by half a unit cell +Lattice spacing in x,y,z = 4.8105 4.8105 4.8105 +create_atoms 2 box +Created 256 atoms + using lattice units in orthogonal box = (0 0 0) to (19.242 19.242 19.242) + create_atoms CPU = 0.000 seconds + +mass 1 40.078 +mass 2 15.999 + +displace_atoms all random 0.01 0.01 0.01 9084544 +Displacing atoms ... +set type 1 charge 1.2 +Setting atom values ... + 256 settings made for charge +set type 2 charge -1.2 +Setting atom values ... + 256 settings made for charge + +timestep 0.002 +neigh_modify delay 5 every 1 check yes + +pair_style hybrid/overlay pedone 15.0 coul/long 15.0 +kspace_style pppm 1.0e-6 + +pair_coeff * * coul/long +pair_coeff 1 2 pedone 0.030211 2.241334 2.923245 5.0 +pair_coeff 2 2 pedone 0.042395 1.379316 3.618701 22.0 + +variable len equal lx*0.25 +thermo_style custom step v_len lx pe press +thermo 100 +fix 1 all box/relax iso 0.0 +minimize 0.0 0.0 1000 10000 +Switching to 'neigh_modify every 1 delay 0 check yes' setting during minimization +PPPM initialization ... + using 12-bit tables for long-range coulomb (src/kspace.cpp:342) + G vector (1/distance) = 0.23676226 + grid = 24 24 24 + stencil order = 5 + estimated absolute RMS force accuracy = 1.3089053e-05 + estimated relative force accuracy = 9.089844e-07 + using double precision FFTW3 + 3d grid and FFT values/proc = 11191 3456 +Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule +Neighbor list info ... + update: every = 1 steps, delay = 0 steps, check = yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 17 + ghost atom cutoff = 17 + binsize = 8.5, bins = 3 3 3 + 2 neighbor lists, perpetual/occasional/extra = 2 0 0 + (1) pair pedone, perpetual, skip from (2) + attributes: half, newton on + pair build: skip + stencil: none + bin: none + (2) pair coul/long, perpetual + attributes: half, newton on + pair build: half/bin/atomonly/newton + stencil: half/bin/3d + bin: standard +WARNING: Energy due to 1 extra global DOFs will be included in minimizer energies + (src/min.cpp:219) +Per MPI rank memory allocation (min/avg/max) = 6.44 | 6.44 | 6.44 Mbytes + Step v_len Lx PotEng Press + 0 4.8105 19.242 -3765.9116 -21299.914 + 100 4.7797128 19.118851 -3767.814 -164.13101 + 200 4.7787507 19.115003 -3769.1367 -373.59489 + 300 4.7768265 19.107306 -3770.5868 32.046893 + 400 4.7768265 19.107306 -3771.0322 -290.69703 + 500 4.7758644 19.103458 -3771.4223 150.34606 + 600 4.7758644 19.103458 -3771.7941 -117.26938 + 700 4.7758644 19.103458 -3772.0193 -277.34372 + 800 4.7749023 19.099609 -3772.42 171.95177 + 860 4.7748339 19.099336 -3772.8237 1.0976356 +Loop time of 5.65601 on 4 procs for 860 steps with 512 atoms + +99.5% CPU use with 4 MPI tasks x 1 OpenMP threads + +Minimization stats: + Stopping criterion = linesearch alpha is zero + Energy initial, next-to-last, final = + -3765.91161156888 -3772.82365446552 -3772.82365446552 + Force two-norm initial, final = 284.3967 0.067746634 + Force max component initial, final = 284.14458 0.014426328 + Final line search alpha, max atom move = 1.9073486e-06 2.7516038e-08 + Iterations, force evaluations = 860 922 + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 3.7408 | 3.8442 | 4.0543 | 6.5 | 67.97 +Kspace | 0.60187 | 0.81211 | 0.91543 | 14.1 | 14.36 +Neigh | 0 | 0 | 0 | 0.0 | 0.00 +Comm | 0.14969 | 0.15017 | 0.15071 | 0.1 | 2.66 +Output | 0.00019203 | 0.00020711 | 0.0002511 | 0.0 | 0.00 +Modify | 0 | 0 | 0 | 0.0 | 0.00 +Other | | 0.8494 | | | 15.02 + +Nlocal: 128 ave 135 max 123 min +Histogram: 1 0 1 0 1 0 0 0 0 1 +Nghost: 8175 ave 8180 max 8168 min +Histogram: 1 0 0 0 0 1 0 1 0 1 +Neighs: 93038.8 ave 98164 max 89373 min +Histogram: 1 0 1 0 1 0 0 0 0 1 + +Total # of neighbors = 372155 +Ave neighs/atom = 726.86523 +Neighbor list builds = 0 +Dangerous builds = 0 + +print "Expected lattice parameter: 4.7748, computed: $(v_len:%6.4f)" +Expected lattice parameter: 4.7748, computed: 4.7748 +Total wall time: 0:00:05 diff --git a/examples/micelle/log.29Mar2019.micelle-rigid.g++.1 b/examples/micelle/log.29Mar2019.micelle-rigid.g++.1 deleted file mode 100644 index f1001e6cea..0000000000 --- a/examples/micelle/log.29Mar2019.micelle-rigid.g++.1 +++ /dev/null @@ -1,260 +0,0 @@ -LAMMPS (29 Mar 2019) - using 1 OpenMP thread(s) per MPI task -# 2d micelle simulation - -dimension 2 - -neighbor 0.3 bin -neigh_modify delay 5 - -atom_style bond - -# Soft potential push-off - -read_data data.micelle - orthogonal box = (0 0 -0.1) to (35.8569 35.8569 0.1) - 1 by 1 by 1 MPI processor grid - reading atoms ... - 1200 atoms - scanning bonds ... - 1 = max bonds/atom - reading bonds ... - 300 bonds - 2 = max # of 1-2 neighbors - 1 = max # of 1-3 neighbors - 1 = max # of 1-4 neighbors - 2 = max # of special neighbors - special bonds CPU = 0.000473022 secs - read_data CPU = 0.0024147 secs -special_bonds fene - 2 = max # of 1-2 neighbors - 2 = max # of special neighbors - special bonds CPU = 0.00022316 secs - -pair_style soft 1.12246 -pair_coeff * * 0.0 1.12246 - -bond_style harmonic -bond_coeff 1 50.0 0.75 - -velocity all create 0.45 2349852 - -variable prefactor equal ramp(1.0,20.0) - -fix 1 all nve -fix 2 all temp/rescale 100 0.45 0.45 0.02 1.0 -fix 3 all adapt 1 pair soft a * * v_prefactor -fix 4 all enforce2d - -thermo 50 -run 500 -Neighbor list info ... - update every 1 steps, delay 5 steps, check yes - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 1.42246 - ghost atom cutoff = 1.42246 - binsize = 0.71123, bins = 51 51 1 - 1 neighbor lists, perpetual/occasional/extra = 1 0 0 - (1) pair soft, perpetual - attributes: half, newton on - pair build: half/bin/newton - stencil: half/bin/2d/newton - bin: standard -Per MPI rank memory allocation (min/avg/max) = 3.799 | 3.799 | 3.799 Mbytes -Step Temp E_pair E_mol TotEng Press - 0 0.45 0.40003481 2.2200223e-06 0.84966203 0.78952518 - 50 0.54981866 0.93548899 0.068440043 1.5532895 1.9232786 - 100 0.45 0.99659327 0.079228519 1.5254468 3.2135679 - 150 0.86965411 0.90456016 0.07493355 1.8484231 4.3821925 - 200 0.45 1.01454 0.10663502 1.5708 4.7598476 - 250 0.79636561 0.82567712 0.12105337 1.7424325 5.4983899 - 300 0.45 0.86475538 0.11819875 1.4325791 5.8554758 - 350 0.72135464 0.70693069 0.10912636 1.5368106 6.0388247 - 400 0.45 0.75067331 0.14165013 1.3419484 6.3840708 - 450 0.64839221 0.62402486 0.14173679 1.4136135 6.4791009 - 500 0.45 0.66669513 0.13695201 1.2532721 6.807146 -Loop time of 0.103162 on 1 procs for 500 steps with 1200 atoms - -Performance: 2093802.885 tau/day, 4846.766 timesteps/s -99.6% CPU use with 1 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 0.068308 | 0.068308 | 0.068308 | 0.0 | 66.21 -Bond | 0.004235 | 0.004235 | 0.004235 | 0.0 | 4.11 -Neigh | 0.014069 | 0.014069 | 0.014069 | 0.0 | 13.64 -Comm | 0.0019219 | 0.0019219 | 0.0019219 | 0.0 | 1.86 -Output | 0.00017262 | 0.00017262 | 0.00017262 | 0.0 | 0.17 -Modify | 0.011728 | 0.011728 | 0.011728 | 0.0 | 11.37 -Other | | 0.002726 | | | 2.64 - -Nlocal: 1200 ave 1200 max 1200 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Nghost: 197 ave 197 max 197 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Neighs: 3094 ave 3094 max 3094 min -Histogram: 1 0 0 0 0 0 0 0 0 0 - -Total # of neighbors = 3094 -Ave neighs/atom = 2.57833 -Ave special neighs/atom = 0.5 -Neighbor list builds = 52 -Dangerous builds = 0 - -unfix 3 - -# Main run - -pair_style lj/cut 2.5 - -# solvent/head - full-size and long-range - -pair_coeff 1 1 1.0 1.0 2.5 -pair_coeff 2 2 1.0 1.0 2.5 -pair_coeff 1 2 1.0 1.0 2.5 - -# tail/tail - size-averaged and long-range - -pair_coeff 3 3 1.0 0.75 2.5 -pair_coeff 4 4 1.0 0.50 2.5 -pair_coeff 3 4 1.0 0.67 2.5 - -# solvent/tail - full-size and repulsive - -pair_coeff 1 3 1.0 1.0 1.12246 -pair_coeff 1 4 1.0 1.0 1.12246 - -# head/tail - size-averaged and repulsive - -pair_coeff 2 3 1.0 0.88 1.12246 -pair_coeff 2 4 1.0 0.75 1.12246 - -thermo 50 - -#dump 1 all atom 2000 dump.micelle - -#dump 2 all image 2000 image.*.jpg type type zoom 1.6 -#dump_modify 2 pad 5 adiam 1 0.5 adiam 2 1.5 adiam 3 1.0 adiam 4 0.75 - -#dump 3 all movie 2000 movie.mpg type type zoom 1.6 -#dump_modify 3 pad 5 adiam 1 0.5 adiam 2 1.5 adiam 3 1.0 adiam 4 0.75 - -reset_timestep 0 -group solvent molecule 0 -750 atoms in group solvent -group solute subtract all solvent -450 atoms in group solute -unfix 1 -unfix 2 -unfix 4 -fix 1 solvent nve -fix 2 solvent temp/rescale 100 0.45 0.45 0.02 1.0 -fix 5 solute rigid molecule langevin 0.45 0.45 0.5 112211 -150 rigid bodies with 450 atoms -fix 4 all enforce2d -run 500 -Neighbor list info ... - update every 1 steps, delay 5 steps, check yes - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 2.8 - ghost atom cutoff = 2.8 - binsize = 1.4, bins = 26 26 1 - 1 neighbor lists, perpetual/occasional/extra = 1 0 0 - (1) pair lj/cut, perpetual - attributes: half, newton on - pair build: half/bin/newton - stencil: half/bin/2d/newton - bin: standard -Per MPI rank memory allocation (min/avg/max) = 5.274 | 5.274 | 5.274 Mbytes -Step Temp E_pair E_mol TotEng Press - 0 0.45318168 -1.3753652 0.13695201 -0.8705807 1.975423 - 50 0.77871641 -1.6955252 0.13695201 -0.92651507 0.64222539 - 100 0.5336062 -1.7124572 0.13695201 -1.1423948 -0.11959696 - 150 0.58789067 -1.7926109 0.13695201 -1.1784877 1.2592743 - 200 0.47864796 -1.8040298 0.13695201 -1.2785752 3.6739793 - 250 0.51124651 -1.8614797 0.13695201 -1.309566 2.5817722 - 300 0.45695639 -1.8708384 0.13695201 -1.3629901 3.0833794 - 350 0.477504 -1.8924359 0.13695201 -1.3679098 -5.1605926 - 400 0.45328205 -1.87754 0.13695201 -1.372674 -4.0355858 - 450 0.47465031 -1.9071924 0.13695201 -1.3849826 3.1949617 - 500 0.45533691 -1.9072316 0.13695201 -1.4006978 0.48079061 -Loop time of 0.178806 on 1 procs for 500 steps with 1200 atoms - -Performance: 1208012.705 tau/day, 2796.326 timesteps/s -99.6% CPU use with 1 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 0.086131 | 0.086131 | 0.086131 | 0.0 | 48.17 -Bond | 0.0042472 | 0.0042472 | 0.0042472 | 0.0 | 2.38 -Neigh | 0.021317 | 0.021317 | 0.021317 | 0.0 | 11.92 -Comm | 0.0025985 | 0.0025985 | 0.0025985 | 0.0 | 1.45 -Output | 0.000175 | 0.000175 | 0.000175 | 0.0 | 0.10 -Modify | 0.061408 | 0.061408 | 0.061408 | 0.0 | 34.34 -Other | | 0.00293 | | | 1.64 - -Nlocal: 1200 ave 1200 max 1200 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Nghost: 416 ave 416 max 416 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Neighs: 8769 ave 8769 max 8769 min -Histogram: 1 0 0 0 0 0 0 0 0 0 - -Total # of neighbors = 8769 -Ave neighs/atom = 7.3075 -Ave special neighs/atom = 0.5 -Neighbor list builds = 47 -Dangerous builds = 2 -unfix 2 -unfix 4 -unfix 5 -fix 5 solute rigid/small molecule - create bodies CPU = 0.00015378 secs -150 rigid bodies with 450 atoms - 1.30435 = max distance from body owner to body atom -fix 4 all enforce2d -run 500 -Per MPI rank memory allocation (min/avg/max) = 8.64 | 8.64 | 8.64 Mbytes -Step Temp E_pair E_mol TotEng Press - 500 0.45533691 -1.9072316 0.13695201 -1.4006978 2.4545793 - 550 0.45627282 -1.912409 0.13695201 -1.4051155 2.1845065 - 600 0.44734553 -1.8890695 0.13695201 -1.389022 2.3458965 - 650 0.46444648 -1.9042462 0.13695201 -1.3903185 2.1609319 - 700 0.47113236 -1.8977576 0.13695201 -1.3784032 2.2420351 - 750 0.48554548 -1.9253545 0.13695201 -1.3943015 2.143907 - 800 0.46350091 -1.8865749 0.13695201 -1.3734146 2.294431 - 850 0.4766104 -1.9094039 0.13695201 -1.3856031 2.2077157 - 900 0.48988467 -1.9051538 0.13695201 -1.3705787 2.0107056 - 950 0.48351943 -1.9162485 0.13695201 -1.3868399 2.1891332 - 1000 0.49033701 -1.9115165 0.13695201 -1.3765742 2.1508141 -Loop time of 0.166502 on 1 procs for 500 steps with 1200 atoms - -Performance: 1297278.008 tau/day, 3002.958 timesteps/s -99.6% CPU use with 1 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 0.085767 | 0.085767 | 0.085767 | 0.0 | 51.51 -Bond | 0.0042562 | 0.0042562 | 0.0042562 | 0.0 | 2.56 -Neigh | 0.018039 | 0.018039 | 0.018039 | 0.0 | 10.83 -Comm | 0.0024002 | 0.0024002 | 0.0024002 | 0.0 | 1.44 -Output | 0.00018239 | 0.00018239 | 0.00018239 | 0.0 | 0.11 -Modify | 0.052717 | 0.052717 | 0.052717 | 0.0 | 31.66 -Other | | 0.003141 | | | 1.89 - -Nlocal: 1200 ave 1200 max 1200 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Nghost: 415 ave 415 max 415 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Neighs: 8743 ave 8743 max 8743 min -Histogram: 1 0 0 0 0 0 0 0 0 0 - -Total # of neighbors = 8743 -Ave neighs/atom = 7.28583 -Ave special neighs/atom = 0.5 -Neighbor list builds = 40 -Dangerous builds = 0 -Total wall time: 0:00:00 diff --git a/examples/micelle/log.29Mar2019.micelle-rigid.g++.4 b/examples/micelle/log.29Mar2019.micelle-rigid.g++.4 deleted file mode 100644 index e65f67a527..0000000000 --- a/examples/micelle/log.29Mar2019.micelle-rigid.g++.4 +++ /dev/null @@ -1,260 +0,0 @@ -LAMMPS (29 Mar 2019) - using 1 OpenMP thread(s) per MPI task -# 2d micelle simulation - -dimension 2 - -neighbor 0.3 bin -neigh_modify delay 5 - -atom_style bond - -# Soft potential push-off - -read_data data.micelle - orthogonal box = (0 0 -0.1) to (35.8569 35.8569 0.1) - 2 by 2 by 1 MPI processor grid - reading atoms ... - 1200 atoms - scanning bonds ... - 1 = max bonds/atom - reading bonds ... - 300 bonds - 2 = max # of 1-2 neighbors - 1 = max # of 1-3 neighbors - 1 = max # of 1-4 neighbors - 2 = max # of special neighbors - special bonds CPU = 0.000422001 secs - read_data CPU = 0.00473404 secs -special_bonds fene - 2 = max # of 1-2 neighbors - 2 = max # of special neighbors - special bonds CPU = 0.000183344 secs - -pair_style soft 1.12246 -pair_coeff * * 0.0 1.12246 - -bond_style harmonic -bond_coeff 1 50.0 0.75 - -velocity all create 0.45 2349852 - -variable prefactor equal ramp(1.0,20.0) - -fix 1 all nve -fix 2 all temp/rescale 100 0.45 0.45 0.02 1.0 -fix 3 all adapt 1 pair soft a * * v_prefactor -fix 4 all enforce2d - -thermo 50 -run 500 -Neighbor list info ... - update every 1 steps, delay 5 steps, check yes - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 1.42246 - ghost atom cutoff = 1.42246 - binsize = 0.71123, bins = 51 51 1 - 1 neighbor lists, perpetual/occasional/extra = 1 0 0 - (1) pair soft, perpetual - attributes: half, newton on - pair build: half/bin/newton - stencil: half/bin/2d/newton - bin: standard -Per MPI rank memory allocation (min/avg/max) = 3.758 | 3.85 | 4.126 Mbytes -Step Temp E_pair E_mol TotEng Press - 0 0.45 0.40003481 2.2200223e-06 0.84966203 0.78952518 - 50 0.54981866 0.93548899 0.068440043 1.5532895 1.9232786 - 100 0.45 0.99659327 0.079228519 1.5254468 3.2135679 - 150 0.86965411 0.90456016 0.07493355 1.8484231 4.3821925 - 200 0.45 1.01454 0.10663502 1.5708 4.7598476 - 250 0.79636561 0.82567712 0.12105337 1.7424325 5.4983899 - 300 0.45 0.86475538 0.11819875 1.4325791 5.8554758 - 350 0.72135464 0.70693069 0.10912636 1.5368106 6.0388247 - 400 0.45 0.75067331 0.14165013 1.3419484 6.3840708 - 450 0.64839221 0.62402486 0.14173679 1.4136135 6.4791009 - 500 0.45 0.66669513 0.13695201 1.2532721 6.807146 -Loop time of 0.0426326 on 4 procs for 500 steps with 1200 atoms - -Performance: 5066547.720 tau/day, 11728.120 timesteps/s -98.7% CPU use with 4 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 0.016784 | 0.019254 | 0.022154 | 1.5 | 45.16 -Bond | 0.0010612 | 0.0012558 | 0.0014153 | 0.4 | 2.95 -Neigh | 0.0046048 | 0.0046697 | 0.0047245 | 0.1 | 10.95 -Comm | 0.0064592 | 0.0097114 | 0.012527 | 2.4 | 22.78 -Output | 0.00022507 | 0.00026393 | 0.00033951 | 0.0 | 0.62 -Modify | 0.0041659 | 0.0048084 | 0.0053945 | 0.8 | 11.28 -Other | | 0.002669 | | | 6.26 - -Nlocal: 300 ave 304 max 292 min -Histogram: 1 0 0 0 0 0 0 0 2 1 -Nghost: 103.5 ave 108 max 98 min -Histogram: 1 0 0 1 0 0 0 0 0 2 -Neighs: 773.5 ave 792 max 735 min -Histogram: 1 0 0 0 0 0 0 0 2 1 - -Total # of neighbors = 3094 -Ave neighs/atom = 2.57833 -Ave special neighs/atom = 0.5 -Neighbor list builds = 52 -Dangerous builds = 0 - -unfix 3 - -# Main run - -pair_style lj/cut 2.5 - -# solvent/head - full-size and long-range - -pair_coeff 1 1 1.0 1.0 2.5 -pair_coeff 2 2 1.0 1.0 2.5 -pair_coeff 1 2 1.0 1.0 2.5 - -# tail/tail - size-averaged and long-range - -pair_coeff 3 3 1.0 0.75 2.5 -pair_coeff 4 4 1.0 0.50 2.5 -pair_coeff 3 4 1.0 0.67 2.5 - -# solvent/tail - full-size and repulsive - -pair_coeff 1 3 1.0 1.0 1.12246 -pair_coeff 1 4 1.0 1.0 1.12246 - -# head/tail - size-averaged and repulsive - -pair_coeff 2 3 1.0 0.88 1.12246 -pair_coeff 2 4 1.0 0.75 1.12246 - -thermo 50 - -#dump 1 all atom 2000 dump.micelle - -#dump 2 all image 2000 image.*.jpg type type zoom 1.6 -#dump_modify 2 pad 5 adiam 1 0.5 adiam 2 1.5 adiam 3 1.0 adiam 4 0.75 - -#dump 3 all movie 2000 movie.mpg type type zoom 1.6 -#dump_modify 3 pad 5 adiam 1 0.5 adiam 2 1.5 adiam 3 1.0 adiam 4 0.75 - -reset_timestep 0 -group solvent molecule 0 -750 atoms in group solvent -group solute subtract all solvent -450 atoms in group solute -unfix 1 -unfix 2 -unfix 4 -fix 1 solvent nve -fix 2 solvent temp/rescale 100 0.45 0.45 0.02 1.0 -fix 5 solute rigid molecule langevin 0.45 0.45 0.5 112211 -150 rigid bodies with 450 atoms -fix 4 all enforce2d -run 500 -Neighbor list info ... - update every 1 steps, delay 5 steps, check yes - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 2.8 - ghost atom cutoff = 2.8 - binsize = 1.4, bins = 26 26 1 - 1 neighbor lists, perpetual/occasional/extra = 1 0 0 - (1) pair lj/cut, perpetual - attributes: half, newton on - pair build: half/bin/newton - stencil: half/bin/2d/newton - bin: standard -Per MPI rank memory allocation (min/avg/max) = 5.251 | 5.282 | 5.374 Mbytes -Step Temp E_pair E_mol TotEng Press - 0 0.45318168 -1.3753652 0.13695201 -0.8705807 1.975423 - 50 0.77871641 -1.6955252 0.13695201 -0.92651507 0.64222539 - 100 0.5336062 -1.7124572 0.13695201 -1.1423948 -0.11959696 - 150 0.58789067 -1.7926109 0.13695201 -1.1784877 1.2592743 - 200 0.47864796 -1.8040298 0.13695201 -1.2785752 3.6739793 - 250 0.51124651 -1.8614797 0.13695201 -1.309566 2.5817722 - 300 0.45695639 -1.8708384 0.13695201 -1.3629901 3.0833794 - 350 0.477504 -1.8924359 0.13695201 -1.3679098 -5.1605926 - 400 0.45328205 -1.87754 0.13695201 -1.372674 -4.0355858 - 450 0.47465031 -1.9071924 0.13695201 -1.3849826 3.1949617 - 500 0.45533691 -1.9072316 0.13695201 -1.4006978 0.48079061 -Loop time of 0.0887392 on 4 procs for 500 steps with 1200 atoms - -Performance: 2434100.210 tau/day, 5634.491 timesteps/s -98.9% CPU use with 4 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 0.022611 | 0.022839 | 0.023082 | 0.1 | 25.74 -Bond | 0.0010793 | 0.0011569 | 0.0012515 | 0.2 | 1.30 -Neigh | 0.0064609 | 0.0064996 | 0.0065265 | 0.0 | 7.32 -Comm | 0.0071712 | 0.0073687 | 0.0077734 | 0.3 | 8.30 -Output | 0.00023389 | 0.00025356 | 0.00030327 | 0.0 | 0.29 -Modify | 0.047258 | 0.047683 | 0.048503 | 0.2 | 53.73 -Other | | 0.002938 | | | 3.31 - -Nlocal: 300 ave 309 max 291 min -Histogram: 1 0 0 1 0 0 1 0 0 1 -Nghost: 218.75 ave 223 max 216 min -Histogram: 1 0 2 0 0 0 0 0 0 1 -Neighs: 2192.25 ave 2251 max 2113 min -Histogram: 1 0 0 1 0 0 0 0 0 2 - -Total # of neighbors = 8769 -Ave neighs/atom = 7.3075 -Ave special neighs/atom = 0.5 -Neighbor list builds = 47 -Dangerous builds = 2 -unfix 2 -unfix 4 -unfix 5 -fix 5 solute rigid/small molecule - create bodies CPU = 7.70092e-05 secs -150 rigid bodies with 450 atoms - 1.30435 = max distance from body owner to body atom -fix 4 all enforce2d -run 500 -Per MPI rank memory allocation (min/avg/max) = 8.565 | 8.597 | 8.69 Mbytes -Step Temp E_pair E_mol TotEng Press - 500 0.45533691 -1.9072316 0.13695201 -1.4006978 2.4545793 - 550 0.45627282 -1.912409 0.13695201 -1.4051155 2.1845065 - 600 0.44734553 -1.8890695 0.13695201 -1.389022 2.3458965 - 650 0.46444648 -1.9042462 0.13695201 -1.3903185 2.1609319 - 700 0.47113236 -1.8977576 0.13695201 -1.3784032 2.2420351 - 750 0.48554548 -1.9253545 0.13695201 -1.3943015 2.143907 - 800 0.46350091 -1.8865749 0.13695201 -1.3734146 2.294431 - 850 0.4766104 -1.9094039 0.13695201 -1.3856031 2.2077157 - 900 0.48988467 -1.9051538 0.13695201 -1.3705787 2.0107056 - 950 0.48351942 -1.9162485 0.13695201 -1.3868399 2.1891332 - 1000 0.490337 -1.9115164 0.13695201 -1.3765742 2.1508141 -Loop time of 0.0588261 on 4 procs for 500 steps with 1200 atoms - -Performance: 3671840.233 tau/day, 8499.630 timesteps/s -98.3% CPU use with 4 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 0.022407 | 0.022631 | 0.0229 | 0.1 | 38.47 -Bond | 0.0010669 | 0.0011355 | 0.0012124 | 0.2 | 1.93 -Neigh | 0.0052333 | 0.00528 | 0.0053182 | 0.0 | 8.98 -Comm | 0.0063677 | 0.0066406 | 0.0068488 | 0.2 | 11.29 -Output | 0.00023055 | 0.00024778 | 0.00028086 | 0.0 | 0.42 -Modify | 0.020577 | 0.020651 | 0.020834 | 0.1 | 35.11 -Other | | 0.00224 | | | 3.81 - -Nlocal: 300 ave 303 max 295 min -Histogram: 1 0 0 0 0 0 1 0 1 1 -Nghost: 219 ave 224 max 215 min -Histogram: 1 0 0 1 1 0 0 0 0 1 -Neighs: 2185.75 ave 2244 max 2143 min -Histogram: 1 1 0 0 0 1 0 0 0 1 - -Total # of neighbors = 8743 -Ave neighs/atom = 7.28583 -Ave special neighs/atom = 0.5 -Neighbor list builds = 40 -Dangerous builds = 0 -Total wall time: 0:00:00 diff --git a/examples/micelle/log.4Apr2024.micelle-rigid.g++.1 b/examples/micelle/log.4Apr2024.micelle-rigid.g++.1 new file mode 100644 index 0000000000..d3cb98940d --- /dev/null +++ b/examples/micelle/log.4Apr2024.micelle-rigid.g++.1 @@ -0,0 +1,271 @@ +LAMMPS (7 Feb 2024 - Development - patch_7Feb2024_update1-345-g506bf886ee-modified) +# 2d micelle simulation + +dimension 2 + +neighbor 0.3 bin +neigh_modify delay 5 + +atom_style bond + +# Soft potential push-off + +read_data data.micelle +Reading data file ... + orthogonal box = (0 0 -0.1) to (35.85686 35.85686 0.1) + 1 by 1 by 1 MPI processor grid + reading atoms ... + 1200 atoms + scanning bonds ... + 1 = max bonds/atom + reading bonds ... + 300 bonds +Finding 1-2 1-3 1-4 neighbors ... + special bond factors lj: 0 0 0 + special bond factors coul: 0 0 0 + 2 = max # of 1-2 neighbors + 1 = max # of 1-3 neighbors + 1 = max # of 1-4 neighbors + 2 = max # of special neighbors + special bonds CPU = 0.000 seconds + read_data CPU = 0.005 seconds +special_bonds fene +Finding 1-2 1-3 1-4 neighbors ... + special bond factors lj: 0 1 1 + special bond factors coul: 0 1 1 + 2 = max # of 1-2 neighbors + 2 = max # of special neighbors + special bonds CPU = 0.000 seconds + +pair_style soft 1.12246 +pair_coeff * * 0.0 1.12246 + +bond_style harmonic +bond_coeff 1 50.0 0.75 + +velocity all create 0.45 2349852 + +variable prefactor equal ramp(1.0,20.0) + +fix 1 all nve +fix 2 all temp/rescale 100 0.45 0.45 0.02 1.0 +fix 3 all adapt 1 pair soft a * * v_prefactor +fix 4 all enforce2d + +thermo 50 +run 500 +Generated 0 of 6 mixed pair_coeff terms from geometric mixing rule +WARNING: Communication cutoff 1.42246 is shorter than a bond length based estimate of 1.425. This may lead to errors. (../comm.cpp:730) +Neighbor list info ... + update: every = 1 steps, delay = 5 steps, check = yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 1.42246 + ghost atom cutoff = 1.42246 + binsize = 0.71123, bins = 51 51 1 + 1 neighbor lists, perpetual/occasional/extra = 1 0 0 + (1) pair soft, perpetual + attributes: half, newton on + pair build: half/bin/newton + stencil: half/bin/2d + bin: standard +WARNING: Communication cutoff 1.42246 is shorter than a bond length based estimate of 1.425. This may lead to errors. (../comm.cpp:730) +Per MPI rank memory allocation (min/avg/max) = 4.148 | 4.148 | 4.148 Mbytes + Step Temp E_pair E_mol TotEng Press + 0 0.45 0.40003481 2.2200223e-06 0.84966203 0.78952518 + 50 0.54981866 0.93548899 0.068440043 1.5532895 1.9232786 + 100 0.45 0.99659327 0.079228519 1.5254468 3.2135679 + 150 0.86965411 0.90456016 0.07493355 1.8484231 4.3821925 + 200 0.45 1.01454 0.10663502 1.5708 4.7598476 + 250 0.79636561 0.82567712 0.12105337 1.7424325 5.4983899 + 300 0.45 0.86475538 0.11819875 1.4325791 5.8554758 + 350 0.72135464 0.70693069 0.10912636 1.5368106 6.0388247 + 400 0.45 0.75067331 0.14165013 1.3419484 6.3840708 + 450 0.64839221 0.62402486 0.14173679 1.4136135 6.4791009 + 500 0.45 0.66669513 0.13695201 1.2532721 6.807146 +Loop time of 0.0365221 on 1 procs for 500 steps with 1200 atoms + +Performance: 5914221.123 tau/day, 13690.327 timesteps/s, 16.428 Matom-step/s +89.2% CPU use with 1 MPI tasks x no OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0.022939 | 0.022939 | 0.022939 | 0.0 | 62.81 +Bond | 0.00073851 | 0.00073851 | 0.00073851 | 0.0 | 2.02 +Neigh | 0.0078339 | 0.0078339 | 0.0078339 | 0.0 | 21.45 +Comm | 0.00072134 | 0.00072134 | 0.00072134 | 0.0 | 1.98 +Output | 7.1419e-05 | 7.1419e-05 | 7.1419e-05 | 0.0 | 0.20 +Modify | 0.0034868 | 0.0034868 | 0.0034868 | 0.0 | 9.55 +Other | | 0.0007314 | | | 2.00 + +Nlocal: 1200 ave 1200 max 1200 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 197 ave 197 max 197 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 3094 ave 3094 max 3094 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 3094 +Ave neighs/atom = 2.5783333 +Ave special neighs/atom = 0.5 +Neighbor list builds = 52 +Dangerous builds = 0 + +unfix 3 + +# Main run + +pair_style lj/cut 2.5 + +# solvent/head - full-size and long-range + +pair_coeff 1 1 1.0 1.0 2.5 +pair_coeff 2 2 1.0 1.0 2.5 +pair_coeff 1 2 1.0 1.0 2.5 + +# tail/tail - size-averaged and long-range + +pair_coeff 3 3 1.0 0.75 2.5 +pair_coeff 4 4 1.0 0.50 2.5 +pair_coeff 3 4 1.0 0.67 2.5 + +# solvent/tail - full-size and repulsive + +pair_coeff 1 3 1.0 1.0 1.12246 +pair_coeff 1 4 1.0 1.0 1.12246 + +# head/tail - size-averaged and repulsive + +pair_coeff 2 3 1.0 0.88 1.12246 +pair_coeff 2 4 1.0 0.75 1.12246 + +thermo 50 + +#dump 1 all atom 2000 dump.micelle + +#dump 2 all image 2000 image.*.jpg type type zoom 1.6 +#dump_modify 2 pad 5 adiam 1 0.5 adiam 2 1.5 adiam 3 1.0 adiam 4 0.75 + +#dump 3 all movie 2000 movie.mpg type type zoom 1.6 +#dump_modify 3 pad 5 adiam 1 0.5 adiam 2 1.5 adiam 3 1.0 adiam 4 0.75 + +reset_timestep 0 +group solvent molecule 0 +750 atoms in group solvent +group solute subtract all solvent +450 atoms in group solute +unfix 1 +unfix 2 +unfix 4 +fix 1 solvent nve +fix 2 solvent temp/rescale 100 0.45 0.45 0.02 1.0 +fix 5 solute rigid molecule langevin 0.45 0.45 0.5 112211 + 150 rigid bodies with 450 atoms +fix 4 all enforce2d +run 500 +Generated 0 of 6 mixed pair_coeff terms from geometric mixing rule +Neighbor list info ... + update: every = 1 steps, delay = 5 steps, check = yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 2.8 + ghost atom cutoff = 2.8 + binsize = 1.4, bins = 26 26 1 + 1 neighbor lists, perpetual/occasional/extra = 1 0 0 + (1) pair lj/cut, perpetual + attributes: half, newton on + pair build: half/bin/newton + stencil: half/bin/2d + bin: standard +Per MPI rank memory allocation (min/avg/max) = 5.391 | 5.391 | 5.391 Mbytes + Step Temp E_pair E_mol TotEng Press + 0 0.45318168 -1.3753652 0.13695201 -0.8705807 1.975423 + 50 0.77344732 -1.6944083 0.13695201 -0.92967487 0.58657109 + 100 0.53530681 -1.7006195 0.13695201 -1.1291768 0.11219772 + 150 0.60820175 -1.8071581 0.13695201 -1.176549 1.5161796 + 200 0.49410558 -1.7945459 0.13695201 -1.2565449 4.0469262 + 250 0.52460847 -1.8528672 0.13695201 -1.290108 2.9929445 + 300 0.46596803 -1.8680499 0.13695201 -1.3528872 2.7958851 + 350 0.48831812 -1.8723486 0.13695201 -1.3390451 -4.5106818 + 400 0.46798432 -1.9008529 0.13695201 -1.3840536 -4.3096566 + 450 0.46000658 -1.9081144 0.13695201 -1.3977904 3.3360611 + 500 0.45822409 -1.9077531 0.13695201 -1.3988759 0.45428738 +Loop time of 0.0650638 on 1 procs for 500 steps with 1200 atoms + +Performance: 3319817.322 tau/day, 7684.762 timesteps/s, 9.222 Matom-step/s +100.0% CPU use with 1 MPI tasks x no OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0.027565 | 0.027565 | 0.027565 | 0.0 | 42.37 +Bond | 0.0007043 | 0.0007043 | 0.0007043 | 0.0 | 1.08 +Neigh | 0.012724 | 0.012724 | 0.012724 | 0.0 | 19.56 +Comm | 0.00091442 | 0.00091442 | 0.00091442 | 0.0 | 1.41 +Output | 6.004e-05 | 6.004e-05 | 6.004e-05 | 0.0 | 0.09 +Modify | 0.022329 | 0.022329 | 0.022329 | 0.0 | 34.32 +Other | | 0.0007666 | | | 1.18 + +Nlocal: 1200 ave 1200 max 1200 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 411 ave 411 max 411 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 8759 ave 8759 max 8759 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 8759 +Ave neighs/atom = 7.2991667 +Ave special neighs/atom = 0.5 +Neighbor list builds = 46 +Dangerous builds = 2 +unfix 2 +unfix 4 +unfix 5 +fix 5 solute rigid/small molecule + create bodies CPU = 0.000 seconds + 150 rigid bodies with 450 atoms + 1.3043524 = max distance from body owner to body atom +fix 4 all enforce2d +run 500 +Generated 0 of 6 mixed pair_coeff terms from geometric mixing rule +Per MPI rank memory allocation (min/avg/max) = 9.306 | 9.306 | 9.306 Mbytes + Step Temp E_pair E_mol TotEng Press + 500 0.45822409 -1.9077531 0.13695201 -1.3988759 2.4509752 + 550 0.46736204 -1.9141964 0.13695201 -1.3979022 2.1695662 + 600 0.47872194 -1.9232781 0.13695201 -1.3977635 2.0058379 + 650 0.47491575 -1.9224109 0.13695201 -1.3999857 2.0637789 + 700 0.44714331 -1.8990682 0.13695201 -1.3991848 2.4863082 + 750 0.49089274 -1.9231004 0.13695201 -1.3877071 2.123147 + 800 0.4753839 -1.8959698 0.13695201 -1.3731645 2.3030481 + 850 0.46870816 -1.8972225 0.13695201 -1.3798357 2.2464703 + 900 0.49610454 -1.9070748 0.13695201 -1.3674513 2.2196388 + 950 0.4773035 -1.8925765 0.13695201 -1.3682132 2.3534786 + 1000 0.50413702 -1.9292393 0.13695201 -1.383096 2.1630988 +Loop time of 0.0592806 on 1 procs for 500 steps with 1200 atoms + +Performance: 3643690.276 tau/day, 8434.468 timesteps/s, 10.121 Matom-step/s +100.0% CPU use with 1 MPI tasks x no OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0.026866 | 0.026866 | 0.026866 | 0.0 | 45.32 +Bond | 0.00071863 | 0.00071863 | 0.00071863 | 0.0 | 1.21 +Neigh | 0.010927 | 0.010927 | 0.010927 | 0.0 | 18.43 +Comm | 0.00084187 | 0.00084187 | 0.00084187 | 0.0 | 1.42 +Output | 6.8106e-05 | 6.8106e-05 | 6.8106e-05 | 0.0 | 0.11 +Modify | 0.019075 | 0.019075 | 0.019075 | 0.0 | 32.18 +Other | | 0.000783 | | | 1.32 + +Nlocal: 1200 ave 1200 max 1200 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 417 ave 417 max 417 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 8654 ave 8654 max 8654 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 8654 +Ave neighs/atom = 7.2116667 +Ave special neighs/atom = 0.5 +Neighbor list builds = 39 +Dangerous builds = 0 +Total wall time: 0:00:00 diff --git a/examples/micelle/log.4Apr2024.micelle-rigid.g++.4 b/examples/micelle/log.4Apr2024.micelle-rigid.g++.4 new file mode 100644 index 0000000000..ce15cfec21 --- /dev/null +++ b/examples/micelle/log.4Apr2024.micelle-rigid.g++.4 @@ -0,0 +1,272 @@ +LAMMPS (7 Feb 2024 - Development - patch_7Feb2024_update1-345-g506bf886ee-modified) +WARNING: Using I/O redirection is unreliable with parallel runs. Better to use the -in switch to read input files. (../lammps.cpp:551) +# 2d micelle simulation + +dimension 2 + +neighbor 0.3 bin +neigh_modify delay 5 + +atom_style bond + +# Soft potential push-off + +read_data data.micelle +Reading data file ... + orthogonal box = (0 0 -0.1) to (35.85686 35.85686 0.1) + 2 by 2 by 1 MPI processor grid + reading atoms ... + 1200 atoms + scanning bonds ... + 1 = max bonds/atom + reading bonds ... + 300 bonds +Finding 1-2 1-3 1-4 neighbors ... + special bond factors lj: 0 0 0 + special bond factors coul: 0 0 0 + 2 = max # of 1-2 neighbors + 1 = max # of 1-3 neighbors + 1 = max # of 1-4 neighbors + 2 = max # of special neighbors + special bonds CPU = 0.000 seconds + read_data CPU = 0.004 seconds +special_bonds fene +Finding 1-2 1-3 1-4 neighbors ... + special bond factors lj: 0 1 1 + special bond factors coul: 0 1 1 + 2 = max # of 1-2 neighbors + 2 = max # of special neighbors + special bonds CPU = 0.000 seconds + +pair_style soft 1.12246 +pair_coeff * * 0.0 1.12246 + +bond_style harmonic +bond_coeff 1 50.0 0.75 + +velocity all create 0.45 2349852 + +variable prefactor equal ramp(1.0,20.0) + +fix 1 all nve +fix 2 all temp/rescale 100 0.45 0.45 0.02 1.0 +fix 3 all adapt 1 pair soft a * * v_prefactor +fix 4 all enforce2d + +thermo 50 +run 500 +Generated 0 of 6 mixed pair_coeff terms from geometric mixing rule +WARNING: Communication cutoff 1.42246 is shorter than a bond length based estimate of 1.425. This may lead to errors. (../comm.cpp:730) +Neighbor list info ... + update: every = 1 steps, delay = 5 steps, check = yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 1.42246 + ghost atom cutoff = 1.42246 + binsize = 0.71123, bins = 51 51 1 + 1 neighbor lists, perpetual/occasional/extra = 1 0 0 + (1) pair soft, perpetual + attributes: half, newton on + pair build: half/bin/newton + stencil: half/bin/2d + bin: standard +WARNING: Communication cutoff 1.42246 is shorter than a bond length based estimate of 1.425. This may lead to errors. (../comm.cpp:730) +Per MPI rank memory allocation (min/avg/max) = 4.126 | 4.126 | 4.127 Mbytes + Step Temp E_pair E_mol TotEng Press + 0 0.45 0.40003481 2.2200223e-06 0.84966203 0.78952518 + 50 0.54981866 0.93548899 0.068440043 1.5532895 1.9232786 + 100 0.45 0.99659327 0.079228519 1.5254468 3.2135679 + 150 0.86965411 0.90456016 0.07493355 1.8484231 4.3821925 + 200 0.45 1.01454 0.10663502 1.5708 4.7598476 + 250 0.79636561 0.82567712 0.12105337 1.7424325 5.4983899 + 300 0.45 0.86475538 0.11819875 1.4325791 5.8554758 + 350 0.72135464 0.70693069 0.10912636 1.5368106 6.0388247 + 400 0.45 0.75067331 0.14165013 1.3419484 6.3840708 + 450 0.64839221 0.62402486 0.14173679 1.4136135 6.4791009 + 500 0.45 0.66669513 0.13695201 1.2532721 6.807146 +Loop time of 0.0138659 on 4 procs for 500 steps with 1200 atoms + +Performance: 15577811.312 tau/day, 36059.748 timesteps/s, 43.272 Matom-step/s +99.9% CPU use with 4 MPI tasks x no OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0.0053896 | 0.0057144 | 0.0060899 | 0.4 | 41.21 +Bond | 0.00020074 | 0.00021422 | 0.00022291 | 0.0 | 1.54 +Neigh | 0.0025301 | 0.0025401 | 0.0025501 | 0.0 | 18.32 +Comm | 0.0031194 | 0.0035074 | 0.0038196 | 0.4 | 25.30 +Output | 6.4137e-05 | 6.7743e-05 | 7.7909e-05 | 0.0 | 0.49 +Modify | 0.0013391 | 0.0013582 | 0.0013972 | 0.1 | 9.80 +Other | | 0.0004638 | | | 3.34 + +Nlocal: 300 ave 304 max 292 min +Histogram: 1 0 0 0 0 0 0 0 2 1 +Nghost: 103.5 ave 108 max 98 min +Histogram: 1 0 0 1 0 0 0 0 0 2 +Neighs: 773.5 ave 792 max 735 min +Histogram: 1 0 0 0 0 0 0 0 2 1 + +Total # of neighbors = 3094 +Ave neighs/atom = 2.5783333 +Ave special neighs/atom = 0.5 +Neighbor list builds = 52 +Dangerous builds = 0 + +unfix 3 + +# Main run + +pair_style lj/cut 2.5 + +# solvent/head - full-size and long-range + +pair_coeff 1 1 1.0 1.0 2.5 +pair_coeff 2 2 1.0 1.0 2.5 +pair_coeff 1 2 1.0 1.0 2.5 + +# tail/tail - size-averaged and long-range + +pair_coeff 3 3 1.0 0.75 2.5 +pair_coeff 4 4 1.0 0.50 2.5 +pair_coeff 3 4 1.0 0.67 2.5 + +# solvent/tail - full-size and repulsive + +pair_coeff 1 3 1.0 1.0 1.12246 +pair_coeff 1 4 1.0 1.0 1.12246 + +# head/tail - size-averaged and repulsive + +pair_coeff 2 3 1.0 0.88 1.12246 +pair_coeff 2 4 1.0 0.75 1.12246 + +thermo 50 + +#dump 1 all atom 2000 dump.micelle + +#dump 2 all image 2000 image.*.jpg type type zoom 1.6 +#dump_modify 2 pad 5 adiam 1 0.5 adiam 2 1.5 adiam 3 1.0 adiam 4 0.75 + +#dump 3 all movie 2000 movie.mpg type type zoom 1.6 +#dump_modify 3 pad 5 adiam 1 0.5 adiam 2 1.5 adiam 3 1.0 adiam 4 0.75 + +reset_timestep 0 +group solvent molecule 0 +750 atoms in group solvent +group solute subtract all solvent +450 atoms in group solute +unfix 1 +unfix 2 +unfix 4 +fix 1 solvent nve +fix 2 solvent temp/rescale 100 0.45 0.45 0.02 1.0 +fix 5 solute rigid molecule langevin 0.45 0.45 0.5 112211 + 150 rigid bodies with 450 atoms +fix 4 all enforce2d +run 500 +Generated 0 of 6 mixed pair_coeff terms from geometric mixing rule +Neighbor list info ... + update: every = 1 steps, delay = 5 steps, check = yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 2.8 + ghost atom cutoff = 2.8 + binsize = 1.4, bins = 26 26 1 + 1 neighbor lists, perpetual/occasional/extra = 1 0 0 + (1) pair lj/cut, perpetual + attributes: half, newton on + pair build: half/bin/newton + stencil: half/bin/2d + bin: standard +Per MPI rank memory allocation (min/avg/max) = 5.375 | 5.375 | 5.375 Mbytes + Step Temp E_pair E_mol TotEng Press + 0 0.45318168 -1.3753652 0.13695201 -0.8705807 1.975423 + 50 0.77344732 -1.6944083 0.13695201 -0.92967487 0.58657109 + 100 0.53530681 -1.7006195 0.13695201 -1.1291768 0.11219772 + 150 0.60820175 -1.8071581 0.13695201 -1.176549 1.5161796 + 200 0.49410558 -1.7945459 0.13695201 -1.2565449 4.0469262 + 250 0.52460847 -1.8528672 0.13695201 -1.290108 2.9929445 + 300 0.46596803 -1.8680499 0.13695201 -1.3528872 2.7958851 + 350 0.48831812 -1.8723486 0.13695201 -1.3390451 -4.5106818 + 400 0.46798432 -1.9008529 0.13695201 -1.3840536 -4.3096566 + 450 0.46000658 -1.9081144 0.13695201 -1.3977904 3.3360611 + 500 0.45822409 -1.9077531 0.13695201 -1.3988759 0.45428738 +Loop time of 0.0381773 on 4 procs for 500 steps with 1200 atoms + +Performance: 5657810.772 tau/day, 13096.784 timesteps/s, 15.716 Matom-step/s +99.6% CPU use with 4 MPI tasks x no OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0.0059651 | 0.0062314 | 0.0066404 | 0.3 | 16.32 +Bond | 0.00021057 | 0.00022477 | 0.0002333 | 0.0 | 0.59 +Neigh | 0.0041424 | 0.0041487 | 0.0041512 | 0.0 | 10.87 +Comm | 0.004264 | 0.0047244 | 0.0050297 | 0.4 | 12.37 +Output | 8.2396e-05 | 8.6559e-05 | 9.6749e-05 | 0.0 | 0.23 +Modify | 0.021833 | 0.021946 | 0.022094 | 0.1 | 57.48 +Other | | 0.0008157 | | | 2.14 + +Nlocal: 300 ave 303 max 296 min +Histogram: 1 0 0 0 1 0 0 0 1 1 +Nghost: 216.25 ave 219 max 214 min +Histogram: 1 0 1 0 0 0 1 0 0 1 +Neighs: 2189.75 ave 2205 max 2173 min +Histogram: 1 0 0 0 1 0 1 0 0 1 + +Total # of neighbors = 8759 +Ave neighs/atom = 7.2991667 +Ave special neighs/atom = 0.5 +Neighbor list builds = 46 +Dangerous builds = 2 +unfix 2 +unfix 4 +unfix 5 +fix 5 solute rigid/small molecule + create bodies CPU = 0.000 seconds + 150 rigid bodies with 450 atoms + 1.3043524 = max distance from body owner to body atom +fix 4 all enforce2d +run 500 +Generated 0 of 6 mixed pair_coeff terms from geometric mixing rule +Per MPI rank memory allocation (min/avg/max) = 9.233 | 9.233 | 9.234 Mbytes + Step Temp E_pair E_mol TotEng Press + 500 0.45822409 -1.9077531 0.13695201 -1.3988759 2.4509752 + 550 0.46736204 -1.9141964 0.13695201 -1.3979022 2.1695662 + 600 0.47872194 -1.9232781 0.13695201 -1.3977635 2.0058379 + 650 0.47491575 -1.9224109 0.13695201 -1.3999857 2.0637789 + 700 0.44714331 -1.8990682 0.13695201 -1.3991848 2.4863082 + 750 0.49089274 -1.9231004 0.13695201 -1.3877071 2.123147 + 800 0.4753839 -1.8959698 0.13695201 -1.3731645 2.3030481 + 850 0.46870816 -1.8972225 0.13695201 -1.3798357 2.2464703 + 900 0.49610454 -1.9070748 0.13695201 -1.3674513 2.2196388 + 950 0.4773035 -1.8925765 0.13695201 -1.3682132 2.3534786 + 1000 0.50413702 -1.9292393 0.13695201 -1.383096 2.1630987 +Loop time of 0.0236819 on 4 procs for 500 steps with 1200 atoms + +Performance: 9120883.727 tau/day, 21113.157 timesteps/s, 25.336 Matom-step/s +99.9% CPU use with 4 MPI tasks x no OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0.0058656 | 0.0059718 | 0.0061355 | 0.1 | 25.22 +Bond | 0.0002083 | 0.00022447 | 0.00023485 | 0.0 | 0.95 +Neigh | 0.0035477 | 0.0035644 | 0.0035824 | 0.0 | 15.05 +Comm | 0.0041037 | 0.0042227 | 0.0043024 | 0.1 | 17.83 +Output | 7.4355e-05 | 7.8273e-05 | 8.7777e-05 | 0.0 | 0.33 +Modify | 0.008976 | 0.0090549 | 0.0091663 | 0.1 | 38.24 +Other | | 0.0005654 | | | 2.39 + +Nlocal: 300 ave 306 max 295 min +Histogram: 1 0 1 0 0 1 0 0 0 1 +Nghost: 221 ave 226 max 217 min +Histogram: 1 0 0 1 1 0 0 0 0 1 +Neighs: 2163.5 ave 2271 max 2100 min +Histogram: 1 1 0 1 0 0 0 0 0 1 + +Total # of neighbors = 8654 +Ave neighs/atom = 7.2116667 +Ave special neighs/atom = 0.5 +Neighbor list builds = 39 +Dangerous builds = 0 +Total wall time: 0:00:00 diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index f415b30334..7255b8770f 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -33,6 +33,7 @@ //#define ASYNC_DEVICE_COPY +#if 0 #if !defined(USE_OPENCL) && !defined(USE_HIP) // temporary workaround for int2 also defined in cufft #ifdef int2 @@ -40,6 +41,7 @@ #endif #include "cufft.h" #endif +#endif namespace LAMMPS_AL { @@ -313,10 +315,11 @@ class BaseAmoeba { virtual int fphi_mpole(); virtual int polar_real(const int eflag, const int vflag) = 0; - +#if 0 #if !defined(USE_OPENCL) && !defined(USE_HIP) cufftHandle plan; #endif +#endif bool fft_plan_created; }; diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index 40e3c95f24..f7b8af7695 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,5 +1,105 @@ # CHANGELOG +## [4.3.00](https://github.com/kokkos/kokkos/tree/4.3.00) (2024-03-19) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.01...4.3.00) + +### Features: +* Add `Experimental::sort_by_key(exec, keys, values)` algorithm [\#6801](https://github.com/kokkos/kokkos/pull/6801) + +### Backend and Architecture Enhancements: + +#### CUDA: +* Experimental multi-GPU support (from the same process) [\#6782](https://github.com/kokkos/kokkos/pull/6782) +* Link against CUDA libraries even with KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE [\#6701](https://github.com/kokkos/kokkos/pull/6701) +* Don't use the compiler launcher script if the CMake compile language is CUDA. [\#6704](https://github.com/kokkos/kokkos/pull/6704) +* nvcc(wrapper): adding "long" and "short" versions for all flags [\#6615](https://github.com/kokkos/kokkos/pull/6615) + +#### HIP: + * Fix compilation when using amdclang (with ROCm >= 5.7) and RDC [\#6857](https://github.com/kokkos/kokkos/pull/6857) + * Use rocthrust for sorting, when available [\#6793](https://github.com/kokkos/kokkos/pull/6793) + +#### SYCL: +* We only support OneAPI SYCL implementation: add check during initialization + * Error out on initialization if the backend is different from `ext_oneapi_*` [\#6784](https://github.com/kokkos/kokkos/pull/6784) + * Filter GPU devices for `ext_onapi_*` GPU devices [\#6758](https://github.com/kokkos/kokkos/pull/6784) +* Performance Improvements + * Avoid unnecessary zero-memset of the scratch flags in SYCL [\#6739](https://github.com/kokkos/kokkos/pull/6739) + * Use host-pinned memory to copy reduction/scan result [\#6500](https://github.com/kokkos/kokkos/pull/6500) +* Address deprecations after oneAPI 2023.2.0 [\#6577](https://github.com/kokkos/kokkos/pull/6739) +* Make sure to call find_dependency for oneDPL if necessary [\#6870](https://github.com/kokkos/kokkos/pull/6870) + +#### OpenMPTarget: +* Use LLVM extensions for dynamic shared memory [\#6380](https://github.com/kokkos/kokkos/pull/6380) +* Guard scratch memory usage in ParallelReduce [\#6585 ](https://github.com/kokkos/kokkos/pull/6585) +* Update linker flags for Intel GPUs update [\#6735](https://github.com/kokkos/kokkos/pull/6735) +* Improve handling of printf on Intel GPUs [\#6652](https://github.com/kokkos/kokkos/pull/6652) + +#### OpenACC: +* Add atomics support [\#6446](https://github.com/kokkos/kokkos/pull/6446) +* Make the OpenACC backend asynchronous [\#6772](https://github.com/kokkos/kokkos/pull/6772) + +#### Threads: +* Add missing broadcast to TeamThreadRange parallel_scan [\#6601](https://github.com/kokkos/kokkos/pull/6446) + +#### OpenMP: +* Improve performance of view initializations and filling with zeros [\#6573](https://github.com/kokkos/kokkos/pull/6573) + +### General Enhancements + +* Improve performance of random number generation when using a normal distribution on GPUs [\#6556](https://github.com/kokkos/kokkos/pull/6556) +* Allocate temporary view with the user-provided execution space instance and do not initialize in `unique` algorithm [\#6598](https://github.com/kokkos/kokkos/pull/6598) +* Add deduction guide for `Kokkos::Array` [\#6373](https://github.com/kokkos/kokkos/pull/6373) +* Provide new public headers `` and `` [\#6687](https://github.com/kokkos/kokkos/pull/6687) +* Fix/improvement to `remove_if` parallel algorithm: use the provided execution space instance for temporary allocations and drop unnecessaryinitialization + avoid evaluating twice the predicate during final pass [\#6747](https://github.com/kokkos/kokkos/pull/6747) +* Add runtime function to query the number of devices and make device ID consistent with `KOKKOS_VISIBLE_DEVICES` [\#6713](https://github.com/kokkos/kokkos/pull/6713) +* simd: support `vector_aligned_tag` [\#6243](https://github.com/kokkos/kokkos/pull/6243) +* Avoid unnecessary allocation when default constructing Bitset [\#6524](https://github.com/kokkos/kokkos/pull/6524) +* Fix constness for views in std algorithms [\#6813](https://github.com/kokkos/kokkos/pull/6813) +* Improve error message on unsafe implicit conversion in MDRangePolicy [\#6855](https://github.com/kokkos/kokkos/pull/6855) +* CTAD (deduction guides) for RangePolicy [\#6850](https://github.com/kokkos/kokkos/pull/6850) +* CTAD (deduction guides) for MDRangePolicy [\#5516](https://github.com/kokkos/kokkos/pull/5516) + +### Build System Changes +* Require `Kokkos_ENABLE_ATOMICS_BYPASS` option to bypass atomic operation for Serial backend only builds [\#6692](https://github.com/kokkos/kokkos/pull/6692) +* Add support for RISCV and the Milk-V's Pioneer [\#6773](https://github.com/kokkos/kokkos/pull/6773) +* Add C++26 standard to CMake setup [\#6733](https://github.com/kokkos/kokkos/pull/6733) +* Fix Makefile when using gnu_generate_makefile.sh and make >= 4.3 [\#6606](https://github.com/kokkos/kokkos/pull/6606) +* Cuda: Fix configuring with CMake >= 3.28.4 - temporary fallback to internal CudaToolkit.cmake [\#6898](https://github.com/kokkos/kokkos/pull/6898) + +### Incompatibilities (i.e. breaking changes) +* Remove all `DEPRECATED_CODE_3` option and all code that was guarded by it [\#6523](https://github.com/kokkos/kokkos/pull/6523) +* Drop guards to accommodate external code defining `KOKKOS_ASSERT` [\#6665](https://github.com/kokkos/kokkos/pull/6665) +* `Profiling::ProfilingSection(std::string)` constructor marked explicit and nodiscard [\#6690](https://github.com/kokkos/kokkos/pull/6690) +* Add bound check preconditions for `RangePolicy` and `MDRangePolicy` [\#6617](https://github.com/kokkos/kokkos/pull/6617) [\#6726](https://github.com/kokkos/kokkos/pull/6726) +* Add checks for unsafe implicit conversions in RangePolicy [\#6754](https://github.com/kokkos/kokkos/pull/6754) +* Remove Kokkos::[b]half_t volatile overloads [\#6579](https://github.com/kokkos/kokkos/pull/6579) +* Remove KOKKOS_IMPL_DO_NOT_USE_PRINTF [\#6593](https://github.com/kokkos/kokkos/pull/6593) +* Check matching static extents in View constructor [\#5190 ](https://github.com/kokkos/kokkos/pull/5190) +* Tools(profiling): fix typo Kokkos_Tools_Optim[i]zationGoal [\#6642](https://github.com/kokkos/kokkos/pull/6642) +* Remove variadic range policy constructor (disallow passing multiple trailing chunk size arguments) [\#6845](https://github.com/kokkos/kokkos/pull/6845) +* Improve message on view out of bounds access and always abort [\#6861](https://github.com/kokkos/kokkos/pull/6861) +* Drop `KOKKOS_ENABLE_INTEL_MM_ALLOC` macro [\#6797](https://github.com/kokkos/kokkos/pull/6797) +* Remove `Kokkos::Experimental::LogicalMemorySpace` (without going through deprecation) [\#6557](https://github.com/kokkos/kokkos/pull/6557) +* Remove `Experimental::HBWSpace` and support for linking against memkind [\#6791](https://github.com/kokkos/kokkos/pull/6791) +* Drop librt TPL and associated `KOKKOS_ENABLE_LIBRT` macro [\#6798](https://github.com/kokkos/kokkos/pull/6798) +* Drop support for old CPU architectures (`ARCH_BGQ`, `ARCH_POWER7`, `ARCH_WSM` and associated `ARCH_SSE4` macro) [\#6806](https://github.com/kokkos/kokkos/pull/6806) +* Drop support for deprecated command-line arguments and environment variables [\#6744](https://github.com/kokkos/kokkos/pull/6744) + +### Deprecations +* Provide kokkos_swap as part of Core and deprecate Experimental::swap in Algorithms [\#6697](https://github.com/kokkos/kokkos/pull/6697) +* Deprecate {Cuda,HIP}::detect_device_count() and Cuda::[detect_]device_arch() [\#6710](https://github.com/kokkos/kokkos/pull/6710) +* Deprecate `ExecutionSpace::in_parallel()` [\#6582](https://github.com/kokkos/kokkos/pull/6582) + +### Bug Fixes +* Fix team-level MDRange reductions: [\#6511](https://github.com/kokkos/kokkos/pull/6511) +* Fix CUDA and SYCL small value type (16-bit) team reductions [\#5334](https://github.com/kokkos/kokkos/pull/5334) +* Enable `{transform_}exclusive_scan` in place [\#6667](https://github.com/kokkos/kokkos/pull/6667) +* `fill_random` overload that do not take an execution space instance argument should fence [\#6658](https://github.com/kokkos/kokkos/pull/6658) +* HIP,Cuda,OpenMPTarget: Fixup use provided execution space when copying host inaccessible reduction result [\#6777](https://github.com/kokkos/kokkos/pull/6777) +* Fix typo in `cuda_func_set_attribute[s]_wrapper` preventing proper setting of desired occupancy [\#6786](https://github.com/kokkos/kokkos/pull/6786) +* Avoid undefined behavior due to conversion between signed and unsigned integers in shift_{right, left}_team_impl [\#6821](https://github.com/kokkos/kokkos/pull/6821) +* Fix a bug in Makefile.kokkos when using AMD GPU architectures as `AMD_GFXYYY` [\#6892](https://github.com/kokkos/kokkos/pull/6892) + ## [4.2.01](https://github.com/kokkos/kokkos/tree/4.2.01) (2023-12-07) [Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.00...4.2.01) @@ -999,95 +1099,95 @@ - Major update for OpenMPTarget: many capabilities now work. For details contact us. - Added DPC++/SYCL backend: primary capabilites are working. - Added Kokkos Graph API analogous to CUDA Graphs. -- Added parallel_scan support with TeamThreadRange [\#3536](https://github.com/kokkos/kokkos/pull/#3536) -- Added Logical Memory Spaces [\#3546](https://github.com/kokkos/kokkos/pull/#3546) -- Added initial half precision support [\#3439](https://github.com/kokkos/kokkos/pull/#3439) -- Experimental feature: control cuda occupancy [\#3379](https://github.com/kokkos/kokkos/pull/#3379) +- Added parallel_scan support with TeamThreadRange [\#3536](https://github.com/kokkos/kokkos/pull/3536) +- Added Logical Memory Spaces [\#3546](https://github.com/kokkos/kokkos/pull/3546) +- Added initial half precision support [\#3439](https://github.com/kokkos/kokkos/pull/3439) +- Experimental feature: control cuda occupancy [\#3379](https://github.com/kokkos/kokkos/pull/3379) **Implemented enhancements Backends and Archs:** -- Add a64fx and fujitsu Compiler support [\#3614](https://github.com/kokkos/kokkos/pull/#3614) -- Adding support for AMD gfx908 archictecture [\#3375](https://github.com/kokkos/kokkos/pull/#3375) -- SYCL parallel\_for MDRangePolicy [\#3583](https://github.com/kokkos/kokkos/pull/#3583) -- SYCL add parallel\_scan [\#3577](https://github.com/kokkos/kokkos/pull/#3577) -- SYCL custom reductions [\#3544](https://github.com/kokkos/kokkos/pull/#3544) -- SYCL Enable container unit tests [\#3550](https://github.com/kokkos/kokkos/pull/#3550) -- SYCL feature level 5 [\#3480](https://github.com/kokkos/kokkos/pull/#3480) -- SYCL Feature level 4 (parallel\_for) [\#3474](https://github.com/kokkos/kokkos/pull/#3474) -- SYCL feature level 3 [\#3451](https://github.com/kokkos/kokkos/pull/#3451) -- SYCL feature level 2 [\#3447](https://github.com/kokkos/kokkos/pull/#3447) -- OpenMPTarget: Hierarchial reduction for + operator on scalars [\#3504](https://github.com/kokkos/kokkos/pull/#3504) -- OpenMPTarget hierarchical [\#3411](https://github.com/kokkos/kokkos/pull/#3411) -- HIP Add Impl::atomic\_[store,load] [\#3440](https://github.com/kokkos/kokkos/pull/#3440) -- HIP enable global lock arrays [\#3418](https://github.com/kokkos/kokkos/pull/#3418) -- HIP Implement multiple occupancy paths for various HIP kernel launchers [\#3366](https://github.com/kokkos/kokkos/pull/#3366) +- Add a64fx and fujitsu Compiler support [\#3614](https://github.com/kokkos/kokkos/pull/3614) +- Adding support for AMD gfx908 archictecture [\#3375](https://github.com/kokkos/kokkos/pull/3375) +- SYCL parallel\_for MDRangePolicy [\#3583](https://github.com/kokkos/kokkos/pull/3583) +- SYCL add parallel\_scan [\#3577](https://github.com/kokkos/kokkos/pull/3577) +- SYCL custom reductions [\#3544](https://github.com/kokkos/kokkos/pull/3544) +- SYCL Enable container unit tests [\#3550](https://github.com/kokkos/kokkos/pull/3550) +- SYCL feature level 5 [\#3480](https://github.com/kokkos/kokkos/pull/3480) +- SYCL Feature level 4 (parallel\_for) [\#3474](https://github.com/kokkos/kokkos/pull/3474) +- SYCL feature level 3 [\#3451](https://github.com/kokkos/kokkos/pull/3451) +- SYCL feature level 2 [\#3447](https://github.com/kokkos/kokkos/pull/3447) +- OpenMPTarget: Hierarchial reduction for + operator on scalars [\#3504](https://github.com/kokkos/kokkos/pull/3504) +- OpenMPTarget hierarchical [\#3411](https://github.com/kokkos/kokkos/pull/3411) +- HIP Add Impl::atomic\_[store,load] [\#3440](https://github.com/kokkos/kokkos/pull/3440) +- HIP enable global lock arrays [\#3418](https://github.com/kokkos/kokkos/pull/3418) +- HIP Implement multiple occupancy paths for various HIP kernel launchers [\#3366](https://github.com/kokkos/kokkos/pull/3366) **Implemented enhancements Policies:** -- MDRangePolicy: Let it be semiregular [\#3494](https://github.com/kokkos/kokkos/pull/#3494) -- MDRangePolicy: Check narrowing conversion in construction [\#3527](https://github.com/kokkos/kokkos/pull/#3527) -- MDRangePolicy: CombinedReducers support [\#3395](https://github.com/kokkos/kokkos/pull/#3395) -- Kokkos Graph: Interface and Default Implementation [\#3362](https://github.com/kokkos/kokkos/pull/#3362) -- Kokkos Graph: add Cuda Graph implementation [\#3369](https://github.com/kokkos/kokkos/pull/#3369) -- TeamPolicy: implemented autotuning of team sizes and vector lengths [\#3206](https://github.com/kokkos/kokkos/pull/#3206) -- RangePolicy: Initialize all data members in default constructor [\#3509](https://github.com/kokkos/kokkos/pull/#3509) +- MDRangePolicy: Let it be semiregular [\#3494](https://github.com/kokkos/kokkos/pull/3494) +- MDRangePolicy: Check narrowing conversion in construction [\#3527](https://github.com/kokkos/kokkos/pull/3527) +- MDRangePolicy: CombinedReducers support [\#3395](https://github.com/kokkos/kokkos/pull/3395) +- Kokkos Graph: Interface and Default Implementation [\#3362](https://github.com/kokkos/kokkos/pull/3362) +- Kokkos Graph: add Cuda Graph implementation [\#3369](https://github.com/kokkos/kokkos/pull/3369) +- TeamPolicy: implemented autotuning of team sizes and vector lengths [\#3206](https://github.com/kokkos/kokkos/pull/3206) +- RangePolicy: Initialize all data members in default constructor [\#3509](https://github.com/kokkos/kokkos/pull/3509) **Implemented enhancements BuildSystem:** -- Auto-generate core test files for all backends [\#3488](https://github.com/kokkos/kokkos/pull/#3488) -- Avoid rewriting test files when calling cmake [\#3548](https://github.com/kokkos/kokkos/pull/#3548) -- RULE\_LAUNCH\_COMPILE and RULE\_LAUNCH\_LINK system for nvcc\_wrapper [\#3136](https://github.com/kokkos/kokkos/pull/#3136) -- Adding -include as a known argument to nvcc\_wrapper [\#3434](https://github.com/kokkos/kokkos/pull/#3434) -- Install hpcbind script [\#3402](https://github.com/kokkos/kokkos/pull/#3402) -- cmake/kokkos\_tribits.cmake: add parsing for args [\#3457](https://github.com/kokkos/kokkos/pull/#3457) +- Auto-generate core test files for all backends [\#3488](https://github.com/kokkos/kokkos/pull/3488) +- Avoid rewriting test files when calling cmake [\#3548](https://github.com/kokkos/kokkos/pull/3548) +- RULE\_LAUNCH\_COMPILE and RULE\_LAUNCH\_LINK system for nvcc\_wrapper [\#3136](https://github.com/kokkos/kokkos/pull/3136) +- Adding -include as a known argument to nvcc\_wrapper [\#3434](https://github.com/kokkos/kokkos/pull/3434) +- Install hpcbind script [\#3402](https://github.com/kokkos/kokkos/pull/3402) +- cmake/kokkos\_tribits.cmake: add parsing for args [\#3457](https://github.com/kokkos/kokkos/pull/3457) **Implemented enhancements Tools:** -- Changed namespacing of Kokkos::Tools::Impl::Impl::tune\_policy [\#3455](https://github.com/kokkos/kokkos/pull/#3455) -- Delegate to an impl allocate/deallocate method to allow specifying a SpaceHandle for MemorySpaces [\#3530](https://github.com/kokkos/kokkos/pull/#3530) -- Use the Kokkos Profiling interface rather than the Impl interface [\#3518](https://github.com/kokkos/kokkos/pull/#3518) -- Runtime option for tuning [\#3459](https://github.com/kokkos/kokkos/pull/#3459) -- Dual View Tool Events [\#3326](https://github.com/kokkos/kokkos/pull/#3326) +- Changed namespacing of Kokkos::Tools::Impl::Impl::tune\_policy [\#3455](https://github.com/kokkos/kokkos/pull/3455) +- Delegate to an impl allocate/deallocate method to allow specifying a SpaceHandle for MemorySpaces [\#3530](https://github.com/kokkos/kokkos/pull/3530) +- Use the Kokkos Profiling interface rather than the Impl interface [\#3518](https://github.com/kokkos/kokkos/pull/3518) +- Runtime option for tuning [\#3459](https://github.com/kokkos/kokkos/pull/3459) +- Dual View Tool Events [\#3326](https://github.com/kokkos/kokkos/pull/3326) **Implemented enhancements Other:** -- Abort on errors instead of just printing [\#3528](https://github.com/kokkos/kokkos/pull/#3528) -- Enable C++14 macros unconditionally [\#3449](https://github.com/kokkos/kokkos/pull/#3449) -- Make ViewMapping trivially copyable [\#3436](https://github.com/kokkos/kokkos/pull/#3436) -- Rename struct ViewMapping to class [\#3435](https://github.com/kokkos/kokkos/pull/#3435) -- Replace enums in Kokkos\_ViewMapping.hpp (removes -Wextra) [\#3422](https://github.com/kokkos/kokkos/pull/#3422) -- Use bool for enums representing bools [\#3416](https://github.com/kokkos/kokkos/pull/#3416) -- Fence active instead of default execution space instances [\#3388](https://github.com/kokkos/kokkos/pull/#3388) -- Refactor parallel\_reduce fence usage [\#3359](https://github.com/kokkos/kokkos/pull/#3359) -- Moved Space EBO helpers to Kokkos\_EBO [\#3357](https://github.com/kokkos/kokkos/pull/#3357) -- Add remove\_cvref type trait [\#3340](https://github.com/kokkos/kokkos/pull/#3340) -- Adding identity type traits and update definition of identity\_t alias [\#3339](https://github.com/kokkos/kokkos/pull/#3339) -- Add is\_specialization\_of type trait [\#3338](https://github.com/kokkos/kokkos/pull/#3338) -- Make ScratchMemorySpace semi-regular [\#3309](https://github.com/kokkos/kokkos/pull/#3309) -- Optimize min/max atomics with early exit on no-op case [\#3265](https://github.com/kokkos/kokkos/pull/#3265) -- Refactor Backend Development [\#2941](https://github.com/kokkos/kokkos/pull/#2941) +- Abort on errors instead of just printing [\#3528](https://github.com/kokkos/kokkos/pull/3528) +- Enable C++14 macros unconditionally [\#3449](https://github.com/kokkos/kokkos/pull/3449) +- Make ViewMapping trivially copyable [\#3436](https://github.com/kokkos/kokkos/pull/3436) +- Rename struct ViewMapping to class [\#3435](https://github.com/kokkos/kokkos/pull/3435) +- Replace enums in Kokkos\_ViewMapping.hpp (removes -Wextra) [\#3422](https://github.com/kokkos/kokkos/pull/3422) +- Use bool for enums representing bools [\#3416](https://github.com/kokkos/kokkos/pull/3416) +- Fence active instead of default execution space instances [\#3388](https://github.com/kokkos/kokkos/pull/3388) +- Refactor parallel\_reduce fence usage [\#3359](https://github.com/kokkos/kokkos/pull/3359) +- Moved Space EBO helpers to Kokkos\_EBO [\#3357](https://github.com/kokkos/kokkos/pull/3357) +- Add remove\_cvref type trait [\#3340](https://github.com/kokkos/kokkos/pull/3340) +- Adding identity type traits and update definition of identity\_t alias [\#3339](https://github.com/kokkos/kokkos/pull/3339) +- Add is\_specialization\_of type trait [\#3338](https://github.com/kokkos/kokkos/pull/3338) +- Make ScratchMemorySpace semi-regular [\#3309](https://github.com/kokkos/kokkos/pull/3309) +- Optimize min/max atomics with early exit on no-op case [\#3265](https://github.com/kokkos/kokkos/pull/3265) +- Refactor Backend Development [\#2941](https://github.com/kokkos/kokkos/pull/2941) **Fixed bugs:** -- Fixup MDRangePolicy construction from Kokkos arrays [\#3591](https://github.com/kokkos/kokkos/pull/#3591) -- Add atomic functions for unsigned long long using gcc built-in [\#3588](https://github.com/kokkos/kokkos/pull/#3588) -- Fixup silent pointless comparison with zero in checked\_narrow\_cast (compiler workaround) [\#3566](https://github.com/kokkos/kokkos/pull/#3566) -- Fixes for ROCm 3.9 [\#3565](https://github.com/kokkos/kokkos/pull/#3565) -- Fix windows build issues which crept in for the CUDA build [\#3532](https://github.com/kokkos/kokkos/pull/#3532) -- HIP Fix atomics of large data types and clean up lock arrays [\#3529](https://github.com/kokkos/kokkos/pull/#3529) -- Pthreads fix exception resulting from 0 grain size [\#3510](https://github.com/kokkos/kokkos/pull/#3510) -- Fixup do not require atomic operation to be default constructible [\#3503](https://github.com/kokkos/kokkos/pull/#3503) -- Fix race condition in HIP backend [\#3467](https://github.com/kokkos/kokkos/pull/#3467) -- Replace KOKKOS\_DEBUG with KOKKOS\_ENABLE\_DEBUG [\#3458](https://github.com/kokkos/kokkos/pull/#3458) -- Fix multi-stream team scratch space definition for HIP [\#3398](https://github.com/kokkos/kokkos/pull/#3398) -- HIP fix template deduction [\#3393](https://github.com/kokkos/kokkos/pull/#3393) -- Fix compiling with HIP and C++17 [\#3390](https://github.com/kokkos/kokkos/pull/#3390) -- Fix sigFPE in HIP blocksize deduction [\#3378](https://github.com/kokkos/kokkos/pull/#3378) -- Type alias change: replace CS with CTS to avoid conflicts with NVSHMEM [\#3348](https://github.com/kokkos/kokkos/pull/#3348) -- Clang compilation of CUDA backend on Windows [\#3345](https://github.com/kokkos/kokkos/pull/#3345) -- Fix HBW support [\#3343](https://github.com/kokkos/kokkos/pull/#3343) -- Added missing fences to unique token [\#3260](https://github.com/kokkos/kokkos/pull/#3260) +- Fixup MDRangePolicy construction from Kokkos arrays [\#3591](https://github.com/kokkos/kokkos/pull/3591) +- Add atomic functions for unsigned long long using gcc built-in [\#3588](https://github.com/kokkos/kokkos/pull/3588) +- Fixup silent pointless comparison with zero in checked\_narrow\_cast (compiler workaround) [\#3566](https://github.com/kokkos/kokkos/pull/3566) +- Fixes for ROCm 3.9 [\#3565](https://github.com/kokkos/kokkos/pull/3565) +- Fix windows build issues which crept in for the CUDA build [\#3532](https://github.com/kokkos/kokkos/pull/3532) +- HIP Fix atomics of large data types and clean up lock arrays [\#3529](https://github.com/kokkos/kokkos/pull/3529) +- Pthreads fix exception resulting from 0 grain size [\#3510](https://github.com/kokkos/kokkos/pull/3510) +- Fixup do not require atomic operation to be default constructible [\#3503](https://github.com/kokkos/kokkos/pull/3503) +- Fix race condition in HIP backend [\#3467](https://github.com/kokkos/kokkos/pull/3467) +- Replace KOKKOS\_DEBUG with KOKKOS\_ENABLE\_DEBUG [\#3458](https://github.com/kokkos/kokkos/pull/3458) +- Fix multi-stream team scratch space definition for HIP [\#3398](https://github.com/kokkos/kokkos/pull/3398) +- HIP fix template deduction [\#3393](https://github.com/kokkos/kokkos/pull/3393) +- Fix compiling with HIP and C++17 [\#3390](https://github.com/kokkos/kokkos/pull/3390) +- Fix sigFPE in HIP blocksize deduction [\#3378](https://github.com/kokkos/kokkos/pull/3378) +- Type alias change: replace CS with CTS to avoid conflicts with NVSHMEM [\#3348](https://github.com/kokkos/kokkos/pull/3348) +- Clang compilation of CUDA backend on Windows [\#3345](https://github.com/kokkos/kokkos/pull/3345) +- Fix HBW support [\#3343](https://github.com/kokkos/kokkos/pull/3343) +- Added missing fences to unique token [\#3260](https://github.com/kokkos/kokkos/pull/3260) **Incompatibilities:** -- Remove unused utilities (forward, move, and expand\_variadic) from Kokkos::Impl [\#3535](https://github.com/kokkos/kokkos/pull/#3535) -- Remove unused traits [\#3534](https://github.com/kokkos/kokkos/pull/#3534) -- HIP: Remove old HCC code [\#3301](https://github.com/kokkos/kokkos/pull/#3301) -- Prepare for deprecation of ViewAllocateWithoutInitializing [\#3264](https://github.com/kokkos/kokkos/pull/#3264) -- Remove ROCm backend [\#3148](https://github.com/kokkos/kokkos/pull/#3148) +- Remove unused utilities (forward, move, and expand\_variadic) from Kokkos::Impl [\#3535](https://github.com/kokkos/kokkos/pull/3535) +- Remove unused traits [\#3534](https://github.com/kokkos/kokkos/pull/3534) +- HIP: Remove old HCC code [\#3301](https://github.com/kokkos/kokkos/pull/3301) +- Prepare for deprecation of ViewAllocateWithoutInitializing [\#3264](https://github.com/kokkos/kokkos/pull/3264) +- Remove ROCm backend [\#3148](https://github.com/kokkos/kokkos/pull/3148) ## [3.2.01](https://github.com/kokkos/kokkos/tree/3.2.01) (2020-11-17) [Full Changelog](https://github.com/kokkos/kokkos/compare/3.2.00...3.2.01) diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt index 4a4e7a5501..93a796f200 100644 --- a/lib/kokkos/CMakeLists.txt +++ b/lib/kokkos/CMakeLists.txt @@ -150,8 +150,8 @@ ENDIF() set(Kokkos_VERSION_MAJOR 4) -set(Kokkos_VERSION_MINOR 2) -set(Kokkos_VERSION_PATCH 1) +set(Kokkos_VERSION_MINOR 3) +set(Kokkos_VERSION_PATCH 0) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") message(STATUS "Kokkos version: ${Kokkos_VERSION}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index 393422d73c..a167ce2070 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -11,8 +11,8 @@ CXXFLAGS += $(SHFLAGS) endif KOKKOS_VERSION_MAJOR = 4 -KOKKOS_VERSION_MINOR = 2 -KOKKOS_VERSION_PATCH = 1 +KOKKOS_VERSION_MINOR = 3 +KOKKOS_VERSION_PATCH = 0 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial @@ -22,14 +22,14 @@ KOKKOS_DEVICES ?= "OpenMP" # Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR # NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90 # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX -# IBM: BGQ,Power7,Power8,Power9 -# AMD-GPUS: GFX906,GFX908,GFX90A,GFX940,GFX942,GFX1030,GFX1100 +# IBM: Power8,Power9 +# AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX1030,AMD_GFX1100 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3 # Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC KOKKOS_ARCH ?= "" # Options: yes,no KOKKOS_DEBUG ?= "no" -# Options: hwloc,librt,experimental_memkind +# Options: hwloc KOKKOS_USE_TPLS ?= "" # Options: c++17,c++1z,c++20,c++2a,c++23,c++2b KOKKOS_CXX_STANDARD ?= "c++17" @@ -56,7 +56,7 @@ uppercase_internal=$(if $1,$$(subst $(firstword $1),$(call uppercase_internal,$( uppercase=$(eval uppercase_RESULT:=$(call uppercase_internal,$(uppercase_TABLE),$1))$(uppercase_RESULT) # Return a 1 if a string contains a substring and 0 if not # Note the search string should be without '"' -# Example: $(call kokkos_has_string,"hwloc,librt",hwloc) +# Example: $(call kokkos_has_string,"hwloc,libdl",hwloc) # Will return a 1 kokkos_has_string=$(if $(findstring $(call uppercase,$2),$(call uppercase,$1)),1,0) # Returns 1 if the path exists, 0 otherwise @@ -73,11 +73,11 @@ KOKKOS_INTERNAL_ENABLE_CXX20 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD), KOKKOS_INTERNAL_ENABLE_CXX2A := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2a) KOKKOS_INTERNAL_ENABLE_CXX23 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++23) KOKKOS_INTERNAL_ENABLE_CXX2B := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2b) +KOKKOS_INTERNAL_ENABLE_CXX26 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++26) +KOKKOS_INTERNAL_ENABLE_CXX2C := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2c) # Check for external libraries. KOKKOS_INTERNAL_USE_HWLOC := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),hwloc) -KOKKOS_INTERNAL_USE_LIBRT := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),librt) -KOKKOS_INTERNAL_USE_MEMKIND := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),experimental_memkind) # Check for advanced settings. KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),compiler_warnings) @@ -318,7 +318,6 @@ endif # Intel based. KOKKOS_INTERNAL_USE_ARCH_KNC := $(call kokkos_has_string,$(KOKKOS_ARCH),KNC) -KOKKOS_INTERNAL_USE_ARCH_WSM := $(call kokkos_has_string,$(KOKKOS_ARCH),WSM) KOKKOS_INTERNAL_USE_ARCH_SNB := $(call kokkos_has_string,$(KOKKOS_ARCH),SNB) KOKKOS_INTERNAL_USE_ARCH_HSW := $(call kokkos_has_string,$(KOKKOS_ARCH),HSW) KOKKOS_INTERNAL_USE_ARCH_BDW := $(call kokkos_has_string,$(KOKKOS_ARCH),BDW) @@ -398,11 +397,9 @@ KOKKOS_INTERNAL_USE_ARCH_A64FX := $(call kokkos_has_string,$(KOKKOS_ARCH),A64FX) KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2)+$(KOKKOS_INTERNAL_USE_ARCH_A64FX) | bc)) # IBM based. -KOKKOS_INTERNAL_USE_ARCH_BGQ := $(call kokkos_has_string,$(KOKKOS_ARCH),BGQ) -KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power7) KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power8) KOKKOS_INTERNAL_USE_ARCH_POWER9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power9) -KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BGQ)+$(KOKKOS_INTERNAL_USE_ARCH_POWER7)+$(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc)) +KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc)) # AMD based. KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX) @@ -413,22 +410,37 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 0) KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen) endif endif -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906)) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908)) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A)) + +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A) +endif KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX940) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX942) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030)) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100)) +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100) +endif # Any AVX? -KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM)) KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)) KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3)) KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL)) # Incompatible flags? -KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_SKL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX)+$(KOKKOS_INTERNAL_USE_ARCH_ICL)+$(KOKKOS_INTERNAL_USE_ARCH_ICX)+$(KOKKOS_INTERNAL_USE_ARCH_SPR)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc) +KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_SKL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX)+$(KOKKOS_INTERNAL_USE_ARCH_ICL)+$(KOKKOS_INTERNAL_USE_ARCH_ICX)+$(KOKKOS_INTERNAL_USE_ARCH_SPR)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc) KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1") | bc) ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1) @@ -573,6 +585,16 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2B), 1) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2B_FLAG) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX23") endif +ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX26), 1) + #I cannot make CMake add this in a good way - so add it here + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX26_FLAG) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX26") +endif +ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2C), 1) + #I cannot make CMake add this in a good way - so add it here + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2C_FLAG) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX26") +endif ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1) ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) @@ -612,27 +634,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HWLOC") endif -ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_LIBRT") - KOKKOS_LIBS += -lrt - KOKKOS_TPL_LIBRARY_NAMES += rt -endif - -ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - ifneq ($(KOKKOS_CMAKE), yes) - ifneq ($(MEMKIND_PATH),) - KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include - KOKKOS_LIBDIRS += -L$(MEMKIND_PATH)/lib - KOKKOS_CXXLDFLAGS += -L$(MEMKIND_PATH)/lib - KOKKOS_TPL_INCLUDE_DIRS += $(MEMKIND_PATH)/include - KOKKOS_TPL_LIBRARY_DIRS += $(MEMKIND_PATH)/lib - endif - KOKKOS_LIBS += -lmemkind -lnuma - KOKKOS_TPL_LIBRARY_NAMES += memkind numa - endif - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HBWSPACE") -endif - ifeq ($(KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_LARGE_MEM_TESTS") endif @@ -699,10 +700,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) endif endif - ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_IMPL_CUDA_CLANG_WORKAROUND") - endif - ifeq ($(KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC), 0) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC") else @@ -827,20 +824,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2), 1) endif endif -ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_SSE42") - - ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) - KOKKOS_CXXFLAGS += -xSSE4.2 - KOKKOS_LDFLAGS += -xSSE4.2 - else ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) - else - # Assume that this is a really a GNU compiler. - KOKKOS_CXXFLAGS += -msse4.2 - KOKKOS_LDFLAGS += -msse4.2 - endif -endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX") @@ -1249,7 +1232,6 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp") tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp") - tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp") ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") @@ -1289,10 +1271,6 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif - ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") - endif endif KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) @@ -1403,11 +1381,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) KOKKOS_TPL_LIBRARY_NAMES += hpx endif -# Don't include Kokkos_HBWSpace.cpp if not using MEMKIND to avoid a link warning. -ifneq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp,$(KOKKOS_SRC)) -endif - # With Cygwin functions such as fdopen and fileno are not defined # when strict ansi is enabled. strict ansi gets enabled with -std=c++14 # though. So we hard undefine it here. Not sure if that has any bad side effects @@ -1461,6 +1434,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) else tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENMP */") endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_OPENACC") +else + tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENACC */") +endif tmp := $(call desul_append_header, "") tmp := $(call desul_append_header, "$H""endif") @@ -1493,7 +1472,7 @@ include $(KOKKOS_PATH)/Makefile.targets kokkos-clean: rm -f $(KOKKOS_OBJ_LINK) $(DESUL_CONFIG_HEADER) $(DESUL_INTERNAL_CONFIG_TMP) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a KokkosCore_Config_SetupBackend.hpp \ KokkosCore_Config_FwdBackend.hpp KokkosCore_Config_DeclareBackend.hpp KokkosCore_Config_DeclareBackend.tmp \ - KokkosCore_Config_FwdBackend.tmp KokkosCore_Config_PostInclude.hpp KokkosCore_Config_PostInclude.tmp KokkosCore_Config_SetupBackend.tmp + KokkosCore_Config_FwdBackend.tmp KokkosCore_Config_SetupBackend.tmp libkokkos.a: $(KOKKOS_OBJ_LINK) $(KOKKOS_SRC) $(KOKKOS_HEADERS) ar cr libkokkos.a $(KOKKOS_OBJ_LINK) diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets index ec8770dd7d..e6900a822a 100644 --- a/lib/kokkos/Makefile.targets +++ b/lib/kokkos/Makefile.targets @@ -20,8 +20,6 @@ Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ta $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp -Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp Kokkos_HostBarrier.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp Kokkos_Profiling.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling.cpp @@ -30,8 +28,6 @@ Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_ $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp -Kokkos_MemorySpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp Kokkos_NumericTraits.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp @@ -82,8 +78,10 @@ Lock_Array_HIP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array endif ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) -Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp +Kokkos_Threads_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp +Kokkos_Threads_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Spinwait.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Spinwait.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) @@ -123,6 +121,3 @@ Kokkos_OpenACC_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenACC Kokkos_OpenACC_SharedAllocationRecord.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp endif - -Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp diff --git a/lib/kokkos/README.md b/lib/kokkos/README.md index 033346e956..19793bb82d 100644 --- a/lib/kokkos/README.md +++ b/lib/kokkos/README.md @@ -28,7 +28,7 @@ To start learning about Kokkos: - [Use cases and Examples](https://kokkos.github.io/kokkos-core-wiki/usecases.html): a series of examples ranging from how to use Kokkos with MPI to Fortran interoperability. -For questions find us on Slack: https://kokkosteam.slack.com or open a github issue. +For questions find us on Slack: https://kokkosteam.slack.com or open a GitHub issue. For non-public questions send an email to: *crtrott(at)sandia.gov* @@ -48,10 +48,10 @@ Please see the [following page](https://kokkos.github.io/kokkos-core-wiki/citati # License -[![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) +[![License](https://img.shields.io/badge/License-Apache--2.0_WITH_LLVM--exception-blue)](https://spdx.org/licenses/LLVM-exception.html) Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights in this software. -The full license statement used in all headers is available [here](https://kokkos.github.io/kokkos-core-wiki/license.html) or -[here](https://github.com/kokkos/kokkos/blob/master/LICENSE). +The full license statement used in all headers is available [here](https://kokkos.org/kokkos-core-wiki/license.html) or +[here](https://github.com/kokkos/kokkos/blob/develop/LICENSE). diff --git a/lib/kokkos/SECURITY.md b/lib/kokkos/SECURITY.md new file mode 100644 index 0000000000..93cf6e3663 --- /dev/null +++ b/lib/kokkos/SECURITY.md @@ -0,0 +1,12 @@ +# Reporting Security Issues + +To report a security issue, please email +[lebrungrandt@ornl.gov](mailto:lebrungrandt@ornl.gov) +and [crtrott@sandia.gov](mailto:crtrott@sandia.gov) +with a description of the issue, the steps you took to create the issue, +affected versions, and, if known, mitigations for the issue. + +Our vulnerability management team will respond within 5 working days of your +email. If the issue is confirmed as a vulnerability, we will open a +Security Advisory and acknowledge your contributions as part of it. This project +follows a 90 day disclosure timeline. diff --git a/lib/kokkos/Spack.md b/lib/kokkos/Spack.md index 79606c259d..06c763a64e 100644 --- a/lib/kokkos/Spack.md +++ b/lib/kokkos/Spack.md @@ -159,7 +159,6 @@ If you don't specify a CUDA build variant in a `packages.yaml` and you build you > spack install superscience ```` you may end up just getting the default Kokkos (i.e. Serial). -Some examples are included in the `config/yaml` folder for common platforms. Before running `spack install ` we recommend running `spack spec ` to confirm your dependency tree is correct. For example, with Kokkos Kernels: ````bash diff --git a/lib/kokkos/algorithms/src/CMakeLists.txt b/lib/kokkos/algorithms/src/CMakeLists.txt index 1695778947..b490caca62 100644 --- a/lib/kokkos/algorithms/src/CMakeLists.txt +++ b/lib/kokkos/algorithms/src/CMakeLists.txt @@ -30,5 +30,5 @@ KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkosalgorithms ${CMAKE_CURRENT_SOURCE_DIR} ) - - +KOKKOS_LINK_TPL(kokkoscontainers PUBLIC ROCTHRUST) +KOKKOS_LINK_TPL(kokkoscore PUBLIC ONEDPL) diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp index 2d7d236d2f..7df12b8518 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp @@ -849,18 +849,17 @@ class Random_XorShift64 { return drand(end - start) + start; } - // Marsaglia polar method for drawing a standard normal distributed random + // Box-muller method for drawing a standard normal distributed random // number KOKKOS_INLINE_FUNCTION double normal() { - double S = 2.0; - double U; - while (S >= 1.0) { - U = 2.0 * drand() - 1.0; - const double V = 2.0 * drand() - 1.0; - S = U * U + V * V; - } - return U * std::sqrt(-2.0 * std::log(S) / S); + constexpr auto two_pi = 2 * Kokkos::numbers::pi_v; + + const double u = drand(); + const double v = drand(); + const double r = Kokkos::sqrt(-2.0 * Kokkos::log(u)); + const double theta = v * two_pi; + return r * Kokkos::cos(theta); } KOKKOS_INLINE_FUNCTION @@ -1094,18 +1093,17 @@ class Random_XorShift1024 { return drand(end - start) + start; } - // Marsaglia polar method for drawing a standard normal distributed random + // Box-muller method for drawing a standard normal distributed random // number KOKKOS_INLINE_FUNCTION double normal() { - double S = 2.0; - double U; - while (S >= 1.0) { - U = 2.0 * drand() - 1.0; - const double V = 2.0 * drand() - 1.0; - S = U * U + V * V; - } - return U * std::sqrt(-2.0 * std::log(S) / S); + constexpr auto two_pi = 2 * Kokkos::numbers::pi_v; + + const double u = drand(); + const double v = drand(); + const double r = Kokkos::sqrt(-2.0 * Kokkos::log(u)); + const double theta = v * two_pi; + return r * Kokkos::cos(theta); } KOKKOS_INLINE_FUNCTION @@ -1545,13 +1543,23 @@ template void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type begin, typename ViewType::const_value_type end) { - fill_random(typename ViewType::execution_space{}, a, g, begin, end); + Kokkos::fence( + "fill_random: fence before since no execution space instance provided"); + typename ViewType::execution_space exec; + fill_random(exec, a, g, begin, end); + exec.fence( + "fill_random: fence after since no execution space instance provided"); } template void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type range) { - fill_random(typename ViewType::execution_space{}, a, g, 0, range); + Kokkos::fence( + "fill_random: fence before since no execution space instance provided"); + typename ViewType::execution_space exec; + fill_random(exec, a, g, 0, range); + exec.fence( + "fill_random: fence after since no execution space instance provided"); } } // namespace Kokkos diff --git a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp index f77484cc55..136b4ec82d 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp @@ -23,6 +23,7 @@ #include "sorting/Kokkos_BinSortPublicAPI.hpp" #include "sorting/Kokkos_SortPublicAPI.hpp" +#include "sorting/Kokkos_SortByKeyPublicAPI.hpp" #include "sorting/Kokkos_NestedSortPublicAPI.hpp" #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_SORT diff --git a/lib/kokkos/algorithms/src/Kokkos_StdAlgorithms.hpp b/lib/kokkos/algorithms/src/Kokkos_StdAlgorithms.hpp index 436ae0d10b..b532a774e1 100644 --- a/lib/kokkos/algorithms/src/Kokkos_StdAlgorithms.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_StdAlgorithms.hpp @@ -35,7 +35,6 @@ // following the std classification. // modifying ops -#include "std_algorithms/Kokkos_Swap.hpp" #include "std_algorithms/Kokkos_IterSwap.hpp" // non-modifying sequence diff --git a/lib/kokkos/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp b/lib/kokkos/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp new file mode 100644 index 0000000000..fc73eccad6 --- /dev/null +++ b/lib/kokkos/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp @@ -0,0 +1,117 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SORT_BY_KEY_PUBLIC_API_HPP_ +#define KOKKOS_SORT_BY_KEY_PUBLIC_API_HPP_ + +#include "./impl/Kokkos_SortByKeyImpl.hpp" +#include +#include + +namespace Kokkos::Experimental { + +// --------------------------------------------------------------- +// basic overloads +// --------------------------------------------------------------- + +template +void sort_by_key( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { + // constraints + using KeysType = Kokkos::View; + using ValuesType = Kokkos::View; + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(keys); + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(values); + + static_assert(SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the keys View argument!"); + static_assert( + SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the values View argument!"); + + static_assert(KeysType::static_extent(0) == 0 || + ValuesType::static_extent(0) == 0 || + KeysType::static_extent(0) == ValuesType::static_extent(0)); + if (values.size() != keys.size()) + Kokkos::abort((std::string("values and keys extents must be the same. The " + "values extent is ") + + std::to_string(values.size()) + ", and the keys extent is " + + std::to_string(keys.size()) + ".") + .c_str()); + + if (keys.extent(0) <= 1) { + return; + } + + ::Kokkos::Impl::sort_by_key_device_view_without_comparator(exec, keys, + values); +} + +// --------------------------------------------------------------- +// overloads supporting a custom comparator +// --------------------------------------------------------------- + +template +void sort_by_key( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { + // constraints + using KeysType = Kokkos::View; + using ValuesType = Kokkos::View; + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(keys); + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(values); + + static_assert(SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the keys View argument!"); + static_assert( + SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the values View argument!"); + + static_assert(KeysType::static_extent(0) == 0 || + ValuesType::static_extent(0) == 0 || + KeysType::static_extent(0) == ValuesType::static_extent(0)); + if (values.size() != keys.size()) + Kokkos::abort((std::string("values and keys extents must be the same. The " + "values extent is ") + + std::to_string(values.size()) + ", and the keys extent is " + + std::to_string(keys.size()) + ".") + .c_str()); + + if (keys.extent(0) <= 1) { + return; + } + + ::Kokkos::Impl::sort_by_key_device_view_with_comparator(exec, keys, values, + comparator); +} + +} // namespace Kokkos::Experimental +#endif diff --git a/lib/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp b/lib/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp index a763c41e58..308e9e3a00 100644 --- a/lib/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp +++ b/lib/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp @@ -29,7 +29,7 @@ namespace Kokkos { // --------------------------------------------------------------- template -void sort([[maybe_unused]] const ExecutionSpace& exec, +void sort(const ExecutionSpace& exec, const Kokkos::View& view) { // constraints using ViewType = Kokkos::View; @@ -52,6 +52,7 @@ void sort([[maybe_unused]] const ExecutionSpace& exec, } if constexpr (Impl::better_off_calling_std_sort_v) { + exec.fence("Kokkos::sort without comparator use std::sort"); auto first = ::Kokkos::Experimental::begin(view); auto last = ::Kokkos::Experimental::end(view); std::sort(first, last); @@ -82,7 +83,7 @@ void sort(const Kokkos::View& view) { // --------------------------------------------------------------- template -void sort([[maybe_unused]] const ExecutionSpace& exec, +void sort(const ExecutionSpace& exec, const Kokkos::View& view, const ComparatorType& comparator) { // constraints @@ -105,6 +106,7 @@ void sort([[maybe_unused]] const ExecutionSpace& exec, } if constexpr (Impl::better_off_calling_std_sort_v) { + exec.fence("Kokkos::sort with comparator use std::sort"); auto first = ::Kokkos::Experimental::begin(view); auto last = ::Kokkos::Experimental::end(view); std::sort(first, last, comparator); diff --git a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp index 50ac823319..2fe58272d9 100644 --- a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp +++ b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp @@ -18,7 +18,6 @@ #define KOKKOS_NESTED_SORT_IMPL_HPP_ #include -#include namespace Kokkos { namespace Experimental { @@ -99,7 +98,7 @@ KOKKOS_INLINE_FUNCTION void sort_nested_impl( keyView(elem1) = key2; keyView(elem2) = key1; if constexpr (!std::is_same_v) { - Kokkos::Experimental::swap(valueView(elem1), valueView(elem2)); + Kokkos::kokkos_swap(valueView(elem1), valueView(elem2)); } } } diff --git a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp new file mode 100644 index 0000000000..36deccdfb1 --- /dev/null +++ b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp @@ -0,0 +1,401 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SORT_BY_KEY_FREE_FUNCS_IMPL_HPP_ +#define KOKKOS_SORT_BY_KEY_FREE_FUNCS_IMPL_HPP_ + +#include + +#if defined(KOKKOS_ENABLE_CUDA) + +// Workaround for `Instruction 'shfl' without '.sync' is not supported on +// .target sm_70 and higher from PTX ISA version 6.4`. +// Also see https://github.com/NVIDIA/cub/pull/170. +#if !defined(CUB_USE_COOPERATIVE_GROUPS) +#define CUB_USE_COOPERATIVE_GROUPS +#endif + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wshadow" + +#if defined(KOKKOS_COMPILER_CLANG) +// Some versions of Clang fail to compile Thrust, failing with errors like +// this: +// /thrust/system/cuda/detail/core/agent_launcher.h:557:11: +// error: use of undeclared identifier 'va_printf' +// The exact combination of versions for Clang and Thrust (or CUDA) for this +// failure was not investigated, however even very recent version combination +// (Clang 10.0.0 and Cuda 10.0) demonstrated failure. +// +// Defining _CubLog here locally allows us to avoid that code path, however +// disabling some debugging diagnostics +#pragma push_macro("_CubLog") +#ifdef _CubLog +#undef _CubLog +#endif +#define _CubLog +#include +#include +#pragma pop_macro("_CubLog") +#else +#include +#include +#endif + +#pragma GCC diagnostic pop + +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +#include +#include +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) && \ + (ONEDPL_VERSION_MAJOR > 2022 || \ + (ONEDPL_VERSION_MAJOR == 2022 && ONEDPL_VERSION_MINOR >= 2)) +#define KOKKOS_ONEDPL_HAS_SORT_BY_KEY +#include +#include +#endif + +namespace Kokkos::Impl { + +template +constexpr inline bool is_admissible_to_kokkos_sort_by_key = + ::Kokkos::is_view::value&& T::rank() == 1 && + (std::is_same::value || + std::is_same::value || + std::is_same::value); + +template +KOKKOS_INLINE_FUNCTION constexpr void +static_assert_is_admissible_to_kokkos_sort_by_key(const ViewType& /* view */) { + static_assert(is_admissible_to_kokkos_sort_by_key, + "Kokkos::sort_by_key only accepts 1D values View with " + "LayoutRight, LayoutLeft or LayoutStride."); +} + +// For the fallback implementation for sort_by_key using Kokkos::sort, we need +// to consider if Kokkos::sort defers to the fallback implementation that copies +// the array to the host and uses std::sort, see +// copy_to_host_run_stdsort_copy_back() in impl/Kokkos_SortImpl.hpp. If +// sort_on_device_v is true, we assume that std::sort doesn't copy data. +// Otherwise, we manually copy all data to the host and provide Kokkos::sort +// with a host execution space. +template +inline constexpr bool sort_on_device_v = false; + +#if defined(KOKKOS_ENABLE_CUDA) +template +inline constexpr bool sort_on_device_v = true; + +template +void sort_by_key_cudathrust( + const Kokkos::Cuda& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + MaybeComparator&&... maybeComparator) { + const auto policy = thrust::cuda::par.on(exec.cuda_stream()); + auto keys_first = ::Kokkos::Experimental::begin(keys); + auto keys_last = ::Kokkos::Experimental::end(keys); + auto values_first = ::Kokkos::Experimental::begin(values); + thrust::sort_by_key(policy, keys_first, keys_last, values_first, + std::forward(maybeComparator)...); +} +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +inline constexpr bool sort_on_device_v = true; + +template +void sort_by_key_rocthrust( + const Kokkos::HIP& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + MaybeComparator&&... maybeComparator) { + const auto policy = thrust::hip::par.on(exec.hip_stream()); + auto keys_first = ::Kokkos::Experimental::begin(keys); + auto keys_last = ::Kokkos::Experimental::end(keys); + auto values_first = ::Kokkos::Experimental::begin(values); + thrust::sort_by_key(policy, keys_first, keys_last, values_first, + std::forward(maybeComparator)...); +} +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) +template +inline constexpr bool sort_on_device_v = + std::is_same_v || + std::is_same_v; + +#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY +template +void sort_by_key_onedpl( + const Kokkos::Experimental::SYCL& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + MaybeComparator&&... maybeComparator) { + if (keys.stride(0) != 1 && values.stride(0) != 1) { + Kokkos::abort( + "SYCL sort_by_key only supports rank-1 Views with stride(0) = 1."); + } + + // Can't use Experimental::begin/end here since the oneDPL then assumes that + // the data is on the host. + auto queue = exec.sycl_queue(); + auto policy = oneapi::dpl::execution::make_device_policy(queue); + const int n = keys.extent(0); + oneapi::dpl::sort_by_key(policy, keys.data(), keys.data() + n, values.data(), + std::forward(maybeComparator)...); +} +#endif +#endif + +template +void applyPermutation(const ExecutionSpace& space, + const PermutationView& permutation, + const ViewType& view) { + static_assert(std::is_integral::value); + + auto view_copy = Kokkos::create_mirror( + Kokkos::view_alloc(space, typename ExecutionSpace::memory_space{}, + Kokkos::WithoutInitializing), + view); + Kokkos::deep_copy(space, view_copy, view); + Kokkos::parallel_for( + "Kokkos::sort_by_key_via_sort::permute_" + view.label(), + Kokkos::RangePolicy(space, 0, view.extent(0)), + KOKKOS_LAMBDA(int i) { view(i) = view_copy(permutation(i)); }); +} + +template +void sort_by_key_via_sort( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + MaybeComparator&&... maybeComparator) { + static_assert(sizeof...(MaybeComparator) <= 1); + + auto const n = keys.size(); + + Kokkos::View permute( + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "Kokkos::sort_by_key_via_sort::permute"), + n); + + // iota + Kokkos::parallel_for( + "Kokkos::sort_by_key_via_sort::iota", + Kokkos::RangePolicy(exec, 0, n), + KOKKOS_LAMBDA(int i) { permute(i) = i; }); + + using Layout = + typename Kokkos::View::array_layout; + if constexpr (!sort_on_device_v) { + auto host_keys = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::HostSpace{}, Kokkos::WithoutInitializing), + keys); + auto host_permute = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::HostSpace{}, Kokkos::WithoutInitializing), + permute); + Kokkos::deep_copy(exec, host_keys, keys); + Kokkos::deep_copy(exec, host_permute, permute); + + exec.fence("Kokkos::Impl::sort_by_key_via_sort: before host sort"); + Kokkos::DefaultHostExecutionSpace host_exec; + + if constexpr (sizeof...(MaybeComparator) == 0) { + Kokkos::sort( + host_exec, host_permute, + KOKKOS_LAMBDA(int i, int j) { return host_keys(i) < host_keys(j); }); + } else { + auto keys_comparator = + std::get<0>(std::tuple(maybeComparator...)); + Kokkos::sort( + host_exec, host_permute, KOKKOS_LAMBDA(int i, int j) { + return keys_comparator(host_keys(i), host_keys(j)); + }); + } + host_exec.fence("Kokkos::Impl::sort_by_key_via_sort: after host sort"); + Kokkos::deep_copy(exec, permute, host_permute); + } else { +#ifdef KOKKOS_ENABLE_SYCL + auto* raw_keys_in_comparator = keys.data(); + auto stride = keys.stride(0); + if constexpr (sizeof...(MaybeComparator) == 0) { + Kokkos::sort( + exec, permute, KOKKOS_LAMBDA(int i, int j) { + return raw_keys_in_comparator[i * stride] < + raw_keys_in_comparator[j * stride]; + }); + } else { + auto keys_comparator = + std::get<0>(std::tuple(maybeComparator...)); + Kokkos::sort( + exec, permute, KOKKOS_LAMBDA(int i, int j) { + return keys_comparator(raw_keys_in_comparator[i * stride], + raw_keys_in_comparator[j * stride]); + }); + } +#else + if constexpr (sizeof...(MaybeComparator) == 0) { + Kokkos::sort( + exec, permute, + KOKKOS_LAMBDA(int i, int j) { return keys(i) < keys(j); }); + } else { + auto keys_comparator = + std::get<0>(std::tuple(maybeComparator...)); + Kokkos::sort( + exec, permute, KOKKOS_LAMBDA(int i, int j) { + return keys_comparator(keys(i), keys(j)); + }); + } +#endif + } + + applyPermutation(exec, permute, keys); + applyPermutation(exec, permute, values); +} + +// ------------------------------------------------------ +// +// specialize cases for sorting by key without comparator +// +// ------------------------------------------------------ + +#if defined(KOKKOS_ENABLE_CUDA) +template +void sort_by_key_device_view_without_comparator( + const Kokkos::Cuda& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { + sort_by_key_cudathrust(exec, keys, values); +} +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_by_key_device_view_without_comparator( + const Kokkos::HIP& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { + sort_by_key_rocthrust(exec, keys, values); +} +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) +template +void sort_by_key_device_view_without_comparator( + const Kokkos::Experimental::SYCL& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { +#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY + if (keys.stride(0) == 1 && values.stride(0) == 1) + sort_by_key_onedpl(exec, keys, values); + else +#endif + sort_by_key_via_sort(exec, keys, values); +} +#endif + +// fallback case +template +std::enable_if_t::value> +sort_by_key_device_view_without_comparator( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { + sort_by_key_via_sort(exec, keys, values); +} + +// --------------------------------------------------- +// +// specialize cases for sorting by key with comparator +// +// --------------------------------------------------- + +#if defined(KOKKOS_ENABLE_CUDA) +template +void sort_by_key_device_view_with_comparator( + const Kokkos::Cuda& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { + sort_by_key_cudathrust(exec, keys, values, comparator); +} +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_by_key_device_view_with_comparator( + const Kokkos::HIP& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { + sort_by_key_rocthrust(exec, keys, values, comparator); +} +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) +template +void sort_by_key_device_view_with_comparator( + const Kokkos::Experimental::SYCL& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { +#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY + if (keys.stride(0) == 1 && values.stride(0) == 1) + sort_by_key_onedpl(exec, keys, values, comparator); + else +#endif + sort_by_key_via_sort(exec, keys, values, comparator); +} +#endif + +// fallback case +template +std::enable_if_t::value> +sort_by_key_device_view_with_comparator( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { + sort_by_key_via_sort(exec, keys, values, comparator); +} + +#undef KOKKOS_ONEDPL_HAS_SORT_BY_KEY + +} // namespace Kokkos::Impl +#endif diff --git a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp index d87ab09e77..4c174b5fda 100644 --- a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp +++ b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp @@ -63,6 +63,11 @@ #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +#include +#include +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) #include #include @@ -184,6 +189,26 @@ void sort_cudathrust(const Cuda& space, } #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_rocthrust(const HIP& space, + const Kokkos::View& view, + MaybeComparator&&... maybeComparator) { + using ViewType = Kokkos::View; + static_assert(ViewType::rank == 1, + "Kokkos::sort: currently only supports rank-1 Views."); + + if (view.extent(0) <= 1) { + return; + } + const auto exec = thrust::hip::par.on(space.hip_stream()); + auto first = ::Kokkos::Experimental::begin(view); + auto last = ::Kokkos::Experimental::end(view); + thrust::sort(exec, first, last, + std::forward(maybeComparator)...); +} +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_onedpl(const Kokkos::Experimental::SYCL& space, @@ -274,6 +299,14 @@ void sort_device_view_without_comparator( } #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_device_view_without_comparator( + const HIP& exec, const Kokkos::View& view) { + sort_rocthrust(exec, view); +} +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_device_view_without_comparator( @@ -320,6 +353,15 @@ void sort_device_view_with_comparator( } #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_device_view_with_comparator( + const HIP& exec, const Kokkos::View& view, + const ComparatorType& comparator) { + sort_rocthrust(exec, view, comparator); +} +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_device_view_with_comparator( diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp index b7ce1ba5ed..c5406c72b0 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp @@ -50,7 +50,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -66,7 +66,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -93,7 +93,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp index 8f9e0f19b8..82071a9362 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp @@ -50,7 +50,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_backward(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -65,7 +65,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_backward(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -92,7 +92,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp index ba18bc76b9..599fde5737 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp @@ -54,7 +54,8 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_if(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest, Predicate pred) { + const ::Kokkos::View& dest, + Predicate pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -69,7 +70,8 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_if(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest, Predicate pred) { + const ::Kokkos::View& dest, + Predicate pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -96,7 +98,7 @@ template & source, - ::Kokkos::View& dest, Predicate pred) { + const ::Kokkos::View& dest, Predicate pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp index 43c9120483..637d8d4cbc 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp @@ -51,7 +51,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_n(const ExecutionSpace& ex, const ::Kokkos::View& source, Size count, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -66,7 +66,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_n(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, Size count, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -93,7 +93,7 @@ template & source, Size count, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp index a72a49cc22..593c42f87e 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp @@ -80,7 +80,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool equal(const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -96,7 +96,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool equal(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -111,7 +111,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool equal(const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2, + const ::Kokkos::View& view2, BinaryPredicateType predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -128,7 +128,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool equal(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2, + const ::Kokkos::View& view2, BinaryPredicateType predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -227,7 +227,7 @@ template & view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -243,7 +243,7 @@ template & view1, - ::Kokkos::View& view2, + const ::Kokkos::View& view2, BinaryPredicateType predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp index a796a306dd..5bb2d1039d 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp @@ -19,7 +19,6 @@ #include #include "impl/Kokkos_Constraints.hpp" -#include "Kokkos_Swap.hpp" namespace Kokkos { namespace Experimental { @@ -33,7 +32,7 @@ struct StdIterSwapFunctor { KOKKOS_FUNCTION void operator()(int i) const { (void)i; - ::Kokkos::Experimental::swap(*m_a, *m_b); + ::Kokkos::kokkos_swap(*m_a, *m_b); } KOKKOS_FUNCTION @@ -58,6 +57,16 @@ void iter_swap(IteratorType1 a, IteratorType2 b) { Impl::iter_swap_impl(a, b); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template +KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::kokkos_swap instead!") +KOKKOS_FUNCTION + void swap(T& a, T& b) noexcept(::Kokkos::kokkos_swap(std::declval(), + std::declval())) { + ::Kokkos::kokkos_swap(a, b); +} +#endif + } // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp index 4b5c69df45..e13479c370 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp @@ -54,7 +54,7 @@ template < bool lexicographical_compare( const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -71,7 +71,7 @@ template < bool lexicographical_compare( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -112,7 +112,8 @@ template < bool lexicographical_compare( const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2, ComparatorType comp) { + const ::Kokkos::View& view2, + ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -129,7 +130,8 @@ template < bool lexicographical_compare( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2, ComparatorType comp) { + const ::Kokkos::View& view2, + ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -161,7 +163,7 @@ template & view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -187,7 +189,8 @@ template & view1, - ::Kokkos::View& view2, ComparatorType comp) { + const ::Kokkos::View& view2, + ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp index f04ea12ba8..ac308ea184 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp @@ -50,7 +50,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto move(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -64,7 +64,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto move(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -92,7 +92,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp index 375474ca57..2789ab2179 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp @@ -41,7 +41,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto move_backward(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -65,7 +65,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto move_backward(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -94,7 +94,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp index 37336c983a..66f39c4eaa 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp @@ -50,7 +50,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto reverse_copy(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -65,7 +65,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto reverse_copy(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -94,7 +94,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp index 39f33b6487..d66763d304 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp @@ -40,7 +40,7 @@ template , int> = 0> auto swap_ranges(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -64,7 +64,7 @@ template , int> = 0> auto swap_ranges(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -94,7 +94,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp index 838c9169e2..84cbed524d 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp @@ -58,7 +58,7 @@ template , int> = 0> auto transform(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, UnaryOperation unary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -73,7 +73,7 @@ template , int> = 0> auto transform(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, UnaryOperation unary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -119,7 +119,7 @@ template & source1, const ::Kokkos::View& source2, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, BinaryOperation binary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2); @@ -137,7 +137,7 @@ template & source1, const ::Kokkos::View& source2, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, BinaryOperation binary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2); @@ -174,7 +174,8 @@ template & source, - ::Kokkos::View& dest, UnaryOperation unary_op) { + const ::Kokkos::View& dest, + UnaryOperation unary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -207,7 +208,7 @@ KOKKOS_FUNCTION auto transform( const TeamHandleType& teamHandle, const ::Kokkos::View& source1, const ::Kokkos::View& source2, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, BinaryOperation binary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2); diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp index 8151ee3495..5a7fe16984 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp @@ -47,8 +47,9 @@ struct ExclusiveScanDefaultFunctorForKnownNeutralElement { KOKKOS_FUNCTION void operator()(const IndexType i, ValueType& update, const bool final_pass) const { + const auto tmp = m_first_from[i]; if (final_pass) m_first_dest[i] = update + m_init_value; - update += m_first_from[i]; + update += tmp; } }; @@ -73,6 +74,7 @@ struct ExclusiveScanDefaultFunctorWithValueWrapper { KOKKOS_FUNCTION void operator()(const IndexType i, value_type& update, const bool final_pass) const { + const auto tmp = value_type{m_first_from[i], false}; if (final_pass) { if (i == 0) { m_first_dest[i] = m_init_value; @@ -81,7 +83,6 @@ struct ExclusiveScanDefaultFunctorWithValueWrapper { } } - const auto tmp = value_type{m_first_from[i], false}; this->join(update, tmp); } @@ -132,6 +133,7 @@ struct TransformExclusiveScanFunctorWithValueWrapper { KOKKOS_FUNCTION void operator()(const IndexType i, value_type& update, const bool final_pass) const { + const auto tmp = value_type{m_unary_op(m_first_from[i]), false}; if (final_pass) { if (i == 0) { // for both ExclusiveScan and TransformExclusiveScan, @@ -142,7 +144,6 @@ struct TransformExclusiveScanFunctorWithValueWrapper { } } - const auto tmp = value_type{m_unary_op(m_first_from[i]), false}; this->join(update, tmp); } @@ -190,6 +191,7 @@ struct TransformExclusiveScanFunctorWithoutValueWrapper { KOKKOS_FUNCTION void operator()(const IndexType i, ValueType& update, const bool final_pass) const { + const auto tmp = ValueType{m_unary_op(m_first_from[i])}; if (final_pass) { if (i == 0) { // for both ExclusiveScan and TransformExclusiveScan, @@ -200,7 +202,6 @@ struct TransformExclusiveScanFunctorWithoutValueWrapper { } } - const auto tmp = ValueType{m_unary_op(m_first_from[i])}; this->join(update, tmp); } diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp index 50224c8874..456df43aed 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp @@ -46,15 +46,14 @@ struct StdRemoveIfStage1Functor { void operator()(const IndexType i, IndexType& update, const bool final_pass) const { auto& myval = m_first_from[i]; - if (final_pass) { - if (!m_must_remove(myval)) { + + if (!m_must_remove(myval)) { + if (final_pass) { // calling move here is ok because we are inside final pass // we are calling move assign as specified by the std m_first_dest[update] = std::move(myval); } - } - if (!m_must_remove(myval)) { update += 1; } } @@ -108,7 +107,9 @@ IteratorType remove_if_exespace_impl(const std::string& label, // create helper tmp view using value_type = typename IteratorType::value_type; using tmp_view_type = Kokkos::View; - tmp_view_type tmp_view("std_remove_if_tmp_view", keep_count); + tmp_view_type tmp_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, ex, + "std_remove_if_tmp_view"), + keep_count); using tmp_readwrite_iterator_type = decltype(begin(tmp_view)); // in stage 1, *move* all elements to keep from original range to tmp diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp index 428dc0d744..b4046c7645 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp @@ -21,7 +21,6 @@ #include "Kokkos_Constraints.hpp" #include "Kokkos_HelperPredicates.hpp" #include -#include #include namespace Kokkos { @@ -39,7 +38,7 @@ struct StdReverseFunctor { KOKKOS_FUNCTION void operator()(index_type i) const { - ::Kokkos::Experimental::swap(m_first[i], m_last[-i - 1]); + ::Kokkos::kokkos_swap(m_first[i], m_last[-i - 1]); } KOKKOS_FUNCTION diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp index 50bc7c8d61..9414748507 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp @@ -126,10 +126,11 @@ KOKKOS_FUNCTION IteratorType shift_left_team_impl( // execution space impl because for this team impl we are // within a parallel region, so for now we solve serially - const std::size_t numElementsToMove = + using difference_type = typename IteratorType::difference_type; + const difference_type numElementsToMove = ::Kokkos::Experimental::distance(first + n, last); Kokkos::single(Kokkos::PerTeam(teamHandle), [=]() { - for (std::size_t i = 0; i < numElementsToMove; ++i) { + for (difference_type i = 0; i < numElementsToMove; ++i) { first[i] = std::move(first[i + n]); } }); diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp index cac20bfbba..0414e6f1c2 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp @@ -103,26 +103,6 @@ IteratorType shift_right_exespace_impl( return first + n; } -template -struct StdShiftRightTeamSingleFunctor { - Iterator m_first; - Iterator m_last; - std::size_t m_shift; - - KOKKOS_FUNCTION - void operator()() const { - // the impl function calling this functor guarantees that - // - m_shift is non-negative - // - m_first, m_last identify a valid range with m_last > m_first - // - m_shift is less than m_last - m_first - // so I can safely use std::size_t here - } - - KOKKOS_FUNCTION - StdShiftRightTeamSingleFunctor(Iterator _first, Iterator _last, std::size_t n) - : m_first(std::move(_first)), m_last(std::move(_last)), m_shift(n) {} -}; - template KOKKOS_FUNCTION IteratorType shift_right_team_impl( const TeamHandleType& teamHandle, IteratorType first, IteratorType last, @@ -145,10 +125,11 @@ KOKKOS_FUNCTION IteratorType shift_right_team_impl( // execution space impl because for this team impl we are // within a parallel region, so for now we solve serially - const std::size_t numElementsToMove = + using difference_type = typename IteratorType::difference_type; + const difference_type numElementsToMove = ::Kokkos::Experimental::distance(first, last - n); Kokkos::single(Kokkos::PerTeam(teamHandle), [=]() { - for (std::size_t i = 0; i < numElementsToMove; ++i) { + for (difference_type i = 0; i < numElementsToMove; ++i) { last[-i - 1] = std::move(last[-n - i - 1]); } }); diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp index 5bc77ed7dd..930a14ac48 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp @@ -21,7 +21,6 @@ #include "Kokkos_Constraints.hpp" #include "Kokkos_HelperPredicates.hpp" #include -#include #include namespace Kokkos { @@ -36,7 +35,7 @@ struct StdSwapRangesFunctor { KOKKOS_FUNCTION void operator()(index_type i) const { - ::Kokkos::Experimental::swap(m_first1[i], m_first2[i]); + ::Kokkos::kokkos_swap(m_first1[i], m_first2[i]); } KOKKOS_FUNCTION diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp index 11afa8ed6e..2863582458 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp @@ -105,7 +105,9 @@ IteratorType unique_exespace_impl(const std::string& label, // using the same algorithm used for unique_copy but we now move things using value_type = typename IteratorType::value_type; using tmp_view_type = Kokkos::View; - tmp_view_type tmp_view("std_unique_tmp_view", num_elements_to_explore); + tmp_view_type tmp_view(Kokkos::view_alloc(ex, Kokkos::WithoutInitializing, + "std_unique_tmp_view"), + num_elements_to_explore); // scan extent is: num_elements_to_explore - 1 // for same reason as the one explained in unique_copy diff --git a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt index 419f5ec1d1..db184bc8a9 100644 --- a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt +++ b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt @@ -25,6 +25,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) set(ALGO_SORT_SOURCES) foreach(SOURCE_Input TestSort + TestSortByKey TestSortCustomComp TestBinSortA TestBinSortB @@ -57,35 +58,37 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) configure_file(${dir}/dummy.cpp ${file}) list(APPEND ALGO_RANDOM_SOURCES ${file}) endforeach() + endif() +endforeach() - # ------------------------------------------ - # std set A - # ------------------------------------------ - set(STDALGO_SOURCES_A) - foreach(Name +# ------------------------------------------ +# std set A +# ------------------------------------------ +set(STDALGO_SOURCES_A) +foreach(Name StdReducers StdAlgorithmsConstraints RandomAccessIterator - ) - list(APPEND STDALGO_SOURCES_A Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_SOURCES_A Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std set B - # ------------------------------------------ - set(STDALGO_SOURCES_B) - foreach(Name +# ------------------------------------------ +# std set B +# ------------------------------------------ +set(STDALGO_SOURCES_B) +foreach(Name StdAlgorithmsCommon StdAlgorithmsMinMaxElementOps - ) - list(APPEND STDALGO_SOURCES_B Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_SOURCES_B Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std set C - # ------------------------------------------ - set(STDALGO_SOURCES_C) - foreach(Name +# ------------------------------------------ +# std set C +# ------------------------------------------ +set(STDALGO_SOURCES_C) +foreach(Name StdAlgorithmsCommon StdAlgorithmsLexicographicalCompare StdAlgorithmsForEach @@ -100,15 +103,15 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsSearch_n StdAlgorithmsMismatch StdAlgorithmsMoveBackward - ) - list(APPEND STDALGO_SOURCES_C Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_SOURCES_C Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std set D - # ------------------------------------------ - set(STDALGO_SOURCES_D) - foreach(Name +# ------------------------------------------ +# std set D +# ------------------------------------------ +set(STDALGO_SOURCES_D) +foreach(Name StdAlgorithmsCommon StdAlgorithmsModOps StdAlgorithmsModSeqOps @@ -128,15 +131,15 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsReverse StdAlgorithmsShiftLeft StdAlgorithmsShiftRight - ) - list(APPEND STDALGO_SOURCES_D Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_SOURCES_D Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std set E - # ------------------------------------------ - set(STDALGO_SOURCES_E) - foreach(Name +# ------------------------------------------ +# std set E +# ------------------------------------------ +set(STDALGO_SOURCES_E) +foreach(Name StdAlgorithmsCommon StdAlgorithmsIsSorted StdAlgorithmsIsSortedUntil @@ -149,83 +152,83 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsTransformUnaryOp StdAlgorithmsTransformExclusiveScan StdAlgorithmsTransformInclusiveScan - ) - list(APPEND STDALGO_SOURCES_E Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_SOURCES_E Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team Q - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_Q) - foreach(Name +# ------------------------------------------ +# std team Q +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_Q) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamInclusiveScan StdAlgorithmsTeamTransformInclusiveScan - ) - list(APPEND STDALGO_TEAM_SOURCES_Q Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_Q Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team P - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_P) - foreach(Name +# ------------------------------------------ +# std team P +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_P) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamExclusiveScan StdAlgorithmsTeamTransformExclusiveScan - ) - list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team M - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_M) - foreach(Name +# ------------------------------------------ +# std team M +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_M) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamTransformUnaryOp StdAlgorithmsTeamTransformBinaryOp StdAlgorithmsTeamGenerate StdAlgorithmsTeamGenerate_n StdAlgorithmsTeamSwapRanges - ) - list(APPEND STDALGO_TEAM_SOURCES_M Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_M Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team L - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_L) - foreach(Name +# ------------------------------------------ +# std team L +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_L) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamIsSorted StdAlgorithmsTeamIsSortedUntil StdAlgorithmsTeamIsPartitioned StdAlgorithmsTeamPartitionCopy StdAlgorithmsTeamPartitionPoint - ) - list(APPEND STDALGO_TEAM_SOURCES_L Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_L Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team I - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_I) - foreach(Name +# ------------------------------------------ +# std team I +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_I) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamUnique StdAlgorithmsTeamAdjacentDifference StdAlgorithmsTeamReduce StdAlgorithmsTeamTransformReduce - ) - list(APPEND STDALGO_TEAM_SOURCES_I Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_I Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team H - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_H) - foreach(Name +# ------------------------------------------ +# std team H +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_H) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamCopy StdAlgorithmsTeamCopy_n @@ -236,43 +239,43 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsTeamRemoveIf StdAlgorithmsTeamRemoveCopy StdAlgorithmsTeamRemoveCopyIf - ) - list(APPEND STDALGO_TEAM_SOURCES_H Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_H Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team G - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_G) - foreach(Name +# ------------------------------------------ +# std team G +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_G) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamMove StdAlgorithmsTeamMoveBackward StdAlgorithmsTeamShiftLeft StdAlgorithmsTeamShiftRight - ) - list(APPEND STDALGO_TEAM_SOURCES_G Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_G Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team F - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_F) - foreach(Name +# ------------------------------------------ +# std team F +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_F) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamReverse StdAlgorithmsTeamReverseCopy StdAlgorithmsTeamRotate StdAlgorithmsTeamRotateCopy - ) - list(APPEND STDALGO_TEAM_SOURCES_F Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_F Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team E - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_E) - foreach(Name +# ------------------------------------------ +# std team E +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_E) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamFill StdAlgorithmsTeamFill_n @@ -280,28 +283,28 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsTeamReplaceIf StdAlgorithmsTeamReplaceCopy StdAlgorithmsTeamReplaceCopyIf - ) - list(APPEND STDALGO_TEAM_SOURCES_E Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_E Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team D - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_D) - foreach(Name +# ------------------------------------------ +# std team D +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_D) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamMinElement StdAlgorithmsTeamMaxElement StdAlgorithmsTeamMinMaxElement - ) - list(APPEND STDALGO_TEAM_SOURCES_D Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_D Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team C - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_C) - foreach(Name +# ------------------------------------------ +# std team C +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_C) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamFind StdAlgorithmsTeamFindIf @@ -310,29 +313,29 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsTeamAnyOf StdAlgorithmsTeamNoneOf StdAlgorithmsTeamSearchN - ) - list(APPEND STDALGO_TEAM_SOURCES_C Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_C Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team B - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_B) - foreach(Name +# ------------------------------------------ +# std team B +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_B) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamEqual StdAlgorithmsTeamSearch StdAlgorithmsTeamFindEnd StdAlgorithmsTeamFindFirstOf - ) - list(APPEND STDALGO_TEAM_SOURCES_B Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_B Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team A - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_A) - foreach(Name +# ------------------------------------------ +# std team A +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_A) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamAdjacentFind StdAlgorithmsTeamCount @@ -341,11 +344,8 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsTeamForEachN StdAlgorithmsTeamLexicographicalCompare StdAlgorithmsTeamMismatch - ) - list(APPEND STDALGO_TEAM_SOURCES_A Test${Name}.cpp) - endforeach() - - endif() + ) + list(APPEND STDALGO_TEAM_SOURCES_A Test${Name}.cpp) endforeach() # FIXME_OPENMPTARGET - remove sort test as it leads to ICE with clang/16 and above at compile time. diff --git a/lib/kokkos/algorithms/unit_tests/Makefile b/lib/kokkos/algorithms/unit_tests/Makefile index 601217799a..d3946c149b 100644 --- a/lib/kokkos/algorithms/unit_tests/Makefile +++ b/lib/kokkos/algorithms/unit_tests/Makefile @@ -27,13 +27,13 @@ TARGETS = tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ $(if $(filter Test$(device).cpp, $(shell ls Test$(device).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " > Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ ) \ ) diff --git a/lib/kokkos/algorithms/unit_tests/TestSortByKey.hpp b/lib/kokkos/algorithms/unit_tests/TestSortByKey.hpp new file mode 100644 index 0000000000..16f68eaaf2 --- /dev/null +++ b/lib/kokkos/algorithms/unit_tests/TestSortByKey.hpp @@ -0,0 +1,241 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TEST_SORT_BY_KEY_HPP +#define KOKKOS_ALGORITHMS_UNITTESTS_TEST_SORT_BY_KEY_HPP + +#include +#include +#include +#include + +#include // pair + +namespace Test { +namespace SortImpl { + +struct Less { + template + KOKKOS_INLINE_FUNCTION bool operator()(const ValueType &lhs, + const ValueType &rhs) const { + return lhs < rhs; + } +}; + +struct Greater { + template + KOKKOS_INLINE_FUNCTION bool operator()(const ValueType &lhs, + const ValueType &rhs) const { + return lhs > rhs; + } +}; + +template +struct is_sorted_by_key_struct { + Keys keys; + Keys keys_orig; + Permute permute; + Comparator comparator; + + is_sorted_by_key_struct(Keys keys_, Keys keys_orig_, Permute permute_, + Comparator comparator_ = Comparator{}) + : keys(keys_), + keys_orig(keys_orig_), + permute(permute_), + comparator(comparator_) {} + KOKKOS_INLINE_FUNCTION + void operator()(int i, unsigned int &count) const { + if (i < keys.extent_int(0) - 1 && comparator(keys(i + 1), keys(i))) ++count; + if (keys(i) != keys_orig(permute(i))) ++count; + } +}; + +template +void iota(ExecutionSpace const &space, ViewType const &v, + typename ViewType::value_type value = 0) { + using ValueType = typename ViewType::value_type; + Kokkos::parallel_for( + "ArborX::Algorithms::iota", + Kokkos::RangePolicy(space, 0, v.extent(0)), + KOKKOS_LAMBDA(int i) { v(i) = value + (ValueType)i; }); +} + +} // namespace SortImpl + +TEST(TEST_CATEGORY, SortByKeyEmptyView) { + using ExecutionSpace = TEST_EXECSPACE; + + // does not matter if we use int or something else + Kokkos::View keys("keys", 0); + Kokkos::View values("values", 0); + + ASSERT_NO_THROW( + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values)); +} + +TEST(TEST_CATEGORY, SortByKey) { + using ExecutionSpace = TEST_EXECSPACE; + using MemorySpace = typename ExecutionSpace::memory_space; + + ExecutionSpace space{}; + + for (auto keys_vector : {std::vector{36, 19, 25, 17, 3, 7, 1, 2, 9}, + std::vector{36, 19, 25, 17, 3, 9, 1, 2, 7}, + std::vector{100, 19, 36, 17, 3, 25, 1, 2, 7}, + std::vector{15, 5, 11, 3, 4, 8}}) { + auto const n = keys_vector.size(); + + auto keys = Kokkos::create_mirror_view_and_copy( + MemorySpace{}, + Kokkos::View( + keys_vector.data(), n)); + + auto keys_orig = Kokkos::create_mirror(space, keys); + Kokkos::deep_copy(space, keys_orig, keys); + + Kokkos::View permute("permute", n); + SortImpl::iota(space, permute); + + Kokkos::Experimental::sort_by_key(space, keys, permute); + + unsigned int sort_fails = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(space, 0, n), + SortImpl::is_sorted_by_key_struct(keys, keys_orig, + permute), + sort_fails); + + ASSERT_EQ(sort_fails, 0u); + } +} + +TEST(TEST_CATEGORY, SortByKeyWithComparator) { + using ExecutionSpace = TEST_EXECSPACE; + using MemorySpace = typename ExecutionSpace::memory_space; + + ExecutionSpace space{}; + + SortImpl::Greater comparator; + + for (auto keys_vector : {std::vector{36, 19, 25, 17, 3, 7, 1, 2, 9}, + std::vector{36, 19, 25, 17, 3, 9, 1, 2, 7}, + std::vector{100, 19, 36, 17, 3, 25, 1, 2, 7}, + std::vector{15, 5, 11, 3, 4, 8}}) { + auto const n = keys_vector.size(); + + auto keys = Kokkos::create_mirror_view_and_copy( + MemorySpace{}, + Kokkos::View( + keys_vector.data(), n)); + + auto keys_orig = Kokkos::create_mirror(space, keys); + Kokkos::deep_copy(space, keys_orig, keys); + + Kokkos::View permute("permute", n); + SortImpl::iota(space, permute); + + Kokkos::Experimental::sort_by_key(space, keys, permute, comparator); + + unsigned int sort_fails = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(space, 0, n), + SortImpl::is_sorted_by_key_struct( + keys, keys_orig, permute, comparator), + sort_fails); + + ASSERT_EQ(sort_fails, 0u); + } +} + +TEST(TEST_CATEGORY, SortByKeyStaticExtents) { + using ExecutionSpace = TEST_EXECSPACE; + + ExecutionSpace space{}; + + Kokkos::View keys("keys"); + + Kokkos::View values_static("values_static"); + ASSERT_NO_THROW( + Kokkos::Experimental::sort_by_key(space, keys, values_static)); + + Kokkos::View values_dynamic("values_dynamic", 10); + ASSERT_NO_THROW( + Kokkos::Experimental::sort_by_key(space, keys, values_dynamic)); +} + +template +void buildViewsForStrided(ExecutionSpace const &space, int n, Keys &keys, + Values &values) { + Kokkos::parallel_for( + "create_data", + Kokkos::MDRangePolicy, ExecutionSpace>(space, {0, 0, 0}, + {n, n, n}), + KOKKOS_LAMBDA(int i, int j, int k) { + keys(i, j, k) = n - i; + values(i, j, k) = j; + }); +} + +TEST(TEST_CATEGORY, SortByKeyWithStrides) { + using ExecutionSpace = TEST_EXECSPACE; + + ExecutionSpace space{}; + + auto const n = 10; + + Kokkos::View keys("keys", n, n, n); + Kokkos::View values("values", n, n, n); + buildViewsForStrided(space, n, keys, values); + + auto keys_sub = Kokkos::subview(keys, Kokkos::ALL(), 1, 2); + auto values_sub = Kokkos::subview(values, 4, Kokkos::ALL(), 6); + + auto keys_orig = Kokkos::create_mirror(space, keys_sub); + Kokkos::deep_copy(space, keys_orig, keys_sub); + + Kokkos::Experimental::sort_by_key(space, keys_sub, values_sub); + + unsigned int sort_fails = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(space, 0, n), + SortImpl::is_sorted_by_key_struct( + keys_sub, keys_orig, values_sub), + sort_fails); + + ASSERT_EQ(sort_fails, 0u); +} + +TEST(TEST_CATEGORY, SortByKeyKeysLargerThanValues) { + using ExecutionSpace = TEST_EXECSPACE; + + // does not matter if we use int or something else + Kokkos::View keys("keys", 3); + Kokkos::View values("values", 1); + + ASSERT_DEATH( + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values), + "values and keys extents must be the same"); + ASSERT_DEATH(Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values, + SortImpl::Greater{}), + "values and keys extents must be the same"); +} + +} // namespace Test +#endif diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp index 3eb963faf2..67052e2f9d 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp @@ -239,16 +239,8 @@ KOKKOS_FUNCTION bool team_members_have_matching_result( // set accum to 1 if a mismach is found const bool mismatch = memberValue != target; int accum = static_cast(mismatch); - // FIXME_OPENMPTARGET: team API does not meet the TeamHandle concept and - // ignores the reducer passed -#if defined KOKKOS_ENABLE_OPENMPTARGET - Kokkos::Sum dummyReducer(accum); - const auto result = teamHandle.team_reduce(accum, dummyReducer); - return (result == 0); -#else teamHandle.team_reduce(Kokkos::Sum(accum)); return (accum == 0); -#endif } template diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp index 6ab68a1987..b364c53a88 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp @@ -16,6 +16,7 @@ #include #include +#include namespace Test { namespace stdalgos { @@ -132,47 +133,6 @@ void my_host_exclusive_scan(it1 first, it1 last, it2 dest, ValType init, } } -template -void verify_data(ViewType1 data_view, // contains data - ViewType2 test_view, // the view to test - ValueType init_value, BinaryOp bop) { - //! always careful because views might not be deep copyable - - auto data_view_dc = create_deep_copyable_compatible_clone(data_view); - auto data_view_h = - create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc); - - using gold_view_value_type = typename ViewType2::value_type; - Kokkos::View gold_h( - "goldh", data_view.extent(0)); - my_host_exclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h), - KE::begin(gold_h), init_value, bop); - - auto test_view_dc = create_deep_copyable_compatible_clone(test_view); - auto test_view_h = - create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); - if (test_view_h.extent(0) > 0) { - for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " " - // << gold_h(i) << " " << test_view_h(i) << " " - // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - if (std::is_same::value) { - ASSERT_EQ(gold_h(i), test_view_h(i)); - } else { - const auto error = - std::abs(static_cast(gold_h(i) - test_view_h(i))); - if (error > 1e-10) { - std::cout << i << " " << std::setprecision(15) << data_view_h(i) - << " " << gold_h(i) << " " << test_view_h(i) << " " - << std::abs(static_cast(gold_h(i) - test_view_h(i))) - << std::endl; - } - EXPECT_LT(error, 1e-10); - } - } - } -} - template struct MultiplyFunctor { KOKKOS_INLINE_FUNCTION @@ -189,107 +149,153 @@ struct SumFunctor { } }; +struct VerifyData { + template + void operator()(ViewType1 data_view, // contains data + ViewType2 test_view, // the view to test + ValueType init_value, BinaryOp bop) { + //! always careful because views might not be deep copyable + + auto data_view_dc = create_deep_copyable_compatible_clone(data_view); + auto data_view_h = + create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc); + + using gold_view_value_type = typename ViewType2::value_type; + Kokkos::View gold_h( + "goldh", data_view.extent(0)); + my_host_exclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h), + KE::begin(gold_h), init_value, bop); + + auto test_view_dc = create_deep_copyable_compatible_clone(test_view); + auto test_view_h = + create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); + if (test_view_h.extent(0) > 0) { + for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { + if (std::is_same::value) { + ASSERT_EQ(gold_h(i), test_view_h(i)); + } else { + const auto error = + std::abs(static_cast(gold_h(i) - test_view_h(i))); + ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error + << static_cast(test_view_h(i)) << " " + << static_cast(gold_h(i)); + } + } + } + } + + template + void operator()(ViewType1 data_view, // contains data + ViewType2 test_view, // the view to test + ValueType init_value) { + (*this)(data_view, test_view, init_value, SumFunctor()); + } +}; + std::string value_type_to_string(int) { return "int"; } std::string value_type_to_string(double) { return "double"; } -template -void run_single_scenario_default_op(const InfoType& scenario_info, - ValueType init_value) { - using default_op = SumFunctor; +template +void run_single_scenario(const InfoType& scenario_info, ValueType init_value, + OpOrEmpty... empty_or_op) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // std::cout << "exclusive_scan default op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " - // << "init = " << init_value << std::endl; auto view_dest = create_view(Tag{}, view_ext, "exclusive_scan"); auto view_from = create_view(Tag{}, view_ext, "exclusive_scan"); fill_view(view_from, name); + // view_dest is filled with zeros before calling the algorithm everytime to + // ensure the algorithm does something meaningful { fill_zero(view_dest); auto r = KE::exclusive_scan(exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), - init_value); + init_value, empty_or_op...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, default_op()); + VerifyData()(view_from, view_dest, init_value, empty_or_op...); } { fill_zero(view_dest); auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), - init_value); + init_value, empty_or_op...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, default_op()); + VerifyData()(view_from, view_dest, init_value, empty_or_op...); } { fill_zero(view_dest); - auto r = KE::exclusive_scan(exespace(), view_from, view_dest, init_value); + auto r = KE::exclusive_scan(exespace(), view_from, view_dest, init_value, + empty_or_op...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, default_op()); + VerifyData()(view_from, view_dest, init_value, empty_or_op...); } { fill_zero(view_dest); auto r = KE::exclusive_scan("label", exespace(), view_from, view_dest, - init_value); + init_value, empty_or_op...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, default_op()); + VerifyData()(view_from, view_dest, init_value, empty_or_op...); } Kokkos::fence(); } -template -void run_single_scenario_custom_op(const InfoType& scenario_info, - ValueType init_value, BinaryOp bop) { +template +void run_single_scenario_inplace(const InfoType& scenario_info, + ValueType init_value, + OpOrEmpty... empty_or_op) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // std::cout << "exclusive_scan custom op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " - // << "init = " << init_value << std::endl; - auto view_dest = create_view(Tag{}, view_ext, "exclusive_scan"); - auto view_from = create_view(Tag{}, view_ext, "exclusive_scan"); - fill_view(view_from, name); + // since here we call the in-place operation, we need to use two views: + // view1: filled according to what the scenario asks for and is not modified + // view2: filled according to what the scenario asks for and used for the + // in-place op Therefore, after the op is done, view2 should contain the + // result of doing exclusive scan NOTE: view2 is filled below every time + // because the algorithm acts in place + auto view1 = + create_view(Tag{}, view_ext, "exclusive_scan_inplace_view1"); + fill_view(view1, name); + + auto view2 = + create_view(Tag{}, view_ext, "exclusive_scan_inplace_view2"); { - fill_zero(view_dest); - auto r = KE::exclusive_scan(exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), - init_value, bop); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, bop); + fill_view(view2, name); + auto r = KE::exclusive_scan(exespace(), KE::cbegin(view2), KE::cend(view2), + KE::begin(view2), init_value, empty_or_op...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, init_value, empty_or_op...); } { - fill_zero(view_dest); - auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), - init_value, bop); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, bop); + fill_view(view2, name); + auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view2), + KE::cend(view2), KE::begin(view2), init_value, + empty_or_op...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, init_value, empty_or_op...); } { - fill_zero(view_dest); - auto r = - KE::exclusive_scan(exespace(), view_from, view_dest, init_value, bop); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, bop); + fill_view(view2, name); + auto r = KE::exclusive_scan(exespace(), view2, view2, init_value, + empty_or_op...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, init_value, empty_or_op...); } { - fill_zero(view_dest); - auto r = KE::exclusive_scan("label", exespace(), view_from, view_dest, - init_value, bop); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, bop); + fill_view(view2, name); + auto r = KE::exclusive_scan("label", exespace(), view2, view2, init_value, + empty_or_op...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, init_value, empty_or_op...); } Kokkos::fence(); @@ -303,34 +309,39 @@ void run_exclusive_scan_all_scenarios() { {"medium", 1103}, {"large", 10513}}; for (const auto& it : scenarios) { - run_single_scenario_default_op(it, ValueType{0}); - run_single_scenario_default_op(it, ValueType{1}); - run_single_scenario_default_op(it, ValueType{-2}); - run_single_scenario_default_op(it, ValueType{3}); + run_single_scenario(it, ValueType{0}); + run_single_scenario(it, ValueType{1}); + run_single_scenario(it, ValueType{-2}); + run_single_scenario(it, ValueType{3}); + + run_single_scenario_inplace(it, ValueType{0}); + run_single_scenario_inplace(it, ValueType{-2}); #if !defined KOKKOS_ENABLE_OPENMPTARGET // custom multiply op is only run for small views otherwise it overflows if (it.first == "small-a" || it.first == "small-b") { using custom_bop_t = MultiplyFunctor; - run_single_scenario_custom_op(it, ValueType{0}, - custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{1}, - custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{-2}, - custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{3}, - custom_bop_t()); + run_single_scenario(it, ValueType{0}, custom_bop_t()); + run_single_scenario(it, ValueType{1}, custom_bop_t()); + run_single_scenario(it, ValueType{-2}, custom_bop_t()); + run_single_scenario(it, ValueType{3}, custom_bop_t()); + + run_single_scenario_inplace(it, ValueType{0}, + custom_bop_t()); + run_single_scenario_inplace(it, ValueType{-2}, + custom_bop_t()); } using custom_bop_t = SumFunctor; - run_single_scenario_custom_op(it, ValueType{0}, - custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{1}, - custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{-2}, - custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{3}, - custom_bop_t()); + run_single_scenario(it, ValueType{0}, custom_bop_t()); + run_single_scenario(it, ValueType{1}, custom_bop_t()); + run_single_scenario(it, ValueType{-2}, custom_bop_t()); + run_single_scenario(it, ValueType{3}, custom_bop_t()); + + run_single_scenario_inplace(it, ValueType{0}, + custom_bop_t()); + run_single_scenario_inplace(it, ValueType{-2}, + custom_bop_t()); #endif } } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp index 8e60a43e5f..a08a737210 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp @@ -16,6 +16,7 @@ #include #include +#include namespace Test { namespace stdalgos { @@ -143,51 +144,6 @@ void my_host_inclusive_scan(it1 first, it1 last, it2 dest, BinOp bop, } } -template -void verify_data(ViewType1 data_view, // contains data - ViewType2 test_view, // the view to test - BinaryOp bop, Args... args /* copy on purpose */) { - //! always careful because views might not be deep copyable - - auto data_view_dc = create_deep_copyable_compatible_clone(data_view); - auto data_view_h = - create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc); - - using gold_view_value_type = typename ViewType2::value_type; - Kokkos::View gold_h( - "goldh", data_view.extent(0)); - my_host_inclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h), - KE::begin(gold_h), bop, args...); - - auto test_view_dc = create_deep_copyable_compatible_clone(test_view); - auto test_view_h = - create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); - - const auto ext = test_view_h.extent(0); - if (ext > 0) { - for (std::size_t i = 0; i < ext; ++i) { - // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " " - // << gold_h(i) << " " << test_view_h(i) << " " - // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - - if (std::is_same::value) { - ASSERT_EQ(gold_h(i), test_view_h(i)); - } else { - const auto error = - std::abs(static_cast(gold_h(i) - test_view_h(i))); - if (error > 1e-10) { - std::cout << i << " " << std::setprecision(15) << data_view_h(i) - << " " << gold_h(i) << " " << test_view_h(i) << " " - << std::abs(static_cast(gold_h(i) - test_view_h(i))) - << std::endl; - } - EXPECT_LT(error, 1e-10); - } - } - // std::cout << " last el: " << test_view_h(ext-1) << std::endl; - } -} - template struct MultiplyFunctor { KOKKOS_INLINE_FUNCTION @@ -204,107 +160,151 @@ struct SumFunctor { } }; +struct VerifyData { + template + void operator()(ViewType1 data_view, // contains data + ViewType2 test_view, // the view to test + BinaryOp bop, Args... args /* copy on purpose */) { + //! always careful because views might not be deep copyable + + auto data_view_dc = create_deep_copyable_compatible_clone(data_view); + auto data_view_h = + create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc); + + using gold_view_value_type = typename ViewType2::value_type; + Kokkos::View gold_h( + "goldh", data_view.extent(0)); + my_host_inclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h), + KE::begin(gold_h), bop, args...); + + auto test_view_dc = create_deep_copyable_compatible_clone(test_view); + auto test_view_h = + create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); + + const auto ext = test_view_h.extent(0); + if (ext > 0) { + for (std::size_t i = 0; i < ext; ++i) { + if (std::is_same::value) { + ASSERT_EQ(gold_h(i), test_view_h(i)); + } else { + const auto error = + std::abs(static_cast(gold_h(i) - test_view_h(i))); + ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error + << static_cast(test_view_h(i)) << " " + << static_cast(gold_h(i)); + } + } + } + } + + template + void operator()(ViewType1 data_view, // contains data + ViewType2 test_view) // the view to test + { + using value_type = typename ViewType1::non_const_value_type; + (*this)(data_view, test_view, SumFunctor()); + } +}; + std::string value_type_to_string(int) { return "int"; } std::string value_type_to_string(double) { return "double"; } -template -void run_single_scenario_default_op(const InfoType& scenario_info) { - using default_op = SumFunctor; +template +void run_single_scenario(const InfoType& scenario_info, + Args... args /* copy on purpose */) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // std::cout << "inclusive_scan default op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << std::endl; auto view_dest = create_view(Tag{}, view_ext, "inclusive_scan"); auto view_from = create_view(Tag{}, view_ext, "inclusive_scan"); fill_view(view_from, name); + // view_dest is filled with zeros before calling the algorithm everytime to + // ensure the algorithm does something meaningful { fill_zero(view_dest); - auto r = KE::inclusive_scan(exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest)); + auto r = + KE::inclusive_scan(exespace(), KE::cbegin(view_from), + KE::cend(view_from), KE::begin(view_dest), args...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, default_op()); + VerifyData()(view_from, view_dest, args...); } { fill_zero(view_dest); - auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest)); + auto r = + KE::inclusive_scan("label", exespace(), KE::cbegin(view_from), + KE::cend(view_from), KE::begin(view_dest), args...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, default_op()); + VerifyData()(view_from, view_dest, args...); } { fill_zero(view_dest); - auto r = KE::inclusive_scan(exespace(), view_from, view_dest); + auto r = KE::inclusive_scan(exespace(), view_from, view_dest, args...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, default_op()); + VerifyData()(view_from, view_dest, args...); } { fill_zero(view_dest); - auto r = KE::inclusive_scan("label", exespace(), view_from, view_dest); + auto r = + KE::inclusive_scan("label", exespace(), view_from, view_dest, args...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, default_op()); + VerifyData()(view_from, view_dest, args...); } Kokkos::fence(); } -template -void run_single_scenario_custom_op(const InfoType& scenario_info, BinaryOp bop, - Args... args /* copy on purpose */) { +template +void run_single_scenario_inplace(const InfoType& scenario_info, + Args... args /* copy on purpose */) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // if (1 == sizeof...(Args)) { - // std::cout << "inclusive_scan custom op and init value: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " << std::endl; - // } else { - // std::cout << "inclusive_scan custom op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " << std::endl; - // } + // since here we call the in-place operation, we need to use two views: + // view1: filled according to what the scenario asks for and is not modified + // view2: filled according to what the scenario asks for and used for the + // in-place op Therefore, after the op is done, view_2 should contain the + // result of doing exclusive scan NOTE: view2 is filled below every time + // because the algorithm acts in place - auto view_dest = create_view(Tag{}, view_ext, "inclusive_scan"); - auto view_from = create_view(Tag{}, view_ext, "inclusive_scan"); - fill_view(view_from, name); + auto view1 = + create_view(Tag{}, view_ext, "inclusive_scan_inplace_view1"); + fill_view(view1, name); + + auto view2 = + create_view(Tag{}, view_ext, "inclusive_scan_inplace_view2"); { - fill_zero(view_dest); - auto r = KE::inclusive_scan(exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), bop, - args...); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, bop, args...); + fill_view(view2, name); + auto r = KE::inclusive_scan(exespace(), KE::cbegin(view2), KE::cend(view2), + KE::begin(view2), args...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, args...); } { - fill_zero(view_dest); - auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), bop, - args...); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, bop, args...); + fill_view(view2, name); + auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view2), + KE::cend(view2), KE::begin(view2), args...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, args...); } { - fill_zero(view_dest); - auto r = KE::inclusive_scan(exespace(), view_from, view_dest, bop, args...); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, bop, args...); + fill_view(view2, name); + auto r = KE::inclusive_scan(exespace(), view2, view2, args...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, args...); } { - fill_zero(view_dest); - auto r = KE::inclusive_scan("label", exespace(), view_from, view_dest, bop, - args...); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, bop, args...); + fill_view(view2, name); + auto r = KE::inclusive_scan("label", exespace(), view2, view2, args...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, args...); } Kokkos::fence(); @@ -318,27 +318,35 @@ void run_inclusive_scan_all_scenarios() { {"medium-a", 313}, {"medium-b", 1103}, {"large", 10513}}; for (const auto& it : scenarios) { - run_single_scenario_default_op(it); + run_single_scenario(it); + run_single_scenario_inplace(it); #if !defined KOKKOS_ENABLE_OPENMPTARGET // the sum custom op is always run using sum_binary_op = SumFunctor; sum_binary_op sbop; - run_single_scenario_custom_op(it, sbop); - run_single_scenario_custom_op(it, sbop, ValueType{0}); - run_single_scenario_custom_op(it, sbop, ValueType{1}); - run_single_scenario_custom_op(it, sbop, ValueType{-2}); - run_single_scenario_custom_op(it, sbop, ValueType{3}); + run_single_scenario(it, sbop); + run_single_scenario(it, sbop, ValueType{0}); + run_single_scenario(it, sbop, ValueType{1}); + run_single_scenario(it, sbop, ValueType{-2}); + run_single_scenario(it, sbop, ValueType{3}); + + run_single_scenario_inplace(it, sbop, ValueType{0}); + run_single_scenario_inplace(it, sbop, ValueType{-2}); // custom multiply only for small views to avoid overflows if (it.first == "small-a" || it.first == "small-b") { using mult_binary_op = MultiplyFunctor; mult_binary_op mbop; - run_single_scenario_custom_op(it, mbop); - run_single_scenario_custom_op(it, mbop, ValueType{0}); - run_single_scenario_custom_op(it, mbop, ValueType{1}); - run_single_scenario_custom_op(it, mbop, ValueType{-2}); - run_single_scenario_custom_op(it, mbop, ValueType{3}); + run_single_scenario(it, mbop); + run_single_scenario(it, mbop, ValueType{0}); + run_single_scenario(it, mbop, ValueType{1}); + run_single_scenario(it, mbop, ValueType{-2}); + run_single_scenario(it, mbop, ValueType{3}); + + run_single_scenario_inplace(it, mbop); + run_single_scenario_inplace(it, mbop, ValueType{0}); + run_single_scenario_inplace(it, mbop, ValueType{-2}); } #endif } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp index f31d49e06b..75d4f0afeb 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp @@ -146,7 +146,7 @@ void run_single_scenario(const InfoType& scenario_info) { resultsA[3] = KE::is_sorted("label", exespace(), view); const auto allA = std::all_of(resultsA.cbegin(), resultsA.cend(), [=](bool v) { return v == gold; }); - EXPECT_TRUE(allA); + EXPECT_TRUE(allA) << name << ", " << view_tag_to_string(Tag{}); #if !defined KOKKOS_ENABLE_OPENMPTARGET CustomLessThanComparator comp; @@ -159,7 +159,7 @@ void run_single_scenario(const InfoType& scenario_info) { resultsB[3] = KE::is_sorted("label", exespace(), view, comp); const auto allB = std::all_of(resultsB.cbegin(), resultsB.cend(), [=](bool v) { return v == gold; }); - EXPECT_TRUE(allB); + EXPECT_TRUE(allB) << name << ", " << view_tag_to_string(Tag{}); #endif Kokkos::fence(); @@ -173,9 +173,6 @@ void run_is_sorted_all_scenarios() { {"medium-a", 1003}, {"medium-b", 1003}, {"large-a", 101513}, {"large-b", 101513}}; - std::cout << "is_sorted: " << view_tag_to_string(Tag{}) - << ", all overloads \n"; - for (const auto& it : scenarios) { run_single_scenario(it); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp index dcfe8ad67e..29ac7cc9bc 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp @@ -145,10 +145,10 @@ void run_single_scenario(const InfoType& scenario_info) { KE::is_sorted_until("label", exespace(), KE::begin(view), KE::end(view)); auto r3 = KE::is_sorted_until(exespace(), view); auto r4 = KE::is_sorted_until("label", exespace(), view); - ASSERT_EQ(r1, gold); - ASSERT_EQ(r2, gold); - ASSERT_EQ(r3, gold); - ASSERT_EQ(r4, gold); + ASSERT_EQ(r1, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r2, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r3, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r4, gold) << name << ", " << view_tag_to_string(Tag{}); #if !defined KOKKOS_ENABLE_OPENMPTARGET CustomLessThanComparator comp; @@ -160,10 +160,10 @@ void run_single_scenario(const InfoType& scenario_info) { auto r8 = KE::is_sorted_until("label", exespace(), view, comp); #endif - ASSERT_EQ(r1, gold); - ASSERT_EQ(r2, gold); - ASSERT_EQ(r3, gold); - ASSERT_EQ(r4, gold); + ASSERT_EQ(r1, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r2, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r3, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r4, gold) << name << ", " << view_tag_to_string(Tag{}); Kokkos::fence(); } @@ -176,9 +176,6 @@ void run_is_sorted_until_all_scenarios() { {"medium-a", 1003}, {"medium-b", 1003}, {"large-a", 101513}, {"large-b", 101513}}; - std::cout << "is_sorted_until: " << view_tag_to_string(Tag{}) - << ", all overloads \n"; - for (const auto& it : scenarios) { run_single_scenario(it); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp index 4604764097..1b1a02f39c 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp @@ -48,7 +48,7 @@ struct MyMovableType { TEST(std_algorithms_mod_ops_test, move) { MyMovableType a; using move_t = decltype(std::move(a)); - static_assert(std::is_rvalue_reference::value, ""); + static_assert(std::is_rvalue_reference::value); // move constr MyMovableType b(std::move(a)); @@ -70,7 +70,7 @@ struct StdAlgoModSeqOpsTestMove { void operator()(const int index) const { typename ViewType::value_type a{11}; using move_t = decltype(std::move(a)); - static_assert(std::is_rvalue_reference::value, ""); + static_assert(std::is_rvalue_reference::value); m_view(index) = std::move(a); } @@ -89,50 +89,6 @@ TEST(std_algorithms_mod_ops_test, move_within_parfor) { } } -// ------------ -// swap -// ------------ -TEST(std_algorithms_mod_ops_test, swap) { - { - int a = 1; - int b = 2; - KE::swap(a, b); - ASSERT_EQ(a, 2); - ASSERT_EQ(b, 1); - } - - { - double a = 3.; - double b = 1.; - KE::swap(a, b); - EXPECT_DOUBLE_EQ(a, 1.); - EXPECT_DOUBLE_EQ(b, 3.); - } -} - -template -struct StdAlgoModSeqOpsTestSwap { - ViewType m_view; - - KOKKOS_INLINE_FUNCTION - void operator()(const int index) const { - typename ViewType::value_type newval{11}; - KE::swap(m_view(index), newval); - } - - StdAlgoModSeqOpsTestSwap(ViewType aIn) : m_view(aIn) {} -}; - -TEST(std_algorithms_mod_ops_test, swap_within_parfor) { - auto a = create_view(stdalgos::DynamicTag{}, 10, "a"); - StdAlgoModSeqOpsTestSwap fnc(a); - Kokkos::parallel_for(a.extent(0), fnc); - auto a_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a); - for (std::size_t i = 0; i < a.extent(0); ++i) { - EXPECT_DOUBLE_EQ(a_h(0), 11.); - } -} - // ------------ // iter_swap // ------------ diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp index f169fd9ce8..a36c9db2b9 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp @@ -110,11 +110,9 @@ void verify_data(const std::string& name, ResultType my_result, ViewTypeDestFalse view_dest_false, PredType pred) { using value_type = typename ViewTypeFrom::value_type; static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); const std::size_t ext = view_from.extent(0); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp index b5aa27c7c3..7c3c465dc8 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp @@ -166,6 +166,10 @@ void run_all_scenarios() { } TEST(std_algorithms_copy_if_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp index c6b2566c6c..2c8fee02f4 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp @@ -121,7 +121,9 @@ struct TestFunctorA { } }; -template +struct InPlace {}; + +template void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { /* description: use a rank-2 view randomly filled with values, @@ -147,9 +149,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { using space_t = Kokkos::DefaultExecutionSpace; Kokkos::TeamPolicy policy(numTeams, Kokkos::AUTO()); - // create the destination view - Kokkos::View destView("destView", numTeams, numCols); - // exclusive_scan returns an iterator so to verify that it is correct // each team stores the distance of the returned iterator from the beginning // of the interval that team operates on and then we check that these @@ -168,12 +167,19 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { rand_pool pool(lowerBound * upperBound); Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); - // use CTAD for functor auto initValuesView = Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); - TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, - initValuesView, binaryOp, apiId); - Kokkos::parallel_for(policy, fnc); + + Kokkos::View destView("destView", numTeams, numCols); + if constexpr (std::is_same_v) { + TestFunctorA fnc(sourceView, sourceView, distancesView, + intraTeamSentinelView, initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } else { + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } // ----------------------------------------------- // run cpp-std kernel and check @@ -223,11 +229,16 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { #undef exclusive_scan } - auto dataViewAfterOp_h = create_host_space_copy(destView); - expect_equal_host_views(stdDestView, dataViewAfterOp_h); + if constexpr (std::is_same_v) { + auto dataViewAfterOp_h = create_host_space_copy(sourceView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } else { + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } } -template +template void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { @@ -236,16 +247,24 @@ void run_all_scenarios() { #else for (int apiId : {0, 1}) { #endif - test_A(numTeams, numCols, apiId); + test_A(numTeams, numCols, apiId); } } } } TEST(std_algorithms_exclusive_scan_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); + + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); } } // namespace TeamExclusiveScan diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp index 0daf9dbfe8..b5f4cdd612 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp @@ -139,7 +139,9 @@ struct TestFunctorA { } }; -template +struct InPlace {}; + +template void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { /* description: use a rank-2 view randomly filled with values, @@ -165,9 +167,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { using space_t = Kokkos::DefaultExecutionSpace; Kokkos::TeamPolicy policy(numTeams, Kokkos::AUTO()); - // create the destination view - Kokkos::View destView("destView", numTeams, numCols); - // inclusive_scan returns an iterator so to verify that it is correct // each team stores the distance of the returned iterator from the beginning // of the interval that team operates on and then we check that these @@ -186,12 +185,20 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { rand_pool pool(lowerBound * upperBound); Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); - // use CTAD for functor auto initValuesView = Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); - TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, - initValuesView, binaryOp, apiId); - Kokkos::parallel_for(policy, fnc); + + // create the destination view + Kokkos::View destView("destView", numTeams, numCols); + if constexpr (std::is_same_v) { + TestFunctorA fnc(sourceView, sourceView, distancesView, + intraTeamSentinelView, initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } else { + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } // ----------------------------------------------- // run cpp-std kernel and check @@ -251,25 +258,38 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { #undef inclusive_scan } - auto dataViewAfterOp_h = create_host_space_copy(destView); - expect_equal_host_views(stdDestView, dataViewAfterOp_h); + if constexpr (std::is_same_v) { + auto dataViewAfterOp_h = create_host_space_copy(sourceView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } else { + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } } -template +template void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { for (int apiId : {0, 1, 2, 3, 4, 5}) { - test_A(numTeams, numCols, apiId); + test_A(numTeams, numCols, apiId); } } } } TEST(std_algorithms_inclusive_scan_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); + + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); } } // namespace TeamInclusiveScan diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp index 24b840154b..6bb0d24998 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp @@ -212,6 +212,10 @@ void run_all_scenarios() { } TEST(std_algorithms_remove_copy_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp index ce18eb4d31..cff9aa178a 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp @@ -168,6 +168,10 @@ void run_all_scenarios() { } TEST(std_algorithms_remove_copy_if_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp index 9f30812d8e..60fa369af1 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp @@ -108,7 +108,9 @@ struct TestFunctorA { } }; -template +struct InPlace {}; + +template void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { /* description: use a rank-2 view randomly filled with values, @@ -134,9 +136,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { using space_t = Kokkos::DefaultExecutionSpace; Kokkos::TeamPolicy policy(numTeams, Kokkos::AUTO()); - // create the destination view - Kokkos::View destView("destView", numTeams, numCols); - // tranform_exclusive_scan returns an iterator so to verify that it is correct // each team stores the distance of the returned iterator from the beginning // of the interval that team operates on and then we check that these @@ -156,12 +155,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { rand_pool pool(lowerBound * upperBound); Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); - // use CTAD for functor auto initValuesView = Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); - TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, - initValuesView, binaryOp, unaryOp, apiId); - Kokkos::parallel_for(policy, fnc); + + // create the destination view + Kokkos::View destView("destView", numTeams, numCols); + if constexpr (std::is_same_v) { + TestFunctorA fnc(sourceView, sourceView, distancesView, + intraTeamSentinelView, initValuesView, binaryOp, unaryOp, + apiId); + Kokkos::parallel_for(policy, fnc); + } else { + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, unaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } // ----------------------------------------------- // run cpp-std kernel and check @@ -200,16 +208,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { #undef transform_exclusive_scan } - auto dataViewAfterOp_h = create_host_space_copy(destView); - expect_equal_host_views(stdDestView, dataViewAfterOp_h); + if constexpr (std::is_same_v) { + auto dataViewAfterOp_h = create_host_space_copy(sourceView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } else { + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } } -template +template void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { for (int apiId : {0, 1}) { - test_A(numTeams, numCols, apiId); + test_A(numTeams, numCols, apiId); } } } @@ -219,6 +232,10 @@ TEST(std_algorithms_transform_exclusive_scan_team_test, test) { run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); + + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); } } // namespace TeamTransformExclusiveScan diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp index 4b31660232..10454d6551 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp @@ -131,7 +131,9 @@ struct TestFunctorA { } }; -template +struct InPlace {}; + +template void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { /* description: use a rank-2 view randomly filled with values, @@ -157,9 +159,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { using space_t = Kokkos::DefaultExecutionSpace; Kokkos::TeamPolicy policy(numTeams, Kokkos::AUTO()); - // create the destination view - Kokkos::View destView("destView", numTeams, numCols); - // tranform_inclusive_scan returns an iterator so to verify that it is correct // each team stores the distance of the returned iterator from the beginning // of the interval that team operates on and then we check that these @@ -179,12 +178,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { rand_pool pool(lowerBound * upperBound); Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); - // use CTAD for functor auto initValuesView = Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); - TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, - initValuesView, binaryOp, unaryOp, apiId); - Kokkos::parallel_for(policy, fnc); + + // create the destination view + Kokkos::View destView("destView", numTeams, numCols); + if constexpr (std::is_same_v) { + TestFunctorA fnc(sourceView, sourceView, distancesView, + intraTeamSentinelView, initValuesView, binaryOp, unaryOp, + apiId); + Kokkos::parallel_for(policy, fnc); + } else { + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, unaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } // ----------------------------------------------- // run cpp-std kernel and check @@ -236,16 +244,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { } #undef transform_inclusive_scan - auto dataViewAfterOp_h = create_host_space_copy(destView); - expect_equal_host_views(stdDestView, dataViewAfterOp_h); + if constexpr (std::is_same_v) { + auto dataViewAfterOp_h = create_host_space_copy(sourceView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } else { + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } } -template +template void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { for (int apiId : {0, 1, 2, 3}) { - test_A(numTeams, numCols, apiId); + test_A(numTeams, numCols, apiId); } } } @@ -255,6 +268,10 @@ TEST(std_algorithms_transform_inclusive_scan_team_test, test) { run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); + + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); } } // namespace TeamTransformInclusiveScan diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp index 87687b60a1..0d3289e196 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp @@ -186,6 +186,10 @@ void run_all_scenarios() { } TEST(std_algorithms_unique_copy_team_test, test) { + // FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp index 9dac3ce75f..fa2804256a 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp @@ -16,6 +16,7 @@ #include #include +#include namespace Test { namespace stdalgos { @@ -160,24 +161,15 @@ void verify_data(ViewType1 data_view, // contains data create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); if (test_view_h.extent(0) > 0) { for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " " - // << gold_h(i) << " " << test_view_h(i) << " " - // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - if (std::is_same::value) { ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); - if (error > 1e-10) { - std::cout << i << " " << std::setprecision(15) << data_view_h(i) - << " " << gold_h(i) << " " << test_view_h(i) << " " - << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - } - EXPECT_LT(error, 1e-10); + ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error + << static_cast(test_view_h(i)) << " " + << static_cast(gold_h(i)); } } - // std::cout << " last el: " << test_view_h(test_view_h.extent(0)-1) << - // std::endl; } } @@ -205,17 +197,13 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value, BinaryOp bop, UnaryOp uop) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // std::cout << "transform_exclusive_scan custom op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " - // << "init = " << init_value << std::endl; - auto view_dest = - create_view(Tag{}, view_ext, "transform_exclusive_scan"); - auto view_from = - create_view(Tag{}, view_ext, "transform_exclusive_scan"); + auto view_from = create_view(Tag{}, view_ext, + "transform_exclusive_scan_view_from"); fill_view(view_from, name); + auto view_dest = create_view(Tag{}, view_ext, + "transform_exclusive_scan_view_dest"); { fill_zero(view_dest); auto r = KE::transform_exclusive_scan( @@ -253,6 +241,65 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value, Kokkos::fence(); } +template +void run_single_scenario_inplace(const InfoType& scenario_info, + ValueType init_value, BinaryOp bop, + UnaryOp uop) { + const auto name = std::get<0>(scenario_info); + const std::size_t view_ext = std::get<1>(scenario_info); + + // since here we call the in-place operation, we need to use two views: + // view1: filled according to what the scenario asks for and is not modified + // view2: filled according to what the scenario asks for and used for the + // in-place op Therefore, after the op is done, view2 should contain the + // result of doing exclusive scan NOTE: view2 is filled below every time + // because the algorithm acts in place + + auto view1 = + create_view(Tag{}, view_ext, "transform_exclusive_scan_view1"); + fill_view(view1, name); + + auto view2 = + create_view(Tag{}, view_ext, "transform_exclusive_scan_view2"); + + { + fill_view(view2, name); + auto r = KE::transform_exclusive_scan(exespace(), KE::cbegin(view2), + KE::cend(view2), KE::begin(view2), + init_value, bop, uop); + ASSERT_EQ(r, KE::end(view2)); + verify_data(view1, view2, init_value, bop, uop); + } + + { + fill_view(view2, name); + auto r = KE::transform_exclusive_scan( + "label", exespace(), KE::cbegin(view2), KE::cend(view2), + KE::begin(view2), init_value, bop, uop); + ASSERT_EQ(r, KE::end(view2)); + verify_data(view1, view2, init_value, bop, uop); + } + + { + fill_view(view2, name); + auto r = KE::transform_exclusive_scan(exespace(), view2, view2, init_value, + bop, uop); + ASSERT_EQ(r, KE::end(view2)); + verify_data(view1, view2, init_value, bop, uop); + } + + { + fill_view(view2, name); + auto r = KE::transform_exclusive_scan("label", exespace(), view2, view2, + init_value, bop, uop); + ASSERT_EQ(r, KE::end(view2)); + verify_data(view1, view2, init_value, bop, uop); + } + + Kokkos::fence(); +} + template void run_all_scenarios() { const std::map scenarios = { @@ -267,6 +314,11 @@ void run_all_scenarios() { run_single_scenario(it, ValueType{1}, bop_t(), uop_t()); run_single_scenario(it, ValueType{-2}, bop_t(), uop_t()); run_single_scenario(it, ValueType{3}, bop_t(), uop_t()); + + run_single_scenario_inplace(it, ValueType{0}, bop_t(), + uop_t()); + run_single_scenario_inplace(it, ValueType{-2}, bop_t(), + uop_t()); } } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp index a90a68ca1d..fb81ae91b0 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp @@ -16,6 +16,7 @@ #include #include +#include namespace Test { namespace stdalgos { @@ -172,24 +173,15 @@ void verify_data(ViewType1 data_view, // contains data create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); if (test_view_h.extent(0) > 0) { for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " " - // << gold_h(i) << " " << test_view_h(i) << " " - // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - if (std::is_same::value) { ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); - if (error > 1e-10) { - std::cout << i << " " << std::setprecision(15) << data_view_h(i) - << " " << gold_h(i) << " " << test_view_h(i) << " " - << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - } - EXPECT_LT(error, 1e-10); + ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error + << static_cast(test_view_h(i)) << " " + << static_cast(gold_h(i)); } } - // std::cout << " last el: " << test_view_h(test_view_h.extent(0)-1) << - // std::endl; } } @@ -210,30 +202,11 @@ struct SumBinaryFunctor { std::string value_type_to_string(int) { return "int"; } std::string value_type_to_string(double) { return "double"; } -template -void print_scenario_details(const std::string& name, BopT bop, UopT uop) { - (void)bop; - (void)uop; - std::cout << "transform_inclusive_scan: " << name << ", " - << view_tag_to_string(Tag{}) << std::endl; -} - -template -void print_scenario_details(const std::string& name, BopT bop, UopT uop, - ValueType init_value) { - (void)bop; - (void)uop; - std::cout << "transform_inclusive_scan: " << name << ", " - << view_tag_to_string(Tag{}) << ", " - << "init = " << init_value << std::endl; -} - template void run_single_scenario(const InfoType& scenario_info, Args... args /* by value on purpose*/) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // print_scenario_details(name, args...); auto view_dest = create_view(Tag{}, view_ext, "transform_inclusive_scan"); @@ -278,6 +251,63 @@ void run_single_scenario(const InfoType& scenario_info, Kokkos::fence(); } +template +void run_single_scenario_inplace(const InfoType& scenario_info, + Args... args /* by value on purpose*/) { + const auto name = std::get<0>(scenario_info); + const std::size_t view_ext = std::get<1>(scenario_info); + + // since here we call the in-place operation, we need to use two views: + // view1: filled according to scenario and is not modified + // view2: filled according scenario and used for the in-place op + // Therefore, after the op is done, view_2 should contain the + // result of doing exclusive scan. + // NOTE: view2 must be filled before every call to the algorithm + // because the algorithm acts in place + + auto view_1 = create_view(Tag{}, view_ext, + "transform_inclusive_scan_view_1"); + fill_view(view_1, name); + + auto view_2 = create_view(Tag{}, view_ext, + "transform_inclusive_scan_view_2"); + + { + fill_view(view_2, name); + auto r = KE::transform_inclusive_scan(exespace(), KE::cbegin(view_2), + KE::cend(view_2), KE::begin(view_2), + args...); + ASSERT_EQ(r, KE::end(view_2)); + verify_data(view_1, view_2, args...); + } + + { + fill_view(view_2, name); + auto r = KE::transform_inclusive_scan("label", exespace(), + KE::cbegin(view_2), KE::cend(view_2), + KE::begin(view_2), args...); + ASSERT_EQ(r, KE::end(view_2)); + verify_data(view_1, view_2, args...); + } + + { + fill_view(view_2, name); + auto r = KE::transform_inclusive_scan(exespace(), view_2, view_2, args...); + ASSERT_EQ(r, KE::end(view_2)); + verify_data(view_1, view_2, args...); + } + + { + fill_view(view_2, name); + auto r = KE::transform_inclusive_scan("label", exespace(), view_2, view_2, + args...); + ASSERT_EQ(r, KE::end(view_2)); + verify_data(view_1, view_2, args...); + } + + Kokkos::fence(); +} + template void run_all_scenarios() { const std::map scenarios = { @@ -294,15 +324,23 @@ void run_all_scenarios() { run_single_scenario(it, bop_t(), uop_t(), ValueType{2}); run_single_scenario(it, bop_t(), uop_t(), ValueType{-1}); run_single_scenario(it, bop_t(), uop_t(), ValueType{-2}); + + run_single_scenario_inplace(it, bop_t(), uop_t()); + run_single_scenario_inplace(it, bop_t(), uop_t(), + ValueType{0}); + run_single_scenario_inplace(it, bop_t(), uop_t(), + ValueType{2}); + run_single_scenario_inplace(it, bop_t(), uop_t(), + ValueType{-2}); } } #if !defined KOKKOS_ENABLE_OPENMPTARGET TEST(std_algorithms_numeric_ops_test, transform_inclusive_scan) { run_all_scenarios(); - // run_all_scenarios(); - // run_all_scenarios(); - // run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); } #endif diff --git a/lib/kokkos/algorithms/unit_tests/TestStdReducers.cpp b/lib/kokkos/algorithms/unit_tests/TestStdReducers.cpp index 3847e1e6a3..c05006a161 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdReducers.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdReducers.cpp @@ -83,9 +83,6 @@ auto run_min_or_max_test(ViewType view, StdReducersTestEnumOrder enValue) { static_assert(std::is_same::value, "test is only enabled for HostSpace"); - std::cout << "checking reduction with order: " << order_to_string(enValue) - << "\n"; - using view_value_type = typename ViewType::value_type; using reducer_type = std::conditional_t< (flag == 0), Kokkos::MaxFirstLoc, @@ -132,18 +129,24 @@ TEST(std_algorithms_reducers, max_first_loc) { const auto pair1 = run_min_or_max_test<0, hostspace, index_type>( view_h, StdReducersTestEnumOrder::LeftToRight); - ASSERT_EQ(pair1.first, gold_value); - ASSERT_EQ(pair1.second, gold_location); + ASSERT_EQ(pair1.first, gold_value) + << order_to_string(StdReducersTestEnumOrder::LeftToRight); + ASSERT_EQ(pair1.second, gold_location) + << order_to_string(StdReducersTestEnumOrder::LeftToRight); const auto pair2 = run_min_or_max_test<0, hostspace, index_type>( view_h, StdReducersTestEnumOrder::RightToLeft); - ASSERT_EQ(pair2.first, gold_value); - ASSERT_EQ(pair2.second, gold_location); + ASSERT_EQ(pair2.first, gold_value) + << order_to_string(StdReducersTestEnumOrder::RightToLeft); + ASSERT_EQ(pair2.second, gold_location) + << order_to_string(StdReducersTestEnumOrder::RightToLeft); const auto pair3 = run_min_or_max_test<0, hostspace, index_type>( view_h, StdReducersTestEnumOrder::Random); - ASSERT_EQ(pair3.first, gold_value); - ASSERT_EQ(pair3.second, gold_location); + ASSERT_EQ(pair3.first, gold_value) + << order_to_string(StdReducersTestEnumOrder::Random); + ASSERT_EQ(pair3.second, gold_location) + << order_to_string(StdReducersTestEnumOrder::Random); } TEST(std_algorithms_reducers, min_first_loc) { @@ -191,9 +194,6 @@ void run_min_max_test(ViewType view, StdReducersTestEnumOrder enValue, static_assert(std::is_same::value, "test is only enabled for HostSpace"); - std::cout << "checking reduction with order: " << order_to_string(enValue) - << "\n"; - using view_value_type = typename ViewType::value_type; using reducer_type = Kokkos::MinMaxFirstLastLoc; @@ -212,10 +212,10 @@ void run_min_max_test(ViewType view, StdReducersTestEnumOrder enValue, reduction_value_type{view(index), view(index), index, index}); } - ASSERT_EQ(red_result.min_val, gold_values.first); - ASSERT_EQ(red_result.max_val, gold_values.second); - ASSERT_EQ(red_result.min_loc, gold_locs.first); - ASSERT_EQ(red_result.max_loc, gold_locs.second); + ASSERT_EQ(red_result.min_val, gold_values.first) << order_to_string(enValue); + ASSERT_EQ(red_result.max_val, gold_values.second) << order_to_string(enValue); + ASSERT_EQ(red_result.min_loc, gold_locs.first) << order_to_string(enValue); + ASSERT_EQ(red_result.max_loc, gold_locs.second) << order_to_string(enValue); } TEST(std_algorithms_reducers, min_max_first_last_loc) { diff --git a/lib/kokkos/benchmarks/CMakeLists.txt b/lib/kokkos/benchmarks/CMakeLists.txt index 42279bf55d..abf5028359 100644 --- a/lib/kokkos/benchmarks/CMakeLists.txt +++ b/lib/kokkos/benchmarks/CMakeLists.txt @@ -1 +1,12 @@ +#FIXME_OPENMPTARGET - compiling in debug mode causes ICE. +KOKKOS_ADD_BENCHMARK_DIRECTORIES(atomic) +KOKKOS_ADD_BENCHMARK_DIRECTORIES(gather) KOKKOS_ADD_BENCHMARK_DIRECTORIES(gups) +KOKKOS_ADD_BENCHMARK_DIRECTORIES(launch_latency) +KOKKOS_ADD_BENCHMARK_DIRECTORIES(stream) + +#FIXME_OPENMPTARGET - These two benchmarks cause ICE. Commenting them for now but a deeper analysis on the cause and a possible fix will follow. +IF(NOT Kokkos_ENABLE_OPENMPTARGET) + KOKKOS_ADD_BENCHMARK_DIRECTORIES(policy_performance) + KOKKOS_ADD_BENCHMARK_DIRECTORIES(bytes_and_flops) +ENDIF() diff --git a/lib/kokkos/benchmarks/atomic/CMakeLists.txt b/lib/kokkos/benchmarks/atomic/CMakeLists.txt new file mode 100644 index 0000000000..85f7412f49 --- /dev/null +++ b/lib/kokkos/benchmarks/atomic/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + atomic + SOURCES main.cpp +) diff --git a/lib/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt b/lib/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt new file mode 100644 index 0000000000..0ce44a6f1a --- /dev/null +++ b/lib/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + bytes_and_flops + SOURCES bench_double.cpp bench_float.cpp bench_int32_t.cpp bench_int64_t.cpp main.cpp +) diff --git a/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp b/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp index 2589fd7309..88830af624 100644 --- a/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp +++ b/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp @@ -37,22 +37,22 @@ struct RunStride { }; #define STRIDE 1 -#include +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 2 -#include +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 4 -#include +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 8 -#include +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 16 -#include +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 32 -#include +#include "bench_stride.hpp" #undef STRIDE template diff --git a/lib/kokkos/benchmarks/bytes_and_flops/bench_double.cpp b/lib/kokkos/benchmarks/bytes_and_flops/bench_double.cpp index f955c99666..2fda1ae3d4 100644 --- a/lib/kokkos/benchmarks/bytes_and_flops/bench_double.cpp +++ b/lib/kokkos/benchmarks/bytes_and_flops/bench_double.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "bench.hpp" template void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S, int B, int I); diff --git a/lib/kokkos/benchmarks/bytes_and_flops/bench_float.cpp b/lib/kokkos/benchmarks/bytes_and_flops/bench_float.cpp index 137ff67d40..3210116a9e 100644 --- a/lib/kokkos/benchmarks/bytes_and_flops/bench_float.cpp +++ b/lib/kokkos/benchmarks/bytes_and_flops/bench_float.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "bench.hpp" template void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S, int B, int I); diff --git a/lib/kokkos/benchmarks/bytes_and_flops/bench_int32_t.cpp b/lib/kokkos/benchmarks/bytes_and_flops/bench_int32_t.cpp index 29ccec0141..24a5dcd389 100644 --- a/lib/kokkos/benchmarks/bytes_and_flops/bench_int32_t.cpp +++ b/lib/kokkos/benchmarks/bytes_and_flops/bench_int32_t.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "bench.hpp" template void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S, int B, int I); diff --git a/lib/kokkos/benchmarks/bytes_and_flops/bench_int64_t.cpp b/lib/kokkos/benchmarks/bytes_and_flops/bench_int64_t.cpp index c153d5eff3..0634700c31 100644 --- a/lib/kokkos/benchmarks/bytes_and_flops/bench_int64_t.cpp +++ b/lib/kokkos/benchmarks/bytes_and_flops/bench_int64_t.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "bench.hpp" template void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S, int B, int I); diff --git a/lib/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp b/lib/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp index b63d486fc9..80f017fbe8 100644 --- a/lib/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp +++ b/lib/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp @@ -15,28 +15,28 @@ //@HEADER #define UNROLL 1 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 2 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 3 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 4 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 5 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 6 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 7 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 8 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL template diff --git a/lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp b/lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp index 0f7a298c1b..78cfd48eff 100644 --- a/lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp +++ b/lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp @@ -26,7 +26,7 @@ struct Run { Kokkos::deep_copy(C, Scalar(3.5)); Kokkos::Timer timer; - for (int i = 0; i < I; ++i) { + for (int iter = 0; iter < I; ++iter) { Kokkos::parallel_for( "BenchmarkKernel", Kokkos::TeamPolicy<>(N, T).set_scratch_size(0, Kokkos::PerTeam(S)), diff --git a/lib/kokkos/benchmarks/bytes_and_flops/main.cpp b/lib/kokkos/benchmarks/bytes_and_flops/main.cpp index 20077757d1..fdfcc4ea64 100644 --- a/lib/kokkos/benchmarks/bytes_and_flops/main.cpp +++ b/lib/kokkos/benchmarks/bytes_and_flops/main.cpp @@ -16,7 +16,7 @@ #include #include -#include +#include "bench.hpp" #include extern template void run_stride_unroll(int, int, int, int, int, int, int, @@ -86,7 +86,7 @@ int main(int argc, char* argv[]) { printf("D must be one of 1,2,4,8,16,32\n"); return 0; } - if ((P < 1) && (P > 2)) { + if ((P < 1) || (P > 4)) { printf("P must be one of 1,2,3,4\n"); return 0; } diff --git a/lib/kokkos/benchmarks/gather/CMakeLists.txt b/lib/kokkos/benchmarks/gather/CMakeLists.txt new file mode 100644 index 0000000000..24c7062772 --- /dev/null +++ b/lib/kokkos/benchmarks/gather/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + gather + SOURCES main.cpp +) diff --git a/lib/kokkos/benchmarks/gather/gather.hpp b/lib/kokkos/benchmarks/gather/gather.hpp index d83461702c..90b1101c1d 100644 --- a/lib/kokkos/benchmarks/gather/gather.hpp +++ b/lib/kokkos/benchmarks/gather/gather.hpp @@ -20,28 +20,28 @@ struct RunGather { }; #define UNROLL 1 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 2 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 3 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 4 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 5 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 6 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 7 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 8 -#include +#include "gather_unroll.hpp" #undef UNROLL template diff --git a/lib/kokkos/benchmarks/gather/gather_unroll.hpp b/lib/kokkos/benchmarks/gather/gather_unroll.hpp index 5ee5742a3f..1aa73091bc 100644 --- a/lib/kokkos/benchmarks/gather/gather_unroll.hpp +++ b/lib/kokkos/benchmarks/gather/gather_unroll.hpp @@ -138,7 +138,7 @@ struct RunGather { printf( "SNKDRUF: %i %i %i %i %i %i %i Time: %lfs Bandwidth: %lfGiB/s GFlop/s: " "%lf GGather/s: %lf\n", - sizeof(Scalar) / 4, N, K, D, R, UNROLL, F, seconds, + static_cast(sizeof(Scalar) / 4), N, K, D, R, UNROLL, F, seconds, 1.0 * bytes / seconds / 1024 / 1024 / 1024, 1.e-9 * flops / seconds, 1.e-9 * gather_ops / seconds); } diff --git a/lib/kokkos/benchmarks/gather/main.cpp b/lib/kokkos/benchmarks/gather/main.cpp index 7f4fc9ede6..07fca9fdc6 100644 --- a/lib/kokkos/benchmarks/gather/main.cpp +++ b/lib/kokkos/benchmarks/gather/main.cpp @@ -16,7 +16,7 @@ #include #include -#include +#include "gather.hpp" #include int main(int argc, char* argv[]) { diff --git a/lib/kokkos/benchmarks/launch_latency/CMakeLists.txt b/lib/kokkos/benchmarks/launch_latency/CMakeLists.txt new file mode 100644 index 0000000000..bb14da749d --- /dev/null +++ b/lib/kokkos/benchmarks/launch_latency/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + launch_latency + SOURCES launch_latency.cpp +) diff --git a/lib/kokkos/benchmarks/launch_latency/launch_latency.cpp b/lib/kokkos/benchmarks/launch_latency/launch_latency.cpp new file mode 100644 index 0000000000..73b176ab8d --- /dev/null +++ b/lib/kokkos/benchmarks/launch_latency/launch_latency.cpp @@ -0,0 +1,283 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/*! \file launch_latency.cpp + + Tests of parallel_for and parallel_reduce latency for different + circumstances. + + Three launch kinds are tested: parallel_for, parallel_reduce into scalar, + and parallel_reduce into view + + N controls how large the parallel loops is + V controls how large the functor is + M controls across how many launches the latency is averaged + K controls how larege the nested loop is (no larger than V) + + For each launch kind, + 1. Avg functor dispatch latency: (time to do M launches) / M + 2. Avg functor completion throughput: (M launches + sync) / M + 3. Avg functor completion latency: (M (launch + sync)) / M +*/ + +#include + +template +struct TestFunctor { + double values[V]; + Kokkos::View a; + int K; + TestFunctor(Kokkos::View a_, int K_) : a(a_), K(K_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + for (int j = 0; j < K; j++) a(i) += 1.0 * i * values[j]; + } +}; + +template +struct TestRFunctor { + double values[V]; + Kokkos::View a; + int K; + TestRFunctor(Kokkos::View a_, int K_) : a(a_), K(K_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, double& lsum) const { + for (int j = 0; j < K; j++) a(i) += 1.0 * i * values[j]; + lsum += a(i); + } +}; + +struct Opts { + bool par_for = true; + bool par_reduce = true; + bool par_reduce_view = true; +}; + +template +void run(int N, int M, int K, const Opts& opts) { + std::string l_no_fence, l_fence, l_red_no_fence, l_red_fence, + l_red_view_no_fence, l_red_view_fence; + { + std::ostringstream ostream; + ostream << "RunNoFence_" << N << "_" << K << std::endl; + l_no_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunFence_" << N << "_" << K << std::endl; + l_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunReduceNoFence_" << N << "_" << K << std::endl; + l_red_no_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunReduceFence_" << N << "_" << K << std::endl; + l_red_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunReduceViewNoFence_" << N << "_" << K << std::endl; + l_red_view_no_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunReduceViewFence_" << N << "_" << K << std::endl; + l_red_view_fence = ostream.str(); + } + + double result; + Kokkos::View a("A", N); + Kokkos::View v_result("result"); + TestFunctor f(a, K); + TestRFunctor rf(a, K); + Kokkos::Timer timer; + + // initialize to an obviously wrong value + double time_no_fence = -1; // launch loop + double time_no_fence_fenced = -1; // launch loop then fence + double time_fence = -1; // launch&fence loop + + double time_red_no_fence = -1; + double time_red_no_fence_fenced = -1; + double time_red_fence = -1; + + double time_red_view_no_fence = -1; + double time_red_view_no_fence_fenced = -1; + double time_red_view_fence = -1; + + if (opts.par_for) { + // warmup + for (int i = 0; i < 4; ++i) { + Kokkos::parallel_for(l_no_fence, N, f); + } + Kokkos::fence(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_for(l_no_fence, N, f); + } + time_no_fence = timer.seconds(); + Kokkos::fence(); + time_no_fence_fenced = timer.seconds(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_for(l_fence, N, f); + Kokkos::fence(); + } + time_fence = timer.seconds(); + } + + if (opts.par_reduce) { + // warmup + for (int i = 0; i < 4; ++i) { + Kokkos::parallel_reduce(l_red_no_fence, N, rf, result); + } + Kokkos::fence(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_reduce(l_red_no_fence, N, rf, result); + } + time_red_no_fence = timer.seconds(); + Kokkos::fence(); + time_red_no_fence_fenced = timer.seconds(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_reduce(l_red_fence, N, rf, result); + Kokkos::fence(); + } + time_red_fence = timer.seconds(); + Kokkos::fence(); + } + + if (opts.par_reduce_view) { + // warmup + for (int i = 0; i < 4; ++i) { + Kokkos::parallel_reduce(l_red_view_no_fence, N, rf, v_result); + } + Kokkos::fence(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_reduce(l_red_view_no_fence, N, rf, v_result); + } + time_red_view_no_fence = timer.seconds(); + Kokkos::fence(); + time_red_view_no_fence_fenced = timer.seconds(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_reduce(l_red_view_fence, N, rf, v_result); + Kokkos::fence(); + } + time_red_view_fence = timer.seconds(); + Kokkos::fence(); + timer.reset(); + } + + const double x = 1.e6 / M; + printf("%i %i %i %i", N, V, K, M); + if (opts.par_for) { + printf(" parallel_for: %lf %lf ( %lf )", x * time_no_fence, x * time_fence, + x * time_no_fence_fenced); + } + if (opts.par_reduce) { + printf(" parallel_reduce: %lf %lf ( %lf )", x * time_red_no_fence, + x * time_red_fence, x * time_red_no_fence_fenced); + } + if (opts.par_reduce_view) { + printf(" parallel_reduce(view): %lf %lf ( %lf )", + x * time_red_view_no_fence, x * time_red_view_fence, + x * time_red_view_no_fence_fenced); + } + printf("\n"); +} +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + int N = 10000; + int M = 20; + int K = 1; + + Opts opts; + + printf("==========================\n"); + printf("Kokkos Launch Latency Test\n"); + printf("==========================\n"); + printf("\n"); + printf("Usage: %s ARGUMENTS [OPTIONS...]\n\n", argv[0]); + printf("Arguments: N M K\n"); + printf(" N: loop length\n"); + printf(" M: how many kernels to dispatch\n"); + printf( + " K: nested loop length (capped by size of functor member array\n\n"); + printf("Options:\n"); + printf(" --no-parallel-for: skip parallel_for benchmark\n"); + printf(" --no-parallel-reduce: skip parallel_reduce benchmark\n"); + printf( + " --no-parallel-reduce-view: skip parallel_reduce into view " + "benchmark\n"); + printf("\n\n"); + printf(" Output V is the size of the functor member array\n"); + printf("\n\n"); + + for (int i = 1; i < argc; ++i) { + const std::string_view arg(argv[i]); + + // anything that doesn't start with -- + if (arg.size() < 2 || + (arg.size() >= 2 && arg[0] != '-' && arg[1] != '-')) { + if (i == 1) + N = atoi(arg.data()); + else if (i == 2) + M = atoi(arg.data()); + else if (i == 3) + K = atoi(arg.data()); + else { + throw std::runtime_error("unexpected argument!"); + } + } else if (arg == "--no-parallel-for") { + opts.par_for = false; + } else if (arg == "--no-parallel-reduce") { + opts.par_reduce = false; + } else if (arg == "--no-parallel-reduce-view") { + opts.par_reduce_view = false; + } else { + std::stringstream ss; + ss << "unexpected argument \"" << arg << "\" at position " << i; + throw std::runtime_error(ss.str()); + } + } + + printf("N V K M time_no_fence time_fence (time_no_fence_fenced)\n"); + + /* A backend may have different launch strategies for functors of different + * sizes: test a variety of functor sizes.*/ + run<1>(N, M, K <= 1 ? K : 1, opts); + run<16>(N, M, K <= 16 ? K : 16, opts); + run<200>(N, M, K <= 200 ? K : 200, opts); + run<3000>(N, M, K <= 3000 ? K : 3000, opts); + run<30000>(N, M, K <= 30000 ? K : 30000, opts); + } + Kokkos::finalize(); +} diff --git a/lib/kokkos/benchmarks/policy_performance/CMakeLists.txt b/lib/kokkos/benchmarks/policy_performance/CMakeLists.txt new file mode 100644 index 0000000000..929b9c9702 --- /dev/null +++ b/lib/kokkos/benchmarks/policy_performance/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + policy_performance + SOURCES main.cpp +) diff --git a/lib/kokkos/benchmarks/policy_performance/main.cpp b/lib/kokkos/benchmarks/policy_performance/main.cpp index 28cfde552a..0983a3d535 100644 --- a/lib/kokkos/benchmarks/policy_performance/main.cpp +++ b/lib/kokkos/benchmarks/policy_performance/main.cpp @@ -106,8 +106,9 @@ int main(int argc, char* argv[]) { Kokkos::parallel_reduce( "parallel_reduce warmup", Kokkos::TeamPolicy<>(10, 1), - KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type team, - double& lval) { lval += 1; }, + KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type&, double& lval) { + lval += 1; + }, result); using view_type_1d = Kokkos::View; diff --git a/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp b/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp index cc2cc40257..0e23d221f6 100644 --- a/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp +++ b/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp @@ -21,13 +21,13 @@ struct ParallelScanFunctor { using value_type = double; ViewType v; - ParallelScanFunctor(const ViewType& v_) : v(v_) {} + explicit ParallelScanFunctor(const ViewType& v_) : v(v_) {} KOKKOS_INLINE_FUNCTION - void operator()(const int idx, value_type& val, const bool& final) const { + void operator()(const int idx, value_type& val, const bool& is_final) const { // inclusive scan val += v(idx); - if (final) { + if (is_final) { v(idx) = val; } } @@ -109,7 +109,7 @@ void test_policy(int team_range, int thread_range, int vector_range, vector_result = 0.0; Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, vector_range), - [&](const int vi, double& vval) { vval += 1; }, + [&](const int, double& vval) { vval += 1; }, vector_result); } v2(idx, t) = vector_result; @@ -128,7 +128,7 @@ void test_policy(int team_range, int thread_range, int vector_range, team_result = 0.0; Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, thread_range), - [&](const int t, double& lval) { lval += 1; }, team_result); + [&](const int, double& lval) { lval += 1; }, team_result); } v1(idx) = team_result; // prevent compiler optimizing loop away @@ -170,13 +170,13 @@ void test_policy(int team_range, int thread_range, int vector_range, for (int tr = 0; tr < thread_repeat; ++tr) { Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, thread_range), - [&](const int t, double& lval) { + [&](const int, double& lval) { double vector_result = 0.0; for (int vr = 0; vr < inner_repeat; ++vr) { vector_result = 0.0; Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, vector_range), - [&](const int vi, double& vval) { vval += 1; }, + [&](const int, double& vval) { vval += 1; }, vector_result); lval += vector_result; } diff --git a/lib/kokkos/benchmarks/stream/CMakeLists.txt b/lib/kokkos/benchmarks/stream/CMakeLists.txt new file mode 100644 index 0000000000..0dded6e3a5 --- /dev/null +++ b/lib/kokkos/benchmarks/stream/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + stream + SOURCES stream-kokkos.cpp +) diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper index c140087240..9b935835d5 100755 --- a/lib/kokkos/bin/nvcc_wrapper +++ b/lib/kokkos/bin/nvcc_wrapper @@ -229,7 +229,7 @@ do fi ;; #Handle known nvcc args - --dryrun|--verbose|--keep|--source-in-ptx|-src-in-ptx|--keep-dir*|-G|-lineinfo|-extended-lambda|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|--fmad=*|--use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) + --dryrun|-dryrun|--verbose|--keep|-keep|--source-in-ptx|-src-in-ptx|--keep-dir*|-keep-dir*|-G|-lineinfo|--generate-line-info|-extended-lambda|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-res-usage|-fmad=*|--use_fast_math|-use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) cuda_args="$cuda_args $1" ;; #Handle more known nvcc args diff --git a/lib/kokkos/cmake/KokkosConfig.cmake.in b/lib/kokkos/cmake/KokkosConfig.cmake.in index e26c75b312..1b6d1b66ff 100644 --- a/lib/kokkos/cmake/KokkosConfig.cmake.in +++ b/lib/kokkos/cmake/KokkosConfig.cmake.in @@ -39,10 +39,12 @@ IF("launch_compiler" IN_LIST Kokkos_FIND_COMPONENTS) GLOBAL CHECK_CUDA_COMPILES) -ELSEIF(@Kokkos_ENABLE_CUDA@ AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) +ELSEIF(@Kokkos_ENABLE_CUDA@ + AND NOT @KOKKOS_COMPILE_LANGUAGE@ STREQUAL CUDA + AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) # - # if CUDA was enabled, separable compilation was not specified, and current compiler - # cannot compile CUDA, then set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK globally and + # if CUDA was enabled, the compilation language was not set to CUDA, and separable compilation was not + # specified, then set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK globally and # kokkos_launch_compiler will re-direct to the compiler used to compile CUDA code during installation. # kokkos_launch_compiler will re-direct if ${CMAKE_CXX_COMPILER} and -DKOKKOS_DEPENDENCE is present, # otherwise, the original command will be executed diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in index 9930d2abf0..2df0f6c520 100644 --- a/lib/kokkos/cmake/KokkosCore_config.h.in +++ b/lib/kokkos/cmake/KokkosCore_config.h.in @@ -23,8 +23,6 @@ #cmakedefine KOKKOS_ENABLE_CUDA #cmakedefine KOKKOS_ENABLE_HIP #cmakedefine KOKKOS_ENABLE_HPX -#cmakedefine KOKKOS_ENABLE_MEMKIND -#cmakedefine KOKKOS_ENABLE_LIBRT #cmakedefine KOKKOS_ENABLE_SYCL #cmakedefine KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED @@ -32,6 +30,7 @@ #cmakedefine KOKKOS_ENABLE_CXX17 #cmakedefine KOKKOS_ENABLE_CXX20 #cmakedefine KOKKOS_ENABLE_CXX23 +#cmakedefine KOKKOS_ENABLE_CXX26 #cmakedefine KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE #cmakedefine KOKKOS_ENABLE_CUDA_UVM @@ -45,7 +44,6 @@ #cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK #cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK #cmakedefine KOKKOS_ENABLE_TUNING -#cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE_3 #cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE_4 #cmakedefine KOKKOS_ENABLE_DEPRECATION_WARNINGS #cmakedefine KOKKOS_ENABLE_LARGE_MEM_TESTS @@ -53,17 +51,15 @@ #cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION // deprecated #cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION #cmakedefine KOKKOS_ENABLE_IMPL_MDSPAN +#cmakedefine KOKKOS_ENABLE_ATOMICS_BYPASS /* TPL Settings */ #cmakedefine KOKKOS_ENABLE_HWLOC -#cmakedefine KOKKOS_USE_LIBRT -#cmakedefine KOKKOS_ENABLE_HBWSPACE #cmakedefine KOKKOS_ENABLE_LIBDL #cmakedefine KOKKOS_ENABLE_LIBQUADMATH -#cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND #cmakedefine KOKKOS_ENABLE_ONEDPL +#cmakedefine KOKKOS_ENABLE_ROCTHRUST -#cmakedefine KOKKOS_ARCH_SSE42 #cmakedefine KOKKOS_ARCH_ARMV80 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX #cmakedefine KOKKOS_ARCH_ARMV81 @@ -78,6 +74,7 @@ #cmakedefine KOKKOS_ARCH_POWER7 #cmakedefine KOKKOS_ARCH_POWER8 #cmakedefine KOKKOS_ARCH_POWER9 +#cmakedefine KOKKOS_ARCH_RISCV_SG2042 #cmakedefine KOKKOS_ARCH_INTEL_GEN #cmakedefine KOKKOS_ARCH_INTEL_DG1 #cmakedefine KOKKOS_ARCH_INTEL_GEN9 diff --git a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake index 792c92c07e..5a62c530fc 100644 --- a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake @@ -7,7 +7,8 @@ IF (NOT CUDAToolkit_ROOT) ENDIF() ENDIF() -IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0") +# FIXME CMake 3.28.4 creates more targets than we export +IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0" AND CMAKE_VERSION VERSION_LESS "3.28.4") find_package(CUDAToolkit) ELSE() include(${CMAKE_CURRENT_LIST_DIR}/CudaToolkit.cmake) diff --git a/lib/kokkos/cmake/Modules/FindTPLLIBRT.cmake b/lib/kokkos/cmake/Modules/FindTPLLIBRT.cmake deleted file mode 100644 index e75da56b5b..0000000000 --- a/lib/kokkos/cmake/Modules/FindTPLLIBRT.cmake +++ /dev/null @@ -1 +0,0 @@ -KOKKOS_FIND_IMPORTED(LIBRT HEADER time.h LIBRARY rt) diff --git a/lib/kokkos/cmake/Modules/FindTPLMEMKIND.cmake b/lib/kokkos/cmake/Modules/FindTPLMEMKIND.cmake deleted file mode 100644 index 20aaff2295..0000000000 --- a/lib/kokkos/cmake/Modules/FindTPLMEMKIND.cmake +++ /dev/null @@ -1 +0,0 @@ -KOKKOS_FIND_IMPORTED(MEMKIND HEADER memkind.h LIBRARY memkind) diff --git a/lib/kokkos/cmake/Modules/FindTPLONEDPL.cmake b/lib/kokkos/cmake/Modules/FindTPLONEDPL.cmake index 01791cff44..603510c315 100644 --- a/lib/kokkos/cmake/Modules/FindTPLONEDPL.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLONEDPL.cmake @@ -43,4 +43,7 @@ ELSE() COMPILE_DEFINITIONS PSTL_USE_PARALLEL_POLICIES=0 _GLIBCXX_USE_TBB_PAR_BACKEND=0 ) ENDIF() + + # Export oneDPL as a Kokkos dependency + KOKKOS_EXPORT_CMAKE_TPL(oneDPL) ENDIF() diff --git a/lib/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake b/lib/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake new file mode 100644 index 0000000000..dae7dc3c95 --- /dev/null +++ b/lib/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake @@ -0,0 +1,15 @@ +# ROCm 5.6 and earlier set AMDGPU_TARGETS and GPU_TARGETS to all the supported +# architectures. Therefore, we end up compiling Kokkos for all the supported +# architecture. Starting with ROCm 5.7 AMDGPU_TARGETS and GPU_TARGETS are empty. +# It is the user's job to set the variables. Since we are injecting the +# architecture flag ourselves, we can let the variables empty. To replicate the +# behavior of ROCm 5.7 and later for earlier version of ROCm we set +# AMDGPU_TARGETS and GPU_TARGETS to empty and set the values in the cache. If +# the values are not cached, FIND_PACKAGE(rocthrust) will overwrite them. +SET(AMDGPU_TARGETS "" CACHE STRING "AMD GPU targets to compile for") +SET(GPU_TARGETS "" CACHE STRING "GPU targets to compile for") +FIND_PACKAGE(rocthrust REQUIRED) +KOKKOS_CREATE_IMPORTED_TPL(ROCTHRUST INTERFACE LINK_LIBRARIES roc::rocthrust) + +# Export ROCTHRUST as a Kokkos dependency +KOKKOS_EXPORT_CMAKE_TPL(rocthrust) diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake index 30764bde86..34e9f05986 100644 --- a/lib/kokkos/cmake/kokkos_arch.cmake +++ b/lib/kokkos/cmake/kokkos_arch.cmake @@ -49,7 +49,6 @@ DECLARE_AND_CHECK_HOST_ARCH(ARMV81 "ARMv8.1 Compatible CPU") DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX "ARMv8 Cavium ThunderX CPU") DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX2 "ARMv8 Cavium ThunderX2 CPU") DECLARE_AND_CHECK_HOST_ARCH(A64FX "ARMv8.2 with SVE Support") -DECLARE_AND_CHECK_HOST_ARCH(WSM "Intel Westmere CPU") DECLARE_AND_CHECK_HOST_ARCH(SNB "Intel Sandy/Ivy Bridge CPUs") DECLARE_AND_CHECK_HOST_ARCH(HSW "Intel Haswell CPUs") DECLARE_AND_CHECK_HOST_ARCH(BDW "Intel Broadwell Xeon E-class CPUs") @@ -60,13 +59,12 @@ DECLARE_AND_CHECK_HOST_ARCH(SKX "Intel Skylake Xeon Server CPUs (A DECLARE_AND_CHECK_HOST_ARCH(KNC "Intel Knights Corner Xeon Phi") DECLARE_AND_CHECK_HOST_ARCH(KNL "Intel Knights Landing Xeon Phi") DECLARE_AND_CHECK_HOST_ARCH(SPR "Intel Sapphire Rapids Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(BGQ "IBM Blue Gene Q") -DECLARE_AND_CHECK_HOST_ARCH(POWER7 "IBM POWER7 CPUs") DECLARE_AND_CHECK_HOST_ARCH(POWER8 "IBM POWER8 CPUs") DECLARE_AND_CHECK_HOST_ARCH(POWER9 "IBM POWER9 CPUs") DECLARE_AND_CHECK_HOST_ARCH(ZEN "AMD Zen architecture") DECLARE_AND_CHECK_HOST_ARCH(ZEN2 "AMD Zen2 architecture") DECLARE_AND_CHECK_HOST_ARCH(ZEN3 "AMD Zen3 architecture") +DECLARE_AND_CHECK_HOST_ARCH(RISCV_SG2042 "SG2042 (RISC-V) CPUs") IF(Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR Kokkos_ENABLE_SYCL) SET(KOKKOS_SHOW_CUDA_ARCHS ON) @@ -191,9 +189,6 @@ IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang) ELSEIF(CUDAToolkit_BIN_DIR) GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${CUDAToolkit_BIN_DIR}/..) ENDIF() - IF (KOKKOS_ENABLE_CUDA) - SET(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND ON CACHE BOOL "enable CUDA Clang workarounds" FORCE) - ENDIF() ELSEIF (KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) SET(CUDA_ARCH_FLAG "-gpu") GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -cuda) @@ -342,18 +337,6 @@ IF (KOKKOS_ARCH_ZEN3) SET(KOKKOS_ARCH_AVX2 ON) ENDIF() -IF (KOKKOS_ARCH_WSM) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xSSE4.2 - MSVC NO-VALUE-SPECIFIED - NVHPC -tp=px - DEFAULT -msse4.2 - ) - SET(KOKKOS_ARCH_SSE42 ON) -ENDIF() - IF (KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX) SET(KOKKOS_ARCH_AVX ON) COMPILER_SPECIFIC_FLAGS( @@ -378,6 +361,23 @@ IF (KOKKOS_ARCH_HSW) ) ENDIF() +IF (KOKKOS_ARCH_RISCV_SG2042) + IF(NOT + (KOKKOS_CXX_COMPILER_ID STREQUAL GNU + AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) + OR + (KOKKOS_CXX_COMPILER_ID STREQUAL Clang + AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) + ) + MESSAGE(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.") + ENDIF() + COMPILER_SPECIFIC_FLAGS( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID + DEFAULT -march=rv64imafdcv + ) +ENDIF() + + IF (KOKKOS_ARCH_BDW) SET(KOKKOS_ARCH_AVX2 ON) COMPILER_SPECIFIC_FLAGS( @@ -571,6 +571,11 @@ IF (KOKKOS_ENABLE_HIP) COMPILER_SPECIFIC_FLAGS( DEFAULT -fgpu-rdc ) + IF (NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + COMPILER_SPECIFIC_LINK_OPTIONS( + DEFAULT --hip-link + ) + ENDIF() ELSE() COMPILER_SPECIFIC_FLAGS( DEFAULT -fno-gpu-rdc @@ -588,32 +593,44 @@ IF (KOKKOS_ENABLE_SYCL) ENDIF() # Check support for device_global variables -# FIXME_SYCL Once the feature test macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL is -# available, use that instead. -IF(KOKKOS_ENABLE_SYCL AND NOT BUILD_SHARED_LIBS) - INCLUDE(CheckCXXSourceCompiles) +# FIXME_SYCL If SYCL_EXT_ONEAPI_DEVICE_GLOBAL is defined, we can use device +# global variables with shared libraries using the "non-separable compilation" +# implementation. Otherwise, the feature is not supported when building shared +# libraries. Thus, we don't even check for support if shared libraries are +# requested and SYCL_EXT_ONEAPI_DEVICE_GLOBAL is not defined. +IF(KOKKOS_ENABLE_SYCL) STRING(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - CHECK_CXX_SOURCE_COMPILES(" - #include - using namespace sycl::ext::oneapi::experimental; - using namespace sycl; + INCLUDE(CheckCXXSymbolExists) + CHECK_CXX_SYMBOL_EXISTS(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + IF (KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + SET(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) + # Use the non-separable compilation implementation to support shared libraries as well. + COMPILER_SPECIFIC_FLAGS(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) + ELSEIF(NOT BUILD_SHARED_LIBS) + INCLUDE(CheckCXXSourceCompiles) + CHECK_CXX_SOURCE_COMPILES(" + #include + using namespace sycl::ext::oneapi::experimental; + using namespace sycl; - SYCL_EXTERNAL device_global Foo; + SYCL_EXTERNAL device_global Foo; - void bar(queue q) { - q.single_task([=] { - Foo = 42; - }); - } + void bar(queue q) { + q.single_task([=] { + Foo = 42; + }); + } - int main(){ return 0; } - " - KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) + int main(){ return 0; } + " + KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) - IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED - ) + IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) + # Only the separable compilation implementation is supported. + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED + ) + ENDIF() ENDIF() ENDIF() @@ -767,30 +784,35 @@ IF (KOKKOS_ENABLE_OPENMPTARGET) COMPILER_SPECIFIC_FLAGS( IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__ ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN9) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9" -D__STRICT_ANSI__ + ELSE() + COMPILER_SPECIFIC_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -D__STRICT_ANSI__ ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN11) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11" -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp" -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_DG1) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1" -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_XEHP) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4" -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_PVC) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7" -D__STRICT_ANSI__ + IF(KOKKOS_ARCH_INTEL_GEN9) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9" + ) + ELSEIF(KOKKOS_ARCH_INTEL_GEN11) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11" + ) + ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp" + ) + ELSEIF(KOKKOS_ARCH_INTEL_DG1) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1" + ) + ELSEIF(KOKKOS_ARCH_INTEL_XEHP) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4" + ) + ELSEIF(KOKKOS_ARCH_INTEL_PVC) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7" ) + ENDIF() ENDIF() ENDIF() @@ -1130,3 +1152,14 @@ MESSAGE(STATUS "Architectures:") FOREACH(Arch ${KOKKOS_ENABLED_ARCH_LIST}) MESSAGE(STATUS " ${Arch}") ENDFOREACH() + + +IF(KOKKOS_ENABLE_ATOMICS_BYPASS) + IF(NOT _HOST_PARALLEL STREQUAL "NoTypeDefined" OR NOT _DEVICE_PARALLEL STREQUAL "NoTypeDefined") + MESSAGE(FATAL_ERROR "Not allowed to disable atomics (via -DKokkos_ENABLE_AROMICS_BYPASS=ON) if neither a host parallel nor a device backend is enabled!") + ENDIF() + IF(NOT KOKKOS_ENABLE_SERIAL) + MESSAGE(FATAL_ERROR "Implementation bug") # safeguard + ENDIF() + MESSAGE(STATUS "Atomics: **DISABLED**") +ENDIF() diff --git a/lib/kokkos/cmake/kokkos_compiler_id.cmake b/lib/kokkos/cmake/kokkos_compiler_id.cmake index 04589befc3..9135ca2b41 100644 --- a/lib/kokkos/cmake/kokkos_compiler_id.cmake +++ b/lib/kokkos/cmake/kokkos_compiler_id.cmake @@ -152,6 +152,7 @@ ENDIF() SET(KOKKOS_MESSAGE_TEXT "Compiler not supported by Kokkos. Required compiler versions:") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CPU) 8.0.0 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CUDA) 10.0.0 or higher") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(OpenMPTarget) 15.0.0 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC 8.2.0 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel 19.0.5 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(CPU) 2021.1.1 or higher") @@ -210,6 +211,10 @@ ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL XL OR KOKKOS_CXX_COMPILER_ID STREQUAL XLClang) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") +ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_OPENMPTARGET) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 15.0.0) + MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + ENDIF() ENDIF() IF(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID) diff --git a/lib/kokkos/cmake/kokkos_enable_options.cmake b/lib/kokkos/cmake/kokkos_enable_options.cmake index 89e23b019b..a437f6132a 100644 --- a/lib/kokkos/cmake/kokkos_enable_options.cmake +++ b/lib/kokkos/cmake/kokkos_enable_options.cmake @@ -48,7 +48,6 @@ KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda # resolved but we keep the option around a bit longer to be safe. KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)") KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler") -KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_3 OFF "Whether code deprecated in major release 3 is available" ) KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available" ) KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" ) KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") @@ -74,6 +73,7 @@ KOKKOS_ENABLE_OPTION(HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF "Whether multiple ke # This option will go away eventually, but allows fallback to old implementation when needed. KOKKOS_ENABLE_OPTION(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation") +KOKKOS_ENABLE_OPTION(ATOMICS_BYPASS OFF "**NOT RECOMMENDED** Whether to make atomics non-atomic for non-threaded MPI-only use cases") KOKKOS_ENABLE_OPTION(IMPL_MDSPAN OFF "Whether to enable experimental mdspan support") KOKKOS_ENABLE_OPTION(MDSPAN_EXTERNAL OFF BOOL "Whether to use an external version of mdspan") diff --git a/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake b/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake index d4eca651d4..ae14a10d53 100644 --- a/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake +++ b/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake @@ -7,6 +7,7 @@ KOKKOS_OPTION(CXX_STANDARD "" STRING "[[DEPRECATED - USE CMAKE_CXX_STANDARD INST SET(KOKKOS_ENABLE_CXX17 OFF) SET(KOKKOS_ENABLE_CXX20 OFF) SET(KOKKOS_ENABLE_CXX23 OFF) +SET(KOKKOS_ENABLE_CXX26 OFF) IF (KOKKOS_CXX_STANDARD) MESSAGE(FATAL_ERROR "Setting the variable Kokkos_CXX_STANDARD in configuration is deprecated - set CMAKE_CXX_STANDARD directly instead") ENDIF() diff --git a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake index 7ad49fdd2d..b075a3e36b 100644 --- a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake +++ b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake @@ -74,6 +74,10 @@ ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "23") kokkos_set_cxx_standard_feature(23) SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2B") SET(KOKKOS_ENABLE_CXX23 ON) +ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "26") + kokkos_set_cxx_standard_feature(26) + SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2C") + SET(KOKKOS_ENABLE_CXX26 ON) ELSE() MESSAGE(FATAL_ERROR "Kokkos requires C++17 or newer but requested ${KOKKOS_CXX_STANDARD}!") ENDIF() diff --git a/lib/kokkos/cmake/kokkos_tpls.cmake b/lib/kokkos/cmake/kokkos_tpls.cmake index f124596a84..6ef3b79bde 100644 --- a/lib/kokkos/cmake/kokkos_tpls.cmake +++ b/lib/kokkos/cmake/kokkos_tpls.cmake @@ -32,19 +32,21 @@ FUNCTION(KOKKOS_TPL_OPTION PKG DEFAULT) ENDFUNCTION() KOKKOS_TPL_OPTION(HWLOC Off TRIBITS HWLOC) -KOKKOS_TPL_OPTION(MEMKIND Off) -IF(KOKKOS_ENABLE_MEMKIND) - SET(KOKKOS_ENABLE_HBWSPACE ON) -ENDIF() KOKKOS_TPL_OPTION(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) -KOKKOS_TPL_OPTION(LIBRT Off) IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT KOKKOS_HAS_TRILINOS) SET(ROCM_DEFAULT ON) ELSE() SET(ROCM_DEFAULT OFF) ENDIF() +IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_HAS_TRILINOS) + SET(ROCTHRUST_DEFAULT ON) +ELSE() + SET(ROCTHRUST_DEFAULT OFF) +ENDIF() KOKKOS_TPL_OPTION(ROCM ${ROCM_DEFAULT}) +KOKKOS_TPL_OPTION(ROCTHRUST ${ROCTHRUST_DEFAULT}) + IF(KOKKOS_ENABLE_SYCL AND NOT KOKKOS_HAS_TRILINOS) SET(ONEDPL_DEFAULT ON) ELSE() @@ -77,21 +79,18 @@ KOKKOS_TPL_OPTION(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath) #Make sure we use our local FindKokkosCuda.cmake KOKKOS_IMPORT_TPL(HPX INTERFACE) -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_IMPORT_TPL(CUDA INTERFACE) -ENDIF() +KOKKOS_IMPORT_TPL(CUDA INTERFACE) KOKKOS_IMPORT_TPL(HWLOC) -KOKKOS_IMPORT_TPL(LIBRT) KOKKOS_IMPORT_TPL(LIBDL) -KOKKOS_IMPORT_TPL(MEMKIND) IF (NOT WIN32) KOKKOS_IMPORT_TPL(THREADS INTERFACE) ENDIF() IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) KOKKOS_IMPORT_TPL(ROCM INTERFACE) - KOKKOS_IMPORT_TPL(ONEDPL INTERFACE) ENDIF() +KOKKOS_IMPORT_TPL(ONEDPL INTERFACE) KOKKOS_IMPORT_TPL(LIBQUADMATH) +KOKKOS_IMPORT_TPL(ROCTHRUST) IF (Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL) find_package(desul REQUIRED COMPONENTS atomics) @@ -119,7 +118,3 @@ STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") #Convert to a regular variable UNSET(KOKKOS_TPL_EXPORTS CACHE) SET(KOKKOS_TPL_EXPORTS ${KOKKOS_TPL_EXPORT_TEMP}) -IF (KOKKOS_ENABLE_MEMKIND) - SET(KOKKOS_ENABLE_HBWSPACE) - LIST(APPEND KOKKOS_MEMSPACE_LIST HBWSpace) -ENDIF() diff --git a/lib/kokkos/cmake/kokkos_tribits.cmake b/lib/kokkos/cmake/kokkos_tribits.cmake index b30ca70ab9..060a7a8472 100644 --- a/lib/kokkos/cmake/kokkos_tribits.cmake +++ b/lib/kokkos/cmake/kokkos_tribits.cmake @@ -237,18 +237,10 @@ ENDMACRO() ## KOKKOS_DECLARE is the declaration set ## KOKKOS_POST_INCLUDE is included at the end of Kokkos_Core.hpp MACRO(KOKKOS_CONFIGURE_CORE) - SET(FWD_BACKEND_LIST) - FOREACH(MEMSPACE ${KOKKOS_MEMSPACE_LIST}) - LIST(APPEND FWD_BACKEND_LIST ${MEMSPACE}) - ENDFOREACH() - FOREACH(BACKEND_ ${KOKKOS_ENABLED_DEVICES}) - LIST(APPEND FWD_BACKEND_LIST ${BACKEND_}) - ENDFOREACH() - MESSAGE(STATUS "Kokkos Devices: ${KOKKOS_ENABLED_DEVICES}, Kokkos Backends: ${FWD_BACKEND_LIST}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" "${FWD_BACKEND_LIST}") + MESSAGE(STATUS "Kokkos Backends: ${KOKKOS_ENABLED_DEVICES}") + KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" "${KOKKOS_ENABLED_DEVICES}") KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_SetupBackend.hpp "KOKKOS_SETUP" "setup/Kokkos_Setup" "${DEVICE_SETUP_LIST}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" "${FWD_BACKEND_LIST}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_PostInclude.hpp "KOKKOS_POST_INCLUDE" "Kokkos_Post_Include" "${KOKKOS_BACKEND_POST_INCLUDE_LIST}") + KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" "${KOKKOS_ENABLED_DEVICES}") SET(_DEFAULT_HOST_MEMSPACE "::Kokkos::HostSpace") KOKKOS_OPTION(DEFAULT_DEVICE_MEMORY_SPACE "" STRING "Override default device memory space") KOKKOS_OPTION(DEFAULT_HOST_MEMORY_SPACE "" STRING "Override default host memory space") @@ -309,7 +301,6 @@ MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES) "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_FwdBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_SetupBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_DeclareBackend.hpp" - "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_PostInclude.hpp" DESTINATION ${KOKKOS_HEADER_DIR}) ENDMACRO() diff --git a/lib/kokkos/config/test_all_sandia b/lib/kokkos/config/test_all_sandia deleted file mode 100755 index 193a162a4e..0000000000 --- a/lib/kokkos/config/test_all_sandia +++ /dev/null @@ -1,773 +0,0 @@ -#!/bin/bash -e - -# -# Global config -# - -set -o pipefail - -# Determine current machine. - -MACHINE="" -HOSTNAME=$(hostname) -PROCESSOR=`uname -p` - -if [[ "$HOSTNAME" =~ (white|ride).* ]]; then - MACHINE=white - module load git -fi - -if [[ "$HOSTNAME" =~ .*bowman.* ]]; then - MACHINE=bowman - module load git -fi - -if [[ "$HOSTNAME" == n* ]]; then # Warning: very generic name - if [[ "$PROCESSOR" = "aarch64" ]]; then - MACHINE=sullivan - module load git - fi -fi - -if [[ "$HOSTNAME" == node* ]]; then # Warning: very generic name - if [[ "$MACHINE" = "" ]]; then - MACHINE=shepard - module load git - fi -fi - -if [[ "$HOSTNAME" == apollo\.* ]]; then - MACHINE=apollo - module load git -fi - -if [[ "$HOSTNAME" == sullivan ]]; then - MACHINE=sullivan - module load git -fi - -if [[ "$HOSTNAME" == mayer\.* ]]; then - MACHINE=mayer -# module load git -fi -if [[ "$HOSTNAME" == cn* ]]; then # Warning: very generic name - MACHINE=mayer -fi - -if [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then - if [[ "$MACHINE" = "" ]]; then - MACHINE=sems - module load sems-git - fi -fi - -if [[ "$MACHINE" = "" ]]; then - echo "Unrecognized machine" >&2 - exit 1 -fi - -echo "Running on machine: $MACHINE" - -GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" -IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" -ARM_GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" -INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" -CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial" -CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial" -CUDA_IBM_BUILD_LIST="Cuda_OpenMP,Cuda_Serial" - -GCC_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized" -IBM_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" -CLANG_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" -INTEL_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" -#CUDA_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" -CUDA_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Wsign-compare,-Wtype-limits,-Wuninitialized" -PGI_WARNING_FLAGS="" - -# Default. Machine specific can override. -DEBUG=False -ARGS="" -CUSTOM_BUILD_LIST="" -DRYRUN=False -BUILD_ONLY=False -declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=1 -TEST_SCRIPT=False -SKIP_HWLOC=False -SPOT_CHECK=False - -PRINT_HELP=False -OPT_FLAG="" -CXX_FLAGS_EXTRA="" -LD_FLAGS_EXTRA="" -KOKKOS_OPTIONS="" - -# -# Handle arguments. -# - -while [[ $# > 0 ]] -do - key="$1" - - case $key in - --kokkos-path*) - KOKKOS_PATH="${key#*=}" - ;; - --build-list*) - CUSTOM_BUILD_LIST="${key#*=}" - ;; - --debug*) - DEBUG=True - ;; - --build-only*) - BUILD_ONLY=True - ;; - --test-script*) - TEST_SCRIPT=True - ;; - --skip-hwloc*) - SKIP_HWLOC=True - ;; - --num*) - NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}" - ;; - --dry-run*) - DRYRUN=True - ;; - --spot-check*) - SPOT_CHECK=True - ;; - --arch*) - ARCH_FLAG="--arch=${key#*=}" - ;; - --opt-flag*) - OPT_FLAG="${key#*=}" - ;; - --with-cuda-options*) - KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}" - ;; - --with-options*) - KOKKOS_OPTIONS="--with-options=enable_large_mem_tests,${key#*=}" - ;; - --cxxflags-extra*) - CXX_FLAGS_EXTRA="${key#*=}" - ;; - --ldflags-extra*) - LD_FLAGS_EXTRA="${key#*=}" - ;; - --help*) - PRINT_HELP=True - ;; - *) - # args, just append - ARGS="$ARGS $1" - ;; - esac - - shift -done - -SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd ) - -# Set kokkos path. -if [ -z "$KOKKOS_PATH" ]; then - KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT -else - # Ensure KOKKOS_PATH is abs path. - KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd ) -fi - -UNCOMMITTED=`cd ${KOKKOS_PATH}; git status --porcelain 2>/dev/null` -if ! [ -z "$UNCOMMITTED" ]; then - echo "WARNING!! THE FOLLOWING CHANGES ARE UNCOMMITTED!! :" - echo "$UNCOMMITTED" - echo "" -fi - -GITSTATUS=`cd ${KOKKOS_PATH}; git log -n 1 --format=oneline` -echo "Repository Status: " ${GITSTATUS} -echo "" -echo "" - -# -# Machine specific config. -# - -if [ "$MACHINE" = "sems" ]; then - source /projects/sems/modulefiles/utils/sems-modules-init.sh - - BASE_MODULE_LIST="sems-env,kokkos-env,kokkos-hwloc/1.10.1/base,sems-/" - CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base" - CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base" - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="" - fi - - if [ "$SPOT_CHECK" = "True" ]; then - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" - "gcc/6.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" - "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" - "cuda/8.0.44 $CUDA8_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - else - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/8.0.44 $CUDA8_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - fi -elif [ "$MACHINE" = "white" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - BASE_MODULE_LIST="/" - IBM_MODULE_LIST="/xl/" - CUDA_MODULE_LIST="/,gcc/6.4.0,ibm/xl/16.1.0" - - # Don't do pthread on white. - GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" - "cuda/9.0.103 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=Power8,Kepler37" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "bowman" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - BASE_MODULE_LIST="/compilers/" - - OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/16.4.258 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/17.2.174 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/18.0.128 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - ) - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=KNL" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "sullivan" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=96 - - BASE_MODULE_LIST="/" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/6.1.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS") - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=ARMv8-ThunderX" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "mayer" ]; then - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=96 - - BASE_MODULE_LIST="/" - ARM_MODULE_LIST="/" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/7.2.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "arm/1.4.0 $ARM_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $CLANG_WARNING_FLAGS") - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=ARMv8-TX2" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "shepard" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - BASE_MODULE_LIST="/" - BASE_MODULE_LIST_INTEL="/compilers/" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/17.4.196 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/18.0.128 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "pgi/17.10.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" - ) - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=HSW" - fi - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "apollo" ]; then - source /projects/sems/modulefiles/utils/sems-modules-init.sh - module use /home/projects/modulefiles/local/x86-64 - module load kokkos-env - - module load sems-git - module load sems-tex - module load sems-cmake/3.5.2 - module load sems-gdb - - SKIP_HWLOC=True - - BASE_MODULE_LIST="sems-env,kokkos-env,sems-/,kokkos-hwloc/1.10.1/base" - CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base" - CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base" - - CLANG_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,/,cuda/9.0.69" - NVCC_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,/,sems-gcc/5.3.0" - - BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP" - BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread" - BUILD_LIST_CLANG="Serial,Pthread,OpenMP" - - if [ "$SPOT_CHECK" = "True" ]; then - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" - "gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" - "clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread,OpenMP" clang++ $CUDA_WARNING_FLAGS" - "cuda/9.1 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - else - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("cuda/9.1 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "clang/6.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS" - "clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" - "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - ) - fi - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=SNB,Volta70" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -else - echo "Unhandled machine $MACHINE" >&2 - exit 1 -fi - -export OMP_NUM_THREADS=4 - -declare -i NUM_RESULTS_TO_KEEP=7 - -RESULT_ROOT_PREFIX=TestAll - -if [ "$PRINT_HELP" = "True" ]; then - echo "test_all_sandia :" - echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory" - echo " Defaults to root repo containing this script" - echo "--debug: Run tests in debug. Defaults to False" - echo "--test-script: Test this script, not Kokkos" - echo "--skip-hwloc: Do not do hwloc tests" - echo "--num=N: Number of jobs to run in parallel" - echo "--spot-check: Minimal test set to issue pull request" - echo "--dry-run: Just print what would be executed" - echo "--build-only: Just do builds, don't run anything" - echo "--opt-flag=FLAG: Optimization flag (default: -O3)" - echo "--cxxflags-extra=FLAGS: Extra flags to be added to CXX_FLAGS" - echo "--ldflags-extra=FLAGS: Extra flags to be added to LD_FLAGS" - echo "--arch=ARCHITECTURE: overwrite architecture flags" - echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS" - echo "--build-list=BUILD,BUILD,BUILD..." - echo " Provide a comma-separated list of builds instead of running all builds" - echo " Valid items:" - echo " OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial" - echo " Cuda_OpenMP, Cuda_Pthread, Cuda_Serial" - echo "" - - echo "ARGS: list of expressions matching compilers to test" - echo " supported compilers sems" - for COMPILER_DATA in "${COMPILERS[@]}"; do - ARR=($COMPILER_DATA) - COMPILER=${ARR[0]} - echo " $COMPILER" - done - echo "" - - echo "Examples:" - echo " Run all tests" - echo " % test_all_sandia" - echo "" - echo " Run all gcc tests" - echo " % test_all_sandia gcc" - echo "" - echo " Run all gcc/4.8.4 and all intel tests" - echo " % test_all_sandia gcc/4.8.4 intel" - echo "" - echo " Run all tests in debug" - echo " % test_all_sandia --debug" - echo "" - echo " Run gcc/4.8.4 and only do OpenMP and OpenMP_Serial builds" - echo " % test_all_sandia gcc/4.8.4 --build-list=OpenMP,OpenMP_Serial" - echo "" - echo "If you want to kill the tests, do:" - echo " hit ctrl-z" - echo " % kill -9 %1" - echo - exit 0 -fi - -# Set build type. -if [ "$DEBUG" = "True" ]; then - BUILD_TYPE=debug -else - BUILD_TYPE=release -fi - -# If no args provided, do all compilers. -if [ -z "$ARGS" ]; then - ARGS='?' -fi - -# Process args to figure out which compilers to test. -COMPILERS_TO_TEST="" - -for ARG in $ARGS; do - for COMPILER_DATA in "${COMPILERS[@]}"; do - ARR=($COMPILER_DATA) - COMPILER=${ARR[0]} - - if [[ "$COMPILER" = $ARG* ]]; then - if [[ "$COMPILERS_TO_TEST" != *${COMPILER}* ]]; then - COMPILERS_TO_TEST="$COMPILERS_TO_TEST $COMPILER" - else - echo "Tried to add $COMPILER twice" - fi - fi - done -done - -# -# Functions. -# - -# get_compiler_name -get_compiler_name() { - echo $1 | cut -d/ -f1 -} - -# get_compiler_version -get_compiler_version() { - echo $1 | cut -d/ -f2 -} - -# Do not call directly. -get_compiler_data() { - local compiler=$1 - local item=$2 - local compiler_name=$(get_compiler_name $compiler) - local compiler_vers=$(get_compiler_version $compiler) - - local compiler_data - for compiler_data in "${COMPILERS[@]}" ; do - local arr=($compiler_data) - - if [ "$compiler" = "${arr[0]}" ]; then - echo "${arr[$item]}" | tr , ' ' | sed -e "s//$compiler_name/g" -e "s//$compiler_vers/g" - return 0 - fi - done - - # Not found. - echo "Unreconized compiler $compiler" >&2 - exit 1 -} - -# -# For all getters, usage: -# - -get_compiler_modules() { - get_compiler_data $1 1 -} - -get_compiler_build_list() { - get_compiler_data $1 2 -} - -get_compiler_exe_name() { - get_compiler_data $1 3 -} - -get_compiler_warning_flags() { - get_compiler_data $1 4 -} - -run_cmd() { - echo "RUNNING: $*" - if [ "$DRYRUN" != "True" ]; then - eval "$* 2>&1" - fi -} - -# report_and_log_test_results -report_and_log_test_result() { - # Use sane var names. - local success=$1; local desc=$2; local comment=$3; - - if [ "$success" = "0" ]; then - echo " PASSED $desc" - echo $comment > $PASSED_DIR/$desc - else - # For failures, comment should be the name of the phase that failed. - echo " FAILED $desc" >&2 - echo $comment > $FAILED_DIR/$desc - cat ${desc}.${comment}.log - fi -} - -setup_env() { - local compiler=$1 - local compiler_modules=$(get_compiler_modules $compiler) - - module purge - - local mod - for mod in $compiler_modules; do - echo "Loading module $mod" - module load $mod 2>&1 - # It is ridiculously hard to check for the success of a loaded - # module. Module does not return error codes and piping to grep - # causes module to run in a subshell. - module list 2>&1 | grep "$mod" >& /dev/null || return 1 - done - - return 0 -} - -# single_build_and_test -single_build_and_test() { - # Use sane var names. - local compiler=$1; local build=$2; local build_type=$3; - - # Set up env. - mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type" - cd $ROOT_DIR/$compiler/"${build}-$build_type" - local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g') - setup_env $compiler >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } - - # Set up flags. - local compiler_warning_flags=$(get_compiler_warning_flags $compiler) - local compiler_exe=$(get_compiler_exe_name $compiler) - - if [[ "$build_type" = hwloc* ]]; then - local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info))) - fi - - if [[ "$OPT_FLAG" = "" ]]; then - OPT_FLAG="-O3" - fi - - if [[ "$build_type" = *debug* ]]; then - local extra_args="$extra_args --debug" - local cxxflags="-g $compiler_warning_flags" - local ldflags="-g" - else - local cxxflags="$OPT_FLAG $compiler_warning_flags" - local ldflags="${OPT_FLAG}" - fi - - local cxxflags="${cxxflags} ${CXX_FLAGS_EXTRA}" - local ldflags="${ldflags} ${LD_FLAGS_EXTRA}" - - if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then - local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS" - fi - if [[ "$KOKKOS_OPTIONS" != "" ]]; then - local extra_args="$extra_args $KOKKOS_OPTIONS" - else - local extra_args="$extra_args --with-options=enable_large_mem_tests" - fi - - echo " Starting job $desc" - - local comment="no_comment" - - if [ "$TEST_SCRIPT" = "True" ]; then - local rand=$[ 1 + $[ RANDOM % 10 ]] - sleep $rand - - if [ $rand -gt 5 ]; then - run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; } - fi - else - run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --ldflags=\"$ldflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } - local -i build_start_time=$(date +%s) - run_cmd make -j 48 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; } - local -i build_end_time=$(date +%s) - comment="build_time=$(($build_end_time-$build_start_time))" - - if [[ "$BUILD_ONLY" == False ]]; then - run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; } - local -i run_end_time=$(date +%s) - comment="$comment run_time=$(($run_end_time-$build_end_time))" - fi - fi - - report_and_log_test_result 0 $desc "$comment" - - return 0 -} - -# wait_for_jobs -wait_for_jobs() { - local -i max_jobs=$1 - local -i num_active_jobs=$(jobs | wc -l) - while [ $num_active_jobs -ge $max_jobs ] - do - sleep 1 - num_active_jobs=$(jobs | wc -l) - jobs >& /dev/null - done -} - -# run_in_background -run_in_background() { - local compiler=$1 - - local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL - # Don't override command line input. - # if [[ "$BUILD_ONLY" == True ]]; then - # num_jobs=8 - # else - if [[ "$compiler" == cuda* ]]; then - num_jobs=1 - fi - if [[ "$compiler" == clang ]]; then - num_jobs=1 - fi - # fi - wait_for_jobs $num_jobs - - single_build_and_test $* & -} - -# build_and_test_all -build_and_test_all() { - # Get compiler data. - local compiler=$1 - if [ -z "$CUSTOM_BUILD_LIST" ]; then - local compiler_build_list=$(get_compiler_build_list $compiler) - else - local compiler_build_list=$(echo "$CUSTOM_BUILD_LIST" | tr , ' ') - fi - - # Do builds. - local build - for build in $compiler_build_list - do - run_in_background $compiler $build $BUILD_TYPE - - # If not cuda, do a hwloc test too. - if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then - run_in_background $compiler $build "hwloc-$BUILD_TYPE" - fi - done - - return 0 -} - -get_test_root_dir() { - local existing_results=$(find . -maxdepth 1 -name "$RESULT_ROOT_PREFIX*" | sort) - local -i num_existing_results=$(echo $existing_results | tr ' ' '\n' | wc -l) - local -i num_to_delete=${num_existing_results}-${NUM_RESULTS_TO_KEEP} - - if [ $num_to_delete -gt 0 ]; then - /bin/rm -rf $(echo $existing_results | tr ' ' '\n' | head -n $num_to_delete) - fi - - echo $(pwd)/${RESULT_ROOT_PREFIX}_$(date +"%Y-%m-%d_%H.%M.%S") -} - -wait_summarize_and_exit() { - wait_for_jobs 1 - - echo "#######################################################" - echo "PASSED TESTS" - echo "#######################################################" - - local passed_test - for passed_test in $(\ls -1 $PASSED_DIR | sort) - do - echo $passed_test $(cat $PASSED_DIR/$passed_test) - done - - local -i rv=0 - if [ "$(ls -A $FAILED_DIR)" ]; then - echo "#######################################################" - echo "FAILED TESTS" - echo "#######################################################" - - local failed_test - for failed_test in $(\ls -1 $FAILED_DIR | sort) - do - echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)" - rv=$rv+1 - done - fi - - exit $rv -} - -# -# Main. -# - -ROOT_DIR=$(get_test_root_dir) -mkdir -p $ROOT_DIR -cd $ROOT_DIR - -PASSED_DIR=$ROOT_DIR/results/passed -FAILED_DIR=$ROOT_DIR/results/failed -mkdir -p $PASSED_DIR -mkdir -p $FAILED_DIR - -echo "Going to test compilers: " $COMPILERS_TO_TEST -for COMPILER in $COMPILERS_TO_TEST; do - echo "Testing compiler $COMPILER" - build_and_test_all $COMPILER -done - -wait_summarize_and_exit diff --git a/lib/kokkos/config/yaml/volta.yaml b/lib/kokkos/config/yaml/volta.yaml deleted file mode 100644 index f67af9c2a4..0000000000 --- a/lib/kokkos/config/yaml/volta.yaml +++ /dev/null @@ -1,4 +0,0 @@ -packages: - kokkos: - variants: +cuda +openmp +volta70 +cuda_lambda +wrapper ^cuda@10.1 - compiler: [gcc@7.2.0] diff --git a/lib/kokkos/containers/src/Kokkos_Bitset.hpp b/lib/kokkos/containers/src/Kokkos_Bitset.hpp index cd5ca4ea51..f50ab0a0f7 100644 --- a/lib/kokkos/containers/src/Kokkos_Bitset.hpp +++ b/lib/kokkos/containers/src/Kokkos_Bitset.hpp @@ -28,24 +28,6 @@ namespace Kokkos { -namespace Impl { -//! Either append to the label if the property already exists, or set it. -template -auto with_updated_label(const ViewCtorProp& view_ctor_prop, - const std::string& label) { - using vcp_t = ViewCtorProp; - //! If the label property is already set, append. Otherwise, set label. - if constexpr (vcp_t::has_label) { - vcp_t new_ctor_props(view_ctor_prop); - static_cast&>(new_ctor_props) - .value.append(label); - return new_ctor_props; - } else { - return Impl::with_properties_if_unset(view_ctor_prop, label); - } -} -} // namespace Impl - template class Bitset; @@ -92,9 +74,10 @@ class Bitset { using block_view_type = View>; public: - /// constructor + Bitset() = default; + /// arg_size := number of bit in set - Bitset(unsigned arg_size = 0u) : Bitset(Kokkos::view_alloc(), arg_size) {} + Bitset(unsigned arg_size) : Bitset(Kokkos::view_alloc(), arg_size) {} template Bitset(const Impl::ViewCtorProp& arg_prop, unsigned arg_size) @@ -108,9 +91,8 @@ class Bitset { "Allocation properties should not contain the 'pointer' property."); //! Update 'label' property and allocate. - const auto prop_copy = Kokkos::Impl::with_updated_label( - Impl::with_properties_if_unset(arg_prop, std::string("Bitset")), - " - blocks"); + const auto prop_copy = + Impl::with_properties_if_unset(arg_prop, std::string("Bitset")); m_blocks = block_view_type(prop_copy, ((m_size + block_mask) >> block_shift)); @@ -310,8 +292,8 @@ class Bitset { } private: - unsigned m_size; - unsigned m_last_block_mask; + unsigned m_size = 0; + unsigned m_last_block_mask = 0; block_view_type m_blocks; private: diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp index 84bced2cc4..e821570a8d 100644 --- a/lib/kokkos/containers/src/Kokkos_DualView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp @@ -292,15 +292,6 @@ class DualView : public ViewTraits { d_view(src.d_view), h_view(src.h_view) {} - //! Copy assignment operator (shallow copy assignment) - template - DualView& operator=(const DualView& src) { - modified_flags = src.modified_flags; - d_view = src.d_view; - h_view = src.h_view; - return *this; - } - //! Subview constructor template DualView(const DualView& src, const Arg0& arg0, Args... args) diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp index 52aa86d8ee..5fa59f1b7c 100644 --- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -1340,7 +1340,7 @@ class ViewMapping< template struct apply { - static_assert(Kokkos::is_memory_traits::value, ""); + static_assert(Kokkos::is_memory_traits::value); using traits_type = Kokkos::ViewTraits::type, Args...>( - v.data(), v.impl_map().layout()); + auto layout = v.impl_map().layout(); + + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + is_layouttiled::value) { + for (int i = N; i < 7; ++i) + layout.dimension[i] = KOKKOS_IMPL_CTOR_DEFAULT_ARG; + } + + return View::type, Args...>(v.data(), layout); } template diff --git a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp index 92aead2878..91a7e4a927 100644 --- a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp +++ b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp @@ -124,15 +124,8 @@ KOKKOS_INLINE_FUNCTION void offsetview_verify_operator_bounds( args...); Kokkos::Impl::throw_runtime_exception(std::string(buffer));)) - KOKKOS_IF_ON_DEVICE(( - /* Check #1: is there a SharedAllocationRecord? - (we won't use it, but if it is not there then there isn't - a corresponding SharedAllocationHeader containing a label). - This check should cover the case of Views that don't - have the Unmanaged trait but were initialized by pointer. */ - if (tracker.has_record()) { - Kokkos::Impl::operator_bounds_error_on_device(map); - } else { Kokkos::abort("OffsetView bounds error"); })) + KOKKOS_IF_ON_DEVICE( + (Kokkos::abort("OffsetView bounds error"); (void)tracker;)) } } diff --git a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp index e001c062de..78a6a238ec 100644 --- a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp +++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp @@ -243,16 +243,16 @@ class UnorderedMap { using const_map_type = UnorderedMap; - static const bool is_set = std::is_void::value; - static const bool has_const_key = - std::is_same::value; - static const bool has_const_value = - is_set || std::is_same::value; + static constexpr bool is_set = std::is_void_v; + static constexpr bool has_const_key = + std::is_same_v; + static constexpr bool has_const_value = + is_set || std::is_same_v; - static const bool is_insertable_map = + static constexpr bool is_insertable_map = !has_const_key && (is_set || !has_const_value); - static const bool is_modifiable_map = has_const_key && !has_const_value; - static const bool is_const_map = has_const_key && has_const_value; + static constexpr bool is_modifiable_map = has_const_key && !has_const_value; + static constexpr bool is_const_map = has_const_key && has_const_value; using insert_result = UnorderedMapInsertResult; @@ -337,27 +337,27 @@ class UnorderedMap { Impl::get_property(prop_copy) + " - size")); m_available_indexes = - bitset_type(Kokkos::Impl::with_updated_label(prop_copy, " - bitset"), + bitset_type(Kokkos::Impl::append_to_label(prop_copy, " - bitset"), calculate_capacity(capacity_hint)); m_hash_lists = size_type_view( - Kokkos::Impl::with_updated_label(prop_copy_noinit, " - hash list"), + Kokkos::Impl::append_to_label(prop_copy_noinit, " - hash list"), Impl::find_hash_size(capacity())); m_next_index = size_type_view( - Kokkos::Impl::with_updated_label(prop_copy_noinit, " - next index"), + Kokkos::Impl::append_to_label(prop_copy_noinit, " - next index"), capacity() + 1); // +1 so that the *_at functions can always return a // valid reference - m_keys = key_type_view( - Kokkos::Impl::with_updated_label(prop_copy, " - keys"), capacity()); + m_keys = key_type_view(Kokkos::Impl::append_to_label(prop_copy, " - keys"), + capacity()); - m_values = value_type_view( - Kokkos::Impl::with_updated_label(prop_copy, " - values"), - is_set ? 0 : capacity()); + m_values = + value_type_view(Kokkos::Impl::append_to_label(prop_copy, " - values"), + is_set ? 0 : capacity()); m_scalars = - scalars_view(Kokkos::Impl::with_updated_label(prop_copy, " - scalars")); + scalars_view(Kokkos::Impl::append_to_label(prop_copy, " - scalars")); /** * Deep copies should also be done using the space instance if given. diff --git a/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp index 8f8cd9523b..a979ee40d8 100644 --- a/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp +++ b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp @@ -27,6 +27,18 @@ namespace Kokkos { namespace Impl { +//! Append to the label contained in view_ctor_prop. +template +auto append_to_label(const ViewCtorProp& view_ctor_prop, + const std::string& label) { + using vcp_t = ViewCtorProp; + static_assert(vcp_t::has_label); + vcp_t new_ctor_props(view_ctor_prop); + static_cast&>(new_ctor_props) + .value.append(label); + return new_ctor_props; +} + uint32_t find_hash_size(uint32_t size); template diff --git a/lib/kokkos/containers/unit_tests/Makefile b/lib/kokkos/containers/unit_tests/Makefile index 2e35832cc8..18410882bc 100644 --- a/lib/kokkos/containers/unit_tests/Makefile +++ b/lib/kokkos/containers/unit_tests/Makefile @@ -35,8 +35,8 @@ TESTS = Bitset DualView DynamicView DynViewAPI_generic DynViewAPI_rank12345 DynV tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ tmp2 := $(foreach test, $(TESTS), \ $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include" > Test$(device)_$(test).cpp); \ - $(shell echo "\#include" >> Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include" > Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include" >> Test$(device)_$(test).cpp); \ )\ ) \ ) diff --git a/lib/kokkos/containers/unit_tests/TestBitset.hpp b/lib/kokkos/containers/unit_tests/TestBitset.hpp index 3ad0d2bf57..9923453f72 100644 --- a/lib/kokkos/containers/unit_tests/TestBitset.hpp +++ b/lib/kokkos/containers/unit_tests/TestBitset.hpp @@ -23,6 +23,8 @@ #include #include +#include <../../core/unit_test/tools/include/ToolTestingUtilities.hpp> + namespace Test { namespace Impl { @@ -155,7 +157,7 @@ void test_bitset() { { unsigned ts = 100u; - bitset_type b1; + bitset_type b1(Kokkos::view_alloc("MyBitset"), 0); ASSERT_TRUE(b1.is_allocated()); b1 = bitset_type(ts); @@ -165,6 +167,9 @@ void test_bitset() { ASSERT_TRUE(b1.is_allocated()); ASSERT_TRUE(b2.is_allocated()); ASSERT_TRUE(b3.is_allocated()); + + bitset_type b4; + ASSERT_FALSE(b4.is_allocated()); } std::array test_sizes = { @@ -237,6 +242,24 @@ void test_bitset() { } TEST(TEST_CATEGORY, bitset) { test_bitset(); } + +TEST(TEST_CATEGORY, bitset_default_constructor_no_alloc) { + using namespace Kokkos::Test::Tools; + listen_tool_events(Config::DisableAll(), Config::EnableAllocs()); + + auto success = validate_absence( + [&]() { + Kokkos::Bitset bs; + EXPECT_FALSE(bs.is_allocated()); + }, + [&](AllocateDataEvent) { + return MatchDiagnostic{true, {"Found alloc event"}}; + }); + ASSERT_TRUE(success); + + listen_tool_events(Config::DisableAll()); +} + } // namespace Test #endif // KOKKOS_TEST_BITSET_HPP diff --git a/lib/kokkos/core/perf_test/CMakeLists.txt b/lib/kokkos/core/perf_test/CMakeLists.txt index 7f3916da31..e0dba03e1e 100644 --- a/lib/kokkos/core/perf_test/CMakeLists.txt +++ b/lib/kokkos/core/perf_test/CMakeLists.txt @@ -50,8 +50,8 @@ ELSE() FetchContent_Declare( googlebenchmark DOWNLOAD_EXTRACT_TIMESTAMP FALSE - URL https://github.com/google/benchmark/archive/refs/tags/v1.6.2.tar.gz - URL_HASH MD5=14d14849e075af116143a161bc3b927b + URL https://github.com/google/benchmark/archive/refs/tags/v1.7.1.tar.gz + URL_HASH MD5=0459a6c530df9851bee6504c3e37c2e7 ) FetchContent_MakeAvailable(googlebenchmark) list(POP_BACK CMAKE_MESSAGE_INDENT) diff --git a/lib/kokkos/core/src/CMakeLists.txt b/lib/kokkos/core/src/CMakeLists.txt index 012af0a7d0..b84677e61b 100644 --- a/lib/kokkos/core/src/CMakeLists.txt +++ b/lib/kokkos/core/src/CMakeLists.txt @@ -18,10 +18,16 @@ IF (NOT desul_FOUND) ENDIF() IF(KOKKOS_ENABLE_SYCL) SET(DESUL_ATOMICS_ENABLE_SYCL ON) + IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED AND NOT KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + SET(DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION ON) + ENDIF() ENDIF() IF(KOKKOS_ENABLE_OPENMPTARGET) SET(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP ENDIF() + IF(KOKKOS_ENABLE_OPENACC) + SET(DESUL_ATOMICS_ENABLE_OPENACC ON) + ENDIF() CONFIGURE_FILE( ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/Config.hpp.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/desul/atomics/Config.hpp @@ -80,10 +86,6 @@ IF (KOKKOS_ENABLE_HPX) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.hpp) ENDIF() -IF (NOT KOKKOS_ENABLE_MEMKIND) - LIST(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/Kokkos_HBWSpace.cpp) -ENDIF() - IF (KOKKOS_ENABLE_SERIAL) APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.cpp) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.hpp) @@ -180,20 +182,15 @@ IF (Kokkos_ENABLE_IMPL_MDSPAN) ENDIF() KOKKOS_LINK_TPL(kokkoscore PUBLIC HWLOC) -KOKKOS_LINK_TPL(kokkoscore PUBLIC MEMKIND) -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_LINK_TPL(kokkoscore PUBLIC CUDA) -ENDIF() +KOKKOS_LINK_TPL(kokkoscore PUBLIC CUDA) KOKKOS_LINK_TPL(kokkoscore PUBLIC HPX) KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL) -KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBRT) # On *nix-like systems (Linux, macOS) we need pthread for C++ std::thread IF (NOT WIN32) KOKKOS_LINK_TPL(kokkoscore PUBLIC THREADS) ENDIF() IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM) - KOKKOS_LINK_TPL(kokkoscore PUBLIC ONEDPL) ENDIF() # FIXME: We need a proper solution to figure out whether to enable diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp index 8bfaf8317b..276d03da26 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp @@ -46,7 +46,6 @@ static_assert(false, namespace Kokkos { namespace Impl { -class CudaExec; class CudaInternal; } // namespace Impl } // namespace Kokkos @@ -129,33 +128,16 @@ class Cuda { /// \brief True if and only if this method is being called in a /// thread-parallel function. - KOKKOS_INLINE_FUNCTION static int in_parallel() { + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() { #if defined(__CUDA_ARCH__) return true; #else return false; #endif } - - /** \brief Set the device in a "sleep" state. - * - * This function sets the device in a "sleep" state in which it is - * not ready for work. This may consume less resources than if the - * device were in an "awake" state, but it may also take time to - * bring the device from a sleep state to be ready for work. - * - * \return True if the device is in the "sleep" state, else false if - * the device is actively working and could not enter the "sleep" - * state. - */ - static bool sleep(); - - /// \brief Wake the device from the 'sleep' state so it is ready for work. - /// - /// \return True if the device is in the "ready" state, else "false" - /// if the device is actively working (which also means that it's - /// awake). - static bool wake(); +#endif /// \brief Wait until all dispatched functors complete. /// @@ -199,18 +181,37 @@ class Cuda { //! Initialize, telling the CUDA run-time library which device to use. static void impl_initialize(InitializationSettings const&); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 /// \brief Cuda device architecture of the selected device. /// /// This matches the __CUDA_ARCH__ specification. - static size_type device_arch(); + KOKKOS_DEPRECATED static size_type device_arch() { + const cudaDeviceProp& cudaProp = Cuda().cuda_device_prop(); + return cudaProp.major * 100 + cudaProp.minor; + } //! Query device count. - static size_type detect_device_count(); + KOKKOS_DEPRECATED static size_type detect_device_count() { + int count; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); + return count; + } /** \brief Detect the available devices and their architecture * as defined by the __CUDA_ARCH__ specification. */ - static std::vector detect_device_arch(); + KOKKOS_DEPRECATED static std::vector detect_device_arch() { + int count; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); + std::vector out; + for (int i = 0; i < count; ++i) { + cudaDeviceProp prop; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, i)); + out.push_back(prop.major * 100 + prop.minor); + } + return out; + } +#endif cudaStream_t cuda_stream() const; int cuda_device() const; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index c6512f44da..0944937e1b 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -33,7 +33,6 @@ //#include #include -#include #include @@ -83,11 +82,11 @@ void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) { KOKKOS_IMPL_CUDA_SAFE_CALL( (CudaInternal::singleton().cuda_memcpy_async_wrapper( dst, src, n, cudaMemcpyDefault, s))); - Impl::cuda_stream_synchronize( - s, + Kokkos::Tools::Experimental::Impl::profile_fence_event( + "Kokkos::Impl::DeepCopyAsyncCuda: Deep Copy Stream Sync", Kokkos::Tools::Experimental::SpecialSynchronizationCases:: DeepCopyResourceSynchronization, - "Kokkos::Impl::DeepCopyAsyncCuda: Deep Copy Stream Sync"); + [&]() { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(s)); }); } } // namespace Impl @@ -135,11 +134,23 @@ void kokkos_impl_cuda_set_pin_uvm_to_host(bool val) { namespace Kokkos { -CudaSpace::CudaSpace() : m_device(Kokkos::Cuda().cuda_device()) {} +CudaSpace::CudaSpace() + : m_device(Kokkos::Cuda().cuda_device()), + m_stream(Kokkos::Cuda().cuda_stream()) {} +CudaSpace::CudaSpace(int device_id, cudaStream_t stream) + : m_device(device_id), m_stream(stream) {} -CudaUVMSpace::CudaUVMSpace() : m_device(Kokkos::Cuda().cuda_device()) {} +CudaUVMSpace::CudaUVMSpace() + : m_device(Kokkos::Cuda().cuda_device()), + m_stream(Kokkos::Cuda().cuda_stream()) {} +CudaUVMSpace::CudaUVMSpace(int device_id, cudaStream_t stream) + : m_device(device_id), m_stream(stream) {} -CudaHostPinnedSpace::CudaHostPinnedSpace() {} +CudaHostPinnedSpace::CudaHostPinnedSpace() + : m_device(Kokkos::Cuda().cuda_device()), + m_stream(Kokkos::Cuda().cuda_stream()) {} +CudaHostPinnedSpace::CudaHostPinnedSpace(int device_id, cudaStream_t stream) + : m_device(device_id), m_stream(stream) {} size_t memory_threshold_g = 40000; // 40 kB @@ -161,52 +172,38 @@ void *CudaSpace::allocate(const char *arg_label, const size_t arg_alloc_size, } namespace { -void *impl_allocate_common(const Cuda &exec_space, const char *arg_label, - const size_t arg_alloc_size, +void *impl_allocate_common(const int device_id, + [[maybe_unused]] const cudaStream_t stream, + const char *arg_label, const size_t arg_alloc_size, const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle, - bool exec_space_provided) { + [[maybe_unused]] bool stream_sync_only) { void *ptr = nullptr; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(device_id)); + cudaError_t error_code = cudaSuccess; #ifndef CUDART_VERSION #error CUDART_VERSION undefined! #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) - cudaError_t error_code; if (arg_alloc_size >= memory_threshold_g) { - if (exec_space_provided) { - error_code = - exec_space.impl_internal_space_instance()->cuda_malloc_async_wrapper( - &ptr, arg_alloc_size); - exec_space.fence("Kokkos::Cuda: backend fence after async malloc"); - } else { - error_code = Impl::CudaInternal::singleton().cuda_malloc_async_wrapper( - &ptr, arg_alloc_size); - Impl::cuda_device_synchronize( - "Kokkos::Cuda: backend fence after async malloc"); + error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream); + + if (error_code == cudaSuccess) { + if (stream_sync_only) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream)); + } else { + Impl::cuda_device_synchronize( + "Kokkos::Cuda: backend fence after async malloc"); + } } - } else { - error_code = - (exec_space_provided - ? exec_space.impl_internal_space_instance()->cuda_malloc_wrapper( - &ptr, arg_alloc_size) - : Impl::CudaInternal::singleton().cuda_malloc_wrapper( - &ptr, arg_alloc_size)); - } -#else - cudaError_t error_code; - if (exec_space_provided) { - error_code = exec_space.impl_internal_space_instance()->cuda_malloc_wrapper( - &ptr, arg_alloc_size); - } else { - error_code = Impl::CudaInternal::singleton().cuda_malloc_wrapper( - &ptr, arg_alloc_size); - } + } else #endif + { error_code = cudaMalloc(&ptr, arg_alloc_size); } if (error_code != cudaSuccess) { // TODO tag as unlikely branch // This is the only way to clear the last error, which // we should do here since we're turning it into an // exception here - exec_space.impl_internal_space_instance()->cuda_get_last_error_wrapper(); + cudaGetLastError(); throw Experimental::CudaRawMemoryAllocationFailure( arg_alloc_size, error_code, Experimental::RawMemoryAllocationFailure::AllocationMechanism:: @@ -226,7 +223,7 @@ void *CudaSpace::impl_allocate( const char *arg_label, const size_t arg_alloc_size, const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle) const { - return impl_allocate_common(Kokkos::Cuda{}, arg_label, arg_alloc_size, + return impl_allocate_common(m_device, m_stream, arg_label, arg_alloc_size, arg_logical_size, arg_handle, false); } @@ -234,8 +231,9 @@ void *CudaSpace::impl_allocate( const Cuda &exec_space, const char *arg_label, const size_t arg_alloc_size, const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle) const { - return impl_allocate_common(exec_space, arg_label, arg_alloc_size, - arg_logical_size, arg_handle, true); + return impl_allocate_common( + exec_space.cuda_device(), exec_space.cuda_stream(), arg_label, + arg_alloc_size, arg_logical_size, arg_handle, true); } void *CudaUVMSpace::allocate(const size_t arg_alloc_size) const { @@ -256,28 +254,27 @@ void *CudaUVMSpace::impl_allocate( if (arg_alloc_size > 0) { Kokkos::Impl::num_uvm_allocations++; - auto error_code = - Impl::CudaInternal::singleton().cuda_malloc_managed_wrapper( - &ptr, arg_alloc_size, cudaMemAttachGlobal); - -#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST - if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host()) - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_mem_advise_wrapper( - ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, - cudaCpuDeviceId))); -#endif + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + cudaError_t error_code = + cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal); if (error_code != cudaSuccess) { // TODO tag as unlikely branch // This is the only way to clear the last error, which // we should do here since we're turning it into an // exception here - Impl::CudaInternal::singleton().cuda_get_last_error_wrapper(); + cudaGetLastError(); throw Experimental::CudaRawMemoryAllocationFailure( arg_alloc_size, error_code, Experimental::RawMemoryAllocationFailure::AllocationMechanism:: CudaMallocManaged); } + +#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST + if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host()) + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMemAdvise(ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, + cudaCpuDeviceId)); +#endif } Cuda::impl_static_fence( "Kokkos::CudaUVMSpace::impl_allocate: Post UVM Allocation"); @@ -302,13 +299,14 @@ void *CudaHostPinnedSpace::impl_allocate( const Kokkos::Tools::SpaceHandle arg_handle) const { void *ptr = nullptr; - auto error_code = Impl::CudaInternal::singleton().cuda_host_alloc_wrapper( - &ptr, arg_alloc_size, cudaHostAllocDefault); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + cudaError_t error_code = + cudaHostAlloc(&ptr, arg_alloc_size, cudaHostAllocDefault); if (error_code != cudaSuccess) { // TODO tag as unlikely branch // This is the only way to clear the last error, which // we should do here since we're turning it into an // exception here - Impl::CudaInternal::singleton().cuda_get_last_error_wrapper(); + cudaGetLastError(); throw Experimental::CudaRawMemoryAllocationFailure( arg_alloc_size, error_code, Experimental::RawMemoryAllocationFailure::AllocationMechanism:: @@ -350,18 +348,17 @@ void CudaSpace::impl_deallocate( if (arg_alloc_size >= memory_threshold_g) { Impl::cuda_device_synchronize( "Kokkos::Cuda: backend fence before async free"); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_free_async_wrapper( - arg_alloc_ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeAsync(arg_alloc_ptr, m_stream)); Impl::cuda_device_synchronize( "Kokkos::Cuda: backend fence after async free"); } else { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); } #else - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); #endif } catch (...) { } @@ -393,8 +390,8 @@ void CudaUVMSpace::impl_deallocate( try { if (arg_alloc_ptr != nullptr) { Kokkos::Impl::num_uvm_allocations--; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); } } catch (...) { } @@ -424,8 +421,8 @@ void CudaHostPinnedSpace::impl_deallocate( reported_size); } try { - KOKKOS_IMPL_CUDA_SAFE_CALL(( - Impl::CudaInternal::singleton().cuda_free_host_wrapper(arg_alloc_ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr)); } catch (...) { } } @@ -438,160 +435,6 @@ void CudaHostPinnedSpace::impl_deallocate( namespace Kokkos { namespace Impl { -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord - SharedAllocationRecord::s_root_record; - -SharedAllocationRecord - SharedAllocationRecord::s_root_record; - -SharedAllocationRecord - SharedAllocationRecord::s_root_record; -#endif - -//============================================================================== -// {{{1 - -SharedAllocationRecord::~SharedAllocationRecord() { - auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, (alloc_size - sizeof(SharedAllocationHeader))); -} - -void SharedAllocationRecord::deep_copy_header_no_exec( - void *ptr, const void *header) { - Kokkos::Cuda exec; - Kokkos::Impl::DeepCopy(exec, ptr, header, - sizeof(SharedAllocationHeader)); - exec.fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -// end SharedAllocationRecord destructors }}}1 -//============================================================================== - -//============================================================================== -// {{{1 - -SharedAllocationRecord::SharedAllocationRecord( - const Kokkos::CudaSpace &arg_space, const std::string &arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - Kokkos::Cuda exec; - Kokkos::Impl::DeepCopy( - exec, RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - exec.fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -SharedAllocationRecord::SharedAllocationRecord( - const Kokkos::Cuda &arg_exec_space, const Kokkos::CudaSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_exec_space, arg_space, - arg_label, arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - Kokkos::Impl::DeepCopy(arg_exec_space, - RecordBase::m_alloc_ptr, &header, - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord::SharedAllocationRecord( - const Kokkos::CudaUVMSpace &arg_space, const std::string &arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::CudaHostPinnedSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -// end SharedAllocationRecord constructors }}}1 -//============================================================================== - void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, bool to_device) { if ((ptr == nullptr) || (bytes == 0)) return; @@ -620,19 +463,12 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, #include -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class SharedAllocationRecordCommon; -template class HostInaccessibleSharedAllocationRecordCommon; -template class SharedAllocationRecordCommon; -template class SharedAllocationRecordCommon; - -} // end namespace Impl -} // end namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::CudaSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::CudaUVMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::CudaHostPinnedSpace); // end Explicit instantiations of CRTP Base classes }}}1 //============================================================================== diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp index b8fa335cd3..0e20193e8b 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp @@ -68,6 +68,11 @@ class CudaSpace { /*--------------------------------*/ CudaSpace(); + + private: + CudaSpace(int device_id, cudaStream_t stream); + + public: CudaSpace(CudaSpace&& rhs) = default; CudaSpace(const CudaSpace& rhs) = default; CudaSpace& operator=(CudaSpace&& rhs) = default; @@ -89,9 +94,11 @@ class CudaSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; + static CudaSpace impl_create(int device_id, cudaStream_t stream) { + return CudaSpace(device_id, stream); + } + private: - template - friend class Kokkos::Experimental::LogicalMemorySpace; void* impl_allocate(const Cuda& exec_space, const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, @@ -112,10 +119,10 @@ class CudaSpace { static constexpr const char* name() { return m_name; } private: - int m_device; ///< Which Cuda device + int m_device; + cudaStream_t m_stream; static constexpr const char* m_name = "Cuda"; - friend class Kokkos::Impl::SharedAllocationRecord; }; template <> @@ -149,6 +156,11 @@ class CudaUVMSpace { /*--------------------------------*/ CudaUVMSpace(); + + private: + CudaUVMSpace(int device_id, cudaStream_t stream); + + public: CudaUVMSpace(CudaUVMSpace&& rhs) = default; CudaUVMSpace(const CudaUVMSpace& rhs) = default; CudaUVMSpace& operator=(CudaUVMSpace&& rhs) = default; @@ -156,6 +168,16 @@ class CudaUVMSpace { ~CudaUVMSpace() = default; /**\brief Allocate untracked memory in the cuda space */ + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -167,8 +189,6 @@ class CudaUVMSpace { const size_t arg_logical_size = 0) const; private: - template - friend class Kokkos::Experimental::LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -189,8 +209,13 @@ class CudaUVMSpace { #endif /*--------------------------------*/ + static CudaUVMSpace impl_create(int device_id, cudaStream_t stream) { + return CudaUVMSpace(device_id, stream); + } + private: - int m_device; ///< Which Cuda device + int m_device; + cudaStream_t m_stream; #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST static bool kokkos_impl_cuda_pin_uvm_to_host_v; @@ -223,6 +248,11 @@ class CudaHostPinnedSpace { /*--------------------------------*/ CudaHostPinnedSpace(); + + private: + CudaHostPinnedSpace(int device_id, cudaStream_t stream); + + public: CudaHostPinnedSpace(CudaHostPinnedSpace&& rhs) = default; CudaHostPinnedSpace(const CudaHostPinnedSpace& rhs) = default; CudaHostPinnedSpace& operator=(CudaHostPinnedSpace&& rhs) = default; @@ -230,6 +260,16 @@ class CudaHostPinnedSpace { ~CudaHostPinnedSpace() = default; /**\brief Allocate untracked memory in the space */ + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -240,9 +280,11 @@ class CudaHostPinnedSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; + static CudaHostPinnedSpace impl_create(int device_id, cudaStream_t stream) { + return CudaHostPinnedSpace(device_id, stream); + } + private: - template - friend class Kokkos::Experimental::LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -258,6 +300,9 @@ class CudaHostPinnedSpace { static constexpr const char* name() { return m_name; } private: + int m_device; + cudaStream_t m_stream; + static constexpr const char* m_name = "CudaHostPinned"; /*--------------------------------*/ @@ -280,15 +325,12 @@ const std::unique_ptr& cuda_get_deep_copy_space( bool initialize = true); static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); -static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaSpace>::assignable); +static_assert(Kokkos::Impl::MemorySpaceAccess< + Kokkos::CudaUVMSpace, Kokkos::CudaUVMSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaHostPinnedSpace>::assignable); //---------------------------------------- @@ -516,179 +558,10 @@ struct DeepCopy -class SharedAllocationRecord - : public HostInaccessibleSharedAllocationRecordCommon { - private: - friend class SharedAllocationRecord; - friend class SharedAllocationRecordCommon; - friend class HostInaccessibleSharedAllocationRecordCommon; - - using RecordBase = SharedAllocationRecord; - using base_t = - HostInaccessibleSharedAllocationRecordCommon; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const Kokkos::CudaSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - // This constructor does not forward to the one without exec_space arg - // in order to work around https://github.com/kokkos/kokkos/issues/5258 - // This constructor is templated so I can't just put it into the cpp file - // like the other constructor. - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, const Kokkos::CudaSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - // workaround for issue with NVCC and MSVC - // https://github.com/kokkos/kokkos/issues/5258 - deep_copy_header_no_exec(RecordBase::m_alloc_ptr, &header); - } - - SharedAllocationRecord( - const Kokkos::Cuda& exec_space, const Kokkos::CudaSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const Kokkos::CudaSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - // helper function to work around MSVC+NVCC issue - // https://github.com/kokkos/kokkos/issues/5258 - static void deep_copy_header_no_exec(void*, const void*); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon { - private: - friend class SharedAllocationRecordCommon; - - using base_t = SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - static RecordBase s_root_record; - - const Kokkos::CudaUVMSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - // This constructor does not forward to the one without exec_space arg - // in order to work around https://github.com/kokkos/kokkos/issues/5258 - // This constructor is templated so I can't just put it into the cpp file - // like the other constructor. - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); - } - - SharedAllocationRecord( - const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon { - private: - friend class SharedAllocationRecordCommon; - - using RecordBase = SharedAllocationRecord; - using base_t = SharedAllocationRecordCommon; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - static RecordBase s_root_record; - - const Kokkos::CudaHostPinnedSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - // This constructor does not forward to the one without exec_space arg - // in order to work around https://github.com/kokkos/kokkos/issues/5258 - // This constructor is templated so I can't just put it into the cpp file - // like the other constructor. - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::CudaHostPinnedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); - } - - SharedAllocationRecord( - const Kokkos::CudaHostPinnedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::CudaSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaUVMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaHostPinnedSpace); //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp index f68e05f780..c4458c910c 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp @@ -27,10 +27,6 @@ namespace Kokkos { namespace Impl { -void cuda_stream_synchronize( - const cudaStream_t stream, - Kokkos::Tools::Experimental::SpecialSynchronizationCases reason, - const std::string& name); void cuda_device_synchronize(const std::string& name); void cuda_stream_synchronize(const cudaStream_t stream, const std::string& name); diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp index a4d064e544..5a821ab64a 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp @@ -23,8 +23,7 @@ #include -#include // GraphAccess needs to be complete -#include // SharedAllocationRecord +#include // GraphAccess needs to be complete #include #include @@ -50,10 +49,6 @@ class GraphNodeKernelImpl m_graph_ptr = nullptr; Kokkos::ObservingRawPtr m_graph_node_ptr = nullptr; - // Note: owned pointer to CudaSpace memory (used for global memory launches), - // which we're responsible for deallocating, but not responsible for calling - // its destructor. - using Record = Kokkos::Impl::SharedAllocationRecord; // Basically, we have to make this mutable for the same reasons that the // global kernel buffers in the Cuda instance are mutable... mutable Kokkos::OwningRawPtr m_driver_storage = nullptr; @@ -82,9 +77,7 @@ class GraphNodeKernelImpl allocate_driver_memory_buffer() const { KOKKOS_EXPECTS(m_driver_storage == nullptr) - - auto* record = Record::allocate( - Kokkos::CudaSpace{}, "GraphNodeKernel global memory functor storage", - sizeof(base_t)); - - Record::increment(record); - m_driver_storage = reinterpret_cast(record->data()); + m_driver_storage = static_cast(Kokkos::CudaSpace().allocate( + "GraphNodeKernel global memory functor storage", sizeof(base_t))); KOKKOS_ENSURES(m_driver_storage != nullptr) return m_driver_storage; } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp index d7f853d991..849e8b3b30 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -26,10 +26,10 @@ #include -#include -#include -#include -#include +//#include +//#include +//#include +//#include #include #include #include @@ -97,21 +97,21 @@ __global__ void query_cuda_kernel_arch(int *d_arch) { } /** Query what compute capability is actually launched to the device: */ -int cuda_kernel_arch() { +int cuda_kernel_arch(int device_id) { int arch = 0; int *d_arch = nullptr; - KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_malloc_wrapper( - reinterpret_cast(&d_arch), sizeof(int)))); - KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_memcpy_wrapper( - d_arch, &arch, sizeof(int), cudaMemcpyDefault))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(device_id)); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc(reinterpret_cast(&d_arch), sizeof(int))); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMemcpy(d_arch, &arch, sizeof(int), cudaMemcpyDefault)); query_cuda_kernel_arch<<<1, 1>>>(d_arch); - KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_memcpy_wrapper( - &arch, d_arch, sizeof(int), cudaMemcpyDefault))); KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_free_wrapper(d_arch))); + cudaMemcpy(&arch, d_arch, sizeof(int), cudaMemcpyDefault)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(d_arch)); return arch; } @@ -135,7 +135,6 @@ Kokkos::View cuda_global_unique_token_locks( return locks; } -// FIXME_CUDA_MULTIPLE_DEVICES void cuda_device_synchronize(const std::string &name) { Kokkos::Tools::Experimental::Impl::profile_fence_event( name, @@ -144,16 +143,16 @@ void cuda_device_synchronize(const std::string &name) { #if defined(KOKKOS_COMPILER_CLANG) // annotate with __host__ silence a clang warning about using // cudaDeviceSynchronize in device code - [] __host__() { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_synchronize_wrapper())); - }); + [] __host__() #else - []() { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_synchronize_wrapper())); - }); + []() #endif + { + for (int cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); + } + }); } void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr, @@ -168,25 +167,11 @@ void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr, }); } -void cuda_stream_synchronize( - const cudaStream_t stream, - Kokkos::Tools::Experimental::SpecialSynchronizationCases reason, - const std::string &name) { - Kokkos::Tools::Experimental::Impl::profile_fence_event( - name, reason, [&]() { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_stream_synchronize_wrapper( - stream))); - }); -} - void cuda_internal_error_throw(cudaError e, const char *name, const char *file, const int line) { std::ostringstream out; - out << name << " error( " - << CudaInternal::singleton().cuda_get_error_name_wrapper(e) - << "): " - << CudaInternal::singleton().cuda_get_error_string_wrapper(e); + out << name << " error( " << cudaGetErrorName(e) + << "): " << cudaGetErrorString(e); if (file) { out << " " << file << ":" << line; } @@ -196,10 +181,8 @@ void cuda_internal_error_throw(cudaError e, const char *name, const char *file, void cuda_internal_error_abort(cudaError e, const char *name, const char *file, const int line) { std::ostringstream out; - out << name << " error( " - << CudaInternal::singleton().cuda_get_error_name_wrapper(e) - << "): " - << CudaInternal::singleton().cuda_get_error_string_wrapper(e); + out << name << " error( " << cudaGetErrorName(e) + << "): " << cudaGetErrorString(e); if (file) { out << " " << file << ":" << line; } @@ -208,96 +191,6 @@ void cuda_internal_error_abort(cudaError e, const char *name, const char *file, host_abort(out.str().c_str()); } -//---------------------------------------------------------------------------- -// Some significant cuda device properties: -// -// cudaDeviceProp::name : Text label for device -// cudaDeviceProp::major : Device major number -// cudaDeviceProp::minor : Device minor number -// cudaDeviceProp::warpSize : number of threads per warp -// cudaDeviceProp::multiProcessorCount : number of multiprocessors -// cudaDeviceProp::sharedMemPerBlock : capacity of shared memory per block -// cudaDeviceProp::totalConstMem : capacity of constant memory -// cudaDeviceProp::totalGlobalMem : capacity of global memory -// cudaDeviceProp::maxGridSize[3] : maximum grid size - -// -// Section 4.4.2.4 of the CUDA Toolkit Reference Manual -// -// struct cudaDeviceProp { -// char name[256]; -// size_t totalGlobalMem; -// size_t sharedMemPerBlock; -// int regsPerBlock; -// int warpSize; -// size_t memPitch; -// int maxThreadsPerBlock; -// int maxThreadsDim[3]; -// int maxGridSize[3]; -// size_t totalConstMem; -// int major; -// int minor; -// int clockRate; -// size_t textureAlignment; -// int deviceOverlap; -// int multiProcessorCount; -// int kernelExecTimeoutEnabled; -// int integrated; -// int canMapHostMemory; -// int computeMode; -// int concurrentKernels; -// int ECCEnabled; -// int pciBusID; -// int pciDeviceID; -// int tccDriver; -// int asyncEngineCount; -// int unifiedAddressing; -// int memoryClockRate; -// int memoryBusWidth; -// int l2CacheSize; -// int maxThreadsPerMultiProcessor; -// }; - -namespace { - -class CudaInternalDevices { - public: - enum { MAXIMUM_DEVICE_COUNT = 64 }; - struct cudaDeviceProp m_cudaProp[MAXIMUM_DEVICE_COUNT]; - int m_cudaDevCount; - - CudaInternalDevices(); - - static const CudaInternalDevices &singleton(); -}; - -CudaInternalDevices::CudaInternalDevices() { - // See 'cudaSetDeviceFlags' for host-device thread interaction - // Section 4.4.2.6 of the CUDA Toolkit Reference Manual - - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_device_count_wrapper( - &m_cudaDevCount))); - - if (m_cudaDevCount > MAXIMUM_DEVICE_COUNT) { - Kokkos::abort( - "Sorry, you have more GPUs per node than we thought anybody would ever " - "have. Please report this to github.com/kokkos/kokkos."); - } - for (int i = 0; i < m_cudaDevCount; ++i) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_device_properties_wrapper( - m_cudaProp + i, i))); - } -} - -const CudaInternalDevices &CudaInternalDevices::singleton() { - static CudaInternalDevices self; - return self; -} - -} // namespace - //---------------------------------------------------------------------------- int Impl::CudaInternal::concurrency() { @@ -307,8 +200,6 @@ int Impl::CudaInternal::concurrency() { } void CudaInternal::print_configuration(std::ostream &s) const { - const CudaInternalDevices &dev_info = CudaInternalDevices::singleton(); - #if defined(KOKKOS_ENABLE_CUDA) s << "macro KOKKOS_ENABLE_CUDA : defined\n"; #endif @@ -317,22 +208,23 @@ void CudaInternal::print_configuration(std::ostream &s) const { << CUDA_VERSION / 1000 << "." << (CUDA_VERSION % 1000) / 10 << '\n'; #endif - for (int i = 0; i < dev_info.m_cudaDevCount; ++i) { - s << "Kokkos::Cuda[ " << i << " ] " << dev_info.m_cudaProp[i].name - << " capability " << dev_info.m_cudaProp[i].major << "." - << dev_info.m_cudaProp[i].minor << ", Total Global Memory: " - << human_memory_size(dev_info.m_cudaProp[i].totalGlobalMem) + for (int i : get_visible_devices()) { + cudaDeviceProp prop; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, i)); + s << "Kokkos::Cuda[ " << i << " ] " << prop.name << " capability " + << prop.major << "." << prop.minor + << ", Total Global Memory: " << human_memory_size(prop.totalGlobalMem) << ", Shared Memory per Block: " - << human_memory_size(dev_info.m_cudaProp[i].sharedMemPerBlock); + << human_memory_size(prop.sharedMemPerBlock); if (m_cudaDev == i) s << " : Selected"; - s << std::endl; + s << '\n'; } } //---------------------------------------------------------------------------- CudaInternal::~CudaInternal() { - if (m_stream || m_scratchSpace || m_scratchFlags || m_scratchUnified) { + if (m_scratchSpace || m_scratchFlags || m_scratchUnified) { std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()" << std::endl; } @@ -370,45 +262,53 @@ void CudaInternal::fence() const { fence("Kokkos::CudaInternal::fence(): Unnamed Instance Fence"); } -void CudaInternal::initialize(cudaStream_t stream, bool manage_stream) { +void CudaInternal::initialize(cudaStream_t stream) { KOKKOS_EXPECTS(!is_initialized()); if (was_finalized) Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n"); was_initialized = true; + // Check that the device associated with the stream matches cuda_device + CUcontext context; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuStreamGetCtx(stream, &context))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuCtxPushCurrent(context))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuCtxGetDevice(&m_cudaDev))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev)); + + m_stream = stream; + CudaInternal::cuda_devices.insert(m_cudaDev); + + // Allocate a staging buffer for constant mem in pinned host memory + // and an event to avoid overwriting driver for previous kernel launches + if (!constantMemHostStagingPerDevice[m_cudaDev]) + KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_malloc_host_wrapper( + reinterpret_cast(&constantMemHostStagingPerDevice[m_cudaDev]), + CudaTraits::ConstantMemoryUsage))); + + if (!constantMemReusablePerDevice[m_cudaDev]) + KOKKOS_IMPL_CUDA_SAFE_CALL( + (cuda_event_create_wrapper(&constantMemReusablePerDevice[m_cudaDev]))); + //---------------------------------- // Multiblock reduction uses scratch flags for counters // and scratch space for partial reduction values. // Allocate some initial space. This will grow as needed. { - const unsigned reduce_block_count = - m_maxWarpCount * Impl::CudaTraits::WarpSize; + // Maximum number of warps, + // at most one warp per thread in a warp for reduction. + auto const maxWarpCount = std::min( + m_deviceProp.maxThreadsPerBlock / CudaTraits::WarpSize, + CudaTraits::WarpSize); + unsigned const reduce_block_count = + maxWarpCount * Impl::CudaTraits::WarpSize; (void)scratch_unified(16 * sizeof(size_type)); (void)scratch_flags(reduce_block_count * 2 * sizeof(size_type)); (void)scratch_space(reduce_block_count * 16 * sizeof(size_type)); } - // Init the array for used for arbitrarily sized atomics - if (this == &singleton()) { - desul::Impl::init_lock_arrays(); // FIXME - } - - // Allocate a staging buffer for constant mem in pinned host memory - // and an event to avoid overwriting driver for previous kernel launches - if (this == &singleton()) { - KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_malloc_host_wrapper( - reinterpret_cast(&constantMemHostStaging), - CudaTraits::ConstantMemoryUsage))); - - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_event_create_wrapper(&constantMemReusable))); - } - - m_stream = stream; - m_manage_stream = manage_stream; for (int i = 0; i < m_n_team_scratch; ++i) { m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; @@ -427,22 +327,23 @@ void CudaInternal::initialize(cudaStream_t stream, bool manage_stream) { Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { + auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); + + if (m_scratchFlags) { + mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); + } + m_scratchFlagsCount = scratch_count(size); - using Record = - Kokkos::Impl::SharedAllocationRecord; - - if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags)); - std::size_t alloc_size = multiply_overflow_abort(m_scratchFlagsCount, sizeScratchGrain); - Record *const r = Record::allocate( - Kokkos::CudaSpace(), "Kokkos::InternalScratchFlags", alloc_size); - - Record::increment(r); - - m_scratchFlags = reinterpret_cast(r->data()); + m_scratchFlags = static_cast( + mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size)); + // We only zero-initialize the allocation when we actually allocate. + // It's the responsibility of the features using scratch_flags, + // namely parallel_reduce and parallel_scan, to reset the used values to 0. KOKKOS_IMPL_CUDA_SAFE_CALL( (cuda_memset_wrapper(m_scratchFlags, 0, alloc_size))); } @@ -453,21 +354,19 @@ Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const { Cuda::size_type *CudaInternal::scratch_space(const std::size_t size) const { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { + auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); + + if (m_scratchSpace) { + mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + } + m_scratchSpaceCount = scratch_count(size); - using Record = - Kokkos::Impl::SharedAllocationRecord; - - if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace)); - std::size_t alloc_size = multiply_overflow_abort(m_scratchSpaceCount, sizeScratchGrain); - Record *const r = Record::allocate( - Kokkos::CudaSpace(), "Kokkos::InternalScratchSpace", alloc_size); - - Record::increment(r); - - m_scratchSpace = reinterpret_cast(r->data()); + m_scratchSpace = static_cast( + mem_space.allocate("Kokkos::InternalScratchSpace", alloc_size)); } return m_scratchSpace; @@ -476,23 +375,20 @@ Cuda::size_type *CudaInternal::scratch_space(const std::size_t size) const { Cuda::size_type *CudaInternal::scratch_unified(const std::size_t size) const { if (verify_is_initialized("scratch_unified") && m_scratchUnifiedCount < scratch_count(size)) { + auto mem_space = + Kokkos::CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream); + + if (m_scratchUnified) { + mem_space.deallocate(m_scratchUnified, + m_scratchUnifiedCount * sizeScratchGrain); + } + m_scratchUnifiedCount = scratch_count(size); - using Record = - Kokkos::Impl::SharedAllocationRecord; - - if (m_scratchUnified) - Record::decrement(Record::get_record(m_scratchUnified)); - std::size_t alloc_size = multiply_overflow_abort(m_scratchUnifiedCount, sizeScratchGrain); - Record *const r = - Record::allocate(Kokkos::CudaHostPinnedSpace(), - "Kokkos::InternalScratchUnified", alloc_size); - - Record::increment(r); - - m_scratchUnified = reinterpret_cast(r->data()); + m_scratchUnified = static_cast( + mem_space.allocate("Kokkos::InternalScratchUnified", alloc_size)); } return m_scratchUnified; @@ -500,21 +396,16 @@ Cuda::size_type *CudaInternal::scratch_unified(const std::size_t size) const { Cuda::size_type *CudaInternal::scratch_functor(const std::size_t size) const { if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) { + auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); + + if (m_scratchFunctor) { + mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); + } + m_scratchFunctorSize = size; - using Record = - Kokkos::Impl::SharedAllocationRecord; - - if (m_scratchFunctor) - Record::decrement(Record::get_record(m_scratchFunctor)); - - Record *const r = - Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchFunctor", - m_scratchFunctorSize); - - Record::increment(r); - - m_scratchFunctor = reinterpret_cast(r->data()); + m_scratchFunctor = static_cast(mem_space.allocate( + "Kokkos::InternalScratchFunctor", m_scratchFunctorSize)); } return m_scratchFunctor; @@ -537,21 +428,21 @@ void *CudaInternal::resize_team_scratch_space(int scratch_pool_id, // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race // condition. + auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); if (m_team_scratch_current_size[scratch_pool_id] == 0) { m_team_scratch_current_size[scratch_pool_id] = bytes; m_team_scratch_ptr[scratch_pool_id] = - Kokkos::kokkos_malloc( - "Kokkos::CudaSpace::TeamScratchMemory", - m_team_scratch_current_size[scratch_pool_id]); + mem_space.allocate("Kokkos::CudaSpace::TeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } if ((bytes > m_team_scratch_current_size[scratch_pool_id]) || ((bytes < m_team_scratch_current_size[scratch_pool_id]) && (force_shrink))) { + mem_space.deallocate(m_team_scratch_ptr[scratch_pool_id], + m_team_scratch_current_size[scratch_pool_id]); m_team_scratch_current_size[scratch_pool_id] = bytes; m_team_scratch_ptr[scratch_pool_id] = - Kokkos::kokkos_realloc( - m_team_scratch_ptr[scratch_pool_id], - m_team_scratch_current_size[scratch_pool_id]); + mem_space.allocate("Kokkos::CudaSpace::TeamScratchMemory", bytes); } return m_team_scratch_ptr[scratch_pool_id]; } @@ -568,50 +459,33 @@ void CudaInternal::finalize() { was_finalized = true; - // Only finalize this if we're the singleton - if (this == &singleton()) { - (void)Impl::cuda_global_unique_token_locks(true); - desul::Impl::finalize_lock_arrays(); // FIXME - - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_free_host_wrapper(constantMemHostStaging))); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_event_destroy_wrapper(constantMemReusable))); - auto &deep_copy_space = - Kokkos::Impl::cuda_get_deep_copy_space(/*initialize*/ false); - if (deep_copy_space) - deep_copy_space->impl_internal_space_instance()->finalize(); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_stream_destroy_wrapper(cuda_get_deep_copy_stream()))); - } - + auto cuda_mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { - using RecordCuda = Kokkos::Impl::SharedAllocationRecord; - using RecordHost = - Kokkos::Impl::SharedAllocationRecord; - - RecordCuda::decrement(RecordCuda::get_record(m_scratchFlags)); - RecordCuda::decrement(RecordCuda::get_record(m_scratchSpace)); - RecordHost::decrement(RecordHost::get_record(m_scratchUnified)); - if (m_scratchFunctorSize > 0) - RecordCuda::decrement(RecordCuda::get_record(m_scratchFunctor)); + auto host_mem_space = + Kokkos::CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream); + cuda_mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); + cuda_mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + host_mem_space.deallocate(m_scratchUnified, + m_scratchUnifiedCount * sizeScratchGrain); + if (m_scratchFunctorSize > 0) { + cuda_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); + } } for (int i = 0; i < m_n_team_scratch; ++i) { if (m_team_scratch_current_size[i] > 0) - Kokkos::kokkos_free(m_team_scratch_ptr[i]); + cuda_mem_space.deallocate(m_team_scratch_ptr[i], + m_team_scratch_current_size[i]); } - if (m_manage_stream && get_stream() != nullptr) - KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_stream_destroy_wrapper(m_stream))); - m_scratchSpaceCount = 0; m_scratchFlagsCount = 0; m_scratchUnifiedCount = 0; m_scratchSpace = nullptr; m_scratchFlags = nullptr; m_scratchUnified = nullptr; - m_stream = nullptr; for (int i = 0; i < m_n_team_scratch; ++i) { m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; @@ -624,30 +498,6 @@ void CudaInternal::finalize() { //---------------------------------------------------------------------------- -Cuda::size_type cuda_internal_multiprocessor_count() { - return CudaInternal::singleton().m_multiProcCount; -} - -CudaSpace::size_type cuda_internal_maximum_concurrent_block_count() { -#if defined(KOKKOS_ARCH_KEPLER) - // Compute capability 3.0 through 3.7 - enum : int { max_resident_blocks_per_multiprocessor = 16 }; -#else - // Compute capability 5.0 through 6.2 - enum : int { max_resident_blocks_per_multiprocessor = 32 }; -#endif - return CudaInternal::singleton().m_multiProcCount * - max_resident_blocks_per_multiprocessor; -}; - -Cuda::size_type cuda_internal_maximum_warp_count() { - return CudaInternal::singleton().m_maxWarpCount; -} - -std::array cuda_internal_maximum_grid_count() { - return CudaInternal::singleton().m_maxBlock; -} - Cuda::size_type *cuda_internal_scratch_space(const Cuda &instance, const std::size_t size) { return instance.impl_internal_space_instance()->scratch_space(size); @@ -670,10 +520,6 @@ Cuda::size_type *cuda_internal_scratch_unified(const Cuda &instance, namespace Kokkos { -Cuda::size_type Cuda::detect_device_count() { - return Impl::CudaInternalDevices::singleton().m_cudaDevCount; -} - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 int Cuda::concurrency() { #else @@ -687,25 +533,23 @@ int Cuda::impl_is_initialized() { } void Cuda::impl_initialize(InitializationSettings const &settings) { - const int cuda_device_id = Impl::get_gpu(settings); - const auto &dev_info = Impl::CudaInternalDevices::singleton(); + const std::vector &visible_devices = Impl::get_visible_devices(); + const int cuda_device_id = + Impl::get_gpu(settings).value_or(visible_devices[0]); - const struct cudaDeviceProp &cudaProp = dev_info.m_cudaProp[cuda_device_id]; - - Impl::CudaInternal::m_cudaDev = cuda_device_id; + cudaDeviceProp cudaProp; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaGetDeviceProperties(&cudaProp, cuda_device_id)); Impl::CudaInternal::m_deviceProp = cudaProp; - - Kokkos::Impl::cuda_device_synchronize( - "Kokkos::CudaInternal::initialize: Fence on space initialization"); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); // Query what compute capability architecture a kernel executes: - Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch(); + Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch(cuda_device_id); if (Impl::CudaInternal::m_cudaArch == 0) { - std::stringstream ss; - ss << "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n"; - std::string msg = ss.str(); - Kokkos::abort(msg.c_str()); + Kokkos::abort( + "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n"); } int compiled_major = Impl::CudaInternal::m_cudaArch / 100; @@ -761,78 +605,42 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default } #endif - //---------------------------------- - // number of multiprocessors - Impl::CudaInternal::m_multiProcCount = cudaProp.multiProcessorCount; - - //---------------------------------- - // Maximum number of warps, - // at most one warp per thread in a warp for reduction. - Impl::CudaInternal::m_maxWarpCount = - cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize; - - if (Impl::CudaTraits::WarpSize < Impl::CudaInternal::m_maxWarpCount) { - Impl::CudaInternal::m_maxWarpCount = Impl::CudaTraits::WarpSize; - } - - //---------------------------------- - // Maximum number of blocks: - - Impl::CudaInternal::m_maxBlock[0] = cudaProp.maxGridSize[0]; - Impl::CudaInternal::m_maxBlock[1] = cudaProp.maxGridSize[1]; - Impl::CudaInternal::m_maxBlock[2] = cudaProp.maxGridSize[2]; - - Impl::CudaInternal::m_shmemPerSM = cudaProp.sharedMemPerMultiprocessor; - Impl::CudaInternal::m_maxShmemPerBlock = cudaProp.sharedMemPerBlock; - Impl::CudaInternal::m_maxBlocksPerSM = - Impl::CudaInternal::m_cudaArch < 500 - ? 16 - : (Impl::CudaInternal::m_cudaArch < 750 - ? 32 - : (Impl::CudaInternal::m_cudaArch == 750 ? 16 : 32)); - Impl::CudaInternal::m_maxThreadsPerSM = cudaProp.maxThreadsPerMultiProcessor; - Impl::CudaInternal::m_maxThreadsPerBlock = cudaProp.maxThreadsPerBlock; - //---------------------------------- cudaStream_t singleton_stream; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&singleton_stream)); + + // Init the array for used for arbitrarily sized atomics + desul::Impl::init_lock_arrays(); // FIXME + + Impl::CudaInternal::singleton().initialize(singleton_stream); +} + +void Cuda::impl_finalize() { + (void)Impl::cuda_global_unique_token_locks(true); + desul::Impl::finalize_lock_arrays(); // FIXME + + for (const auto cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaFreeHost(Kokkos::Impl::CudaInternal::constantMemHostStagingPerDevice + [cuda_device])); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventDestroy( + Kokkos::Impl::CudaInternal::constantMemReusablePerDevice[cuda_device])); + } + + auto &deep_copy_space = Impl::cuda_get_deep_copy_space(/*initialize*/ false); + if (deep_copy_space) + deep_copy_space->impl_internal_space_instance()->finalize(); KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_stream_create_wrapper( - &singleton_stream))); + cudaStreamDestroy(Impl::cuda_get_deep_copy_stream())); - auto &cuda_singleton = Impl::CudaInternal::singleton(); - cuda_singleton.initialize(singleton_stream, /*manage*/ true); + Impl::CudaInternal::singleton().finalize(); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaStreamDestroy(Impl::CudaInternal::singleton().m_stream)); } -std::vector Cuda::detect_device_arch() { - const Impl::CudaInternalDevices &s = Impl::CudaInternalDevices::singleton(); - - std::vector output(s.m_cudaDevCount); - - for (int i = 0; i < s.m_cudaDevCount; ++i) { - output[i] = s.m_cudaProp[i].major * 100 + s.m_cudaProp[i].minor; - } - - return output; -} - -Cuda::size_type Cuda::device_arch() { - const int dev_id = Impl::CudaInternal::singleton().m_cudaDev; - - int dev_arch = 0; - - if (0 <= dev_id) { - const struct cudaDeviceProp &cudaProp = - Impl::CudaInternalDevices::singleton().m_cudaProp[dev_id]; - - dev_arch = cudaProp.major * 100 + cudaProp.minor; - } - - return dev_arch; -} - -void Cuda::impl_finalize() { Impl::CudaInternal::singleton().finalize(); } - Cuda::Cuda() : m_space_instance(&Impl::CudaInternal::singleton(), [](Impl::CudaInternal *) {}) { @@ -845,13 +653,17 @@ KOKKOS_DEPRECATED Cuda::Cuda(cudaStream_t stream, bool manage_stream) manage_stream ? Impl::ManageStream::yes : Impl::ManageStream::no) {} Cuda::Cuda(cudaStream_t stream, Impl::ManageStream manage_stream) - : m_space_instance(new Impl::CudaInternal, [](Impl::CudaInternal *ptr) { - ptr->finalize(); - delete ptr; - }) { + : m_space_instance( + new Impl::CudaInternal, [manage_stream](Impl::CudaInternal *ptr) { + ptr->finalize(); + if (static_cast(manage_stream)) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(ptr->m_stream)); + } + delete ptr; + }) { Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor"); - m_space_instance->initialize(stream, static_cast(manage_stream)); + m_space_instance->initialize(stream); } void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp index a324adecfe..24f4af3101 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -22,6 +22,10 @@ #include #include #include +#include "Kokkos_CudaSpace.hpp" + +#include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -55,27 +59,10 @@ struct CudaTraits { unsigned long[ConstantMemoryUsage / sizeof(unsigned long)]; static constexpr int ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */; - - KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_count( - CudaSpace::size_type i) { - return (i + WarpIndexMask) >> WarpIndexShift; - } - - KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_align( - CudaSpace::size_type i) { - constexpr CudaSpace::size_type Mask = ~WarpIndexMask; - return (i + WarpIndexMask) & Mask; - } }; //---------------------------------------------------------------------------- -CudaSpace::size_type cuda_internal_multiprocessor_count(); -CudaSpace::size_type cuda_internal_maximum_warp_count(); -std::array cuda_internal_maximum_grid_count(); - -CudaSpace::size_type cuda_internal_maximum_concurrent_block_count(); - CudaSpace::size_type* cuda_internal_scratch_flags(const Cuda&, const std::size_t size); CudaSpace::size_type* cuda_internal_scratch_space(const Cuda&, @@ -101,18 +88,10 @@ class CudaInternal { public: using size_type = Cuda::size_type; - inline static int m_cudaDev = -1; + int m_cudaDev = -1; // Device Properties - inline static int m_cudaArch = -1; - inline static unsigned m_multiProcCount = 0; - inline static unsigned m_maxWarpCount = 0; - inline static std::array m_maxBlock = {0, 0, 0}; - inline static int m_shmemPerSM = 0; - inline static int m_maxShmemPerBlock = 0; - inline static int m_maxBlocksPerSM = 0; - inline static int m_maxThreadsPerSM = 0; - inline static int m_maxThreadsPerBlock = 0; + inline static int m_cudaArch = -1; static int concurrency(); inline static cudaDeviceProp m_deviceProp; @@ -129,7 +108,6 @@ class CudaInternal { mutable size_type* m_scratchFunctor; cudaStream_t m_stream; uint32_t m_instance_id; - bool m_manage_stream; // Team Scratch Level 1 Space int m_n_team_scratch = 10; @@ -142,11 +120,11 @@ class CudaInternal { bool was_initialized = false; bool was_finalized = false; - // FIXME_CUDA: these want to be per-device, not per-stream... use of 'static' - // here will break once there are multiple devices though - inline static unsigned long* constantMemHostStaging = nullptr; - inline static cudaEvent_t constantMemReusable = nullptr; - inline static std::mutex constantMemMutex; + inline static std::set cuda_devices = {}; + inline static std::map constantMemHostStagingPerDevice = + {}; + inline static std::map constantMemReusablePerDevice = {}; + inline static std::map constantMemMutexPerDevice = {}; static CudaInternal& singleton(); @@ -156,7 +134,7 @@ class CudaInternal { return nullptr != m_scratchSpace && nullptr != m_scratchFlags; } - void initialize(cudaStream_t stream, bool manage_stream); + void initialize(cudaStream_t stream); void finalize(); void print_configuration(std::ostream&) const; @@ -247,12 +225,6 @@ class CudaInternal { return cudaDeviceSetLimit(limit, value); } - template - cudaError_t cuda_device_synchronize_wrapper() const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaDeviceSynchronize(); - } - template cudaError_t cuda_event_create_wrapper(cudaEvent_t* event) const { if constexpr (setCudaDevice) set_cuda_device(); @@ -290,37 +262,6 @@ class CudaInternal { return cudaFreeHost(ptr); } - template - cudaError_t cuda_get_device_count_wrapper(int* count) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetDeviceCount(count); - } - - template - cudaError_t cuda_get_device_properties_wrapper(cudaDeviceProp* prop, - int device) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetDeviceProperties(prop, device); - } - - template - const char* cuda_get_error_name_wrapper(cudaError_t error) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetErrorName(error); - } - - template - const char* cuda_get_error_string_wrapper(cudaError_t error) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetErrorString(error); - } - - template - cudaError_t cuda_get_last_error_wrapper() const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetLastError(); - } - template cudaError_t cuda_graph_add_dependencies_wrapper( cudaGraph_t graph, const cudaGraphNode_t* from, const cudaGraphNode_t* to, @@ -506,10 +447,10 @@ class CudaInternal { } template - cudaError_t cuda_func_set_attributes_wrapper(T* entry, cudaFuncAttribute attr, - int value) const { + cudaError_t cuda_func_set_attribute_wrapper(T* entry, cudaFuncAttribute attr, + int value) const { if constexpr (setCudaDevice) set_cuda_device(); - return cudaFuncSetAttributes(entry, attr, value); + return cudaFuncSetAttribute(entry, attr, value); } template diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index 82a72b6902..b0dadb45f7 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -21,7 +21,6 @@ #ifdef KOKKOS_ENABLE_CUDA #include -#include #include #include #include @@ -118,42 +117,43 @@ inline bool is_empty_launch(dim3 const& grid, dim3 const& block) { } inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) { - if (cuda_instance->m_maxShmemPerBlock < shmem) { + int const maxShmemPerBlock = cuda_instance->m_deviceProp.sharedMemPerBlock; + if (maxShmemPerBlock < shmem) { Kokkos::Impl::throw_runtime_exception( - std::string("CudaParallelLaunch (or graph node creation) FAILED: shared" - " memory request is too large")); + "CudaParallelLaunch (or graph node creation) FAILED: shared memory " + "request is too large"); } } // These functions need to be templated on DriverType and LaunchBounds // so that the static bool is unique for each type combo // KernelFuncPtr does not necessarily contain that type information. -// FIXME_CUDA_MULTIPLE_DEVICES template const cudaFuncAttributes& get_cuda_kernel_func_attributes( - const KernelFuncPtr& func) { + int cuda_device, const KernelFuncPtr& func) { // Only call cudaFuncGetAttributes once for each unique kernel // by leveraging static variable initialization rules - auto wrap_get_attributes = [&]() -> cudaFuncAttributes { + static std::map func_attr; + if (func_attr.find(cuda_device) == func_attr.end()) { cudaFuncAttributes attr; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_func_get_attributes_wrapper(&attr, - func))); - return attr; - }; - static cudaFuncAttributes func_attr = wrap_get_attributes(); - return func_attr; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func)); + func_attr.emplace(cuda_device, attr); + } + return func_attr[cuda_device]; } template -inline void configure_shmem_preference(const KernelFuncPtr& func, +inline void configure_shmem_preference(const int cuda_device, + const KernelFuncPtr& func, const cudaDeviceProp& device_props, const size_t block_size, int& shmem, const size_t occupancy) { #ifndef KOKKOS_ARCH_KEPLER const auto& func_attr = - get_cuda_kernel_func_attributes(func); + get_cuda_kernel_func_attributes(cuda_device, + func); // Compute limits for number of blocks due to registers/SM const size_t regs_per_sm = device_props.regsPerMultiprocessor; @@ -222,7 +222,7 @@ inline void configure_shmem_preference(const KernelFuncPtr& func, // FIXME_CUDA_MULTIPLE_DEVICES auto set_cache_config = [&] { KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_func_set_attributes_wrapper( + (CudaInternal::singleton().cuda_func_set_attribute_wrapper( func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout))); return carveout; }; @@ -387,8 +387,8 @@ struct CudaParallelLaunchKernelInvoker< driver.get_policy().impl_get_desired_occupancy().value(); size_t block_size = block.x * block.y * block.z; Impl::configure_shmem_preference( - base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, - shmem, desired_occupancy); + cuda_instance->m_cudaDev, base_t::get_kernel_func(), + cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy); } void const* args[] = {&driver}; @@ -487,8 +487,8 @@ struct CudaParallelLaunchKernelInvoker< driver.get_policy().impl_get_desired_occupancy().value(); size_t block_size = block.x * block.y * block.z; Impl::configure_shmem_preference( - base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, - shmem, desired_occupancy); + cuda_instance->m_cudaDev, base_t::get_kernel_func(), + cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy); } auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); @@ -576,13 +576,16 @@ struct CudaParallelLaunchKernelInvoker< static void invoke_kernel(DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, CudaInternal const* cuda_instance) { + int cuda_device = cuda_instance->m_cudaDev; // Wait until the previous kernel that uses the constant buffer is done - std::lock_guard lock(CudaInternal::constantMemMutex); + std::lock_guard lock( + CudaInternal::constantMemMutexPerDevice[cuda_device]); KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_synchronize_wrapper( - CudaInternal::constantMemReusable))); + CudaInternal::constantMemReusablePerDevice[cuda_device]))); // Copy functor (synchronously) to staging buffer in pinned host memory - unsigned long* staging = cuda_instance->constantMemHostStaging; + unsigned long* staging = + cuda_instance->constantMemHostStagingPerDevice[cuda_device]; memcpy(staging, &driver, sizeof(DriverType)); // Copy functor asynchronously from there to constant memory on the device @@ -597,7 +600,7 @@ struct CudaParallelLaunchKernelInvoker< // Record an event that says when the constant buffer can be reused KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_record_wrapper( - CudaInternal::constantMemReusable))); + CudaInternal::constantMemReusablePerDevice[cuda_device]))); } inline static void create_parallel_launch_graph_node( @@ -665,8 +668,8 @@ struct CudaParallelLaunchImpl< Impl::configure_shmem_preference< DriverType, Kokkos::LaunchBounds>( - base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, - shmem, desired_occupancy); + cuda_instance->m_cudaDev, base_t::get_kernel_func(), + cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy); } desul::ensure_cuda_lock_arrays_on_device(); @@ -675,18 +678,17 @@ struct CudaParallelLaunchImpl< base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance); #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_instance->cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); cuda_instance->fence( "Kokkos::Impl::launch_kernel: Debug Only Check for Execution Error"); #endif } } - static cudaFuncAttributes get_cuda_func_attributes() { + static cudaFuncAttributes get_cuda_func_attributes(int cuda_device) { return get_cuda_kernel_func_attributes< DriverType, Kokkos::LaunchBounds>( - base_t::get_kernel_func()); + cuda_device, base_t::get_kernel_func()); } }; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp index 7492ab49e5..2c7eba7a18 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp @@ -40,8 +40,8 @@ template <> inline TileSizeProperties get_tile_size_properties( const Kokkos::Cuda& space) { TileSizeProperties properties; - properties.max_threads = - space.impl_internal_space_instance()->m_maxThreadsPerSM; + properties.max_threads = space.impl_internal_space_instance() + ->m_deviceProp.maxThreadsPerMultiProcessor; properties.default_largest_tile_size = 16; properties.default_tile_size = 2; properties.max_total_tile_size = 512; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp index 49d6c112e3..6303898400 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -42,8 +41,8 @@ namespace Impl { template int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) { cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); + CudaParallelLaunch::get_cuda_func_attributes( + pol.space().cuda_device()); auto const& prop = pol.space().cuda_device_prop(); // Limits due to registers/SM, MDRange doesn't have @@ -96,7 +95,7 @@ class ParallelFor, Kokkos::Cuda> { inline void execute() const { if (m_rp.m_num_tiles == 0) return; - const auto maxblocks = cuda_internal_maximum_grid_count(); + const auto maxblocks = m_rp.space().cuda_device_prop().maxGridSize; if (RP::rank == 2) { const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1); KOKKOS_ASSERT(block.x > 0); @@ -325,19 +324,18 @@ class ParallelReduce( f, n); using closure_type = Impl::ParallelReduce, Policy, Kokkos::Cuda>; - cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); + cudaFuncAttributes attr = CudaParallelLaunch:: + get_cuda_func_attributes(m_policy.space().cuda_device()); while ( - (n && - (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < - shmem_size)) || + (n && (maxShmemPerBlock < shmem_size)) || (n > static_cast( Kokkos::Impl::cuda_get_max_block_size( diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index 3472999281..0f052be3c3 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -86,18 +85,18 @@ class ParallelFor, Kokkos::Cuda> { const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); + CudaParallelLaunch::get_cuda_func_attributes( + m_policy.space().cuda_device()); const int block_size = Kokkos::Impl::cuda_get_opt_block_size( m_policy.space().impl_internal_space_instance(), attr, m_functor, 1, 0, 0); KOKKOS_ASSERT(block_size > 0); dim3 block(1, block_size, 1); + const int maxGridSizeX = m_policy.space().cuda_device_prop().maxGridSize[0]; dim3 grid( - std::min( - typename Policy::index_type((nwork + block.y - 1) / block.y), - typename Policy::index_type(cuda_internal_maximum_grid_count()[0])), + std::min(typename Policy::index_type((nwork + block.y - 1) / block.y), + typename Policy::index_type(maxGridSizeX)), 1, 1); #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION if (Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) { @@ -244,10 +243,10 @@ class ParallelReduce, if (CudaTraits::WarpSize < word_count.value) { __syncthreads(); } else if (word_count.value > 1) { - // Inside cuda_single_inter_block_reduce_scan() above, shared[i] below - // might have been updated by a single thread within a warp without - // synchronization afterwards. Synchronize threads within warp to avoid - // potential racecondition. + // Inside cuda_single_inter_block_reduce_scan() and final() above, + // shared[i] below might have been updated by a single thread within a + // warp without synchronization afterwards. Synchronize threads within + // warp to avoid potential race condition. __syncwarp(0xffffffff); } @@ -260,19 +259,18 @@ class ParallelReduce, // Determine block size constrained by shared memory: inline unsigned local_block_size(const FunctorType& f) { unsigned n = CudaTraits::WarpSize * 8; + const int maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; int shmem_size = cuda_single_inter_block_reduce_scan_shmem( f, n); using closure_type = Impl::ParallelReduce, Policy, Kokkos::Cuda>; - cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); + cudaFuncAttributes attr = CudaParallelLaunch:: + get_cuda_func_attributes(m_policy.space().cuda_device()); while ( - (n && - (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < - shmem_size)) || + (n && (maxShmemPerBlock < shmem_size)) || (n > static_cast( Kokkos::Impl::cuda_get_max_block_size( @@ -615,11 +613,11 @@ class ParallelScan, Kokkos::Cuda> { // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit // testing + const int maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; unsigned n = CudaTraits::WarpSize * 4; while (n && - unsigned(m_policy.space() - .impl_internal_space_instance() - ->m_maxShmemPerBlock) < + unsigned(maxShmemPerBlock) < cuda_single_inter_block_reduce_scan_shmem(f, n)) { n >>= 1; @@ -939,11 +937,11 @@ class ParallelScanWithTotal, // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit // testing + const int maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; unsigned n = CudaTraits::WarpSize * 4; while (n && - unsigned(m_policy.space() - .impl_internal_space_instance() - ->m_maxShmemPerBlock) < + unsigned(maxShmemPerBlock) < cuda_single_inter_block_reduce_scan_shmem(f, n)) { n >>= 1; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index b4679b4e0d..9f7be45c83 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include @@ -98,7 +98,7 @@ class TeamPolicyInternal Impl::ParallelFor>; cudaFuncAttributes attr = CudaParallelLaunch:: - get_cuda_func_attributes(); + get_cuda_func_attributes(space().cuda_device()); int block_size = Kokkos::Impl::cuda_get_max_block_size( @@ -137,7 +137,7 @@ class TeamPolicyInternal Impl::ParallelFor>; cudaFuncAttributes attr = CudaParallelLaunch:: - get_cuda_func_attributes(); + get_cuda_func_attributes(space().cuda_device()); const int block_size = Kokkos::Impl::cuda_get_opt_block_size( @@ -262,7 +262,8 @@ class TeamPolicyInternal m_tune_team(bool(team_size_request <= 0)), m_tune_vector(bool(vector_length_request <= 0)) { // Make sure league size is permissible - if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count()[0])) + const int maxGridSizeX = m_space.cuda_device_prop().maxGridSize[0]; + if (league_size_ >= maxGridSizeX) Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution " "space."); @@ -369,7 +370,7 @@ class TeamPolicyInternal cudaFuncAttributes attr = CudaParallelLaunch:: - get_cuda_func_attributes(); + get_cuda_func_attributes(space().cuda_device()); const int block_size = std::forward(block_size_callable)( space().impl_internal_space_instance(), attr, f, (size_t)impl_vector_length(), @@ -539,8 +540,8 @@ class ParallelFor, auto internal_space_instance = m_policy.space().impl_internal_space_instance(); cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); + CudaParallelLaunch::get_cuda_func_attributes( + internal_space_instance->m_cudaDev); m_team_size = m_team_size >= 0 ? m_team_size @@ -575,10 +576,11 @@ class ParallelFor, static_cast(m_league_size)))); } + const int maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; const int shmem_size_total = m_shmem_begin + m_shmem_size; - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { - printf("%i %i\n", internal_space_instance->m_maxShmemPerBlock, - shmem_size_total); + if (maxShmemPerBlock < shmem_size_total) { + printf("%i %i\n", maxShmemPerBlock, shmem_size_total); Kokkos::Impl::throw_runtime_exception(std::string( "Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory")); } @@ -623,6 +625,22 @@ class ParallelReduce 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(Kokkos::Cuda::size_type), + std::conditional_t, + Kokkos::Cuda::size_type>; using size_type = Cuda::size_type; using reducer_type = ReducerType; @@ -646,9 +664,11 @@ class ParallelReduce + const integral_nonzero_constant word_count(m_functor_reducer.get_reducer().value_size() / - sizeof(size_type)); + sizeof(word_size_type)); reference_type value = m_functor_reducer.get_reducer().init( - kokkos_impl_cuda_shared_memory() + + kokkos_impl_cuda_shared_memory() + threadIdx.y * word_count.value); // Iterate this block through the league @@ -721,18 +742,19 @@ class ParallelReduce( m_functor_reducer.get_reducer(), blockIdx.x, gridDim.x, - kokkos_impl_cuda_shared_memory(), m_scratch_space, + kokkos_impl_cuda_shared_memory(), m_scratch_space, m_scratch_flags); if (do_final_reduction) { // This is the final block with the final result at the final threads' // location - size_type* const shared = kokkos_impl_cuda_shared_memory() + - (blockDim.y - 1) * word_count.value; + word_size_type* const shared = + kokkos_impl_cuda_shared_memory() + + (blockDim.y - 1) * word_count.value; size_type* const global = m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) + ? reinterpret_cast(m_result_ptr) : (m_unified_space ? m_unified_space : m_scratch_space); if (threadIdx.y == 0) { @@ -787,7 +809,8 @@ class ParallelReduce(m_scratch_space), result, + m_scratch_flags, blockDim.y)) { const unsigned id = threadIdx.y * blockDim.x + threadIdx.x; if (id == 0) { m_functor_reducer.get_reducer().final(&value); @@ -808,13 +831,15 @@ class ParallelReduce(cuda_internal_scratch_space( + m_policy.space(), + m_functor_reducer.get_reducer().value_size() * block_count)); m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type)); - m_unified_space = cuda_internal_scratch_unified( - m_policy.space(), m_functor_reducer.get_reducer().value_size()); + m_unified_space = + reinterpret_cast(cuda_internal_scratch_unified( + m_policy.space(), m_functor_reducer.get_reducer().value_size())); dim3 block(m_vector_size, m_team_size, 1); dim3 grid(block_count, 1, 1); @@ -847,7 +872,8 @@ class ParallelReduce(m_result_ptr, m_scratch_space, size); + DeepCopy(m_policy.space(), m_result_ptr, + m_scratch_space, size); } } } @@ -883,9 +909,8 @@ class ParallelReduce::get_cuda_func_attributes(); + cudaFuncAttributes attr = CudaParallelLaunch:: + get_cuda_func_attributes(internal_space_instance->m_cudaDev); m_team_size = m_team_size >= 0 ? m_team_size @@ -940,6 +965,8 @@ class ParallelReduce bad team size")); } - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + if (maxShmemPerBlock < shmem_size_total) { Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much " "L0 scratch memory")); diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp index 7ccedbfe28..3037c4ab54 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp @@ -103,7 +103,7 @@ template __device__ bool cuda_inter_block_reduction( typename FunctorType::reference_type value, typename FunctorType::reference_type neutral, const FunctorType& reducer, - Cuda::size_type* const m_scratch_space, + typename FunctorType::pointer_type const m_scratch_space, typename FunctorType::pointer_type const /*result*/, Cuda::size_type* const m_scratch_flags, const int max_active_thread = blockDim.y) { @@ -117,7 +117,7 @@ __device__ bool cuda_inter_block_reduction( // One thread in the block writes block result to global scratch_memory if (id == 0) { - pointer_type global = ((pointer_type)m_scratch_space) + blockIdx.x; + pointer_type global = m_scratch_space + blockIdx.x; *global = value; } @@ -140,7 +140,7 @@ __device__ bool cuda_inter_block_reduction( last_block = true; value = neutral; - pointer_type const volatile global = (pointer_type)m_scratch_space; + pointer_type const volatile global = m_scratch_space; // Reduce all global values with splitting work over threads in one warp const int step_size = @@ -702,8 +702,7 @@ inline void check_reduced_view_shmem_size(const Policy& policy, unsigned reqShmemSize = cuda_single_inter_block_reduce_scan_shmem( functor, minBlockSize); - size_t maxShmemPerBlock = - policy.space().impl_internal_space_instance()->m_maxShmemPerBlock; + size_t maxShmemPerBlock = policy.space().cuda_device_prop().sharedMemPerBlock; if (reqShmemSize > maxShmemPerBlock) { Kokkos::Impl::throw_runtime_exception( diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp index baff7ef3f5..86d6d91bbe 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp @@ -84,8 +84,8 @@ class TaskQueueSpecialization> { KOKKOS_INLINE_FUNCTION static void iff_single_thread_recursive_execute(scheduler_type const&) {} - static int get_max_team_count(execution_space const&) { - return Kokkos::Impl::cuda_internal_multiprocessor_count() * warps_per_block; + static int get_max_team_count(execution_space const& space) { + return space.cuda_device_prop().multiProcessorCount * warps_per_block; } __device__ static void driver(scheduler_type scheduler, @@ -225,7 +225,11 @@ class TaskQueueSpecialization> { // FIXME_CUDA_MULTIPLE_DEVICES static void execute(scheduler_type const& scheduler) { const int shared_per_warp = 2048; - const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1); + const Kokkos::Cuda& exec = scheduler.get_execution_space(); + const auto& impl_instance = exec.impl_internal_space_instance(); + const int multi_processor_count = + exec.cuda_device_prop().multiProcessorCount; + const dim3 grid(multi_processor_count, 1, 1); const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block); const int shared_total = shared_per_warp * warps_per_block; const cudaStream_t stream = nullptr; @@ -245,34 +249,30 @@ class TaskQueueSpecialization> { // Query the stack size, in bytes: size_t previous_stack_size = 0; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_get_limit_wrapper( - &previous_stack_size, cudaLimitStackSize))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_get_limit_wrapper( + &previous_stack_size, cudaLimitStackSize)); // If not large enough then set the stack size, in bytes: const size_t larger_stack_size = 1 << 11; if (previous_stack_size < larger_stack_size) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_set_limit_wrapper( - cudaLimitStackSize, larger_stack_size))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper( + cudaLimitStackSize, larger_stack_size)); } cuda_task_queue_execute<<>>( scheduler, shared_per_warp); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); Impl::cuda_device_synchronize( "Kokkos::Impl::TaskQueueSpecialization::execute: Post Task Execution"); if (previous_stack_size < larger_stack_size) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_set_limit_wrapper( - cudaLimitStackSize, previous_stack_size))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper( + cudaLimitStackSize, previous_stack_size)); } } @@ -300,8 +300,8 @@ class TaskQueueSpecialization> { set_cuda_task_base_apply_function_pointer <<<1, 1>>>(ptr_ptr, dtor_ptr); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); + Impl::cuda_device_synchronize( "Kokkos::Impl::TaskQueueSpecialization::execute: Post Get Function Pointer for Tasks"); @@ -466,7 +466,13 @@ class TaskQueueSpecializationConstrained< static void execute(scheduler_type const& scheduler) { const int shared_per_warp = 2048; const int warps_per_block = 4; - const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1); + const Kokkos::Cuda exec = Cuda(); // FIXME_CUDA_MULTIPLE_DEVICES + const auto& impl_instance = exec.impl_internal_space_instance(); + const int multi_processor_count = + // FIXME not sure why this didn't work + // exec.cuda_device_prop().multiProcessorCount; + impl_instance->m_deviceProp.multiProcessorCount; + const dim3 grid(multi_processor_count, 1, 1); // const dim3 grid( 1 , 1 , 1 ); const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block); const int shared_total = shared_per_warp * warps_per_block; @@ -482,34 +488,30 @@ class TaskQueueSpecializationConstrained< // Query the stack size, in bytes: size_t previous_stack_size = 0; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_get_limit_wrapper( - &previous_stack_size, cudaLimitStackSize))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_get_limit_wrapper( + &previous_stack_size, cudaLimitStackSize)); // If not large enough then set the stack size, in bytes: const size_t larger_stack_size = 2048; if (previous_stack_size < larger_stack_size) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_set_limit_wrapper( - cudaLimitStackSize, larger_stack_size))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper( + cudaLimitStackSize, larger_stack_size)); } cuda_task_queue_execute<<>>( scheduler, shared_per_warp); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); Impl::cuda_device_synchronize( "Kokkos::Impl::TaskQueueSpecializationConstrained::execute: Post Execute Task"); if (previous_stack_size < larger_stack_size) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_set_limit_wrapper( - cudaLimitStackSize, previous_stack_size))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper( + cudaLimitStackSize, previous_stack_size)); } } @@ -532,8 +534,7 @@ class TaskQueueSpecializationConstrained< set_cuda_task_base_apply_function_pointer <<<1, 1>>>(ptr_ptr, dtor_ptr); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); Impl::cuda_device_synchronize( "Kokkos::Impl::TaskQueueSpecializationConstrained::get_function_pointer: Post Get Function Pointer"); diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp index abb747e39a..94a428493f 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp @@ -22,7 +22,6 @@ #include #include -#include namespace Kokkos { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp index a945a716bc..c7ea6988a5 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp @@ -77,7 +77,9 @@ class ParallelFor, inline void execute() { const int warps_per_block = 4; - const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1); + const int multi_processor_count = + m_policy.space().cuda_device_prop().multiProcessorCount; + const dim3 grid(multi_processor_count, 1, 1); const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block); const int shared = 0; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp index c7f0d12d91..517c592af7 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp @@ -25,23 +25,14 @@ namespace Impl { template struct ZeroMemset> { - ZeroMemset(const Kokkos::Cuda& exec_space_instance, const View& dst, - typename View::const_value_type&) { + ZeroMemset(const Kokkos::Cuda& exec_space_instance, + const View& dst) { KOKKOS_IMPL_CUDA_SAFE_CALL( (exec_space_instance.impl_internal_space_instance() ->cuda_memset_async_wrapper( dst.data(), 0, dst.size() * sizeof(typename View::value_type)))); } - - ZeroMemset(const View& dst, - typename View::const_value_type&) { - // FIXME_CUDA_MULTIPLE_DEVICES - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Kokkos::Impl::CudaInternal::singleton().cuda_memset_wrapper( - dst.data(), 0, - dst.size() * sizeof(typename View::value_type)))); - } }; } // namespace Impl diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP.cpp index f78bfd28b2..309e07fb3f 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP.cpp @@ -18,6 +18,7 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE #endif +#include #include #include @@ -41,7 +42,9 @@ int HIP::impl_is_initialized() { } void HIP::impl_initialize(InitializationSettings const& settings) { - const int hip_device_id = Impl::get_gpu(settings); + const std::vector& visible_devices = Impl::get_visible_devices(); + const int hip_device_id = + Impl::get_gpu(settings).value_or(visible_devices[0]); Impl::HIPInternal::m_hipDev = hip_device_id; KOKKOS_IMPL_HIP_SAFE_CALL( @@ -89,10 +92,23 @@ void HIP::impl_initialize(InitializationSettings const& settings) { hipStream_t singleton_stream; KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&singleton_stream)); - Impl::HIPInternal::singleton().initialize(singleton_stream, /*manage*/ true); + Impl::HIPInternal::singleton().initialize(singleton_stream); } -void HIP::impl_finalize() { Impl::HIPInternal::singleton().finalize(); } +void HIP::impl_finalize() { + (void)Impl::hip_global_unique_token_locks(true); + + desul::Impl::finalize_lock_arrays(); // FIXME + + KOKKOS_IMPL_HIP_SAFE_CALL( + hipEventDestroy(Impl::HIPInternal::constantMemReusable)); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipHostFree(Impl::HIPInternal::constantMemHostStaging)); + + Impl::HIPInternal::singleton().finalize(); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipStreamDestroy(Impl::HIPInternal::singleton().m_stream)); +} HIP::HIP() : m_space_instance(&Impl::HIPInternal::singleton(), @@ -102,13 +118,17 @@ HIP::HIP() } HIP::HIP(hipStream_t const stream, Impl::ManageStream manage_stream) - : m_space_instance(new Impl::HIPInternal, [](Impl::HIPInternal* ptr) { - ptr->finalize(); - delete ptr; - }) { + : m_space_instance( + new Impl::HIPInternal, [manage_stream](Impl::HIPInternal* ptr) { + ptr->finalize(); + if (static_cast(manage_stream)) { + KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(ptr->m_stream)); + } + delete ptr; + }) { Impl::HIPInternal::singleton().verify_is_initialized( "HIP instance constructor"); - m_space_instance->initialize(stream, static_cast(manage_stream)); + m_space_instance->initialize(stream); } KOKKOS_DEPRECATED HIP::HIP(hipStream_t const stream, bool manage_stream) diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP.hpp index 61ed346b21..3a88e97ee3 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP.hpp @@ -57,13 +57,15 @@ class HIP { //! \name Functions that all Kokkos devices must implement. //@{ - KOKKOS_INLINE_FUNCTION static int in_parallel() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() { #if defined(__HIP_DEVICE_COMPILE__) return true; #else return false; #endif } +#endif /** \brief Wait until all dispatched functors complete. * @@ -94,9 +96,13 @@ class HIP { static int impl_is_initialized(); - // static size_type device_arch(); - - static size_type detect_device_count(); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static size_type detect_device_count() { + int count; + KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count)); + return count; + } +#endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(); diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp index 576c53426b..5f0df72df1 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp @@ -20,13 +20,11 @@ #include #include -#include #include #include #include -#include #include namespace Kokkos { @@ -43,7 +41,6 @@ class GraphNodeKernelImpl using base_t = typename PatternImplSpecializationFromTag::type; - using Record = Kokkos::Impl::SharedAllocationRecord; // TODO use the name and executionspace template @@ -60,7 +57,7 @@ class GraphNodeKernelImpl ~GraphNodeKernelImpl() { if (m_driver_storage) { - Record::decrement(Record::get_record(m_driver_storage)); + Kokkos::HIPSpace().deallocate(m_driver_storage, sizeof(base_t)); } } @@ -78,15 +75,9 @@ class GraphNodeKernelImpl Kokkos::ObservingRawPtr allocate_driver_memory_buffer() const { KOKKOS_EXPECTS(m_driver_storage == nullptr); - - auto* record = Record::allocate( - Kokkos::HIPSpace{}, "GraphNodeKernel global memory functor storage", - sizeof(base_t)); - - Record::increment(record); - m_driver_storage = reinterpret_cast(record->data()); + m_driver_storage = static_cast(Kokkos::HIPSpace().allocate( + "GraphNodeKernel global memory functor storage", sizeof(base_t))); KOKKOS_ENSURES(m_driver_storage != nullptr); - return m_driver_storage; } diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp index 7f04eb721c..22c0db047f 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include /*--------------------------------------------------------------------------*/ @@ -89,10 +90,14 @@ void HIPInternal::print_configuration(std::ostream &s) const { << '\n'; #endif - int hipDevCount; - KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&hipDevCount)); + s << "macro KOKKOS_ENABLE_ROCTHRUST : " +#if defined(KOKKOS_ENABLE_ROCTHRUST) + << "defined\n"; +#else + << "undefined\n"; +#endif - for (int i = 0; i < hipDevCount; ++i) { + for (int i : get_visible_devices()) { hipDeviceProp_t hipProp; KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(&hipProp, i)); std::string gpu_type = hipProp.integrated == 1 ? "APU" : "dGPU"; @@ -159,14 +164,13 @@ void HIPInternal::fence(const std::string &name) const { [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(m_stream)); }); } -void HIPInternal::initialize(hipStream_t stream, bool manage_stream) { +void HIPInternal::initialize(hipStream_t stream) { KOKKOS_EXPECTS(!is_initialized()); if (was_finalized) Kokkos::abort("Calling HIP::initialize after HIP::finalize is illegal\n"); - m_stream = stream; - m_manage_stream = manage_stream; + m_stream = stream; //---------------------------------- // Multiblock reduction uses scratch flags for counters @@ -192,20 +196,19 @@ void HIPInternal::initialize(hipStream_t stream, bool manage_stream) { Kokkos::HIP::size_type *HIPInternal::scratch_space(const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { + Kokkos::HIPSpace mem_space; + + if (m_scratchSpace) { + mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + } + m_scratchSpaceCount = scratch_count(size); - using Record = Kokkos::Impl::SharedAllocationRecord; - - if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace)); - std::size_t alloc_size = multiply_overflow_abort(m_scratchSpaceCount, sizeScratchGrain); - Record *const r = Record::allocate( - Kokkos::HIPSpace(), "Kokkos::InternalScratchSpace", alloc_size); - - Record::increment(r); - - m_scratchSpace = reinterpret_cast(r->data()); + m_scratchSpace = static_cast( + mem_space.allocate("Kokkos::InternalScratchSpace", alloc_size)); } return m_scratchSpace; @@ -214,21 +217,23 @@ Kokkos::HIP::size_type *HIPInternal::scratch_space(const std::size_t size) { Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { + Kokkos::HIPSpace mem_space; + + if (m_scratchFlags) { + mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); + } + m_scratchFlagsCount = scratch_count(size); - using Record = Kokkos::Impl::SharedAllocationRecord; - - if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags)); - std::size_t alloc_size = multiply_overflow_abort(m_scratchFlagsCount, sizeScratchGrain); - Record *const r = Record::allocate( - Kokkos::HIPSpace(), "Kokkos::InternalScratchFlags", alloc_size); - - Record::increment(r); - - m_scratchFlags = reinterpret_cast(r->data()); + m_scratchFlags = static_cast( + mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size)); + // We only zero-initialize the allocation when we actually allocate. + // It's the responsibility of the features using scratch_flags, + // namely parallel_reduce and parallel_scan, to reset the used values to 0. KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(m_scratchFlags, 0, alloc_size)); } @@ -238,29 +243,20 @@ Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) { Kokkos::HIP::size_type *HIPInternal::stage_functor_for_execution( void const *driver, std::size_t const size) const { if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) { - m_scratchFunctorSize = size; - - using Record = Kokkos::Impl::SharedAllocationRecord; - using RecordHost = - Kokkos::Impl::SharedAllocationRecord; + Kokkos::HIPSpace device_mem_space; + Kokkos::HIPHostPinnedSpace host_mem_space; if (m_scratchFunctor) { - Record::decrement(Record::get_record(m_scratchFunctor)); - RecordHost::decrement(RecordHost::get_record(m_scratchFunctorHost)); + device_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); + host_mem_space.deallocate(m_scratchFunctorHost, m_scratchFunctorSize); } - Record *const r = - Record::allocate(Kokkos::HIPSpace(), "Kokkos::InternalScratchFunctor", - m_scratchFunctorSize); - RecordHost *const r_host = RecordHost::allocate( - Kokkos::HIPHostPinnedSpace(), "Kokkos::InternalScratchFunctorHost", - m_scratchFunctorSize); + m_scratchFunctorSize = size; - Record::increment(r); - RecordHost::increment(r_host); - - m_scratchFunctor = reinterpret_cast(r->data()); - m_scratchFunctorHost = reinterpret_cast(r_host->data()); + m_scratchFunctor = static_cast(device_mem_space.allocate( + "Kokkos::InternalScratchFunctor", m_scratchFunctorSize)); + m_scratchFunctorHost = static_cast(host_mem_space.allocate( + "Kokkos::InternalScratchFunctorHost", m_scratchFunctorSize)); } // When using HSA_XNACK=1, it is necessary to copy the driver to the host to @@ -323,23 +319,18 @@ void HIPInternal::finalize() { this->fence("Kokkos::HIPInternal::finalize: fence on finalization"); was_finalized = true; - if (this == &singleton()) { - (void)Kokkos::Impl::hip_global_unique_token_locks(true); - desul::Impl::finalize_lock_arrays(); // FIXME - - KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(constantMemHostStaging)); - KOKKOS_IMPL_HIP_SAFE_CALL(hipEventDestroy(constantMemReusable)); - } - if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { - using RecordHIP = Kokkos::Impl::SharedAllocationRecord; + Kokkos::HIPSpace device_mem_space; - RecordHIP::decrement(RecordHIP::get_record(m_scratchFlags)); - RecordHIP::decrement(RecordHIP::get_record(m_scratchSpace)); + device_mem_space.deallocate(m_scratchFlags, + m_scratchSpaceCount * sizeScratchGrain); + device_mem_space.deallocate(m_scratchSpace, + m_scratchFlagsCount * sizeScratchGrain); if (m_scratchFunctorSize > 0) { - RecordHIP::decrement(RecordHIP::get_record(m_scratchFunctor)); - RecordHIP::decrement(RecordHIP::get_record(m_scratchFunctorHost)); + device_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); + Kokkos::HIPHostPinnedSpace host_mem_space; + host_mem_space.deallocate(m_scratchFunctorHost, m_scratchFunctorSize); } } @@ -348,14 +339,10 @@ void HIPInternal::finalize() { Kokkos::kokkos_free(m_team_scratch_ptr[i]); } - if (m_manage_stream && m_stream != nullptr) - KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(m_stream)); - m_scratchSpaceCount = 0; m_scratchFlagsCount = 0; m_scratchSpace = nullptr; m_scratchFlags = nullptr; - m_stream = nullptr; for (int i = 0; i < m_n_team_scratch; ++i) { m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; @@ -419,13 +406,3 @@ void Kokkos::Impl::create_HIP_instances(std::vector &instances) { instances[s] = HIP(stream, ManageStream::yes); } } - -//---------------------------------------------------------------------------- - -namespace Kokkos { -HIP::size_type HIP::detect_device_count() { - int hipDevCount; - KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&hipDevCount)); - return hipDevCount; -} -} // namespace Kokkos diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp index 63ad66686b..142008124a 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -98,7 +98,6 @@ class HIPInternal { uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance( reinterpret_cast(this)); - bool m_manage_stream = false; // Team Scratch Level 1 Space int m_n_team_scratch = 10; @@ -124,7 +123,7 @@ class HIPInternal { return nullptr != m_scratchSpace && nullptr != m_scratchFlags; } - void initialize(hipStream_t stream, bool manage_stream); + void initialize(hipStream_t stream); void finalize(); void print_configuration(std::ostream &) const; diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp new file mode 100644 index 0000000000..db07c360b5 --- /dev/null +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp @@ -0,0 +1,173 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_FOR_MDRANGE_HPP +#define KOKKOS_HIP_PARALLEL_FOR_MDRANGE_HPP + +#include + +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +// ParallelFor +template +class ParallelFor, HIP> { + public: + using Policy = Kokkos::MDRangePolicy; + using functor_type = FunctorType; + + private: + using array_index_type = typename Policy::array_index_type; + using index_type = typename Policy::index_type; + using LaunchBounds = typename Policy::launch_bounds; + + const FunctorType m_functor; + const Policy m_policy; + + public: + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; + ParallelFor& operator=(ParallelFor const&) = delete; + + inline __device__ void operator()() const { + Kokkos::Impl::DeviceIterateTile(m_policy, + m_functor) + .exec_range(); + } + + inline void execute() const { + using ClosureType = ParallelFor; + if (m_policy.m_num_tiles == 0) return; + auto const maxblocks = hip_internal_maximum_grid_count(); + if (Policy::rank == 2) { + dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], 1); + dim3 const grid( + std::min( + (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / + block.x, + maxblocks[0]), + std::min( + (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / + block.y, + maxblocks[1]), + 1); + hip_parallel_launch( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 3) { + dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], + m_policy.m_tile[2]); + dim3 const grid( + std::min( + (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / + block.x, + maxblocks[0]), + std::min( + (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / + block.y, + maxblocks[1]), + std::min( + (m_policy.m_upper[2] - m_policy.m_lower[2] + block.z - 1) / + block.z, + maxblocks[2])); + hip_parallel_launch( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 4) { + // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to + // threadIdx.z + dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], + m_policy.m_tile[2], m_policy.m_tile[3]); + dim3 const grid( + std::min( + m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), + std::min( + (m_policy.m_upper[2] - m_policy.m_lower[2] + block.y - 1) / + block.y, + maxblocks[1]), + std::min( + (m_policy.m_upper[3] - m_policy.m_lower[3] + block.z - 1) / + block.z, + maxblocks[2])); + hip_parallel_launch( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 5) { + // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 + // to threadIdx.z + dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], + m_policy.m_tile[2] * m_policy.m_tile[3], + m_policy.m_tile[4]); + dim3 const grid( + std::min( + m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), + std::min( + m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]), + std::min( + (m_policy.m_upper[4] - m_policy.m_lower[4] + block.z - 1) / + block.z, + maxblocks[2])); + hip_parallel_launch( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 6) { + // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; + // id4,id5 to threadIdx.z + dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], + m_policy.m_tile[2] * m_policy.m_tile[3], + m_policy.m_tile[4] * m_policy.m_tile[5]); + dim3 const grid( + std::min( + m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), + std::min( + m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]), + std::min( + m_policy.m_tile_end[4] * m_policy.m_tile_end[5], maxblocks[2])); + hip_parallel_launch( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else { + Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with HIP\n"); + } + + } // end execute + + ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} + + template + static int max_tile_size_product(const Policy&, const Functor&) { + using closure_type = + ParallelFor, HIP>; + unsigned block_size = hip_get_max_blocksize(); + if (block_size == 0) + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid " + "tile size.")); + return block_size; + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp new file mode 100644 index 0000000000..9355c1c75f --- /dev/null +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp @@ -0,0 +1,100 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_FOR_RANGE_HPP +#define KOKKOS_HIP_PARALLEL_FOR_RANGE_HPP + +#include + +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelFor, Kokkos::HIP> { + public: + using Policy = Kokkos::RangePolicy; + + private: + using Member = typename Policy::member_type; + using WorkTag = typename Policy::work_tag; + using LaunchBounds = typename Policy::launch_bounds; + + const FunctorType m_functor; + const Policy m_policy; + + template + inline __device__ std::enable_if_t::value> exec_range( + const Member i) const { + m_functor(i); + } + + template + inline __device__ std::enable_if_t::value> exec_range( + const Member i) const { + m_functor(TagType(), i); + } + + public: + using functor_type = FunctorType; + + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; + ParallelFor& operator=(ParallelFor const&) = delete; + + inline __device__ void operator()() const { + const Member work_stride = blockDim.y * gridDim.x; + const Member work_end = m_policy.end(); + + for (Member iwork = + m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x; + iwork < work_end; + iwork = iwork < work_end - work_stride ? iwork + work_stride + : work_end) { + this->template exec_range(iwork); + } + } + + inline void execute() const { + const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); + + using DriverType = ParallelFor; + const int block_size = + Kokkos::Impl::hip_get_preferred_blocksize(); + const dim3 block(1, block_size, 1); + const dim3 grid( + typename Policy::index_type((nwork + block.y - 1) / block.y), 1, 1); + + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelFor< HIP > could not find a " + "valid execution configuration.")); + } + Kokkos::Impl::hip_parallel_launch( + *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), + false); + } + + ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp new file mode 100644 index 0000000000..bf0c219338 --- /dev/null +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp @@ -0,0 +1,177 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_FOR_TEAM_HPP +#define KOKKOS_HIP_PARALLEL_FOR_TEAM_HPP + +#include + +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelFor, HIP> { + public: + using Policy = TeamPolicy; + using functor_type = FunctorType; + using size_type = HIP::size_type; + + private: + using member_type = typename Policy::member_type; + using work_tag = typename Policy::work_tag; + using launch_bounds = typename Policy::launch_bounds; + + // Algorithmic constraints: blockDim.y is a power of two AND + // blockDim.y == blockDim.z == 1 shared memory utilization: + // + // [ team reduce space ] + // [ team shared space ] + + FunctorType const m_functor; + Policy const m_policy; + size_type const m_league_size; + int m_team_size; + size_type const m_vector_size; + int m_shmem_begin; + int m_shmem_size; + void* m_scratch_ptr[2]; + size_t m_scratch_size[2]; + int m_scratch_pool_id = -1; + int32_t* m_scratch_locks; + size_t m_num_scratch_locks; + + template + __device__ inline std::enable_if_t::value> exec_team( + const member_type& member) const { + m_functor(member); + } + + template + __device__ inline std::enable_if_t::value> exec_team( + const member_type& member) const { + m_functor(TagType(), member); + } + + public: + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; + ParallelFor& operator=(ParallelFor const&) = delete; + + __device__ inline void operator()() const { + // Iterate this block through the league + int64_t threadid = 0; + if (m_scratch_size[1] > 0) { + threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, + m_num_scratch_locks); + } + + int const int_league_size = static_cast(m_league_size); + for (int league_rank = blockIdx.x; league_rank < int_league_size; + league_rank += gridDim.x) { + this->template exec_team(typename Policy::member_type( + kokkos_impl_hip_shared_memory(), m_shmem_begin, m_shmem_size, + static_cast(static_cast(m_scratch_ptr[1]) + + ptrdiff_t(threadid / (blockDim.x * blockDim.y)) * + m_scratch_size[1]), + m_scratch_size[1], league_rank, m_league_size)); + } + if (m_scratch_size[1] > 0) { + hip_release_scratch_index(m_scratch_locks, threadid); + } + } + + inline void execute() const { + int64_t const shmem_size_total = m_shmem_begin + m_shmem_size; + dim3 const grid(static_cast(m_league_size), 1, 1); + dim3 const block(static_cast(m_vector_size), + static_cast(m_team_size), 1); + + using closure_type = + ParallelFor, HIP>; + Impl::hip_parallel_launch( + *this, grid, block, shmem_size_total, + m_policy.space().impl_internal_space_instance(), + true); // copy to device and execute + } + + ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) + : m_functor(arg_functor), + m_policy(arg_policy), + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { + auto internal_space_instance = + m_policy.space().impl_internal_space_instance(); + m_team_size = m_team_size >= 0 ? m_team_size + : arg_policy.team_size_recommended( + arg_functor, ParallelForTag()); + + m_shmem_begin = (sizeof(double) * (m_team_size + 2)); + m_shmem_size = + (m_policy.scratch_size(0, m_team_size) + + FunctorTeamShmemSize::value(m_functor, m_team_size)); + m_scratch_size[0] = m_policy.scratch_size(0, m_team_size); + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + m_scratch_locks = internal_space_instance->m_scratch_locks; + m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + m_scratch_ptr[0] = nullptr; + if (m_team_size <= 0) { + m_scratch_ptr[1] = nullptr; + } else { + m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); + m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( + m_scratch_pool_id, + static_cast(m_scratch_size[1]) * + (std::min( + static_cast(HIP().concurrency() / + (m_team_size * m_vector_size)), + static_cast(m_league_size)))); + } + + int const shmem_size_total = m_shmem_begin + m_shmem_size; + if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + Kokkos::Impl::throw_runtime_exception(std::string( + "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory")); + } + + size_t max_size = arg_policy.team_size_max(arg_functor, ParallelForTag()); + if (static_cast(m_team_size) > static_cast(max_size)) { + Kokkos::Impl::throw_runtime_exception(std::string( + "Kokkos::Impl::ParallelFor< HIP > requested too large team size.")); + } + } + + ~ParallelFor() { + if (m_scratch_pool_id >= 0) { + m_policy.space() + .impl_internal_space_instance() + ->release_team_scratch_space(m_scratch_pool_id); + } + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp similarity index 61% rename from lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp rename to lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp index 0fa325cb12..55b6218d1c 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp @@ -14,157 +14,19 @@ // //@HEADER -#ifndef KOKKOS_HIP_PARALLEL_MDRANGE_HPP -#define KOKKOS_HIP_PARALLEL_MDRANGE_HPP +#ifndef KOKKOS_HIP_PARALLEL_REDUCE_MDRANGE_HPP +#define KOKKOS_HIP_PARALLEL_REDUCE_MDRANGE_HPP + +#include #include #include #include #include #include -#include namespace Kokkos { namespace Impl { -// ParallelFor -template -class ParallelFor, HIP> { - public: - using Policy = Kokkos::MDRangePolicy; - using functor_type = FunctorType; - - private: - using array_index_type = typename Policy::array_index_type; - using index_type = typename Policy::index_type; - using LaunchBounds = typename Policy::launch_bounds; - - const FunctorType m_functor; - const Policy m_policy; - - public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; - ParallelFor& operator=(ParallelFor const&) = delete; - - inline __device__ void operator()() const { - Kokkos::Impl::DeviceIterateTile(m_policy, - m_functor) - .exec_range(); - } - - inline void execute() const { - using ClosureType = ParallelFor; - if (m_policy.m_num_tiles == 0) return; - auto const maxblocks = hip_internal_maximum_grid_count(); - if (Policy::rank == 2) { - dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], 1); - dim3 const grid( - std::min( - (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / - block.x, - maxblocks[0]), - std::min( - (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / - block.y, - maxblocks[1]), - 1); - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 3) { - dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], - m_policy.m_tile[2]); - dim3 const grid( - std::min( - (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / - block.x, - maxblocks[0]), - std::min( - (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / - block.y, - maxblocks[1]), - std::min( - (m_policy.m_upper[2] - m_policy.m_lower[2] + block.z - 1) / - block.z, - maxblocks[2])); - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 4) { - // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to - // threadIdx.z - dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], - m_policy.m_tile[2], m_policy.m_tile[3]); - dim3 const grid( - std::min( - m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), - std::min( - (m_policy.m_upper[2] - m_policy.m_lower[2] + block.y - 1) / - block.y, - maxblocks[1]), - std::min( - (m_policy.m_upper[3] - m_policy.m_lower[3] + block.z - 1) / - block.z, - maxblocks[2])); - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 5) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 - // to threadIdx.z - dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], - m_policy.m_tile[2] * m_policy.m_tile[3], - m_policy.m_tile[4]); - dim3 const grid( - std::min( - m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), - std::min( - m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]), - std::min( - (m_policy.m_upper[4] - m_policy.m_lower[4] + block.z - 1) / - block.z, - maxblocks[2])); - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 6) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; - // id4,id5 to threadIdx.z - dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], - m_policy.m_tile[2] * m_policy.m_tile[3], - m_policy.m_tile[4] * m_policy.m_tile[5]); - dim3 const grid( - std::min( - m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), - std::min( - m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]), - std::min( - m_policy.m_tile_end[4] * m_policy.m_tile_end[5], maxblocks[2])); - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else { - Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with HIP\n"); - } - - } // end execute - - ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} - - template - static int max_tile_size_product(const Policy&, const Functor&) { - using closure_type = - ParallelFor, HIP>; - unsigned block_size = hip_get_max_blocksize(); - if (block_size == 0) - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid " - "tile size.")); - return block_size; - } -}; // ParallelReduce template diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Range.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Range.hpp new file mode 100644 index 0000000000..c8981866e8 --- /dev/null +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Range.hpp @@ -0,0 +1,329 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_REDUCE_RANGE_HPP +#define KOKKOS_HIP_PARALLEL_REDUCE_RANGE_HPP + +#include + +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelReduce, + Kokkos::HIP> { + public: + using Policy = Kokkos::RangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; + + private: + using WorkRange = typename Policy::WorkRange; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + using LaunchBounds = typename Policy::launch_bounds; + + public: + using pointer_type = typename ReducerType::pointer_type; + using value_type = typename ReducerType::value_type; + using reference_type = typename ReducerType::reference_type; + using functor_type = FunctorType; + using reducer_type = ReducerType; + using size_type = Kokkos::HIP::size_type; + using index_type = typename Policy::index_type; + // Conditionally set word_size_type to int16_t or int8_t if value_type is + // smaller than int32_t (Kokkos::HIP::size_type) + // word_size_type is used to determine the word count, shared memory buffer + // size, and global memory buffer size before the scan is performed. + // Within the scan, the word count is recomputed based on word_size_type + // and when calculating indexes into the shared/global memory buffers for + // performing the scan, word_size_type is used again. + // For scalars > 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(size_type), + std::conditional_t, size_type>; + + // Algorithmic constraints: blockSize is a power of two AND blockDim.y == + // blockDim.z == 1 + + const CombinedFunctorReducerType m_functor_reducer; + const Policy m_policy; + const pointer_type m_result_ptr; + const bool m_result_ptr_device_accessible; + const bool m_result_ptr_host_accessible; + word_size_type* m_scratch_space = nullptr; + size_type* m_scratch_flags = nullptr; + + static constexpr bool UseShflReduction = false; + + private: + struct ShflReductionTag {}; + struct SHMEMReductionTag {}; + + // Make the exec_range calls call to Reduce::DeviceIterateTile + template + __device__ inline std::enable_if_t::value> exec_range( + const Member& i, reference_type update) const { + m_functor_reducer.get_functor()(i, update); + } + + template + __device__ inline std::enable_if_t::value> exec_range( + const Member& i, reference_type update) const { + m_functor_reducer.get_functor()(TagType(), i, update); + } + + public: + __device__ inline void operator()() const { + using ReductionTag = std::conditional_t; + run(ReductionTag{}); + } + + __device__ inline void run(SHMEMReductionTag) const { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + const integral_nonzero_constant + word_count(reducer.value_size() / sizeof(word_size_type)); + + { + reference_type value = reducer.init(reinterpret_cast( + ::Kokkos::kokkos_impl_hip_shared_memory() + + threadIdx.y * word_count.value)); + + // Number of blocks is bounded so that the reduction can be limited to two + // passes. Each thread block is given an approximately equal amount of + // work to perform. Accumulate the values for this block. The accumulation + // ordering does not match the final pass, but is arithmetically + // equivalent. + + const WorkRange range(m_policy, blockIdx.x, gridDim.x); + + for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); + iwork < iwork_end; iwork += blockDim.y) { + this->template exec_range(iwork, value); + } + } + + // Reduce with final value at blockDim.y - 1 location. + // Shortcut for length zero reduction + bool do_final_reduction = m_policy.begin() == m_policy.end(); + if (!do_final_reduction) + do_final_reduction = hip_single_inter_block_reduce_scan( + reducer, blockIdx.x, gridDim.x, + ::Kokkos::kokkos_impl_hip_shared_memory(), + m_scratch_space, m_scratch_flags); + if (do_final_reduction) { + // This is the final block with the final result at the final threads' + // location + + word_size_type* const shared = + ::Kokkos::kokkos_impl_hip_shared_memory() + + (blockDim.y - 1) * word_count.value; + word_size_type* const global = + m_result_ptr_device_accessible + ? reinterpret_cast(m_result_ptr) + : m_scratch_space; + + if (threadIdx.y == 0) { + reducer.final(reinterpret_cast(shared)); + } + + if (::Kokkos::Impl::HIPTraits::WarpSize < word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { + global[i] = shared[i]; + } + } + } + + __device__ inline void run(ShflReductionTag) const { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + value_type value; + reducer.init(&value); + // Number of blocks is bounded so that the reduction can be limited to two + // passes. Each thread block is given an approximately equal amount of work + // to perform. Accumulate the values for this block. The accumulation + // ordering does not match the final pass, but is arithmetically equivalent. + + WorkRange const range(m_policy, blockIdx.x, gridDim.x); + + for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); + iwork < iwork_end; iwork += blockDim.y) { + this->template exec_range(iwork, value); + } + + pointer_type const result = reinterpret_cast(m_scratch_space); + + int max_active_thread = static_cast(range.end() - range.begin()) < + static_cast(blockDim.y) + ? range.end() - range.begin() + : blockDim.y; + + max_active_thread = + (max_active_thread == 0) ? blockDim.y : max_active_thread; + + value_type init; + reducer.init(&init); + if (m_policy.begin() == m_policy.end()) { + reducer.final(&value); + pointer_type const final_result = + m_result_ptr_device_accessible ? m_result_ptr : result; + *final_result = value; + } else if (Impl::hip_inter_block_shuffle_reduction<>( + value, init, reducer, m_scratch_space, result, + m_scratch_flags, max_active_thread)) { + unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; + if (id == 0) { + reducer.final(&value); + pointer_type const final_result = + m_result_ptr_device_accessible ? m_result_ptr : result; + *final_result = value; + } + } + } + + // Determine block size constrained by shared memory: + inline unsigned local_block_size(const FunctorType& f) { + const auto& instance = m_policy.space().impl_internal_space_instance(); + auto shmem_functor = [&f](unsigned n) { + return hip_single_inter_block_reduce_scan_shmem(f, n); + }; + return Kokkos::Impl::hip_get_preferred_blocksize( + instance, shmem_functor); + } + + inline void execute() { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + const index_type nwork = m_policy.end() - m_policy.begin(); + const bool need_device_set = ReducerType::has_init_member_function() || + ReducerType::has_final_member_function() || + !m_result_ptr_host_accessible || + !std::is_same::value; + if ((nwork > 0) || need_device_set) { + const int block_size = local_block_size(m_functor_reducer.get_functor()); + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a " + "valid execution configuration.")); + } + + // REQUIRED ( 1 , N , 1 ) + dim3 block(1, block_size, 1); + // use a slightly less constrained, but still well bounded limit for + // scratch + int nblocks = (nwork + block.y - 1) / block.y; + // Heuristic deciding the value of nblocks. + // The general idea here is we want to: + // 1. Not undersubscribe the device (i.e., we want at least + // preferred_block_min blocks) + // 2. Have each thread reduce > 1 value to minimize overheads + // 3. Limit the total # of blocks, to avoid unbounded scratch space + constexpr int block_max = 4096; + constexpr int preferred_block_min = 1024; + + if (nblocks < preferred_block_min) { + // keep blocks as is, already have low parallelism + } else if (nblocks > block_max) { + // "large dispatch" -> already have lots of parallelism + nblocks = block_max; + } else { + // in the intermediate range, try to have each thread process multiple + // items to offset the cost of the reduction (with not enough + // parallelism to hide it) + int items_per_thread = + (nwork + nblocks * block_size - 1) / (nblocks * block_size); + if (items_per_thread < 4) { + int ratio = std::min( + (nblocks + preferred_block_min - 1) / preferred_block_min, + (4 + items_per_thread - 1) / items_per_thread); + nblocks /= ratio; + } + } + + // TODO: down casting these uses more space than required? + m_scratch_space = + (word_size_type*)::Kokkos::Impl::hip_internal_scratch_space( + m_policy.space(), reducer.value_size() * nblocks); + // Intentionally do not downcast to word_size_type since we use HIP + // atomics in Kokkos_HIP_ReduceScan.hpp + m_scratch_flags = ::Kokkos::Impl::hip_internal_scratch_flags( + m_policy.space(), sizeof(size_type)); + // Required grid.x <= block.y + dim3 grid(nblocks, 1, 1); + + if (nwork == 0) { + block = dim3(1, 1, 1); + grid = dim3(1, 1, 1); + } + const int shmem = + UseShflReduction + ? 0 + : hip_single_inter_block_reduce_scan_shmem( + m_functor_reducer.get_functor(), block.y); + + Kokkos::Impl::hip_parallel_launch( + *this, grid, block, shmem, + m_policy.space().impl_internal_space_instance(), + false); // copy to device and execute + + if (!m_result_ptr_device_accessible && m_result_ptr) { + const int size = reducer.value_size(); + DeepCopy(m_policy.space(), m_result_ptr, + m_scratch_space, size); + } + } else { + if (m_result_ptr) { + reducer.init(m_result_ptr); + } + } + } + + template + ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + const Policy& arg_policy, const ViewType& arg_result) + : m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr(arg_result.data()), + m_result_ptr_device_accessible( + MemorySpaceAccess::accessible), + m_result_ptr_host_accessible( + MemorySpaceAccess::accessible) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp new file mode 100644 index 0000000000..609ba28b86 --- /dev/null +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp @@ -0,0 +1,394 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_REDUCE_TEAM_HPP +#define KOKKOS_HIP_PARALLEL_REDUCE_TEAM_HPP + +#include + +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelReduce, HIP> { + public: + using Policy = TeamPolicyInternal; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; + + private: + using member_type = typename Policy::member_type; + using work_tag = typename Policy::work_tag; + using launch_bounds = typename Policy::launch_bounds; + + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; + using value_type = typename ReducerType::value_type; + + public: + using functor_type = FunctorType; + using size_type = HIP::size_type; + + // static int constexpr UseShflReduction = false; + // FIXME_HIP This should be disabled unconditionally for best performance, but + // it currently causes tests to fail. + static constexpr int UseShflReduction = + (ReducerType::static_value_size() != 0); + + private: + struct ShflReductionTag {}; + struct SHMEMReductionTag {}; + + // Algorithmic constraints: blockDim.y is a power of two AND + // blockDim.y == blockDim.z == 1 shared memory utilization: + // + // [ global reduce space ] + // [ team reduce space ] + // [ team shared space ] + // + + const CombinedFunctorReducerType m_functor_reducer; + const Policy m_policy; + const pointer_type m_result_ptr; + const bool m_result_ptr_device_accessible; + const bool m_result_ptr_host_accessible; + size_type* m_scratch_space; + size_type* m_scratch_flags; + size_type m_team_begin; + size_type m_shmem_begin; + size_type m_shmem_size; + void* m_scratch_ptr[2]; + size_t m_scratch_size[2]; + int m_scratch_pool_id = -1; + int32_t* m_scratch_locks; + size_t m_num_scratch_locks; + const size_type m_league_size; + int m_team_size; + const size_type m_vector_size; + + template + __device__ inline std::enable_if_t::value> exec_team( + member_type const& member, reference_type update) const { + m_functor_reducer.get_functor()(member, update); + } + + template + __device__ inline std::enable_if_t::value> exec_team( + member_type const& member, reference_type update) const { + m_functor_reducer.get_functor()(TagType(), member, update); + } + + __device__ inline void iterate_through_league(int const threadid, + reference_type value) const { + int const int_league_size = static_cast(m_league_size); + for (int league_rank = blockIdx.x; league_rank < int_league_size; + league_rank += gridDim.x) { + this->template exec_team( + member_type( + kokkos_impl_hip_shared_memory() + m_team_begin, + m_shmem_begin, m_shmem_size, + reinterpret_cast( + reinterpret_cast(m_scratch_ptr[1]) + + static_cast(threadid / (blockDim.x * blockDim.y)) * + m_scratch_size[1]), + m_scratch_size[1], league_rank, m_league_size), + value); + } + } + + int compute_block_count() const { + constexpr auto light_weight = + Kokkos::Experimental::WorkItemProperty::HintLightWeight; + constexpr typename Policy::work_item_property property; + // Numbers were tuned on MI210 using dot product and yAx benchmarks + constexpr int block_max = + (property & light_weight) == light_weight ? 2097152 : 65536; + constexpr int preferred_block_min = 1024; + int block_count = m_league_size; + if (block_count < preferred_block_min) { + // keep blocks as is, already low parallelism + } else if (block_count >= block_max) { + block_count = block_max; + + } else { + int nwork = m_league_size * m_team_size; + int items_per_thread = + (nwork + block_count * m_team_size - 1) / (block_count * m_team_size); + if (items_per_thread < 4) { + int ratio = std::min( + (block_count + preferred_block_min - 1) / preferred_block_min, + (4 + items_per_thread - 1) / items_per_thread); + block_count /= ratio; + } + } + + return block_count; + } + + public: + __device__ inline void operator()() const { + int64_t threadid = 0; + if (m_scratch_size[1] > 0) { + threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, + m_num_scratch_locks); + } + + using ReductionTag = std::conditional_t; + run(ReductionTag{}, threadid); + + if (m_scratch_size[1] > 0) { + hip_release_scratch_index(m_scratch_locks, threadid); + } + } + + __device__ inline void run(SHMEMReductionTag, int const threadid) const { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + integral_nonzero_constant const + word_count(reducer.value_size() / sizeof(size_type)); + + reference_type value = + reducer.init(kokkos_impl_hip_shared_memory() + + threadIdx.y * word_count.value); + // Iterate this block through the league + iterate_through_league(threadid, value); + + // Reduce with final value at blockDim.y - 1 location. + bool do_final_reduce = (m_league_size == 0); + if (!do_final_reduce) + do_final_reduce = + hip_single_inter_block_reduce_scan( + reducer, blockIdx.x, gridDim.x, + kokkos_impl_hip_shared_memory(), m_scratch_space, + m_scratch_flags); + if (do_final_reduce) { + // This is the final block with the final result at the final threads' + // location + + size_type* const shared = kokkos_impl_hip_shared_memory() + + (blockDim.y - 1) * word_count.value; + size_type* const global = m_result_ptr_device_accessible + ? reinterpret_cast(m_result_ptr) + : m_scratch_space; + + if (threadIdx.y == 0) { + reducer.final(reinterpret_cast(shared)); + } + + if (HIPTraits::WarpSize < word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { + global[i] = shared[i]; + } + } + } + + __device__ inline void run(ShflReductionTag, int const threadid) const { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + value_type value; + reducer.init(&value); + + // Iterate this block through the league + iterate_through_league(threadid, value); + + pointer_type const result = + m_result_ptr_device_accessible + ? m_result_ptr + : reinterpret_cast(m_scratch_space); + + value_type init; + reducer.init(&init); + if (m_league_size == 0) { + reducer.final(&value); + *result = value; + } else if (Impl::hip_inter_block_shuffle_reduction( + value, init, reducer, m_scratch_space, result, + m_scratch_flags, blockDim.y)) { + unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; + if (id == 0) { + reducer.final(&value); + *result = value; + } + } + } + + inline void execute() { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + const bool is_empty_range = m_league_size == 0 || m_team_size == 0; + const bool need_device_set = ReducerType::has_init_member_function() || + ReducerType::has_final_member_function() || + !m_result_ptr_host_accessible || + Policy::is_graph_kernel::value || + !std::is_same::value; + if (!is_empty_range || need_device_set) { + int const block_count = compute_block_count(); + + m_scratch_space = hip_internal_scratch_space( + m_policy.space(), reducer.value_size() * block_count); + m_scratch_flags = + hip_internal_scratch_flags(m_policy.space(), sizeof(size_type)); + + dim3 block(m_vector_size, m_team_size, 1); + dim3 grid(block_count, 1, 1); + if (is_empty_range) { + block = dim3(1, 1, 1); + grid = dim3(1, 1, 1); + } + const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; + + Impl::hip_parallel_launch( + *this, grid, block, shmem_size_total, + m_policy.space().impl_internal_space_instance(), + true); // copy to device and execute + + if (!m_result_ptr_device_accessible) { + m_policy.space().impl_internal_space_instance()->fence(); + + if (m_result_ptr) { + const int size = reducer.value_size(); + DeepCopy(m_policy.space(), m_result_ptr, + m_scratch_space, size); + } + } + } else { + if (m_result_ptr) { + reducer.init(m_result_ptr); + } + } + } + + template + ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, + Policy const& arg_policy, ViewType const& arg_result) + : m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr(arg_result.data()), + m_result_ptr_device_accessible( + MemorySpaceAccess::accessible), + m_result_ptr_host_accessible( + MemorySpaceAccess::accessible), + m_scratch_space(nullptr), + m_scratch_flags(nullptr), + m_team_begin(0), + m_shmem_begin(0), + m_shmem_size(0), + m_scratch_ptr{nullptr, nullptr}, + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { + auto internal_space_instance = + m_policy.space().impl_internal_space_instance(); + m_team_size = m_team_size >= 0 ? m_team_size + : arg_policy.team_size_recommended( + arg_functor_reducer.get_functor(), + arg_functor_reducer.get_reducer(), + ParallelReduceTag()); + + m_team_begin = + UseShflReduction + ? 0 + : hip_single_inter_block_reduce_scan_shmem( + arg_functor_reducer.get_functor(), m_team_size); + m_shmem_begin = sizeof(double) * (m_team_size + 2); + m_shmem_size = m_policy.scratch_size(0, m_team_size) + + FunctorTeamShmemSize::value( + arg_functor_reducer.get_functor(), m_team_size); + m_scratch_size[0] = m_shmem_size; + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + m_scratch_locks = internal_space_instance->m_scratch_locks; + m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; + if (m_team_size <= 0) { + m_scratch_ptr[1] = nullptr; + } else { + m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); + m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( + m_scratch_pool_id, + static_cast(m_scratch_size[1]) * + (std::min( + static_cast(HIP().concurrency() / + (m_team_size * m_vector_size)), + static_cast(m_league_size)))); + } + + // The global parallel_reduce does not support vector_length other than 1 at + // the moment + if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) + Impl::throw_runtime_exception( + "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " + "greater than 1 is not currently supported for HIP for dynamic " + "sized reduction types."); + + if ((m_team_size < HIPTraits::WarpSize) && !UseShflReduction) + Impl::throw_runtime_exception( + "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller " + "than 64 is not currently supported with HIP for dynamic sized " + "reduction types."); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + + const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; + + if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) && + !UseShflReduction) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size")); + } + + if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > requested too much " + "L0 scratch memory")); + } + + size_t max_size = arg_policy.team_size_max( + arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), + ParallelReduceTag()); + if (static_cast(m_team_size) > static_cast(max_size)) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > requested too " + "large team size.")); + } + } + + ~ParallelReduce() { + if (m_scratch_pool_id >= 0) { + m_policy.space() + .impl_internal_space_instance() + ->release_team_scratch_space(m_scratch_pool_id); + } + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp similarity index 50% rename from lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp rename to lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp index 26e8be4698..41692a3291 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp @@ -14,390 +14,18 @@ // //@HEADER -#ifndef KOKKO_HIP_PARALLEL_RANGE_HPP -#define KOKKO_HIP_PARALLEL_RANGE_HPP +#ifndef KOKKOS_HIP_PARALLEL_SCAN_RANGE_HPP +#define KOKKOS_HIP_PARALLEL_SCAN_RANGE_HPP #include -#if defined(__HIPCC__) - #include #include #include -#include -#include namespace Kokkos { namespace Impl { -template -class ParallelFor, Kokkos::HIP> { - public: - using Policy = Kokkos::RangePolicy; - - private: - using Member = typename Policy::member_type; - using WorkTag = typename Policy::work_tag; - using LaunchBounds = typename Policy::launch_bounds; - - const FunctorType m_functor; - const Policy m_policy; - - template - inline __device__ std::enable_if_t::value> exec_range( - const Member i) const { - m_functor(i); - } - - template - inline __device__ std::enable_if_t::value> exec_range( - const Member i) const { - m_functor(TagType(), i); - } - - public: - using functor_type = FunctorType; - - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; - ParallelFor& operator=(ParallelFor const&) = delete; - - inline __device__ void operator()() const { - const Member work_stride = blockDim.y * gridDim.x; - const Member work_end = m_policy.end(); - - for (Member iwork = - m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x; - iwork < work_end; - iwork = iwork < work_end - work_stride ? iwork + work_stride - : work_end) { - this->template exec_range(iwork); - } - } - - inline void execute() const { - const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); - - using DriverType = ParallelFor; - const int block_size = - Kokkos::Impl::hip_get_preferred_blocksize(); - const dim3 block(1, block_size, 1); - const dim3 grid( - typename Policy::index_type((nwork + block.y - 1) / block.y), 1, 1); - - if (block_size == 0) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelFor< HIP > could not find a " - "valid execution configuration.")); - } - Kokkos::Impl::hip_parallel_launch( - *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), - false); - } - - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} -}; - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -template -class ParallelReduce, - Kokkos::HIP> { - public: - using Policy = Kokkos::RangePolicy; - using FunctorType = typename CombinedFunctorReducerType::functor_type; - using ReducerType = typename CombinedFunctorReducerType::reducer_type; - - private: - using WorkRange = typename Policy::WorkRange; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - using LaunchBounds = typename Policy::launch_bounds; - - public: - using pointer_type = typename ReducerType::pointer_type; - using value_type = typename ReducerType::value_type; - using reference_type = typename ReducerType::reference_type; - using functor_type = FunctorType; - using reducer_type = ReducerType; - using size_type = Kokkos::HIP::size_type; - using index_type = typename Policy::index_type; - // Conditionally set word_size_type to int16_t or int8_t if value_type is - // smaller than int32_t (Kokkos::HIP::size_type) - // word_size_type is used to determine the word count, shared memory buffer - // size, and global memory buffer size before the scan is performed. - // Within the scan, the word count is recomputed based on word_size_type - // and when calculating indexes into the shared/global memory buffers for - // performing the scan, word_size_type is used again. - // For scalars > 4 bytes in size, indexing into shared/global memory relies - // on the block and grid dimensions to ensure that we index at the correct - // offset rather than at every 4 byte word; such that, when the join is - // performed, we have the correct data that was copied over in chunks of 4 - // bytes. - using word_size_type = std::conditional_t< - sizeof(value_type) < sizeof(size_type), - std::conditional_t, size_type>; - - // Algorithmic constraints: blockSize is a power of two AND blockDim.y == - // blockDim.z == 1 - - const CombinedFunctorReducerType m_functor_reducer; - const Policy m_policy; - const pointer_type m_result_ptr; - const bool m_result_ptr_device_accessible; - const bool m_result_ptr_host_accessible; - word_size_type* m_scratch_space = nullptr; - size_type* m_scratch_flags = nullptr; - - static constexpr bool UseShflReduction = false; - - private: - struct ShflReductionTag {}; - struct SHMEMReductionTag {}; - - // Make the exec_range calls call to Reduce::DeviceIterateTile - template - __device__ inline std::enable_if_t::value> exec_range( - const Member& i, reference_type update) const { - m_functor_reducer.get_functor()(i, update); - } - - template - __device__ inline std::enable_if_t::value> exec_range( - const Member& i, reference_type update) const { - m_functor_reducer.get_functor()(TagType(), i, update); - } - - public: - __device__ inline void operator()() const { - using ReductionTag = std::conditional_t; - run(ReductionTag{}); - } - - __device__ inline void run(SHMEMReductionTag) const { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - const integral_nonzero_constant - word_count(reducer.value_size() / sizeof(word_size_type)); - - { - reference_type value = reducer.init(reinterpret_cast( - ::Kokkos::kokkos_impl_hip_shared_memory() + - threadIdx.y * word_count.value)); - - // Number of blocks is bounded so that the reduction can be limited to two - // passes. Each thread block is given an approximately equal amount of - // work to perform. Accumulate the values for this block. The accumulation - // ordering does not match the final pass, but is arithmetically - // equivalent. - - const WorkRange range(m_policy, blockIdx.x, gridDim.x); - - for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); - iwork < iwork_end; iwork += blockDim.y) { - this->template exec_range(iwork, value); - } - } - - // Reduce with final value at blockDim.y - 1 location. - // Shortcut for length zero reduction - bool do_final_reduction = m_policy.begin() == m_policy.end(); - if (!do_final_reduction) - do_final_reduction = hip_single_inter_block_reduce_scan( - reducer, blockIdx.x, gridDim.x, - ::Kokkos::kokkos_impl_hip_shared_memory(), - m_scratch_space, m_scratch_flags); - if (do_final_reduction) { - // This is the final block with the final result at the final threads' - // location - - word_size_type* const shared = - ::Kokkos::kokkos_impl_hip_shared_memory() + - (blockDim.y - 1) * word_count.value; - word_size_type* const global = - m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) - : m_scratch_space; - - if (threadIdx.y == 0) { - reducer.final(reinterpret_cast(shared)); - } - - if (::Kokkos::Impl::HIPTraits::WarpSize < word_count.value) { - __syncthreads(); - } - - for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { - global[i] = shared[i]; - } - } - } - - __device__ inline void run(ShflReductionTag) const { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - value_type value; - reducer.init(&value); - // Number of blocks is bounded so that the reduction can be limited to two - // passes. Each thread block is given an approximately equal amount of work - // to perform. Accumulate the values for this block. The accumulation - // ordering does not match the final pass, but is arithmetically equivalent. - - WorkRange const range(m_policy, blockIdx.x, gridDim.x); - - for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); - iwork < iwork_end; iwork += blockDim.y) { - this->template exec_range(iwork, value); - } - - pointer_type const result = reinterpret_cast(m_scratch_space); - - int max_active_thread = static_cast(range.end() - range.begin()) < - static_cast(blockDim.y) - ? range.end() - range.begin() - : blockDim.y; - - max_active_thread = - (max_active_thread == 0) ? blockDim.y : max_active_thread; - - value_type init; - reducer.init(&init); - if (m_policy.begin() == m_policy.end()) { - reducer.final(&value); - pointer_type const final_result = - m_result_ptr_device_accessible ? m_result_ptr : result; - *final_result = value; - } else if (Impl::hip_inter_block_shuffle_reduction<>( - value, init, reducer, m_scratch_space, result, - m_scratch_flags, max_active_thread)) { - unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; - if (id == 0) { - reducer.final(&value); - pointer_type const final_result = - m_result_ptr_device_accessible ? m_result_ptr : result; - *final_result = value; - } - } - } - - // Determine block size constrained by shared memory: - inline unsigned local_block_size(const FunctorType& f) { - const auto& instance = m_policy.space().impl_internal_space_instance(); - auto shmem_functor = [&f](unsigned n) { - return hip_single_inter_block_reduce_scan_shmem(f, n); - }; - return Kokkos::Impl::hip_get_preferred_blocksize( - instance, shmem_functor); - } - - inline void execute() { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - const index_type nwork = m_policy.end() - m_policy.begin(); - const bool need_device_set = ReducerType::has_init_member_function() || - ReducerType::has_final_member_function() || - !m_result_ptr_host_accessible || - !std::is_same::value; - if ((nwork > 0) || need_device_set) { - const int block_size = local_block_size(m_functor_reducer.get_functor()); - if (block_size == 0) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a " - "valid execution configuration.")); - } - - // REQUIRED ( 1 , N , 1 ) - dim3 block(1, block_size, 1); - // use a slightly less constrained, but still well bounded limit for - // scratch - int nblocks = (nwork + block.y - 1) / block.y; - // Heuristic deciding the value of nblocks. - // The general idea here is we want to: - // 1. Not undersubscribe the device (i.e., we want at least - // preferred_block_min blocks) - // 2. Have each thread reduce > 1 value to minimize overheads - // 3. Limit the total # of blocks, to avoid unbounded scratch space - constexpr int block_max = 4096; - constexpr int preferred_block_min = 1024; - - if (nblocks < preferred_block_min) { - // keep blocks as is, already have low parallelism - } else if (nblocks > block_max) { - // "large dispatch" -> already have lots of parallelism - nblocks = block_max; - } else { - // in the intermediate range, try to have each thread process multiple - // items to offset the cost of the reduction (with not enough - // parallelism to hide it) - int items_per_thread = - (nwork + nblocks * block_size - 1) / (nblocks * block_size); - if (items_per_thread < 4) { - int ratio = std::min( - (nblocks + preferred_block_min - 1) / preferred_block_min, - (4 + items_per_thread - 1) / items_per_thread); - nblocks /= ratio; - } - } - - // TODO: down casting these uses more space than required? - m_scratch_space = - (word_size_type*)::Kokkos::Impl::hip_internal_scratch_space( - m_policy.space(), reducer.value_size() * nblocks); - // Intentionally do not downcast to word_size_type since we use HIP - // atomics in Kokkos_HIP_ReduceScan.hpp - m_scratch_flags = ::Kokkos::Impl::hip_internal_scratch_flags( - m_policy.space(), sizeof(size_type)); - // Required grid.x <= block.y - dim3 grid(nblocks, 1, 1); - - if (nwork == 0) { - block = dim3(1, 1, 1); - grid = dim3(1, 1, 1); - } - const int shmem = - UseShflReduction - ? 0 - : hip_single_inter_block_reduce_scan_shmem( - m_functor_reducer.get_functor(), block.y); - - Kokkos::Impl::hip_parallel_launch( - *this, grid, block, shmem, - m_policy.space().impl_internal_space_instance(), - false); // copy to device and execute - - if (!m_result_ptr_device_accessible && m_result_ptr) { - const int size = reducer.value_size(); - DeepCopy(m_policy.space(), m_result_ptr, - m_scratch_space, size); - } - } else { - if (m_result_ptr) { - reducer.init(m_result_ptr); - } - } - } - - template - ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, - const Policy& arg_policy, const ViewType& arg_result) - : m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr(arg_result.data()), - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), - m_result_ptr_host_accessible( - MemorySpaceAccess::accessible) {} -}; - template class ParallelScanHIPBase { public: @@ -763,5 +391,3 @@ class ParallelScanWithTotal, } // namespace Kokkos #endif - -#endif diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp deleted file mode 100644 index 3fe568ac36..0000000000 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp +++ /dev/null @@ -1,936 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKO_HIP_PARALLEL_TEAM_HPP -#define KOKKO_HIP_PARALLEL_TEAM_HPP - -#include - -#if defined(__HIPCC__) - -#include -#include -#include -#include - -namespace Kokkos { -namespace Impl { - -template -class TeamPolicyInternal - : public PolicyTraits { - public: - using execution_policy = TeamPolicyInternal; - - using traits = PolicyTraits; - - template - friend class TeamPolicyInternal; - - private: - typename traits::execution_space m_space; - int m_league_size; - int m_team_size; - int m_vector_length; - size_t m_team_scratch_size[2]; - size_t m_thread_scratch_size[2]; - int m_chunk_size; - bool m_tune_team_size; - bool m_tune_vector_length; - - public: - using execution_space = HIP; - - template - TeamPolicyInternal(TeamPolicyInternal const& p) { - m_league_size = p.m_league_size; - m_team_size = p.m_team_size; - m_vector_length = p.m_vector_length; - m_team_scratch_size[0] = p.m_team_scratch_size[0]; - m_team_scratch_size[1] = p.m_team_scratch_size[1]; - m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; - m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; - m_chunk_size = p.m_chunk_size; - m_space = p.m_space; - m_tune_team_size = p.m_tune_team_size; - m_tune_vector_length = p.m_tune_vector_length; - } - - template - int team_size_max(FunctorType const& f, ParallelForTag const&) const { - using closure_type = - Impl::ParallelFor>; - - return internal_team_size_common(f); - } - - template - inline int team_size_max(const FunctorType& f, - const ParallelReduceTag&) const { - using functor_analysis_type = - Impl::FunctorAnalysis; - using closure_type = Impl::ParallelReduce< - CombinedFunctorReducer, - TeamPolicy, Kokkos::HIP>; - return internal_team_size_common< - BlockType::Max, closure_type, - typename functor_analysis_type::value_type>(f); - } - - template - inline int team_size_max(const FunctorType& f, const ReducerType&, - const ParallelReduceTag&) const { - using closure_type = - Impl::ParallelReduce, - TeamPolicy, Kokkos::HIP>; - return internal_team_size_common(f); - } - - template - int team_size_recommended(FunctorType const& f, ParallelForTag const&) const { - using closure_type = - Impl::ParallelFor>; - - return internal_team_size_common( - f); - } - - template - inline int team_size_recommended(FunctorType const& f, - ParallelReduceTag const&) const { - using functor_analysis_type = - Impl::FunctorAnalysis; - using closure_type = Impl::ParallelReduce< - CombinedFunctorReducer, - TeamPolicy, Kokkos::HIP>; - return internal_team_size_common< - BlockType::Preferred, closure_type, - typename functor_analysis_type::value_type>(f); - } - - template - int team_size_recommended(FunctorType const& f, ReducerType const&, - ParallelReduceTag const&) const { - using closure_type = - Impl::ParallelReduce, - TeamPolicy, Kokkos::HIP>; - return internal_team_size_common(f); - } - - inline bool impl_auto_vector_length() const { return m_tune_vector_length; } - inline bool impl_auto_team_size() const { return m_tune_team_size; } - static int vector_length_max() { return HIPTraits::WarpSize; } - - static int verify_requested_vector_length(int requested_vector_length) { - int test_vector_length = - std::min(requested_vector_length, vector_length_max()); - - // Allow only power-of-two vector_length - if (!(is_integral_power_of_two(test_vector_length))) { - int test_pow2 = 1; - constexpr int warp_size = HIPTraits::WarpSize; - while (test_pow2 < warp_size) { - test_pow2 <<= 1; - if (test_pow2 > test_vector_length) { - break; - } - } - test_vector_length = test_pow2 >> 1; - } - - return test_vector_length; - } - - inline static int scratch_size_max(int level) { - // HIP Teams use (team_size + 2)*sizeof(double) shared memory for team - // reductions. They also use one int64_t in static shared memory for a - // shared ID. Furthermore, they use additional scratch memory in some - // reduction scenarios, which depend on the size of the value_type and is - // NOT captured here - constexpr size_t max_possible_team_size = 1024; - constexpr size_t max_reserved_shared_mem_per_team = - (max_possible_team_size + 2) * sizeof(double) + sizeof(int64_t); - // arbitrarily setting level 1 scratch limit to 20MB, for a - // MI250 that would give us about 4.4GB for 2 teams per CU - constexpr size_t max_l1_scratch_size = 20 * 1024 * 1024; - - size_t max_shmem = HIP().hip_device_prop().sharedMemPerBlock; - return (level == 0 ? max_shmem - max_reserved_shared_mem_per_team - : max_l1_scratch_size); - } - - inline void impl_set_vector_length(size_t size) { m_vector_length = size; } - inline void impl_set_team_size(size_t size) { m_team_size = size; } - int impl_vector_length() const { return m_vector_length; } - - int team_size() const { return m_team_size; } - - int league_size() const { return m_league_size; } - - size_t scratch_size(int level, int team_size_ = -1) const { - if (team_size_ < 0) team_size_ = m_team_size; - return m_team_scratch_size[level] + - team_size_ * m_thread_scratch_size[level]; - } - - size_t team_scratch_size(int level) const { - return m_team_scratch_size[level]; - } - - size_t thread_scratch_size(int level) const { - return m_thread_scratch_size[level]; - } - - typename traits::execution_space space() const { return m_space; } - - TeamPolicyInternal() - : m_space(typename traits::execution_space()), - m_league_size(0), - m_team_size(-1), - m_vector_length(0), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(HIPTraits::WarpSize), - m_tune_team_size(false), - m_tune_vector_length(false) {} - - /** \brief Specify league size, request team size */ - TeamPolicyInternal(const execution_space space_, int league_size_, - int team_size_request, int vector_length_request = 1) - : m_space(space_), - m_league_size(league_size_), - m_team_size(team_size_request), - m_vector_length( - (vector_length_request > 0) - ? verify_requested_vector_length(vector_length_request) - : (verify_requested_vector_length(1))), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(HIPTraits::WarpSize), - m_tune_team_size(bool(team_size_request <= 0)), - m_tune_vector_length(bool(vector_length_request <= 0)) { - // Make sure league size is permissible - if (league_size_ >= static_cast(hip_internal_maximum_grid_count()[0])) - Impl::throw_runtime_exception( - "Requested too large league_size for TeamPolicy on HIP execution " - "space."); - - // Make sure total block size is permissible - if (m_team_size * m_vector_length > HIPTraits::MaxThreadsPerBlock) { - Impl::throw_runtime_exception( - std::string("Kokkos::TeamPolicy< HIP > the team size is too large. " - "Team size x vector length must be smaller than 1024.")); - } - } - - /** \brief Specify league size, request team size */ - TeamPolicyInternal(const execution_space space_, int league_size_, - const Kokkos::AUTO_t& /* team_size_request */, - int vector_length_request = 1) - : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {} - // FLAG - /** \brief Specify league size and team size, request vector length*/ - TeamPolicyInternal(const execution_space space_, int league_size_, - int team_size_request, - const Kokkos::AUTO_t& /* vector_length_request */ - ) - : TeamPolicyInternal(space_, league_size_, team_size_request, -1) - - {} - - /** \brief Specify league size, request team size and vector length*/ - TeamPolicyInternal(const execution_space space_, int league_size_, - const Kokkos::AUTO_t& /* team_size_request */, - const Kokkos::AUTO_t& /* vector_length_request */ - - ) - : TeamPolicyInternal(space_, league_size_, -1, -1) - - {} - - TeamPolicyInternal(int league_size_, int team_size_request, - int vector_length_request = 1) - : TeamPolicyInternal(typename traits::execution_space(), league_size_, - team_size_request, vector_length_request) {} - - TeamPolicyInternal(int league_size_, - const Kokkos::AUTO_t& /* team_size_request */, - int vector_length_request = 1) - : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, - vector_length_request) {} - - /** \brief Specify league size and team size, request vector length*/ - TeamPolicyInternal(int league_size_, int team_size_request, - const Kokkos::AUTO_t& /* vector_length_request */ - - ) - : TeamPolicyInternal(typename traits::execution_space(), league_size_, - team_size_request, -1) - - {} - - /** \brief Specify league size, request team size and vector length*/ - TeamPolicyInternal(int league_size_, - const Kokkos::AUTO_t& /* team_size_request */, - const Kokkos::AUTO_t& /* vector_length_request */ - - ) - : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, - -1) {} - - int chunk_size() const { return m_chunk_size; } - - TeamPolicyInternal& set_chunk_size(typename traits::index_type chunk_size_) { - m_chunk_size = chunk_size_; - return *this; - } - - /** \brief set per team scratch size for a specific level of the scratch - * hierarchy */ - TeamPolicyInternal& set_scratch_size(int level, - PerTeamValue const& per_team) { - m_team_scratch_size[level] = per_team.value; - return *this; - } - - /** \brief set per thread scratch size for a specific level of the scratch - * hierarchy */ - TeamPolicyInternal& set_scratch_size(int level, - PerThreadValue const& per_thread) { - m_thread_scratch_size[level] = per_thread.value; - return *this; - } - - /** \brief set per thread and per team scratch size for a specific level of - * the scratch hierarchy */ - TeamPolicyInternal& set_scratch_size(int level, PerTeamValue const& per_team, - PerThreadValue const& per_thread) { - m_team_scratch_size[level] = per_team.value; - m_thread_scratch_size[level] = per_thread.value; - return *this; - } - - using member_type = Kokkos::Impl::HIPTeamMember; - - protected: - template - int internal_team_size_common(FunctorType const& f) const { - const unsigned shmem_block = team_scratch_size(0) + 2 * sizeof(double); - unsigned shmem_thread = thread_scratch_size(0) + sizeof(double); - using Tag = typename PatternTagFromImplSpecialization::type; - if constexpr (std::is_same_v) { - using Interface = - typename Impl::DeduceFunctorPatternInterface::type; - using Analysis = - Impl::FunctorAnalysis; - shmem_thread += - ((Analysis::StaticValueSize != 0) ? 0 : Analysis::value_size(f)); - } - const int vector_length = impl_vector_length(); - - const auto functor = [&f, shmem_block, shmem_thread, vector_length]( - const hipFuncAttributes& attr, int block_size) { - int functor_shmem = - ::Kokkos::Impl::FunctorTeamShmemSize::value( - f, block_size / vector_length); - return shmem_block + shmem_thread * (block_size / vector_length) + - functor_shmem + attr.sharedSizeBytes; - }; - int block_size; - if constexpr (BlockSize == BlockType::Max) { - block_size = hip_get_max_team_blocksize( - space().impl_internal_space_instance(), functor); - } else { - block_size = - hip_get_preferred_team_blocksize( - space().impl_internal_space_instance(), functor); - } - - if (block_size == 0) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::ParallelFor/Reduce< HIP > could not find a valid " - "team size.")); - } - if constexpr (std::is_same_v) { - return block_size / impl_vector_length(); - } else { - // Currently we require Power-of-2 team size for reductions. - int p2 = 1; - while (p2 <= block_size) p2 *= 2; - p2 /= 2; - return p2 / impl_vector_length(); - } - } -}; - -__device__ inline int64_t hip_get_scratch_index(HIP::size_type league_size, - int32_t* scratch_locks, - size_t num_scratch_locks) { - int64_t threadid = 0; - __shared__ int64_t base_thread_id; - if (threadIdx.x == 0 && threadIdx.y == 0) { - int64_t const wraparound_len = - Kokkos::min(int64_t(league_size), - int64_t(num_scratch_locks) / (blockDim.x * blockDim.y)); - threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len; - threadid *= blockDim.x * blockDim.y; - int done = 0; - while (!done) { - done = (0 == atomicCAS(&scratch_locks[threadid], 0, 1)); - if (!done) { - threadid += blockDim.x * blockDim.y; - if (int64_t(threadid + blockDim.x * blockDim.y) >= - wraparound_len * blockDim.x * blockDim.y) - threadid = 0; - } - } - base_thread_id = threadid; - } - __syncthreads(); - threadid = base_thread_id; - return threadid; -} - -__device__ inline void hip_release_scratch_index(int32_t* scratch_locks, - int64_t threadid) { - __syncthreads(); - if (threadIdx.x == 0 && threadIdx.y == 0) { - scratch_locks[threadid] = 0; - } -} - -template -class ParallelFor, HIP> { - public: - using Policy = TeamPolicy; - using functor_type = FunctorType; - using size_type = HIP::size_type; - - private: - using member_type = typename Policy::member_type; - using work_tag = typename Policy::work_tag; - using launch_bounds = typename Policy::launch_bounds; - - // Algorithmic constraints: blockDim.y is a power of two AND - // blockDim.y == blockDim.z == 1 shared memory utilization: - // - // [ team reduce space ] - // [ team shared space ] - - FunctorType const m_functor; - Policy const m_policy; - size_type const m_league_size; - int m_team_size; - size_type const m_vector_size; - int m_shmem_begin; - int m_shmem_size; - void* m_scratch_ptr[2]; - size_t m_scratch_size[2]; - int m_scratch_pool_id = -1; - int32_t* m_scratch_locks; - size_t m_num_scratch_locks; - - template - __device__ inline std::enable_if_t::value> exec_team( - const member_type& member) const { - m_functor(member); - } - - template - __device__ inline std::enable_if_t::value> exec_team( - const member_type& member) const { - m_functor(TagType(), member); - } - - public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; - ParallelFor& operator=(ParallelFor const&) = delete; - - __device__ inline void operator()() const { - // Iterate this block through the league - int64_t threadid = 0; - if (m_scratch_size[1] > 0) { - threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, - m_num_scratch_locks); - } - - int const int_league_size = static_cast(m_league_size); - for (int league_rank = blockIdx.x; league_rank < int_league_size; - league_rank += gridDim.x) { - this->template exec_team(typename Policy::member_type( - kokkos_impl_hip_shared_memory(), m_shmem_begin, m_shmem_size, - static_cast(static_cast(m_scratch_ptr[1]) + - ptrdiff_t(threadid / (blockDim.x * blockDim.y)) * - m_scratch_size[1]), - m_scratch_size[1], league_rank, m_league_size)); - } - if (m_scratch_size[1] > 0) { - hip_release_scratch_index(m_scratch_locks, threadid); - } - } - - inline void execute() const { - int64_t const shmem_size_total = m_shmem_begin + m_shmem_size; - dim3 const grid(static_cast(m_league_size), 1, 1); - dim3 const block(static_cast(m_vector_size), - static_cast(m_team_size), 1); - - using closure_type = - ParallelFor, HIP>; - Impl::hip_parallel_launch( - *this, grid, block, shmem_size_total, - m_policy.space().impl_internal_space_instance(), - true); // copy to device and execute - } - - ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) - : m_functor(arg_functor), - m_policy(arg_policy), - m_league_size(arg_policy.league_size()), - m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()) { - auto internal_space_instance = - m_policy.space().impl_internal_space_instance(); - m_team_size = m_team_size >= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor, ParallelForTag()); - - m_shmem_begin = (sizeof(double) * (m_team_size + 2)); - m_shmem_size = - (m_policy.scratch_size(0, m_team_size) + - FunctorTeamShmemSize::value(m_functor, m_team_size)); - m_scratch_size[0] = m_policy.scratch_size(0, m_team_size); - m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - m_scratch_locks = internal_space_instance->m_scratch_locks; - m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; - - // Functor's reduce memory, team scan memory, and team shared memory depend - // upon team size. - m_scratch_ptr[0] = nullptr; - if (m_team_size <= 0) { - m_scratch_ptr[1] = nullptr; - } else { - m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); - m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * - (std::min( - static_cast(HIP().concurrency() / - (m_team_size * m_vector_size)), - static_cast(m_league_size)))); - } - - int const shmem_size_total = m_shmem_begin + m_shmem_size; - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory")); - } - - size_t max_size = arg_policy.team_size_max(arg_functor, ParallelForTag()); - if (static_cast(m_team_size) > static_cast(max_size)) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::ParallelFor< HIP > requested too large team size.")); - } - } - - ~ParallelFor() { - if (m_scratch_pool_id >= 0) { - m_policy.space() - .impl_internal_space_instance() - ->release_team_scratch_space(m_scratch_pool_id); - } - } -}; - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -template -class ParallelReduce, HIP> { - public: - using Policy = TeamPolicyInternal; - using FunctorType = typename CombinedFunctorReducerType::functor_type; - using ReducerType = typename CombinedFunctorReducerType::reducer_type; - - private: - using member_type = typename Policy::member_type; - using work_tag = typename Policy::work_tag; - using launch_bounds = typename Policy::launch_bounds; - - using pointer_type = typename ReducerType::pointer_type; - using reference_type = typename ReducerType::reference_type; - using value_type = typename ReducerType::value_type; - - public: - using functor_type = FunctorType; - using size_type = HIP::size_type; - - // static int constexpr UseShflReduction = false; - // FIXME_HIP This should be disabled unconditionally for best performance, but - // it currently causes tests to fail. - static constexpr int UseShflReduction = - (ReducerType::static_value_size() != 0); - - private: - struct ShflReductionTag {}; - struct SHMEMReductionTag {}; - - // Algorithmic constraints: blockDim.y is a power of two AND - // blockDim.y == blockDim.z == 1 shared memory utilization: - // - // [ global reduce space ] - // [ team reduce space ] - // [ team shared space ] - // - - const CombinedFunctorReducerType m_functor_reducer; - const Policy m_policy; - const pointer_type m_result_ptr; - const bool m_result_ptr_device_accessible; - const bool m_result_ptr_host_accessible; - size_type* m_scratch_space; - size_type* m_scratch_flags; - size_type m_team_begin; - size_type m_shmem_begin; - size_type m_shmem_size; - void* m_scratch_ptr[2]; - size_t m_scratch_size[2]; - int m_scratch_pool_id = -1; - int32_t* m_scratch_locks; - size_t m_num_scratch_locks; - const size_type m_league_size; - int m_team_size; - const size_type m_vector_size; - - template - __device__ inline std::enable_if_t::value> exec_team( - member_type const& member, reference_type update) const { - m_functor_reducer.get_functor()(member, update); - } - - template - __device__ inline std::enable_if_t::value> exec_team( - member_type const& member, reference_type update) const { - m_functor_reducer.get_functor()(TagType(), member, update); - } - - __device__ inline void iterate_through_league(int const threadid, - reference_type value) const { - int const int_league_size = static_cast(m_league_size); - for (int league_rank = blockIdx.x; league_rank < int_league_size; - league_rank += gridDim.x) { - this->template exec_team( - member_type( - kokkos_impl_hip_shared_memory() + m_team_begin, - m_shmem_begin, m_shmem_size, - reinterpret_cast( - reinterpret_cast(m_scratch_ptr[1]) + - static_cast(threadid / (blockDim.x * blockDim.y)) * - m_scratch_size[1]), - m_scratch_size[1], league_rank, m_league_size), - value); - } - } - - int compute_block_count() const { - constexpr auto light_weight = - Kokkos::Experimental::WorkItemProperty::HintLightWeight; - constexpr typename Policy::work_item_property property; - // Numbers were tuned on MI210 using dot product and yAx benchmarks - constexpr int block_max = - (property & light_weight) == light_weight ? 2097152 : 65536; - constexpr int preferred_block_min = 1024; - int block_count = m_league_size; - if (block_count < preferred_block_min) { - // keep blocks as is, already low parallelism - } else if (block_count >= block_max) { - block_count = block_max; - - } else { - int nwork = m_league_size * m_team_size; - int items_per_thread = - (nwork + block_count * m_team_size - 1) / (block_count * m_team_size); - if (items_per_thread < 4) { - int ratio = std::min( - (block_count + preferred_block_min - 1) / preferred_block_min, - (4 + items_per_thread - 1) / items_per_thread); - block_count /= ratio; - } - } - - return block_count; - } - - public: - __device__ inline void operator()() const { - int64_t threadid = 0; - if (m_scratch_size[1] > 0) { - threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, - m_num_scratch_locks); - } - - using ReductionTag = std::conditional_t; - run(ReductionTag{}, threadid); - - if (m_scratch_size[1] > 0) { - hip_release_scratch_index(m_scratch_locks, threadid); - } - } - - __device__ inline void run(SHMEMReductionTag, int const threadid) const { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - integral_nonzero_constant const - word_count(reducer.value_size() / sizeof(size_type)); - - reference_type value = - reducer.init(kokkos_impl_hip_shared_memory() + - threadIdx.y * word_count.value); - // Iterate this block through the league - iterate_through_league(threadid, value); - - // Reduce with final value at blockDim.y - 1 location. - bool do_final_reduce = (m_league_size == 0); - if (!do_final_reduce) - do_final_reduce = - hip_single_inter_block_reduce_scan( - reducer, blockIdx.x, gridDim.x, - kokkos_impl_hip_shared_memory(), m_scratch_space, - m_scratch_flags); - if (do_final_reduce) { - // This is the final block with the final result at the final threads' - // location - - size_type* const shared = kokkos_impl_hip_shared_memory() + - (blockDim.y - 1) * word_count.value; - size_type* const global = m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) - : m_scratch_space; - - if (threadIdx.y == 0) { - reducer.final(reinterpret_cast(shared)); - } - - if (HIPTraits::WarpSize < word_count.value) { - __syncthreads(); - } - - for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { - global[i] = shared[i]; - } - } - } - - __device__ inline void run(ShflReductionTag, int const threadid) const { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - value_type value; - reducer.init(&value); - - // Iterate this block through the league - iterate_through_league(threadid, value); - - pointer_type const result = - m_result_ptr_device_accessible - ? m_result_ptr - : reinterpret_cast(m_scratch_space); - - value_type init; - reducer.init(&init); - if (m_league_size == 0) { - reducer.final(&value); - *result = value; - } else if (Impl::hip_inter_block_shuffle_reduction( - value, init, reducer, m_scratch_space, result, - m_scratch_flags, blockDim.y)) { - unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; - if (id == 0) { - reducer.final(&value); - *result = value; - } - } - } - - inline void execute() { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - const bool is_empty_range = m_league_size == 0 || m_team_size == 0; - const bool need_device_set = ReducerType::has_init_member_function() || - ReducerType::has_final_member_function() || - !m_result_ptr_host_accessible || - Policy::is_graph_kernel::value || - !std::is_same::value; - if (!is_empty_range || need_device_set) { - int const block_count = compute_block_count(); - - m_scratch_space = hip_internal_scratch_space( - m_policy.space(), reducer.value_size() * block_count); - m_scratch_flags = - hip_internal_scratch_flags(m_policy.space(), sizeof(size_type)); - - dim3 block(m_vector_size, m_team_size, 1); - dim3 grid(block_count, 1, 1); - if (is_empty_range) { - block = dim3(1, 1, 1); - grid = dim3(1, 1, 1); - } - const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; - - Impl::hip_parallel_launch( - *this, grid, block, shmem_size_total, - m_policy.space().impl_internal_space_instance(), - true); // copy to device and execute - - if (!m_result_ptr_device_accessible) { - m_policy.space().impl_internal_space_instance()->fence(); - - if (m_result_ptr) { - const int size = reducer.value_size(); - DeepCopy(m_result_ptr, m_scratch_space, size); - } - } - } else { - if (m_result_ptr) { - reducer.init(m_result_ptr); - } - } - } - - template - ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, - Policy const& arg_policy, ViewType const& arg_result) - : m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr(arg_result.data()), - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), - m_result_ptr_host_accessible( - MemorySpaceAccess::accessible), - m_scratch_space(nullptr), - m_scratch_flags(nullptr), - m_team_begin(0), - m_shmem_begin(0), - m_shmem_size(0), - m_scratch_ptr{nullptr, nullptr}, - m_league_size(arg_policy.league_size()), - m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()) { - auto internal_space_instance = - m_policy.space().impl_internal_space_instance(); - m_team_size = m_team_size >= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor_reducer.get_functor(), - arg_functor_reducer.get_reducer(), - ParallelReduceTag()); - - m_team_begin = - UseShflReduction - ? 0 - : hip_single_inter_block_reduce_scan_shmem( - arg_functor_reducer.get_functor(), m_team_size); - m_shmem_begin = sizeof(double) * (m_team_size + 2); - m_shmem_size = m_policy.scratch_size(0, m_team_size) + - FunctorTeamShmemSize::value( - arg_functor_reducer.get_functor(), m_team_size); - m_scratch_size[0] = m_shmem_size; - m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - m_scratch_locks = internal_space_instance->m_scratch_locks; - m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; - if (m_team_size <= 0) { - m_scratch_ptr[1] = nullptr; - } else { - m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); - m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * - (std::min( - static_cast(HIP().concurrency() / - (m_team_size * m_vector_size)), - static_cast(m_league_size)))); - } - - // The global parallel_reduce does not support vector_length other than 1 at - // the moment - if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) - Impl::throw_runtime_exception( - "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " - "greater than 1 is not currently supported for HIP for dynamic " - "sized reduction types."); - - if ((m_team_size < HIPTraits::WarpSize) && !UseShflReduction) - Impl::throw_runtime_exception( - "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller " - "than 64 is not currently supported with HIP for dynamic sized " - "reduction types."); - - // Functor's reduce memory, team scan memory, and team shared memory depend - // upon team size. - - const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; - - if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) && - !UseShflReduction) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size")); - } - - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > requested too much " - "L0 scratch memory")); - } - - size_t max_size = arg_policy.team_size_max( - arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), - ParallelReduceTag()); - if (static_cast(m_team_size) > static_cast(max_size)) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > requested too " - "large team size.")); - } - } - - ~ParallelReduce() { - if (m_scratch_pool_id >= 0) { - m_policy.space() - .impl_internal_space_instance() - ->release_team_scratch_space(m_scratch_pool_id); - } - } -}; -} // namespace Impl -} // namespace Kokkos - -#endif - -#endif diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp index ea599989e7..ab24004f5f 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp @@ -18,138 +18,14 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE #endif -#include -#include #include +#include +#include +#include -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord - SharedAllocationRecord::s_root_record; - -SharedAllocationRecord - SharedAllocationRecord::s_root_record; - -SharedAllocationRecord - SharedAllocationRecord::s_root_record; -#endif - -SharedAllocationRecord::~SharedAllocationRecord() { - auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, (alloc_size - sizeof(SharedAllocationHeader))); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size); -} - -SharedAllocationRecord::SharedAllocationRecord( - const HIPSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - HIP exec; - Kokkos::Impl::DeepCopy( - exec, RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - exec.fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -SharedAllocationRecord::SharedAllocationRecord( - const HIP& arg_exec_space, const HIPSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - Kokkos::Impl::DeepCopy(arg_exec_space, - RecordBase::m_alloc_ptr, &header, - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord::SharedAllocationRecord( - const HIPHostPinnedSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - // Fill in the Header information, directly accessible via host pinned memory - this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord::SharedAllocationRecord( - const HIPManagedSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - // Fill in the Header information, directly accessible via managed memory - this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, - arg_label); -} - -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::HIPSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::HIPHostPinnedSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::HIPManagedSpace); diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp index e68bad9723..fbae518834 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp @@ -18,120 +18,11 @@ #define KOKKOS_HIP_SHARED_ALLOCATION_RECORD_HPP #include +#include -namespace Kokkos { -namespace Impl { - -template <> -class SharedAllocationRecord - : public HostInaccessibleSharedAllocationRecordCommon { - private: - friend class SharedAllocationRecordCommon; - friend class HostInaccessibleSharedAllocationRecordCommon; - using base_t = HostInaccessibleSharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const HIPSpace m_space; - - protected: - ~SharedAllocationRecord(); - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec*/, const HIPSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const HIP& exec_space, const HIPSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const HIPSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon { - private: - friend class SharedAllocationRecordCommon; - using base_t = SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const HIPHostPinnedSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, const HIPHostPinnedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const HIPHostPinnedSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon { - private: - friend class SharedAllocationRecordCommon; - using base_t = SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const HIPManagedSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, const HIPManagedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const HIPManagedSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::HIPSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPHostPinnedSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPManagedSpace); #endif diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp index 7f6aa0d8e8..e8bdfca66f 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp @@ -24,10 +24,8 @@ #include #include -#include #include -#include #include #include @@ -287,22 +285,3 @@ void HIPManagedSpace::impl_deallocate( } } // namespace Kokkos - -/*--------------------------------------------------------------------------*/ -/*--------------------------------------------------------------------------*/ - -#include - -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class HostInaccessibleSharedAllocationRecordCommon; -template class SharedAllocationRecordCommon; -template class SharedAllocationRecordCommon; -template class SharedAllocationRecordCommon; - -} // end namespace Impl -} // end namespace Kokkos diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp index f3e5adf87e..7f2004e5cb 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp @@ -65,6 +65,15 @@ class HIPSpace { ~HIPSpace() = default; /**\brief Allocate untracked memory in the hip space */ + // FIXME_HIP Use execution space instance + void* allocate(const HIP&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + // FIXME_HIP Use execution space instance + void* allocate(const HIP&, const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -76,8 +85,6 @@ class HIPSpace { const size_t arg_logical_size = 0) const; private: - template - friend class LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -94,8 +101,6 @@ class HIPSpace { private: int m_device; ///< Which HIP device - - friend class Kokkos::Impl::SharedAllocationRecord; }; template <> @@ -129,6 +134,16 @@ class HIPHostPinnedSpace { ~HIPHostPinnedSpace() = default; /**\brief Allocate untracked memory in the space */ + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -140,8 +155,6 @@ class HIPHostPinnedSpace { const size_t arg_logical_size = 0) const; private: - template - friend class LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -194,6 +207,16 @@ class HIPManagedSpace { ~HIPManagedSpace() = default; /**\brief Allocate untracked memory in the space */ + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -209,8 +232,6 @@ class HIPManagedSpace { private: int m_device; ///< Which HIP device - template - friend class LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -239,8 +260,7 @@ struct Impl::is_hip_type_space : public std::true_type {}; namespace Kokkos { namespace Impl { -static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); +static_assert(Kokkos::Impl::MemorySpaceAccess::assignable); //---------------------------------------- diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp new file mode 100644 index 0000000000..67e1181125 --- /dev/null +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp @@ -0,0 +1,421 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_TEAM_POLICY_INTERNAL_HPP +#define KOKKOS_HIP_TEAM_POLICY_INTERNAL_HPP + +#include + +namespace Kokkos { +namespace Impl { + +template +class TeamPolicyInternal + : public PolicyTraits { + public: + using execution_policy = TeamPolicyInternal; + + using traits = PolicyTraits; + + template + friend class TeamPolicyInternal; + + private: + typename traits::execution_space m_space; + int m_league_size; + int m_team_size; + int m_vector_length; + size_t m_team_scratch_size[2]; + size_t m_thread_scratch_size[2]; + int m_chunk_size; + bool m_tune_team_size; + bool m_tune_vector_length; + + public: + using execution_space = HIP; + + template + TeamPolicyInternal(TeamPolicyInternal const& p) { + m_league_size = p.m_league_size; + m_team_size = p.m_team_size; + m_vector_length = p.m_vector_length; + m_team_scratch_size[0] = p.m_team_scratch_size[0]; + m_team_scratch_size[1] = p.m_team_scratch_size[1]; + m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; + m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; + m_chunk_size = p.m_chunk_size; + m_space = p.m_space; + m_tune_team_size = p.m_tune_team_size; + m_tune_vector_length = p.m_tune_vector_length; + } + + template + int team_size_max(FunctorType const& f, ParallelForTag const&) const { + using closure_type = + Impl::ParallelFor>; + + return internal_team_size_common(f); + } + + template + inline int team_size_max(const FunctorType& f, + const ParallelReduceTag&) const { + using functor_analysis_type = + Impl::FunctorAnalysis; + using closure_type = Impl::ParallelReduce< + CombinedFunctorReducer, + TeamPolicy, Kokkos::HIP>; + return internal_team_size_common< + BlockType::Max, closure_type, + typename functor_analysis_type::value_type>(f); + } + + template + inline int team_size_max(const FunctorType& f, const ReducerType&, + const ParallelReduceTag&) const { + using closure_type = + Impl::ParallelReduce, + TeamPolicy, Kokkos::HIP>; + return internal_team_size_common(f); + } + + template + int team_size_recommended(FunctorType const& f, ParallelForTag const&) const { + using closure_type = + Impl::ParallelFor>; + + return internal_team_size_common( + f); + } + + template + inline int team_size_recommended(FunctorType const& f, + ParallelReduceTag const&) const { + using functor_analysis_type = + Impl::FunctorAnalysis; + using closure_type = Impl::ParallelReduce< + CombinedFunctorReducer, + TeamPolicy, Kokkos::HIP>; + return internal_team_size_common< + BlockType::Preferred, closure_type, + typename functor_analysis_type::value_type>(f); + } + + template + int team_size_recommended(FunctorType const& f, ReducerType const&, + ParallelReduceTag const&) const { + using closure_type = + Impl::ParallelReduce, + TeamPolicy, Kokkos::HIP>; + return internal_team_size_common(f); + } + + inline bool impl_auto_vector_length() const { return m_tune_vector_length; } + inline bool impl_auto_team_size() const { return m_tune_team_size; } + static int vector_length_max() { return HIPTraits::WarpSize; } + + static int verify_requested_vector_length(int requested_vector_length) { + int test_vector_length = + std::min(requested_vector_length, vector_length_max()); + + // Allow only power-of-two vector_length + if (!(is_integral_power_of_two(test_vector_length))) { + int test_pow2 = 1; + constexpr int warp_size = HIPTraits::WarpSize; + while (test_pow2 < warp_size) { + test_pow2 <<= 1; + if (test_pow2 > test_vector_length) { + break; + } + } + test_vector_length = test_pow2 >> 1; + } + + return test_vector_length; + } + + inline static int scratch_size_max(int level) { + // HIP Teams use (team_size + 2)*sizeof(double) shared memory for team + // reductions. They also use one int64_t in static shared memory for a + // shared ID. Furthermore, they use additional scratch memory in some + // reduction scenarios, which depend on the size of the value_type and is + // NOT captured here + constexpr size_t max_possible_team_size = 1024; + constexpr size_t max_reserved_shared_mem_per_team = + (max_possible_team_size + 2) * sizeof(double) + sizeof(int64_t); + // arbitrarily setting level 1 scratch limit to 20MB, for a + // MI250 that would give us about 4.4GB for 2 teams per CU + constexpr size_t max_l1_scratch_size = 20 * 1024 * 1024; + + size_t max_shmem = HIP().hip_device_prop().sharedMemPerBlock; + return (level == 0 ? max_shmem - max_reserved_shared_mem_per_team + : max_l1_scratch_size); + } + + inline void impl_set_vector_length(size_t size) { m_vector_length = size; } + inline void impl_set_team_size(size_t size) { m_team_size = size; } + int impl_vector_length() const { return m_vector_length; } + + int team_size() const { return m_team_size; } + + int league_size() const { return m_league_size; } + + size_t scratch_size(int level, int team_size_ = -1) const { + if (team_size_ < 0) team_size_ = m_team_size; + return m_team_scratch_size[level] + + team_size_ * m_thread_scratch_size[level]; + } + + size_t team_scratch_size(int level) const { + return m_team_scratch_size[level]; + } + + size_t thread_scratch_size(int level) const { + return m_thread_scratch_size[level]; + } + + typename traits::execution_space space() const { return m_space; } + + TeamPolicyInternal() + : m_space(typename traits::execution_space()), + m_league_size(0), + m_team_size(-1), + m_vector_length(0), + m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(HIPTraits::WarpSize), + m_tune_team_size(false), + m_tune_vector_length(false) {} + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(const execution_space space_, int league_size_, + int team_size_request, int vector_length_request = 1) + : m_space(space_), + m_league_size(league_size_), + m_team_size(team_size_request), + m_vector_length( + (vector_length_request > 0) + ? verify_requested_vector_length(vector_length_request) + : (verify_requested_vector_length(1))), + m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(HIPTraits::WarpSize), + m_tune_team_size(bool(team_size_request <= 0)), + m_tune_vector_length(bool(vector_length_request <= 0)) { + // Make sure league size is permissible + if (league_size_ >= static_cast(hip_internal_maximum_grid_count()[0])) + Impl::throw_runtime_exception( + "Requested too large league_size for TeamPolicy on HIP execution " + "space."); + + // Make sure total block size is permissible + if (m_team_size * m_vector_length > HIPTraits::MaxThreadsPerBlock) { + Impl::throw_runtime_exception( + std::string("Kokkos::TeamPolicy< HIP > the team size is too large. " + "Team size x vector length must be smaller than 1024.")); + } + } + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(const execution_space space_, int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + int vector_length_request = 1) + : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {} + // FLAG + /** \brief Specify league size and team size, request vector length*/ + TeamPolicyInternal(const execution_space space_, int league_size_, + int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */ + ) + : TeamPolicyInternal(space_, league_size_, team_size_request, -1) + + {} + + /** \brief Specify league size, request team size and vector length*/ + TeamPolicyInternal(const execution_space space_, int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(space_, league_size_, -1, -1) + + {} + + TeamPolicyInternal(int league_size_, int team_size_request, + int vector_length_request = 1) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, vector_length_request) {} + + TeamPolicyInternal(int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + int vector_length_request = 1) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, + vector_length_request) {} + + /** \brief Specify league size and team size, request vector length*/ + TeamPolicyInternal(int league_size_, int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, -1) + + {} + + /** \brief Specify league size, request team size and vector length*/ + TeamPolicyInternal(int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, + -1) {} + + int chunk_size() const { return m_chunk_size; } + + TeamPolicyInternal& set_chunk_size(typename traits::index_type chunk_size_) { + m_chunk_size = chunk_size_; + return *this; + } + + /** \brief set per team scratch size for a specific level of the scratch + * hierarchy */ + TeamPolicyInternal& set_scratch_size(int level, + PerTeamValue const& per_team) { + m_team_scratch_size[level] = per_team.value; + return *this; + } + + /** \brief set per thread scratch size for a specific level of the scratch + * hierarchy */ + TeamPolicyInternal& set_scratch_size(int level, + PerThreadValue const& per_thread) { + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + /** \brief set per thread and per team scratch size for a specific level of + * the scratch hierarchy */ + TeamPolicyInternal& set_scratch_size(int level, PerTeamValue const& per_team, + PerThreadValue const& per_thread) { + m_team_scratch_size[level] = per_team.value; + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + using member_type = Kokkos::Impl::HIPTeamMember; + + protected: + template + int internal_team_size_common(FunctorType const& f) const { + const unsigned shmem_block = team_scratch_size(0) + 2 * sizeof(double); + unsigned shmem_thread = thread_scratch_size(0) + sizeof(double); + using Tag = typename PatternTagFromImplSpecialization::type; + if constexpr (std::is_same_v) { + using Interface = + typename Impl::DeduceFunctorPatternInterface::type; + using Analysis = + Impl::FunctorAnalysis; + shmem_thread += + ((Analysis::StaticValueSize != 0) ? 0 : Analysis::value_size(f)); + } + const int vector_length = impl_vector_length(); + + const auto functor = [&f, shmem_block, shmem_thread, vector_length]( + const hipFuncAttributes& attr, int block_size) { + int functor_shmem = + ::Kokkos::Impl::FunctorTeamShmemSize::value( + f, block_size / vector_length); + return shmem_block + shmem_thread * (block_size / vector_length) + + functor_shmem + attr.sharedSizeBytes; + }; + int block_size; + if constexpr (BlockSize == BlockType::Max) { + block_size = hip_get_max_team_blocksize( + space().impl_internal_space_instance(), functor); + } else { + block_size = + hip_get_preferred_team_blocksize( + space().impl_internal_space_instance(), functor); + } + + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception(std::string( + "Kokkos::Impl::ParallelFor/Reduce< HIP > could not find a valid " + "team size.")); + } + if constexpr (std::is_same_v) { + return block_size / impl_vector_length(); + } else { + // Currently we require Power-of-2 team size for reductions. + int p2 = 1; + while (p2 <= block_size) p2 *= 2; + p2 /= 2; + return p2 / impl_vector_length(); + } + } +}; + +__device__ inline int64_t hip_get_scratch_index(HIP::size_type league_size, + int32_t* scratch_locks, + size_t num_scratch_locks) { + int64_t threadid = 0; + __shared__ int64_t base_thread_id; + if (threadIdx.x == 0 && threadIdx.y == 0) { + int64_t const wraparound_len = + Kokkos::min(int64_t(league_size), + int64_t(num_scratch_locks) / (blockDim.x * blockDim.y)); + threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len; + threadid *= blockDim.x * blockDim.y; + int done = 0; + while (!done) { + done = (0 == atomicCAS(&scratch_locks[threadid], 0, 1)); + if (!done) { + threadid += blockDim.x * blockDim.y; + if (int64_t(threadid + blockDim.x * blockDim.y) >= + wraparound_len * blockDim.x * blockDim.y) + threadid = 0; + } + } + base_thread_id = threadid; + } + __syncthreads(); + threadid = base_thread_id; + return threadid; +} + +__device__ inline void hip_release_scratch_index(int32_t* scratch_locks, + int64_t threadid) { + __syncthreads(); + if (threadIdx.x == 0 && threadIdx.y == 0) { + scratch_locks[threadid] = 0; + } +} + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp index 313e5f5217..3d70b59646 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp @@ -19,7 +19,6 @@ #include #include -#include namespace Kokkos { diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp index 5c40d0fbc8..4bca29868f 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp @@ -25,19 +25,11 @@ namespace Impl { template struct ZeroMemset> { - ZeroMemset(const HIP& exec_space, const View& dst, - typename View::const_value_type&) { + ZeroMemset(const HIP& exec_space, const View& dst) { KOKKOS_IMPL_HIP_SAFE_CALL(hipMemsetAsync( dst.data(), 0, dst.size() * sizeof(typename View::value_type), exec_space.hip_stream())); } - - ZeroMemset(const View& dst, - typename View::const_value_type&) { - KOKKOS_IMPL_HIP_SAFE_CALL( - hipMemset(dst.data(), 0, - dst.size() * sizeof(typename View::value_type))); - } }; } // namespace Impl diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp b/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp index 4a40ffcaa4..6d541a6414 100644 --- a/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp +++ b/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp @@ -103,6 +103,7 @@ void HPX::print_configuration(std::ostream &os, const bool) const { os << hpx::configuration_string() << '\n'; } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 bool &HPX::impl_get_in_parallel() noexcept { static thread_local bool in_parallel = false; return in_parallel; @@ -127,6 +128,7 @@ HPX::impl_not_in_parallel_scope::~impl_not_in_parallel_scope() noexcept { KOKKOS_EXPECTS(!impl_get_in_parallel()); impl_get_in_parallel() = true; } +#endif void HPX::impl_decrement_active_parallel_region_count() { std::unique_lock l(m_active_parallel_region_count_mutex); diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp b/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp index 1dfc5b4064..26181a7c05 100644 --- a/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp +++ b/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp @@ -27,14 +27,6 @@ static_assert(false, #include -#include -#include -#include - -#ifdef KOKKOS_ENABLE_HBWSPACE -#include -#endif - #include #include #include @@ -59,6 +51,7 @@ static_assert(false, #include +#include #include #include #include @@ -201,6 +194,7 @@ class HPX { return impl_get_instance_data().m_instance_id; } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static bool &impl_get_in_parallel() noexcept; struct impl_in_parallel_scope { @@ -223,9 +217,10 @@ class HPX { delete; }; - static bool in_parallel(HPX const & = HPX()) noexcept { + KOKKOS_DEPRECATED static bool in_parallel(HPX const & = HPX()) noexcept { return impl_get_in_parallel(); } +#endif static void impl_decrement_active_parallel_region_count(); static void impl_increment_active_parallel_region_count(); @@ -248,18 +243,6 @@ class HPX { #endif } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - template - KOKKOS_DEPRECATED static void partition_master( - F const &, int requested_num_partitions = 0, int = 0) { - if (requested_num_partitions > 1) { - Kokkos::abort( - "Kokkos::Experimental::HPX::partition_master: can't partition an " - "HPX instance\n"); - } - } -#endif - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(); #else @@ -355,7 +338,9 @@ class HPX { hpx::threads::thread_stacksize::default_) const { impl_bulk_plain_erased(force_synchronous, is_light_weight_policy, {[functor](Index i) { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 impl_in_parallel_scope p; +#endif functor.execute_range(i); }}, n, stacksize); @@ -417,15 +402,21 @@ class HPX { hpx::threads::thread_stacksize::default_) const { impl_bulk_setup_finalize_erased(force_synchronous, is_light_weight_policy, {[functor](Index i) { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 impl_in_parallel_scope p; +#endif functor.execute_range(i); }}, {[functor]() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 impl_in_parallel_scope p; +#endif functor.setup(); }}, {[functor]() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 impl_in_parallel_scope p; +#endif functor.finalize(); }}, n, stacksize); @@ -1292,6 +1283,7 @@ class ParallelScan, const WorkRange range(m_policy, t, num_worker_threads); execute_chunk(range.begin(), range.end(), update_sum, false); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 { // Since arrive_and_wait may yield and resume on another worker thread we // set in_parallel = false on the current thread before suspending and set @@ -1299,6 +1291,9 @@ class ParallelScan, Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; barrier.arrive_and_wait(); } +#else + barrier.arrive_and_wait(); +#endif if (t == 0) { final_reducer.init(reinterpret_cast( @@ -1320,6 +1315,7 @@ class ParallelScan, } } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 { // Since arrive_and_wait may yield and resume on another worker thread we // set in_parallel = false on the current thread before suspending and set @@ -1327,6 +1323,9 @@ class ParallelScan, Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; barrier.arrive_and_wait(); } +#else + barrier.arrive_and_wait(); +#endif reference_type update_base = Analysis::Reducer::reference(reinterpret_cast( @@ -1407,6 +1406,7 @@ class ParallelScanWithTotal, const WorkRange range(m_policy, t, num_worker_threads); execute_chunk(range.begin(), range.end(), update_sum, false); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 { // Since arrive_and_wait may yield and resume on another worker thread we // set in_parallel = false on the current thread before suspending and set @@ -1414,6 +1414,9 @@ class ParallelScanWithTotal, Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; barrier.arrive_and_wait(); } +#else + barrier.arrive_and_wait(); +#endif if (t == 0) { final_reducer.init(reinterpret_cast( @@ -1435,6 +1438,7 @@ class ParallelScanWithTotal, } } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 { // Since arrive_and_wait may yield and resume on another worker thread we // set in_parallel = false on the current thread before suspending and set @@ -1442,6 +1446,9 @@ class ParallelScanWithTotal, Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; barrier.arrive_and_wait(); } +#else + barrier.arrive_and_wait(); +#endif reference_type update_base = Analysis::Reducer::reference(reinterpret_cast( diff --git a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp index c9080db01c..297b1fadee 100644 --- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp +++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp @@ -73,7 +73,7 @@ is_less_than_value_initialized_variable(T arg) { // Checked narrowing conversion that calls abort if the cast changes the value template -constexpr To checked_narrow_cast(From arg) { +constexpr To checked_narrow_cast(From arg, std::size_t idx) { constexpr const bool is_different_signedness = (std::is_signed::value != std::is_signed::value); auto const ret = static_cast(arg); @@ -81,7 +81,12 @@ constexpr To checked_narrow_cast(From arg) { (is_different_signedness && is_less_than_value_initialized_variable(arg) != is_less_than_value_initialized_variable(ret))) { - Kokkos::abort("unsafe narrowing conversion"); + auto msg = + "Kokkos::MDRangePolicy bound type error: an unsafe implicit conversion " + "is performed on a bound (" + + std::to_string(arg) + ") in dimension (" + std::to_string(idx) + + "), which may not preserve its original value.\n"; + Kokkos::abort(msg.c_str()); } return ret; } @@ -96,15 +101,15 @@ constexpr Array to_array_potentially_narrowing(const U (&init)[M]) { using T = typename Array::value_type; Array a{}; constexpr std::size_t N = a.size(); - static_assert(M <= N, ""); + static_assert(M <= N); auto* ptr = a.data(); // NOTE equivalent to // std::transform(std::begin(init), std::end(init), a.data(), // [](U x) { return static_cast(x); }); // except that std::transform is not constexpr. - for (auto x : init) { - *ptr++ = checked_narrow_cast(x); - (void)checked_narrow_cast(x); // see note above + for (std::size_t i = 0; i < M; ++i) { + *ptr++ = checked_narrow_cast(init[i], i); + (void)checked_narrow_cast(init[i], i); // see note above } return a; } @@ -120,10 +125,10 @@ constexpr NVCC_WONT_LET_ME_CALL_YOU_Array to_array_potentially_narrowing( using T = typename NVCC_WONT_LET_ME_CALL_YOU_Array::value_type; NVCC_WONT_LET_ME_CALL_YOU_Array a{}; constexpr std::size_t N = a.size(); - static_assert(M <= N, ""); + static_assert(M <= N); for (std::size_t i = 0; i < M; ++i) { - a[i] = checked_narrow_cast(other[i]); - (void)checked_narrow_cast(other[i]); // see note above + a[i] = checked_narrow_cast(other[i], i); + (void)checked_narrow_cast(other[i], i); // see note above } return a; } @@ -150,9 +155,20 @@ TileSizeProperties get_tile_size_properties(const ExecutionSpace&) { // multi-dimensional iteration pattern template -struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { - using traits = Kokkos::Impl::PolicyTraits; - using range_policy = RangePolicy; +struct MDRangePolicy; + +// Note: If MDRangePolicy has a primary template, implicit CTAD (deduction +// guides) are generated -> MDRangePolicy<> by some compilers, which is +// incorrect. By making it a template specialization instead, no implicit CTAD +// is generated. This works because there has to be at least one property +// specified (which is Rank<...>); otherwise, we'd get the static_assert +// "Kokkos::Error: MD iteration pattern not defined". This template +// specialization uses in all places for correctness. +template +struct MDRangePolicy + : public Kokkos::Impl::PolicyTraits { + using traits = Kokkos::Impl::PolicyTraits; + using range_policy = RangePolicy; typename traits::execution_space m_space; @@ -161,8 +177,8 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { typename traits::schedule_type, typename traits::index_type>; using execution_policy = - MDRangePolicy; // needed for is_execution_space - // interrogation + MDRangePolicy; // needed for is_execution_policy + // interrogation template friend struct MDRangePolicy; @@ -327,6 +343,20 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { } for (int i = rank_start; i != rank_end; i += increment) { const index_type length = m_upper[i] - m_lower[i]; + + if (m_upper[i] < m_lower[i]) { + std::string msg = + "Kokkos::MDRangePolicy bounds error: The lower bound (" + + std::to_string(m_lower[i]) + ") is greater than its upper bound (" + + std::to_string(m_upper[i]) + ") in dimension " + std::to_string(i) + + ".\n"; +#if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + Kokkos::abort(msg.c_str()); +#elif defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) + Kokkos::Impl::log_warning(msg); +#endif + } + if (m_tile[i] <= 0) { m_tune_tile_size = true; if ((inner_direction == Iterate::Right && (i < rank - 1)) || @@ -358,6 +388,60 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { } }; +template +MDRangePolicy(const LT (&)[N], const UT (&)[N])->MDRangePolicy>; + +template +MDRangePolicy(const LT (&)[N], const UT (&)[N], const TT (&)[TN]) + ->MDRangePolicy>; + +template +MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N]) + ->MDRangePolicy>; + +template +MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N], + const TT (&)[TN]) + ->MDRangePolicy>; + +template >> +MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N]) + ->MDRangePolicy>; + +template >> +MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N], const TT (&)[TN]) + ->MDRangePolicy>; + +template +MDRangePolicy(Array const&, Array const&)->MDRangePolicy>; + +template +MDRangePolicy(Array const&, Array const&, Array const&) + ->MDRangePolicy>; + +template +MDRangePolicy(DefaultExecutionSpace const&, Array const&, + Array const&) + ->MDRangePolicy>; + +template +MDRangePolicy(DefaultExecutionSpace const&, Array const&, + Array const&, Array const&) + ->MDRangePolicy>; + +template >> +MDRangePolicy(ES const&, Array const&, Array const&) + ->MDRangePolicy>; + +template >> +MDRangePolicy(ES const&, Array const&, Array const&, + Array const&) + ->MDRangePolicy>; + } // namespace Kokkos #endif // KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP diff --git a/lib/kokkos/core/src/Kokkos_Array.hpp b/lib/kokkos/core/src/Kokkos_Array.hpp index 82ceaaec21..ba1626bb72 100644 --- a/lib/kokkos/core/src/Kokkos_Array.hpp +++ b/lib/kokkos/core/src/Kokkos_Array.hpp @@ -22,6 +22,7 @@ #endif #include +#include #include #include @@ -320,6 +321,9 @@ struct Array::strided> { : m_elem(arg_ptr), m_size(arg_size), m_stride(arg_stride) {} }; +template +Array(T, Us...)->Array; + } // namespace Kokkos // diff --git a/lib/kokkos/core/src/Kokkos_Assert.hpp b/lib/kokkos/core/src/Kokkos_Assert.hpp index c3b9004734..6fea286005 100644 --- a/lib/kokkos/core/src/Kokkos_Assert.hpp +++ b/lib/kokkos/core/src/Kokkos_Assert.hpp @@ -44,9 +44,6 @@ __LINE__) " \n"); \ } \ } -// some projects already define this for themselves, so don't mess -// them up -#ifndef KOKKOS_ASSERT #define KOKKOS_ASSERT(...) \ { \ if (!bool(__VA_ARGS__)) { \ @@ -58,8 +55,7 @@ __LINE__) " \n"); \ } \ } -#endif // ifndef KOKKOS_ASSERT -#else // not debug mode +#else // not debug mode #define KOKKOS_EXPECTS(...) #define KOKKOS_ENSURES(...) #ifndef KOKKOS_ASSERT diff --git a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp index 1c43474632..9acacef901 100644 --- a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp +++ b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp @@ -25,7 +25,7 @@ static_assert(false, #include #include -#ifdef KOKKOS_INTERNAL_NOT_PARALLEL +#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeCaller() #else #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice() diff --git a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp index bda3783980..eebdd20f15 100644 --- a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp +++ b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp @@ -49,7 +49,7 @@ inline const char* atomic_query_version() { return "KOKKOS_DESUL_ATOMICS"; } #endif // ============================================================ -#ifdef KOKKOS_INTERNAL_NOT_PARALLEL +#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeCaller() #else #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice() diff --git a/lib/kokkos/core/src/Kokkos_Clamp.hpp b/lib/kokkos/core/src/Kokkos_Clamp.hpp new file mode 100644 index 0000000000..033cde9ab8 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_Clamp.hpp @@ -0,0 +1,41 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_CLAMP_HPP +#define KOKKOS_CLAMP_HPP + +#include + +namespace Kokkos { + +template +constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo, + const T& hi) { + KOKKOS_EXPECTS(!(hi < lo)); + return (value < lo) ? lo : (hi < value) ? hi : value; +} + +template +constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo, + const T& hi, + ComparatorType comp) { + KOKKOS_EXPECTS(!comp(hi, lo)); + return comp(value, lo) ? lo : comp(hi, value) ? hi : value; +} + +} // namespace Kokkos + +#endif diff --git a/lib/kokkos/core/src/Kokkos_CopyViews.hpp b/lib/kokkos/core/src/Kokkos_CopyViews.hpp index a0ca55be70..08f6ba8d69 100644 --- a/lib/kokkos/core/src/Kokkos_CopyViews.hpp +++ b/lib/kokkos/core/src/Kokkos_CopyViews.hpp @@ -22,6 +22,7 @@ static_assert(false, #ifndef KOKKOS_COPYVIEWS_HPP_ #define KOKKOS_COPYVIEWS_HPP_ #include +#include #include #include #include @@ -612,12 +613,17 @@ void view_copy(const DstType& dst, const SrcType& src) { }; if (!DstExecCanAccessSrc && !SrcExecCanAccessDst) { - std::string message( - "Error: Kokkos::deep_copy with no available copy mechanism: "); - message += src.label(); - message += " to "; - message += dst.label(); - Kokkos::Impl::throw_runtime_exception(message); + std::ostringstream ss; + ss << "Error: Kokkos::deep_copy with no available copy mechanism: " + << "from source view (\"" << src.label() << "\") to destination view (\"" + << dst.label() << "\").\n" + << "There is no common execution space that can access both source's " + "space\n" + << "(" << src_memory_space().name() << ") and destination's space (" + << dst_memory_space().name() << "), " + << "so source and destination\n" + << "must be contiguous and have the same layout.\n"; + Kokkos::Impl::throw_runtime_exception(ss.str()); } // Figure out iteration order in case we need it @@ -1330,13 +1336,12 @@ inline void contiguous_fill( // Default implementation for execution spaces that don't provide a definition template struct ZeroMemset { - ZeroMemset(const ExecutionSpace& exec_space, const ViewType& dst, - typename ViewType::const_value_type& value) { - contiguous_fill(exec_space, dst, value); - } - - ZeroMemset(const ViewType& dst, typename ViewType::const_value_type& value) { - contiguous_fill(ExecutionSpace(), dst, value); + ZeroMemset(const ExecutionSpace& exec_space, const ViewType& dst) { + using ValueType = typename ViewType::value_type; + alignas(alignof(ValueType)) unsigned char + zero_initialized_storage[sizeof(ValueType)] = {}; + contiguous_fill(exec_space, dst, + *reinterpret_cast(zero_initialized_storage)); } }; @@ -1348,13 +1353,18 @@ inline std::enable_if_t< contiguous_fill_or_memset( const ExecutionSpace& exec_space, const View& dst, typename ViewTraits::const_value_type& value) { -// On A64FX memset seems to do the wrong thing with regards to first touch -// leading to the significant performance issues -#ifndef KOKKOS_ARCH_A64FX - if (Impl::is_zero_byte(value)) - ZeroMemset>(exec_space, dst, value); - else + // With OpenMP, using memset has significant performance issues. + if (Impl::is_zero_byte(value) +#ifdef KOKKOS_ENABLE_OPENMP + && !std::is_same_v #endif + ) + // FIXME intel/19 icpc fails to deduce template parameters here, + // resulting in compilation errors; explicitly passing the template + // parameters to ZeroMemset helps workaround the issue + // See https://github.com/kokkos/kokkos/issues/6775 + ZeroMemset>(exec_space, dst); + else contiguous_fill(exec_space, dst, value); } @@ -1379,15 +1389,20 @@ contiguous_fill_or_memset( typename ViewTraits::const_value_type& value) { using ViewType = View; using exec_space_type = typename ViewType::execution_space; + exec_space_type exec; // On A64FX memset seems to do the wrong thing with regards to first touch // leading to the significant performance issues #ifndef KOKKOS_ARCH_A64FX if (Impl::is_zero_byte(value)) - ZeroMemset>(dst, value); + // FIXME intel/19 icpc fails to deduce template parameters here, + // resulting in compilation errors; explicitly passing the template + // parameters to ZeroMemset helps workaround the issue + // See https://github.com/kokkos/kokkos/issues/6775 + ZeroMemset(exec, dst); else #endif - contiguous_fill(exec_space_type(), dst, value); + contiguous_fill(exec, dst, value); } template diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp index 805411a699..1f146563be 100644 --- a/lib/kokkos/core/src/Kokkos_Core.hpp +++ b/lib/kokkos/core/src/Kokkos_Core.hpp @@ -46,14 +46,15 @@ #include #include -#include #include -#include +#include +#include #include #include #include #include #include +#include #include #include #include @@ -101,6 +102,7 @@ void declare_configuration_metadata(const std::string& category, [[nodiscard]] bool is_finalized() noexcept; [[nodiscard]] int device_id() noexcept; +[[nodiscard]] int num_devices() noexcept; [[nodiscard]] int num_threads() noexcept; bool show_warnings() noexcept; @@ -300,9 +302,6 @@ std::vector partition_space(ExecSpace const& space, // implementation of the RAII wrapper is using Kokkos::single. #include -// Specializations required after core definitions -#include - //---------------------------------------------------------------------------- // Redefinition of the macros min and max if we pushed them at entry of // Kokkos_Core.hpp diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp index 44f1c5b42f..7edb35f00e 100644 --- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp +++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp @@ -30,10 +30,6 @@ #include #include -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -#include -#endif - //---------------------------------------------------------------------------- // Have assumed a 64-bit build (8-byte pointers) throughout the code base. // 32-bit build allowed but unsupported. @@ -75,9 +71,6 @@ template struct Device; // forward declare here so that backend initializer calls can use it. -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -struct InitArguments; -#endif class InitializationSettings; } // namespace Kokkos @@ -262,12 +255,6 @@ KOKKOS_FUNCTION void runtime_check_memory_access_violation( } } // namespace Impl - -namespace Experimental { -template -class LogicalMemorySpace; -} - } // namespace Kokkos //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp index ae1585a498..5f251eeb26 100644 --- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -28,6 +28,7 @@ static_assert(false, #include #include #include +#include //---------------------------------------------------------------------------- @@ -114,62 +115,67 @@ class RangePolicy : public Impl::PolicyTraits { m_granularity_mask(0) {} /** \brief Total range */ + template && + std::is_convertible_v), + bool> = false> + inline RangePolicy(const IndexType1 work_begin, const IndexType2 work_end) + : RangePolicy(typename traits::execution_space(), work_begin, work_end) {} + + /** \brief Total range */ + template && + std::is_convertible_v), + bool> = false> inline RangePolicy(const typename traits::execution_space& work_space, - const member_type work_begin, const member_type work_end) + const IndexType1 work_begin, const IndexType2 work_end) : m_space(work_space), - m_begin(work_begin < work_end ? work_begin : 0), - m_end(work_begin < work_end ? work_end : 0), + m_begin(work_begin), + m_end(work_end), m_granularity(0), m_granularity_mask(0) { + check_conversion_safety(work_begin); + check_conversion_safety(work_end); + check_bounds_validity(); set_auto_chunk_size(); } - /** \brief Total range */ - inline RangePolicy(const member_type work_begin, const member_type work_end) - : RangePolicy(typename traits::execution_space(), work_begin, work_end) { - set_auto_chunk_size(); - } - - /** \brief Total range */ - template - inline RangePolicy(const typename traits::execution_space& work_space, - const member_type work_begin, const member_type work_end, - Args... args) + template && + std::is_convertible_v), + bool> = false> + RangePolicy(const typename traits::execution_space& work_space, + const IndexType1 work_begin, const IndexType2 work_end, + const ChunkSize chunk_size) : m_space(work_space), - m_begin(work_begin < work_end ? work_begin : 0), - m_end(work_begin < work_end ? work_end : 0), + m_begin(work_begin), + m_end(work_end), m_granularity(0), m_granularity_mask(0) { - set_auto_chunk_size(); - set(args...); + check_conversion_safety(work_begin); + check_conversion_safety(work_end); + check_bounds_validity(); + set_chunk_size(chunk_size.value); } /** \brief Total range */ - template - inline RangePolicy(const member_type work_begin, const member_type work_end, - Args... args) - : RangePolicy(typename traits::execution_space(), work_begin, work_end) { - set_auto_chunk_size(); - set(args...); - } - - private: - inline void set() {} + template && + std::is_convertible_v), + bool> = false> + RangePolicy(const IndexType1 work_begin, const IndexType2 work_end, + const ChunkSize chunk_size) + : RangePolicy(typename traits::execution_space(), work_begin, work_end, + chunk_size) {} public: - template - inline void set(Args...) { - static_assert( - 0 == sizeof...(Args), - "Kokkos::RangePolicy: unhandled constructor arguments encountered."); - } - - template - inline void set(const ChunkSize& chunksize, Args... args) { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED_WITH_COMMENT("Use set_chunk_size instead") + inline void set(ChunkSize chunksize) { m_granularity = chunksize.value; m_granularity_mask = m_granularity - 1; - set(args...); } +#endif public: /** \brief return chunk_size */ @@ -218,6 +224,67 @@ class RangePolicy : public Impl::PolicyTraits { m_granularity_mask = m_granularity - 1; } + void check_bounds_validity() { + if (m_end < m_begin) { + std::string msg = "Kokkos::RangePolicy bounds error: The lower bound (" + + std::to_string(m_begin) + + ") is greater than the upper bound (" + + std::to_string(m_end) + ").\n"; +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Kokkos::abort(msg.c_str()); +#endif + m_begin = 0; + m_end = 0; +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + Kokkos::Impl::log_warning(msg); +#endif + } + } + + // To be replaced with std::in_range (c++20) + template + static void check_conversion_safety(const IndexType bound) { +#if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) || \ + defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) + + std::string msg = + "Kokkos::RangePolicy bound type error: an unsafe implicit conversion " + "is performed on a bound (" + + std::to_string(bound) + + "), which may " + "not preserve its original value.\n"; + bool warn = false; + + if constexpr (std::is_signed_v != + std::is_signed_v) { + // check signed to unsigned + if constexpr (std::is_signed_v) + warn |= (bound < static_cast( + std::numeric_limits::min())); + + // check unsigned to signed + if constexpr (std::is_signed_v) + warn |= (bound > static_cast( + std::numeric_limits::max())); + } + + // check narrowing + warn |= (static_cast(static_cast(bound)) != bound); + + if (warn) { +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Kokkos::abort(msg.c_str()); +#endif + +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + Kokkos::Impl::log_warning(msg); +#endif + } +#else + (void)bound; +#endif + } + public: /** \brief Subrange for a partition's rank and size. * @@ -261,6 +328,21 @@ class RangePolicy : public Impl::PolicyTraits { }; }; +RangePolicy()->RangePolicy<>; + +RangePolicy(int64_t, int64_t)->RangePolicy<>; +RangePolicy(int64_t, int64_t, ChunkSize const&)->RangePolicy<>; + +RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t)->RangePolicy<>; +RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t, ChunkSize const&) + ->RangePolicy<>; + +template >> +RangePolicy(ES const&, int64_t, int64_t)->RangePolicy; + +template >> +RangePolicy(ES const&, int64_t, int64_t, ChunkSize const&)->RangePolicy; + } // namespace Kokkos //---------------------------------------------------------------------------- @@ -983,7 +1065,16 @@ template const& policy, Lambda const& lambda, ReducerValueType& val) { + static_assert(/*!Kokkos::is_view_v &&*/ + !std::is_array_v && + !std::is_pointer_v && + !Kokkos::is_reducer_v, + "Only scalar return types are allowed!"); + + val = ReducerValueType{}; Impl::md_parallel_impl(policy, lambda, val); + policy.team.team_reduce( + Kokkos::Sum{val}); } template @@ -997,7 +1088,29 @@ template const& policy, Lambda const& lambda, ReducerValueType& val) { + static_assert(/*!Kokkos::is_view_v &&*/ + !std::is_array_v && + !std::is_pointer_v && + !Kokkos::is_reducer_v, + "Only a scalar return types are allowed!"); + + val = ReducerValueType{}; Impl::md_parallel_impl(policy, lambda, val); + if constexpr (false +#ifdef KOKKOS_ENABLE_CUDA + || std::is_same_v +#elif defined(KOKKOS_ENABLE_HIP) + || std::is_same_v +#elif defined(KOKKOS_ENABLE_SYCL) + || std::is_same_v +#endif + ) + policy.team.vector_reduce( + Kokkos::Sum{ + val}); } template @@ -1011,7 +1124,31 @@ template const& policy, Lambda const& lambda, ReducerValueType& val) { + static_assert(/*!Kokkos::is_view_v &&*/ + !std::is_array_v && + !std::is_pointer_v && + !Kokkos::is_reducer_v, + "Only a scalar return types are allowed!"); + + val = ReducerValueType{}; Impl::md_parallel_impl(policy, lambda, val); + if constexpr (false +#ifdef KOKKOS_ENABLE_CUDA + || std::is_same_v +#elif defined(KOKKOS_ENABLE_HIP) + || std::is_same_v +#elif defined(KOKKOS_ENABLE_SYCL) + || std::is_same_v +#endif + ) + policy.team.vector_reduce( + Kokkos::Sum{ + val}); + policy.team.team_reduce( + Kokkos::Sum{val}); } template diff --git a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp deleted file mode 100644 index 369b7bafb7..0000000000 --- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp +++ /dev/null @@ -1,308 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_HBWSPACE_HPP -#define KOKKOS_HBWSPACE_HPP - -#include -#ifdef KOKKOS_ENABLE_HBWSPACE - -#include - -namespace Kokkos { - -namespace Experimental { - -/// \class HBWSpace -/// \brief Memory management for host memory. -/// -/// HBWSpace is a memory space that governs host memory. "Host" -/// memory means the usual CPU-accessible memory. -class HBWSpace { - public: - //! Tag this class as a kokkos memory space - using memory_space = HBWSpace; - using size_type = size_t; - - /// \typedef execution_space - /// \brief Default execution space for this memory space. - /// - /// Every memory space has a default execution space. This is - /// useful for things like initializing a View (which happens in - /// parallel using the View's default execution space). - using execution_space = Kokkos::DefaultHostExecutionSpace; - - //! This memory space preferred device_type - using device_type = Kokkos::Device; - - /**\brief Default memory space instance */ - HBWSpace(); - HBWSpace(const HBWSpace& rhs) = default; - HBWSpace& operator=(const HBWSpace&) = default; - ~HBWSpace() = default; - - /**\brief Non-default memory space instance to choose allocation mechansim, - * if available */ - - enum AllocationMechanism { - STD_MALLOC, - POSIX_MEMALIGN, - POSIX_MMAP, - INTEL_MM_ALLOC - }; - - explicit HBWSpace(const AllocationMechanism&); - - /**\brief Allocate untracked memory in the space */ - void* allocate(const size_t arg_alloc_size) const; - void* allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const; - - /**\brief Deallocate untracked memory in the space */ - void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const; - void deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const; - - private: - template - friend class LogicalMemorySpace; - - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle = - Kokkos::Tools::make_space_handle(name())) const; - void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle = - Kokkos::Tools::make_space_handle(name())) const; - - public: - /**\brief Return Name of the MemorySpace */ - static constexpr const char* name() { return "HBW"; } - - private: - AllocationMechanism m_alloc_mech; - friend class Kokkos::Impl::SharedAllocationRecord< - Kokkos::Experimental::HBWSpace, void>; -}; - -} // namespace Experimental - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -template <> -class SharedAllocationRecord - : public SharedAllocationRecord { - private: - friend Kokkos::Experimental::HBWSpace; - - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - static void deallocate(RecordBase*); - -#ifdef KOKKOS_ENABLE_DEBUG - /**\brief Root record for tracked allocations from this HBWSpace instance */ - static RecordBase s_root_record; -#endif - - const Kokkos::Experimental::HBWSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - SharedAllocationRecord( - const Kokkos::Experimental::HBWSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); - - public: - inline std::string get_label() const { - return std::string(RecordBase::head()->m_label); - } - - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( - const Kokkos::Experimental::HBWSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size) { - KOKKOS_IF_ON_HOST((return new SharedAllocationRecord(arg_space, arg_label, - arg_alloc_size);)) - KOKKOS_IF_ON_DEVICE(((void)arg_space; (void)arg_label; (void)arg_alloc_size; - return nullptr;)) - } - - /**\brief Allocate tracked memory in the space */ - static void* allocate_tracked(const Kokkos::Experimental::HBWSpace& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size); - - /**\brief Reallocate tracked memory in the space */ - static void* reallocate_tracked(void* const arg_alloc_ptr, - const size_t arg_alloc_size); - - /**\brief Deallocate tracked memory in the space */ - static void deallocate_tracked(void* const arg_alloc_ptr); - - static SharedAllocationRecord* get_record(void* arg_alloc_ptr); - - static void print_records(std::ostream&, - const Kokkos::Experimental::HBWSpace&, - bool detail = false); -}; - -} // namespace Impl - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -static_assert( - Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - -template <> -struct MemorySpaceAccess { - enum : bool { assignable = true }; - enum : bool { accessible = true }; - enum : bool { deepcopy = true }; -}; - -template <> -struct MemorySpaceAccess { - enum : bool { assignable = false }; - enum : bool { accessible = true }; - enum : bool { deepcopy = true }; -}; - -} // namespace Impl - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -template <> -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src, - size_t n) { - hostspace_parallel_deepcopy(exec, dst, src, n); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src, - size_t n) { - hostspace_parallel_deepcopy(exec, dst, src, n); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); - hostspace_parallel_deepcopy_async(copy_space, dst, src, n); - } -}; - -template <> -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src, - size_t n) { - hostspace_parallel_deepcopy(exec, dst, src, n); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); - hostspace_parallel_deepcopy_async(dst, src, n); - } -}; - -} // namespace Impl - -} // namespace Kokkos - -#endif -#endif // #define KOKKOS_HBWSPACE_HPP diff --git a/lib/kokkos/core/src/Kokkos_HostSpace.hpp b/lib/kokkos/core/src/Kokkos_HostSpace.hpp index 252aabd949..a1fb0f5a67 100644 --- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp @@ -37,7 +37,6 @@ static_assert(false, #include #include "impl/Kokkos_HostSpace_deepcopy.hpp" -#include /*--------------------------------------------------------------------------*/ @@ -94,6 +93,16 @@ class HostSpace { #endif /**\brief Allocate untracked memory in the space */ + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -105,9 +114,6 @@ class HostSpace { const size_t arg_logical_size = 0) const; private: - template - friend class Kokkos::Experimental::LogicalMemorySpace; - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -124,7 +130,6 @@ class HostSpace { private: static constexpr const char* m_name = "Host"; - friend class Kokkos::Impl::SharedAllocationRecord; }; } // namespace Kokkos @@ -136,8 +141,7 @@ namespace Kokkos { namespace Impl { static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); template struct HostMirror { @@ -173,75 +177,7 @@ struct HostMirror { //---------------------------------------------------------------------------- -namespace Kokkos { - -namespace Impl { - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon { - private: - friend Kokkos::HostSpace; - friend class SharedAllocationRecordCommon; - - using base_t = SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - /**\brief Root record for tracked allocations from this HostSpace instance */ - static RecordBase s_root_record; -#endif - - Kokkos::HostSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - // This constructor does not forward to the one without exec_space arg - // in order to work around https://github.com/kokkos/kokkos/issues/5258 - // This constructor is templated so I can't just put it into the cpp file - // like the other constructor. - template - SharedAllocationRecord( - const ExecutionSpace& /* exec_space*/, const Kokkos::HostSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, - arg_label); - } - - SharedAllocationRecord( - const Kokkos::HostSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); - - public: - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( - const Kokkos::HostSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size) { - KOKKOS_IF_ON_HOST((return new SharedAllocationRecord(arg_space, arg_label, - arg_alloc_size);)) - KOKKOS_IF_ON_DEVICE(((void)arg_space; (void)arg_label; (void)arg_alloc_size; - return nullptr;)) - } -}; - -} // namespace Impl - -} // namespace Kokkos +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HostSpace); //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_LogicalSpaces.hpp b/lib/kokkos/core/src/Kokkos_LogicalSpaces.hpp deleted file mode 100644 index 1ee1d2c81f..0000000000 --- a/lib/kokkos/core/src/Kokkos_LogicalSpaces.hpp +++ /dev/null @@ -1,413 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_LOGICALSPACES_HPP -#define KOKKOS_LOGICALSPACES_HPP - -#include -#include -#include -#include -#include -#include -#include -#include -namespace Kokkos { -namespace Experimental { -struct DefaultMemorySpaceNamer { - static constexpr const char* get_name() { - return "DefaultLogicalMemorySpaceName"; - } -}; - -struct LogicalSpaceSharesAccess { - struct shared_access {}; - struct no_shared_access {}; -}; - -/// \class LogicalMemorySpace -/// \brief -/// -/// LogicalMemorySpace is a space that is identical to another space, -/// but differentiable by name and template argument -template -class LogicalMemorySpace { -#ifdef KOKKOS_ENABLE_OPENMPTARGET - // [DZP] For some reason I don't yet know, using LogicalMemorySpaces - // inside an OpenMPTarget build causes errors in the - // SharedAllocationRecords of other types. This is my way of erroring - // a build if we instantiate a LogicalMemSpace in an OMPTarget build - static_assert(!std::is_same::value, - "Can't use LogicalMemorySpaces in an OpenMPTarget build, we're " - "debugging memory issues"); -#endif - public: - //! Tag this class as a kokkos memory space - using memory_space = LogicalMemorySpace; - using size_type = typename BaseSpace::size_type; - - /// \typedef execution_space - /// \brief Default execution space for this memory space. - /// - /// Every memory space has a default execution space. This is - /// useful for things like initializing a View (which happens in - /// parallel using the View's default execution space). - - using execution_space = - std::conditional_t::value, - typename BaseSpace::execution_space, - DefaultBaseExecutionSpace>; - - using device_type = Kokkos::Device; - - LogicalMemorySpace() = default; - - template - LogicalMemorySpace(Args&&... args) : underlying_space((Args &&) args...) {} - - /**\brief Allocate untracked memory in the space */ - void* allocate(const size_t arg_alloc_size) const { - return allocate("[unlabeled]", arg_alloc_size); - } - void* allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const { - return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); - } - - /**\brief Deallocate untracked memory in the space */ - void deallocate(void* const arg_alloc_ptr, - const size_t arg_alloc_size) const { - deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); - } - void deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const { - impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); - } - - /**\brief Return Name of the MemorySpace */ - constexpr static const char* name() { return Namer::get_name(); } - - private: - BaseSpace underlying_space; - template - friend class LogicalMemorySpace; - friend class Kokkos::Impl::SharedAllocationRecord; - - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - Kokkos::Tools::SpaceHandle arg_handle = - Kokkos::Tools::make_space_handle(name())) const { - return underlying_space.impl_allocate(arg_label, arg_alloc_size, - arg_logical_size, arg_handle); - } - void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle arg_handle = - Kokkos::Tools::make_space_handle(name())) const { - underlying_space.impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, - arg_logical_size, arg_handle); - } -}; -} // namespace Experimental -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -template -struct MemorySpaceAccess< - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>, - OtherSpace> { - enum { assignable = MemorySpaceAccess::assignable }; - enum { accessible = MemorySpaceAccess::accessible }; - enum { deepcopy = MemorySpaceAccess::deepcopy }; -}; - -template -struct MemorySpaceAccess< - OtherSpace, - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>> { - enum { assignable = MemorySpaceAccess::assignable }; - enum { accessible = MemorySpaceAccess::accessible }; - enum { deepcopy = MemorySpaceAccess::deepcopy }; -}; - -template -struct MemorySpaceAccess< - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>, - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>> { - enum { assignable = true }; - enum { accessible = true }; - enum { deepcopy = true }; -}; - -} // namespace Impl - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { -template -class SharedAllocationRecord, - void> : public SharedAllocationRecord { - private: - using SpaceType = - Kokkos::Experimental::LogicalMemorySpace; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - static void deallocate(RecordBase* arg_rec) { - delete static_cast(arg_rec); - } - -#ifdef KOKKOS_ENABLE_DEBUG - /**\brief Root record for tracked allocations from this - * LogicalMemorySpace instance */ - static RecordBase s_root_record; -#endif - - const SpaceType m_space; - - protected: - ~SharedAllocationRecord() { - m_space.deallocate(RecordBase::m_alloc_ptr->m_label, - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); - } - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, const SpaceType& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const SpaceType& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : SharedAllocationRecord( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - // Fill in the Header information - RecordBase::m_alloc_ptr->m_record = - static_cast*>(this); - - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length - 1); - // Set last element zero, in case c_str is too long - RecordBase::m_alloc_ptr - ->m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; - } - - public: - inline std::string get_label() const { - return std::string(RecordBase::head()->m_label); - } - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( - const SpaceType& arg_space, const std::string& arg_label, - const size_t arg_alloc_size) { - KOKKOS_IF_ON_HOST((return new SharedAllocationRecord(arg_space, arg_label, - arg_alloc_size);)) - KOKKOS_IF_ON_DEVICE(((void)arg_space; (void)arg_label; (void)arg_alloc_size; - return nullptr;)) - } - - /**\brief Allocate tracked memory in the space */ - static void* allocate_tracked(const SpaceType& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size) { - if (!arg_alloc_size) return (void*)nullptr; - - SharedAllocationRecord* const r = - allocate(arg_space, arg_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); - } - - /**\brief Reallocate tracked memory in the space */ - static void* reallocate_tracked(void* const arg_alloc_ptr, - const size_t arg_alloc_size) { - SharedAllocationRecord* const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord* const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - Kokkos::fence( - "SharedAllocationRecord::reallocate_tracked: fence after copying data"); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); - } - /**\brief Deallocate tracked memory in the space */ - static void deallocate_tracked(void* const arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord* const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } - } - - static SharedAllocationRecord* get_record(void* alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordHost = SharedAllocationRecord; - - SharedAllocationHeader const* const head = - alloc_ptr ? Header::get_header(alloc_ptr) - : (SharedAllocationHeader*)nullptr; - RecordHost* const record = - head ? static_cast(head->m_record) : (RecordHost*)nullptr; - - if (!alloc_ptr || record->m_alloc_ptr != head) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::SharedAllocationRecord< LogicalMemorySpace<> , " - "void >::get_record ERROR")); - } - - return record; - } -#ifdef KOKKOS_ENABLE_DEBUG - static void print_records(std::ostream& s, const SpaceType&, - bool detail = false) { - SharedAllocationRecord::print_host_accessible_records( - s, "HostSpace", &s_root_record, detail); - } -#else - static void print_records(std::ostream&, const SpaceType&, - bool detail = false) { - (void)detail; - throw_runtime_exception( - "SharedAllocationRecord::print_records only works " - "with KOKKOS_ENABLE_DEBUG enabled"); - } -#endif -}; -#ifdef KOKKOS_ENABLE_DEBUG -/**\brief Root record for tracked allocations from this LogicalSpace - * instance */ -template -SharedAllocationRecord - SharedAllocationRecord, - void>::s_root_record; -#endif - -} // namespace Impl - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -template -struct DeepCopy, - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, SharesAccess>, - ExecutionSpace> { - DeepCopy(void* dst, void* src, size_t n) { - DeepCopy(dst, src, n); - } - DeepCopy(const ExecutionSpace& exec, void* dst, void* src, size_t n) { - DeepCopy(exec, dst, src, n); - } -}; - -template -struct DeepCopy, - ExecutionSpace> { - DeepCopy(void* dst, void* src, size_t n) { - DeepCopy(dst, src, n); - } - DeepCopy(const ExecutionSpace& exec, void* dst, void* src, size_t n) { - DeepCopy(exec, dst, src, n); - } -}; - -template -struct DeepCopy, - DestinationSpace, ExecutionSpace> { - DeepCopy(void* dst, void* src, size_t n) { - DeepCopy(dst, src, n); - } - DeepCopy(const ExecutionSpace& exec, void* dst, void* src, size_t n) { - DeepCopy(exec, dst, src, n); - } -}; -} // namespace Impl - -} // namespace Kokkos -#endif // KOKKOS_LOGICALSPACES_HPP diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp index 3cf7ac4fa2..b255d2a519 100644 --- a/lib/kokkos/core/src/Kokkos_Macros.hpp +++ b/lib/kokkos/core/src/Kokkos_Macros.hpp @@ -84,11 +84,12 @@ //---------------------------------------------------------------------------- -#if !defined(KOKKOS_ENABLE_THREADS) && !defined(KOKKOS_ENABLE_CUDA) && \ - !defined(KOKKOS_ENABLE_OPENMP) && !defined(KOKKOS_ENABLE_HPX) && \ - !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_HIP) && \ - !defined(KOKKOS_ENABLE_SYCL) -#define KOKKOS_INTERNAL_NOT_PARALLEL +#if defined(KOKKOS_ENABLE_ATOMICS_BYPASS) && \ + (defined(KOKKOS_ENABLE_THREADS) || defined(KOKKOS_ENABLE_CUDA) || \ + defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_HPX) || \ + defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_HIP) || \ + defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENACC)) +#error Atomics may only be disabled if neither a host parallel nor a device backend is enabled #endif #define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA @@ -339,12 +340,6 @@ #define KOKKOS_IMPL_DEVICE_FUNCTION #endif -// Temporary solution for SYCL not supporting printf in kernels. -// Might disappear at any point once we have found another solution. -#if !defined(KOKKOS_IMPL_DO_NOT_USE_PRINTF) -#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(...) ::printf(__VA_ARGS__) -#endif - //---------------------------------------------------------------------------- // Define final version of functions. This is so that clang tidy can find these // macros more easily @@ -433,22 +428,6 @@ #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL #endif -//---------------------------------------------------------------------------- -// Determine for what space the code is being compiled: -#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_3) - -#if defined(__CUDACC__) && defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) -#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA -#elif defined(__SYCL_DEVICE_ONLY__) && defined(KOKKOS_ENABLE_SYCL) -#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL -#elif defined(__HIPCC__) && defined(__HIP_DEVICE_COMPILE__) && \ - defined(KOKKOS_ENABLE_HIP) -#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU -#else -#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST -#endif - -#endif //---------------------------------------------------------------------------- // Remove surrounding parentheses if present diff --git a/lib/kokkos/core/src/Kokkos_MasterLock.hpp b/lib/kokkos/core/src/Kokkos_MasterLock.hpp deleted file mode 100644 index 1d09617371..0000000000 --- a/lib/kokkos/core/src/Kokkos_MasterLock.hpp +++ /dev/null @@ -1,56 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_MASTER_LOCK_HPP -#define KOKKOS_MASTER_LOCK_HPP - -#include - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - -namespace Kokkos { -namespace Experimental { - -// my be used to coordinate work between master instances -// SHOULD NOT be used within a parallel algorithm -// -// This lock should be used with with a scoped lock guard -// i.e. std::unique_lock, std::lock_guard -// -// cannot be copied or moved -// has the following functions available -// -// Lock() -// ~Lock() -// -// void lock() -// void unlock() -// bool try_lock() -// -template -class MasterLock; - -} // namespace Experimental -} // namespace Kokkos - -#endif - -#endif // KOKKOS_MASTER_LOCK_HPP diff --git a/lib/kokkos/core/src/Kokkos_MathematicalConstants.hpp b/lib/kokkos/core/src/Kokkos_MathematicalConstants.hpp index 51a50d347d..1a77f373fd 100644 --- a/lib/kokkos/core/src/Kokkos_MathematicalConstants.hpp +++ b/lib/kokkos/core/src/Kokkos_MathematicalConstants.hpp @@ -51,24 +51,6 @@ KOKKOS_IMPL_MATH_CONSTANT(phi, 1.618033988749894848204586834365638118L); } // namespace Kokkos::numbers -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -namespace Kokkos::Experimental { -using Kokkos::numbers::e_v; -using Kokkos::numbers::egamma_v; -using Kokkos::numbers::inv_pi_v; -using Kokkos::numbers::inv_sqrt3_v; -using Kokkos::numbers::inv_sqrtpi_v; -using Kokkos::numbers::ln10_v; -using Kokkos::numbers::ln2_v; -using Kokkos::numbers::log10e_v; -using Kokkos::numbers::log2e_v; -using Kokkos::numbers::phi_v; -using Kokkos::numbers::pi_v; -using Kokkos::numbers::sqrt2_v; -using Kokkos::numbers::sqrt3_v; -} // namespace Kokkos::Experimental -#endif - #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHCONSTANTS #undef KOKKOS_IMPL_PUBLIC_INCLUDE #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHCONSTANTS diff --git a/lib/kokkos/core/src/Kokkos_MathematicalFunctions.hpp b/lib/kokkos/core/src/Kokkos_MathematicalFunctions.hpp index ee64c67b93..3fead8dd29 100644 --- a/lib/kokkos/core/src/Kokkos_MathematicalFunctions.hpp +++ b/lib/kokkos/core/src/Kokkos_MathematicalFunctions.hpp @@ -92,16 +92,6 @@ using promote_3_t = typename promote_3::type; #endif #endif -#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_3) -#define KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - USING_DECLARATIONS_IN_EXPERIMENTAL_NAMESPACE) \ - USING_DECLARATIONS_IN_EXPERIMENTAL_NAMESPACE -#else -#define KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - USING_DECLARATIONS_IN_EXPERIMENTAL_NAMESPACE) \ - /* nothing */ -#endif - #define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC) \ KOKKOS_INLINE_FUNCTION float FUNC(float x) { \ using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ @@ -128,13 +118,7 @@ using promote_3_t = typename promote_3::type; T x) { \ using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ return FUNC(static_cast(x)); \ - } \ - KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - namespace Experimental { \ - using ::Kokkos::FUNC; \ - using ::Kokkos::FUNC##f; \ - using ::Kokkos::FUNC##l; \ - }) + } // isinf, isnan, and isinfinite do not work on Windows with CUDA with std:: // getting warnings about calling host function in device function then @@ -151,9 +135,7 @@ using promote_3_t = typename promote_3::type; KOKKOS_INLINE_FUNCTION std::enable_if_t, bool> FUNC( \ T x) { \ return ::FUNC(static_cast(x)); \ - } \ - KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - namespace Experimental { using ::Kokkos::FUNC; }) + } #else #define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC) \ KOKKOS_INLINE_FUNCTION bool FUNC(float x) { \ @@ -173,9 +155,7 @@ using promote_3_t = typename promote_3::type; T x) { \ using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ return FUNC(static_cast(x)); \ - } \ - KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - namespace Experimental { using ::Kokkos::FUNC; }) + } #endif #define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC) \ @@ -218,16 +198,10 @@ using promote_3_t = typename promote_3::type; long double> \ FUNC(T1 x, T2 y) { \ using Promoted = Kokkos::Impl::promote_2_t; \ - static_assert(std::is_same_v, ""); \ + static_assert(std::is_same_v); \ using std::FUNC; \ return FUNC(static_cast(x), static_cast(y)); \ - } \ - KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - namespace Experimental { \ - using ::Kokkos::FUNC; \ - using ::Kokkos::FUNC##f; \ - using ::Kokkos::FUNC##l; \ - }) + } #define KOKKOS_IMPL_MATH_TERNARY_FUNCTION(FUNC) \ KOKKOS_INLINE_FUNCTION float FUNC(float x, float y, float z) { \ @@ -314,8 +288,6 @@ inline long double abs(long double x) { using std::abs; return abs(x); } -KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( - namespace Experimental { using ::Kokkos::abs; }) KOKKOS_IMPL_MATH_UNARY_FUNCTION(fabs) KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmod) KOKKOS_IMPL_MATH_BINARY_FUNCTION(remainder) @@ -336,12 +308,6 @@ KOKKOS_INLINE_FUNCTION float nanf(char const*) { return sycl::nan(0u); } KOKKOS_INLINE_FUNCTION double nan(char const*) { return sycl::nan(0ul); } #endif inline long double nanl(char const* arg) { return ::nanl(arg); } -KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( - namespace Experimental { - using ::Kokkos::nan; - using ::Kokkos::nanf; - using ::Kokkos::nanl; - }) // Exponential functions KOKKOS_IMPL_MATH_UNARY_FUNCTION(exp) // FIXME_NVHPC nvc++ has issues with exp2 @@ -478,7 +444,6 @@ KOKKOS_IMPL_MATH_UNARY_PREDICATE(signbit) // islessgreater // isunordered -#undef KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED #undef KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE #undef KOKKOS_IMPL_MATH_UNARY_FUNCTION #undef KOKKOS_IMPL_MATH_UNARY_PREDICATE diff --git a/lib/kokkos/core/src/Kokkos_MinMaxClamp.hpp b/lib/kokkos/core/src/Kokkos_MinMax.hpp similarity index 83% rename from lib/kokkos/core/src/Kokkos_MinMaxClamp.hpp rename to lib/kokkos/core/src/Kokkos_MinMax.hpp index 37a28a80b6..5c60a88bfb 100644 --- a/lib/kokkos/core/src/Kokkos_MinMaxClamp.hpp +++ b/lib/kokkos/core/src/Kokkos_MinMax.hpp @@ -14,13 +14,8 @@ // //@HEADER -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_MIN_MAX_CLAMP_HPP -#define KOKKOS_MIN_MAX_CLAMP_HPP +#ifndef KOKKOS_MIN_MAX_HPP +#define KOKKOS_MIN_MAX_HPP #include #include @@ -29,22 +24,6 @@ static_assert(false, namespace Kokkos { -// clamp -template -constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo, - const T& hi) { - KOKKOS_EXPECTS(!(hi < lo)); - return (value < lo) ? lo : (hi < value) ? hi : value; -} - -template -constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo, - const T& hi, - ComparatorType comp) { - KOKKOS_EXPECTS(!comp(hi, lo)); - return comp(value, lo) ? lo : comp(hi, value) ? hi : value; -} - // max template constexpr KOKKOS_INLINE_FUNCTION const T& max(const T& a, const T& b) { @@ -199,15 +178,6 @@ KOKKOS_INLINE_FUNCTION constexpr Kokkos::pair minmax( return result; } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -namespace Experimental { -using ::Kokkos::clamp; -using ::Kokkos::max; -using ::Kokkos::min; -using ::Kokkos::minmax; -} // namespace Experimental -#endif - } // namespace Kokkos #endif diff --git a/lib/kokkos/core/src/Kokkos_Pair.hpp b/lib/kokkos/core/src/Kokkos_Pair.hpp index 7127c78280..9be8d8d7aa 100644 --- a/lib/kokkos/core/src/Kokkos_Pair.hpp +++ b/lib/kokkos/core/src/Kokkos_Pair.hpp @@ -28,6 +28,7 @@ #endif #include +#include #include namespace Kokkos { @@ -484,7 +485,6 @@ KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( } namespace Impl { - template struct is_pair_like : std::false_type {}; template diff --git a/lib/kokkos/core/src/Kokkos_Printf.hpp b/lib/kokkos/core/src/Kokkos_Printf.hpp index 39f95825c3..63a4cce2ae 100644 --- a/lib/kokkos/core/src/Kokkos_Printf.hpp +++ b/lib/kokkos/core/src/Kokkos_Printf.hpp @@ -30,8 +30,11 @@ namespace Kokkos { // In contrast to std::printf, return void to get a consistent behavior across // backends. The GPU backends always return 1 and NVHPC only compiles if we // don't ask for the return value. +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) +using ::printf; +#else template -KOKKOS_FUNCTION void printf(const char* format, Args... args) { +KOKKOS_FORCEINLINE_FUNCTION void printf(const char* format, Args... args) { #ifdef KOKKOS_ENABLE_SYCL // Some compilers warn if "args" is empty and format is not a string literal if constexpr (sizeof...(Args) == 0) @@ -39,15 +42,13 @@ KOKKOS_FUNCTION void printf(const char* format, Args... args) { else sycl::ext::oneapi::experimental::printf(format, args...); #else - if constexpr (sizeof...(Args) == 0) ::printf("%s", format); - // FIXME_OPENMPTARGET non-string-literal argument used in printf is not - // supported for spir64 -#if !(defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)) + if constexpr (sizeof...(Args) == 0) + ::printf("%s", format); else ::printf(format, args...); #endif -#endif } +#endif } // namespace Kokkos diff --git a/lib/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp b/lib/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp index 29a04ac3b0..e7a9ba0c7e 100644 --- a/lib/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp +++ b/lib/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp @@ -22,49 +22,34 @@ #endif #include -#include #include #include -namespace Kokkos { -namespace Profiling { +namespace Kokkos::Profiling { + +class [[nodiscard]] ProfilingSection { + uint32_t sectionID; -class ProfilingSection { public: ProfilingSection(ProfilingSection const&) = delete; ProfilingSection& operator=(ProfilingSection const&) = delete; - ProfilingSection(const std::string& sectionName) { - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::createProfileSection(sectionName, &secID); - } +#if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907 + [[nodiscard]] +#endif + explicit ProfilingSection(const std::string& sectionName) { + Kokkos::Profiling::createProfileSection(sectionName, §ionID); } - void start() { - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::startSection(secID); - } - } + void start() { Kokkos::Profiling::startSection(sectionID); } - void stop() { - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::stopSection(secID); - } - } + void stop() { Kokkos::Profiling::stopSection(sectionID); } - ~ProfilingSection() { - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::destroyProfileSection(secID); - } - } - - protected: - uint32_t secID; + ~ProfilingSection() { Kokkos::Profiling::destroyProfileSection(sectionID); } }; -} // namespace Profiling -} // namespace Kokkos +} // namespace Kokkos::Profiling #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_CORE #undef KOKKOS_IMPL_PUBLIC_INCLUDE diff --git a/lib/kokkos/core/src/Kokkos_Swap.hpp b/lib/kokkos/core/src/Kokkos_Swap.hpp new file mode 100644 index 0000000000..2f849a13ab --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_Swap.hpp @@ -0,0 +1,68 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SWAP_HPP +#define KOKKOS_SWAP_HPP + +#include + +#include +#include +#include + +namespace Kokkos { + +template +KOKKOS_FUNCTION constexpr std::enable_if_t && + std::is_move_assignable_v> +kokkos_swap(T& a, T& b) noexcept(std::is_nothrow_move_constructible_v&& + std::is_nothrow_move_assignable_v) { + T t(std::move(a)); + a = std::move(b); + b = std::move(t); +} + +namespace Impl { + +template +struct is_swappable { + template + static decltype(kokkos_swap(std::declval(), std::declval())) + test_swap(int); + struct Nope; + template + static Nope test_swap(long); + static constexpr bool value = + !std::is_same_v(0)), Nope>; +}; + +template +inline constexpr bool is_nothrow_swappable_v = + noexcept(kokkos_swap(std::declval(), std::declval())); + +} // namespace Impl + +template +KOKKOS_FUNCTION constexpr std::enable_if_t::value> +kokkos_swap(T (&a)[N], T (&b)[N]) noexcept(Impl::is_nothrow_swappable_v) { + for (std::size_t i = 0; i < N; ++i) { + kokkos_swap(a[i], b[i]); + } +} + +} // namespace Kokkos + +#endif diff --git a/lib/kokkos/core/src/Kokkos_Tuners.hpp b/lib/kokkos/core/src/Kokkos_Tuners.hpp index 618401654e..f5ffc66af5 100644 --- a/lib/kokkos/core/src/Kokkos_Tuners.hpp +++ b/lib/kokkos/core/src/Kokkos_Tuners.hpp @@ -256,13 +256,14 @@ auto get_point_helper(const PointType& in, const ArrayType& indices, template struct GetPoint; -template -struct GetPoint> { +template +struct GetPoint< + PointType, + std::array> { using index_set_type = - std::array; + std::array; static auto build(const PointType& in, const index_set_type& indices) { - return get_point_helper(in, indices, std::make_index_sequence{}); + return get_point_helper(in, indices, std::make_index_sequence{}); } }; diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp index bcbb28014c..484a0e6f62 100644 --- a/lib/kokkos/core/src/Kokkos_View.hpp +++ b/lib/kokkos/core/src/Kokkos_View.hpp @@ -39,7 +39,7 @@ static_assert(false, #ifdef KOKKOS_ENABLE_IMPL_MDSPAN #include #endif -#include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -75,25 +75,59 @@ constexpr KOKKOS_INLINE_FUNCTION std::size_t count_valid_integers( (i6 != KOKKOS_INVALID_INDEX) + (i7 != KOKKOS_INVALID_INDEX); } -KOKKOS_INLINE_FUNCTION -void runtime_check_rank(const size_t rank, const size_t dyn_rank, - const bool is_void_spec, const size_t i0, - const size_t i1, const size_t i2, const size_t i3, - const size_t i4, const size_t i5, const size_t i6, - const size_t i7, const std::string& label) { +// FIXME Ideally, we would not instantiate this function for every possible View +// type. We should be able to only pass "extent" when we use mdspan. +template +KOKKOS_INLINE_FUNCTION void runtime_check_rank( + const View&, const bool is_void_spec, const size_t i0, const size_t i1, + const size_t i2, const size_t i3, const size_t i4, const size_t i5, + const size_t i6, const size_t i7, const char* label) { (void)(label); if (is_void_spec) { const size_t num_passed_args = count_valid_integers(i0, i1, i2, i3, i4, i5, i6, i7); + // We either allow to pass as many extents as the dynamic rank is, or + // as many extents as the total rank is. In the latter case, the given + // extents for the static dimensions must match the + // compile-time extents. + constexpr int rank = View::rank(); + constexpr int dyn_rank = View::rank_dynamic(); + const bool n_args_is_dyn_rank = num_passed_args == dyn_rank; + const bool n_args_is_rank = num_passed_args == rank; - if (num_passed_args != dyn_rank && num_passed_args != rank) { + if constexpr (rank != dyn_rank) { + if (n_args_is_rank) { + size_t new_extents[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + for (int i = dyn_rank; i < rank; ++i) + if (new_extents[i] != View::static_extent(i)) { + KOKKOS_IF_ON_HOST( + const std::string message = + "The specified run-time extent for Kokkos::View '" + + std::string(label) + + "' does not match the compile-time extent in dimension " + + std::to_string(i) + ". The given extent is " + + std::to_string(new_extents[i]) + " but should be " + + std::to_string(View::static_extent(i)) + ".\n"; + Kokkos::abort(message.c_str());) + KOKKOS_IF_ON_DEVICE( + Kokkos::abort( + "The specified run-time extents for a Kokkos::View " + "do not match the compile-time extents.");) + } + } + } + + if (!n_args_is_dyn_rank && !n_args_is_rank) { KOKKOS_IF_ON_HOST( const std::string message = - "Constructor for Kokkos View '" + label + - "' has mismatched number of arguments. Number of arguments = " + + "Constructor for Kokkos::View '" + std::string(label) + + "' has mismatched number of arguments. The number " + "of arguments = " + std::to_string(num_passed_args) + - " but dynamic rank = " + std::to_string(dyn_rank) + " \n"; + " neither matches the dynamic rank = " + + std::to_string(dyn_rank) + + " nor the total rank = " + std::to_string(rank) + "\n"; Kokkos::abort(message.c_str());) KOKKOS_IF_ON_DEVICE(Kokkos::abort("Constructor for Kokkos View has " "mismatched number of arguments.");) @@ -814,15 +848,15 @@ class View : public ViewTraits { template static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) { - static_assert(rank <= sizeof...(Is), ""); - static_assert(sizeof...(Is) <= 8, ""); - static_assert(Kokkos::Impl::are_integral::value, ""); + static_assert(rank <= sizeof...(Is)); + static_assert(sizeof...(Is) <= 8); + static_assert(Kokkos::Impl::are_integral::value); } template static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) { - static_assert(rank == sizeof...(Is), ""); - static_assert(Kokkos::Impl::are_integral::value, ""); + static_assert(rank == sizeof...(Is)); + static_assert(Kokkos::Impl::are_integral::value); } public: @@ -1402,21 +1436,30 @@ class View : public ViewTraits { "execution space"); } - size_t i0 = arg_layout.dimension[0]; - size_t i1 = arg_layout.dimension[1]; - size_t i2 = arg_layout.dimension[2]; - size_t i3 = arg_layout.dimension[3]; - size_t i4 = arg_layout.dimension[4]; - size_t i5 = arg_layout.dimension[5]; - size_t i6 = arg_layout.dimension[6]; - size_t i7 = arg_layout.dimension[7]; +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + is_layouttiled::value) { + size_t i0 = arg_layout.dimension[0]; + size_t i1 = arg_layout.dimension[1]; + size_t i2 = arg_layout.dimension[2]; + size_t i3 = arg_layout.dimension[3]; + size_t i4 = arg_layout.dimension[4]; + size_t i5 = arg_layout.dimension[5]; + size_t i6 = arg_layout.dimension[6]; + size_t i7 = arg_layout.dimension[7]; - const std::string& alloc_name = - Impl::get_property(prop_copy); - Impl::runtime_check_rank( - rank, rank_dynamic, - std::is_same::value, i0, i1, i2, i3, - i4, i5, i6, i7, alloc_name); + const std::string& alloc_name = + Impl::get_property(prop_copy); + Impl::runtime_check_rank( + *this, std::is_same::value, i0, i1, + i2, i3, i4, i5, i6, i7, alloc_name.c_str()); + } +#endif Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( prop_copy, arg_layout, Impl::ViewCtorProp::has_execution_space); @@ -1445,6 +1488,29 @@ class View : public ViewTraits { typename Impl::ViewCtorProp::pointer_type>::value, "Constructing View to wrap user memory must supply matching pointer " "type"); + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + is_layouttiled::value) { + size_t i0 = arg_layout.dimension[0]; + size_t i1 = arg_layout.dimension[1]; + size_t i2 = arg_layout.dimension[2]; + size_t i3 = arg_layout.dimension[3]; + size_t i4 = arg_layout.dimension[4]; + size_t i5 = arg_layout.dimension[5]; + size_t i6 = arg_layout.dimension[6]; + size_t i7 = arg_layout.dimension[7]; + + Impl::runtime_check_rank( + *this, std::is_same::value, i0, i1, + i2, i3, i4, i5, i6, i7, "UNMANAGED"); + } +#endif } // Simple dimension-only layout diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp index f54c44d66f..99daf379b6 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp @@ -58,8 +58,10 @@ void Kokkos::Experimental::OpenACC::impl_initialize( Impl::OpenACCInternal::m_acc_device_num = acc_get_device_num(acc_device_host); } else { + using Kokkos::Impl::get_visible_devices; + std::vector const& visible_devices = get_visible_devices(); using Kokkos::Impl::get_gpu; - int const dev_num = get_gpu(settings); + int const dev_num = get_gpu(settings).value_or(visible_devices[0]); acc_set_device_num(dev_num, Impl::OpenACC_Traits::dev_type); Impl::OpenACCInternal::m_acc_device_num = dev_num; } diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp index b012f6a42a..5155bee33d 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp @@ -91,7 +91,11 @@ class OpenACC { #else int concurrency() const { return 256000; } // FIXME_OPENACC #endif - static bool in_parallel() { return acc_on_device(acc_device_not_host); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static bool in_parallel() { + return acc_on_device(acc_device_not_host); + } +#endif uint32_t impl_instance_id() const noexcept; Impl::OpenACCInternal* impl_internal_space_instance() const { return m_space_instance.get(); diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.cpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.cpp index 141ec77fd1..acc0dcd3c6 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.cpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.cpp @@ -19,8 +19,8 @@ #include #include #include -#include #include +#include #include @@ -66,6 +66,19 @@ void *Kokkos::Experimental::OpenACCSpace::impl_allocate( ptr = acc_malloc(arg_alloc_size); + if (!ptr) { + size_t alignment = 1; // OpenACC does not handle alignment + using Kokkos::Experimental::RawMemoryAllocationFailure; + auto failure_mode = + arg_alloc_size > 0 + ? RawMemoryAllocationFailure::FailureMode::OutOfMemoryError + : RawMemoryAllocationFailure::FailureMode::InvalidAllocationSize; + auto alloc_mechanism = + RawMemoryAllocationFailure::AllocationMechanism::OpenACCMalloc; + throw RawMemoryAllocationFailure(arg_alloc_size, alignment, failure_mode, + alloc_mechanism); + } + if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp index 4aed7e00f7..ca022192b0 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp @@ -34,7 +34,7 @@ struct Kokkos::Impl::DeepCopy 0) { - acc_memcpy_device(dst, const_cast(src), n); + acc_memcpy_device_async(dst, const_cast(src), n, acc_async_noval); } } DeepCopy(const Kokkos::Experimental::OpenACC& exec, void* dst, @@ -52,7 +52,7 @@ struct Kokkos::Impl::DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { if (n > 0) { - acc_memcpy_device(dst, const_cast(src), n); + acc_memcpy_device_async(dst, const_cast(src), n, acc_async_noval); } } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { @@ -60,7 +60,7 @@ struct Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); if (n > 0) { - acc_memcpy_device(dst, const_cast(src), n); + acc_memcpy_device_async(dst, const_cast(src), n, acc_async_noval); } } }; @@ -70,7 +70,9 @@ struct Kokkos::Impl::DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) acc_memcpy_to_device(dst, const_cast(src), n); + if (n > 0) + acc_memcpy_to_device_async(dst, const_cast(src), n, + acc_async_noval); } DeepCopy(const Kokkos::Experimental::OpenACC& exec, void* dst, const void* src, size_t n) { @@ -85,7 +87,8 @@ struct Kokkos::Impl::DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { if (n > 0) { - acc_memcpy_to_device(dst, const_cast(src), n); + acc_memcpy_to_device_async(dst, const_cast(src), n, + acc_async_noval); } } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { @@ -93,7 +96,8 @@ struct Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); if (n > 0) { - acc_memcpy_to_device(dst, const_cast(src), n); + acc_memcpy_to_device_async(dst, const_cast(src), n, + acc_async_noval); } } }; @@ -104,7 +108,8 @@ struct Kokkos::Impl::DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { if (n > 0) { - acc_memcpy_from_device(dst, const_cast(src), n); + acc_memcpy_from_device_async(dst, const_cast(src), n, + acc_async_noval); } } DeepCopy(const Kokkos::Experimental::OpenACC& exec, void* dst, @@ -120,14 +125,17 @@ template struct Kokkos::Impl::DeepCopy< Kokkos::HostSpace, Kokkos::Experimental::OpenACCSpace, ExecutionSpace> { DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) acc_memcpy_from_device(dst, const_cast(src), n); + if (n > 0) + acc_memcpy_from_device_async(dst, const_cast(src), n, + acc_async_noval); } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { exec.fence( "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); if (n > 0) { - acc_memcpy_from_device(dst, const_cast(src), n); + acc_memcpy_from_device_async(dst, const_cast(src), n, + acc_async_noval); } } }; diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp index 6645616ba5..c3d7236872 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp @@ -35,7 +35,7 @@ class OpenACCInternal { public: static int m_acc_device_num; - int m_async_arg = acc_async_sync; + int m_async_arg = acc_async_noval; OpenACCInternal() = default; @@ -43,7 +43,7 @@ class OpenACCInternal { bool verify_is_initialized(const char* const label) const; - void initialize(int async_arg = acc_async_sync); + void initialize(int async_arg = acc_async_noval); void finalize(); bool is_initialized() const; diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp index 2c7793dc11..5afb5e75d3 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp @@ -31,7 +31,7 @@ template ::value, + static_assert(Kokkos::Impl::always_false::value, "not implemented"); } }; @@ -136,6 +136,7 @@ class Kokkos::Impl::ParallelReduce> struct OpenACCParallelReduceHelper { OpenACCParallelReduceHelper(Functor const&, Reducer const&, Policy const&) { - static_assert(!Kokkos::Impl::always_true::value, + static_assert(Kokkos::Impl::always_false::value, "not implemented"); } }; @@ -140,6 +140,7 @@ class Kokkos::Impl::ParallelReduce::value, + static_assert(Kokkos::Impl::always_false::value, "not implemented"); } }; @@ -129,7 +129,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamThreadRangeBoundariesStruct& loop_boundaries, const Lambda& lambda, const JoinType& join, ValueType& init_result) { - static_assert(!Kokkos::Impl::always_true::value, + static_assert(Kokkos::Impl::always_false::value, "custom reduction is not implemented"); } @@ -140,7 +140,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, const JoinType& join, ValueType& init_result) { - static_assert(!Kokkos::Impl::always_true::value, + static_assert(Kokkos::Impl::always_false::value, "custom reduction is not implemented"); } @@ -394,6 +394,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( vector_length); \ functor(team, val); \ } \ + acc_wait(async_arg); \ aval = val; \ } \ } // namespace Kokkos::Experimental::Impl diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp index 91faa64f73..76e1514476 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp @@ -16,92 +16,11 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE -#include +#include #include -#include -#include - -#ifdef KOKKOS_ENABLE_DEBUG -Kokkos::Impl::SharedAllocationRecord SharedAllocationRecord< - Kokkos::Experimental::OpenACCSpace, void>::s_root_record; -#endif - -Kokkos::Impl::SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -Kokkos::Impl::SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::OpenACCSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - Kokkos::Impl::DeepCopy( - RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - Kokkos::fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -Kokkos::Impl::SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::OpenACC &arg_exec_space, - const Kokkos::Experimental::OpenACCSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_exec_space, arg_space, - arg_label, arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - Kokkos::Impl::DeepCopy( - arg_exec_space, RecordBase::m_alloc_ptr, &header, - sizeof(SharedAllocationHeader)); -} - -//============================================================================== -// {{{1 +#include #include -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicitly instantiate these CRTP base classes -// here, where we have access to the associated *_timpl.hpp header files. -template class Kokkos::Impl::HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace>; -template class Kokkos::Impl::SharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace>; - -// end Explicit instantiations of CRTP Base classes }}}1 -//============================================================================== +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::OpenACCSpace); diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp index cf83a5b27b..cde5ecdcb7 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp @@ -20,55 +20,7 @@ #include #include -#include - -template <> -class Kokkos::Impl::SharedAllocationRecord - : public HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace> { - private: - friend class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace>; - friend class SharedAllocationRecordCommon; - friend Kokkos::Experimental::OpenACCSpace; - - using base_t = HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace>; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - /**\brief Root record for tracked allocations from this OpenACCSpace - * instance */ - static RecordBase s_root_record; - - const Kokkos::Experimental::OpenACCSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::OpenACCSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::OpenACC& exec_space, - const Kokkos::Experimental::OpenACCSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); - - SharedAllocationRecord( - const Kokkos::Experimental::OpenACCSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); -}; +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::OpenACCSpace); #endif diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Team.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Team.hpp index 4ec71f56ef..20ea392452 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Team.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Team.hpp @@ -82,7 +82,7 @@ class OpenACCTeamMember { // FIXME_OPENACC: team_broadcast() is not implemented. template KOKKOS_FUNCTION void team_broadcast(ValueType& value, int thread_id) const { - static_assert(!Kokkos::Impl::always_true::value, + static_assert(Kokkos::Impl::always_false::value, "Kokkos Error: team_broadcast() is not implemented for the " "OpenACC backend"); return ValueType(); @@ -99,7 +99,7 @@ class OpenACCTeamMember { template KOKKOS_FUNCTION ValueType team_reduce(const ValueType& value, const JoinOp& op_in) const { - static_assert(!Kokkos::Impl::always_true::value, + static_assert(Kokkos::Impl::always_false::value, "Kokkos Error: team_reduce() is not implemented for the " "OpenACC backend"); return ValueType(); @@ -110,7 +110,7 @@ class OpenACCTeamMember { KOKKOS_FUNCTION ArgType team_scan(const ArgType& /*value*/, ArgType* const /*global_accum*/) const { static_assert( - !Kokkos::Impl::always_true::value, + Kokkos::Impl::always_false::value, "Kokkos Error: team_scan() is not implemented for the OpenACC backend"); return ArgType(); } diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp index 9a169a435c..81f2c5c305 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp @@ -81,29 +81,16 @@ bool OpenMP::impl_is_initialized() noexcept { return Impl::OpenMPInternal::singleton().is_initialized(); } -bool OpenMP::in_parallel(OpenMP const &exec_space) noexcept { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - return ( - (exec_space.impl_internal_space_instance()->m_level < omp_get_level()) && - (!Impl::t_openmp_instance || - Impl::t_openmp_instance->m_level < omp_get_level())); -#else +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_DEPRECATED bool OpenMP::in_parallel(OpenMP const &exec_space) noexcept { return exec_space.impl_internal_space_instance()->m_level < omp_get_level(); -#endif } +#endif int OpenMP::impl_thread_pool_size() const noexcept { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - return OpenMP::in_parallel(*this) - ? omp_get_num_threads() - : (Impl::t_openmp_instance - ? Impl::t_openmp_instance->m_pool_size - : impl_internal_space_instance()->m_pool_size); -#else - return OpenMP::in_parallel(*this) + return (impl_internal_space_instance()->get_level() < omp_get_level()) ? omp_get_num_threads() : impl_internal_space_instance()->m_pool_size; -#endif } int OpenMP::impl_max_hardware_threads() noexcept { diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp index 594f40d524..11292af84a 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp @@ -27,14 +27,7 @@ static_assert(false, #include -#include -#include #include - -#ifdef KOKKOS_ENABLE_HBWSPACE -#include -#endif - #include #include #include @@ -45,6 +38,8 @@ static_assert(false, #include +#include +#include #include /*--------------------------------------------------------------------------*/ @@ -53,11 +48,6 @@ namespace Kokkos { namespace Impl { class OpenMPInternal; - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -// FIXME_OPENMP we can remove this after we remove partition_master -inline thread_local OpenMPInternal* t_openmp_instance = nullptr; -#endif } // namespace Impl /// \class OpenMP @@ -67,12 +57,7 @@ class OpenMP { //! Tag this class as a kokkos execution space using execution_space = OpenMP; - using memory_space = -#ifdef KOKKOS_ENABLE_HBWSPACE - Experimental::HBWSpace; -#else - HostSpace; -#endif + using memory_space = HostSpace; //! This execution space preferred device_type using device_type = Kokkos::Device; @@ -87,8 +72,10 @@ class OpenMP { /// \brief Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose = false) const; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 /// \brief is the instance running a parallel algorithm - static bool in_parallel(OpenMP const& = OpenMP()) noexcept; + KOKKOS_DEPRECATED static bool in_parallel(OpenMP const& = OpenMP()) noexcept; +#endif /// \brief Wait until all dispatched functors complete on the given instance /// @@ -104,18 +91,6 @@ class OpenMP { /// This always returns false on OpenMP inline static bool is_asynchronous(OpenMP const& = OpenMP()) noexcept; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - /// \brief Partition the default instance and call 'f' on each new 'master' - /// thread - /// - /// Func is a functor with the following signiture - /// void( int partition_id, int num_partitions ) - template - KOKKOS_DEPRECATED static void partition_master( - F const& f, int requested_num_partitions = 0, - int requested_partition_size = 0); -#endif - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(OpenMP const& = OpenMP()); #else @@ -166,14 +141,7 @@ class OpenMP { }; inline int OpenMP::impl_thread_pool_rank() noexcept { - // FIXME_OPENMP Can we remove this when removing partition_master? It's only - // used in one partition_master test -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - KOKKOS_IF_ON_HOST( - (return Impl::t_openmp_instance ? 0 : omp_get_thread_num();)) -#else KOKKOS_IF_ON_HOST((return omp_get_thread_num();)) -#endif KOKKOS_IF_ON_DEVICE((return -1;)) } diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp index 12bf3b71f7..32172fbc6c 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp @@ -47,61 +47,6 @@ void OpenMPInternal::release_lock() { desul::MemoryScopeDevice()); } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -void OpenMPInternal::validate_partition_impl(const int nthreads, - int &num_partitions, - int &partition_size) { - if (nthreads == 1) { - num_partitions = 1; - partition_size = 1; - } else if (num_partitions < 1 && partition_size < 1) { - int idle = nthreads; - for (int np = 2; np <= nthreads; ++np) { - for (int ps = 1; ps <= nthreads / np; ++ps) { - if (nthreads - np * ps < idle) { - idle = nthreads - np * ps; - num_partitions = np; - partition_size = ps; - } - if (idle == 0) { - break; - } - } - } - } else if (num_partitions < 1 && partition_size > 0) { - if (partition_size <= nthreads) { - num_partitions = nthreads / partition_size; - } else { - num_partitions = 1; - partition_size = nthreads; - } - } else if (num_partitions > 0 && partition_size < 1) { - if (num_partitions <= nthreads) { - partition_size = nthreads / num_partitions; - } else { - num_partitions = nthreads; - partition_size = 1; - } - } else if (num_partitions * partition_size > nthreads) { - int idle = nthreads; - const int NP = num_partitions; - const int PS = partition_size; - for (int np = NP; np > 0; --np) { - for (int ps = PS; ps > 0; --ps) { - if ((np * ps <= nthreads) && (nthreads - np * ps < idle)) { - idle = nthreads - np * ps; - num_partitions = np; - partition_size = ps; - } - if (idle == 0) { - break; - } - } - } - } -} -#endif - void OpenMPInternal::clear_thread_data() { const size_t member_bytes = sizeof(int64_t) * diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp index 03f5fff395..35b9aa93ba 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp @@ -41,16 +41,6 @@ #include /*--------------------------------------------------------------------------*/ -namespace Kokkos { -namespace Impl { - -inline bool execute_in_serial(OpenMP const& space = OpenMP()) { - return (OpenMP::in_parallel(space) && - !(omp_get_nested() && (omp_get_level() == 1))); -} - -} // namespace Impl -} // namespace Kokkos namespace Kokkos { namespace Impl { @@ -99,11 +89,6 @@ class OpenMPInternal { // Release lock used to protect access to m_pool void release_lock(); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - static void validate_partition_impl(const int nthreads, int& num_partitions, - int& partition_size); -#endif - void resize_thread_data(size_t pool_reduce_bytes, size_t team_reduce_bytes, size_t team_shared_bytes, size_t thread_local_bytes); @@ -115,6 +100,8 @@ class OpenMPInternal { return m_pool[i]; } + int get_level() const { return m_level; } + bool is_initialized() const { return m_initialized; } bool verify_is_initialized(const char* const label) const; @@ -122,32 +109,20 @@ class OpenMPInternal { void print_configuration(std::ostream& s) const; }; -} // namespace Impl - -namespace Experimental { - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -template <> -class MasterLock { - public: - void lock() { omp_set_lock(&m_lock); } - void unlock() { omp_unset_lock(&m_lock); } - bool try_lock() { return static_cast(omp_test_lock(&m_lock)); } - - KOKKOS_DEPRECATED MasterLock() { omp_init_lock(&m_lock); } - ~MasterLock() { omp_destroy_lock(&m_lock); } - - MasterLock(MasterLock const&) = delete; - MasterLock(MasterLock&&) = delete; - MasterLock& operator=(MasterLock const&) = delete; - MasterLock& operator=(MasterLock&&) = delete; - - private: - omp_lock_t m_lock; -}; +inline bool execute_in_serial(OpenMP const& space = OpenMP()) { +// The default value returned by `omp_get_max_active_levels` with gcc version +// lower than 11.1.0 is 2147483647 instead of 1. +#if (!defined(KOKKOS_COMPILER_GNU) || KOKKOS_COMPILER_GNU >= 1110) && \ + _OPENMP >= 201511 + bool is_nested = omp_get_max_active_levels() > 1; +#else + bool is_nested = static_cast(omp_get_nested()); #endif + return (space.impl_internal_space_instance()->get_level() < omp_get_level() && + !(is_nested && (omp_get_level() == 1))); +} -} // namespace Experimental +} // namespace Impl namespace Experimental { namespace Impl { @@ -202,50 +177,6 @@ std::vector partition_space(OpenMP const& main_instance, return Impl::create_OpenMP_instances(main_instance, weights); } } // namespace Experimental - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -template -KOKKOS_DEPRECATED void OpenMP::partition_master(F const& f, int num_partitions, - int partition_size) { -#if _OPENMP >= 201511 - if (omp_get_max_active_levels() > 1) { -#else - if (omp_get_nested()) { -#endif - using Exec = Impl::OpenMPInternal; - - Exec* prev_instance = &Impl::OpenMPInternal::singleton(); - - Exec::validate_partition_impl(prev_instance->m_pool_size, num_partitions, - partition_size); - - OpenMP::memory_space space; - -#pragma omp parallel num_threads(num_partitions) - { - Exec thread_local_instance(partition_size); - Impl::t_openmp_instance = &thread_local_instance; - - size_t pool_reduce_bytes = 32 * partition_size; - size_t team_reduce_bytes = 32 * partition_size; - size_t team_shared_bytes = 1024 * partition_size; - size_t thread_local_bytes = 1024; - - thread_local_instance.resize_thread_data( - pool_reduce_bytes, team_reduce_bytes, team_shared_bytes, - thread_local_bytes); - - omp_set_num_threads(partition_size); - f(omp_get_thread_num(), omp_get_num_threads()); - Impl::t_openmp_instance = nullptr; - } - } else { - // nested openmp not enabled - f(0, 1); - } -} -#endif - } // namespace Kokkos #endif diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp index 96dc664eb7..823a7e668e 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp @@ -147,15 +147,7 @@ class ParallelFor, Kokkos::OpenMP> { inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) : m_instance(nullptr), m_functor(arg_functor), m_policy(arg_policy) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } }; @@ -251,16 +243,9 @@ class ParallelFor, inline ParallelFor(const FunctorType& arg_functor, MDRangePolicy arg_policy) : m_instance(nullptr), m_iter(arg_policy, arg_functor) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } + template static int max_tile_size_product(const Policy&, const Functor&) { /** @@ -409,15 +394,7 @@ class ParallelFor, m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize::value( arg_functor, arg_policy.team_size())) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } }; diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp index 52cdef18e6..05fd1c9dce 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp @@ -170,15 +170,7 @@ class ParallelReduce, m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), m_result_ptr(arg_view.data()) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, @@ -319,15 +311,7 @@ class ParallelReduce::accessible, @@ -543,15 +527,7 @@ class ParallelReduce::value( arg_functor_reducer.get_functor(), arg_policy.team_size())) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif static_assert( Kokkos::Impl::MemorySpaceAccess, inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy) : m_instance(nullptr), m_functor(arg_functor), m_policy(arg_policy) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } }; @@ -292,15 +284,7 @@ class ParallelScanWithTotal, Kokkos::Impl::MemorySpaceAccess::accessible, "Kokkos::OpenMP parallel_scan result must be host-accessible!"); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } //---------------------------------------- diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp index adf972dd08..ea4e7f6bab 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp @@ -65,7 +65,11 @@ class OpenMPTarget { using scratch_memory_space = ScratchMemorySpace; - inline static bool in_parallel() { return omp_in_parallel(); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED inline static bool in_parallel() { + return omp_in_parallel(); + } +#endif static void fence(const std::string& name = "Kokkos::OpenMPTarget::fence: Unnamed Instance Fence"); diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp index 81fbc56de0..a414b34d7c 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp @@ -37,7 +37,6 @@ #include #include #include -#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -110,79 +109,13 @@ void OpenMPTargetSpace::deallocate(const char* arg_label, } // namespace Experimental } // namespace Kokkos -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord SharedAllocationRecord< - Kokkos::Experimental::OpenMPTargetSpace, void>::s_root_record; -#endif - -SharedAllocationRecord::~SharedAllocationRecord() { - auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, (alloc_size - sizeof(SharedAllocationHeader))); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::OpenMPTargetSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // TODO DeepCopy - // DeepCopy - Kokkos::Impl::DeepCopy( - RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - Kokkos::fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -//---------------------------------------------------------------------------- - -} // namespace Impl -} // namespace Kokkos - //============================================================================== // {{{1 #include -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; -template class SharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; - -} // end namespace Impl -} // end namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::OpenMPTargetSpace); // end Explicit instantiations of CRTP Base classes }}}1 //============================================================================== diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp index e5b33d0982..ed625cfcc8 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp @@ -98,6 +98,16 @@ class OpenMPTargetSpace { ~OpenMPTargetSpace() = default; /**\brief Allocate untracked memory in the space */ + // FIXME_OPENMPTARGET Use execution space instance + void* allocate(const OpenMPTarget&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + // FIXME_OPENMPTARGET Use execution space instance + void* allocate(const OpenMPTarget&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -121,9 +131,6 @@ class OpenMPTargetSpace { const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = Kokkos::Tools::make_space_handle(name())) const; - - friend class Kokkos::Impl::SharedAllocationRecord< - Kokkos::Experimental::OpenMPTargetSpace, void>; }; } // namespace Experimental } // namespace Kokkos @@ -131,64 +138,8 @@ class OpenMPTargetSpace { //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -namespace Kokkos { -namespace Impl { - -template <> -class SharedAllocationRecord - : public HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace> { - private: - friend class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; - friend class SharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; - friend Kokkos::Experimental::OpenMPTargetSpace; - - using base_t = HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - /**\brief Root record for tracked allocations from this OpenMPTargetSpace - * instance */ - static RecordBase s_root_record; - - const Kokkos::Experimental::OpenMPTargetSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::OpenMPTargetSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::OpenMPTargetSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); - - public: - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( - const Kokkos::Experimental::OpenMPTargetSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc) { - KOKKOS_IF_ON_HOST( - (return new SharedAllocationRecord(arg_space, arg_label, arg_alloc);)) - KOKKOS_IF_ON_DEVICE( - ((void)arg_space; (void)arg_label; (void)arg_alloc; return nullptr;)) - } -}; - -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::OpenMPTargetSpace); //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp index 1902c38409..b39f5aca35 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp @@ -75,6 +75,7 @@ int* OpenMPTargetExec::m_lock_array = nullptr; uint64_t OpenMPTargetExec::m_lock_size = 0; uint32_t* OpenMPTargetExec::m_uniquetoken_ptr = nullptr; int OpenMPTargetExec::MAX_ACTIVE_THREADS = 0; +std::mutex OpenMPTargetExec::m_mutex_scratch_ptr; void OpenMPTargetExec::clear_scratch() { Kokkos::Experimental::OpenMPTargetSpace space; @@ -98,6 +99,11 @@ void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0, int64_t shmem_size_L1, int64_t league_size) { Kokkos::Experimental::OpenMPTargetSpace space; + // Level-0 scratch when using clang/17 and higher comes from their OpenMP + // extension, `ompx_dyn_cgroup_mem`. +#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) + shmem_size_L0 = 0; +#endif const int64_t shmem_size = shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team. const int64_t padding = shmem_size * 10 / 100; // Padding per team. diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index 9e8844a6f2..3387108da3 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -178,8 +178,10 @@ void OpenMPTarget::impl_static_fence(const std::string& name) { } void OpenMPTarget::impl_initialize(InitializationSettings const& settings) { + using Kokkos::Impl::get_visible_devices; + std::vector const& visible_devices = get_visible_devices(); using Kokkos::Impl::get_gpu; - const int device_num = get_gpu(settings); + const int device_num = get_gpu(settings).value_or(visible_devices[0]); omp_set_default_device(device_num); Impl::OpenMPTargetInternal::impl_singleton()->impl_initialize(); diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp new file mode 100644 index 0000000000..2bd672f4d0 --- /dev/null +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp @@ -0,0 +1,46 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_MACROS_HPP +#define KOKKOS_OPENMPTARGET_MACROS_HPP + +// Intel architectures prefer the classical hierarchical parallelism that relies +// on OpenMP. +#if defined(KOKKOS_ARCH_INTEL_GPU) +#define KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU +#endif + +// Define a macro for llvm compiler greater than version 17 and on NVIDIA and +// AMD GPUs. This would be useful in cases where non-OpenMP standard llvm +// extensions can be used. +#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1700) && \ + (defined(KOKKOS_ARCH_AMD_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)) +#define KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS +#endif + +#define KOKKOS_IMPL_OPENMPTARGET_PRAGMA_HELPER(x) _Pragma(#x) +#define KOKKOS_IMPL_OMPTARGET_PRAGMA(x) \ + KOKKOS_IMPL_OPENMPTARGET_PRAGMA_HELPER(omp target x) + +// Use scratch memory extensions to request dynamic shared memory for the +// right compiler/architecture combination. +#ifdef KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS +#define KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(N) ompx_dyn_cgroup_mem(N) +#else +#define KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(N) +#endif + +#endif // KOKKOS_OPENMPTARGET_MACROS_HPP diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp index 9767d8e53e..dcc509d2fa 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -21,16 +21,10 @@ #include #include #include -#include #include #include "Kokkos_OpenMPTarget_Abort.hpp" - -// Intel architectures prefer the classical hierarchical parallelism that relies -// on OpenMP. -#if defined(KOKKOS_ARCH_INTEL_GPU) -#define KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU -#endif +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -113,14 +107,20 @@ class OpenMPTargetExecTeamMember { team_broadcast(value, thread_id); } - // FIXME_OPENMPTARGET this function has the wrong interface and currently - // ignores the reducer passed. - template - KOKKOS_INLINE_FUNCTION ValueType team_reduce(const ValueType& value, - const JoinOp&) const { + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + team_reduce(ReducerType const& reducer) const noexcept { + team_reduce(reducer, reducer.reference()); + } + + // FIXME_OPENMPTARGET this function currently ignores the reducer passed. + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + team_reduce(ReducerType const&, typename ReducerType::value_type& value) const + noexcept { #pragma omp barrier - using value_type = ValueType; + using value_type = typename ReducerType::value_type; // const JoinLambdaAdapter op(op_in); // Make sure there is enough scratch space: @@ -149,8 +149,9 @@ class OpenMPTargetExecTeamMember { } #pragma omp barrier } - return team_scratch[0]; + value = team_scratch[0]; } + /** \brief Intra-team exclusive prefix sum with team_rank() ordering * with intra-team non-deterministic ordering accumulation. * @@ -249,15 +250,37 @@ class OpenMPTargetExecTeamMember { // and L1 shmem size. TEAM_REDUCE_SIZE = 512 bytes saved per team for // hierarchical reduction. There is an additional 10% of the requested // scratch memory allocated per team as padding. Hence the product with 0.1. + // + // Use llvm extensions for dynamic shared memory with compilers/architecture + // combinations where it is supported. + // + // Size allocated in HBM will now change based on whether we use llvm + // extensions. +#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) + const int total_shmem = shmem_size_L1 + shmem_size_L1 * 0.1; +#else + const int total_shmem = + shmem_size_L0 + shmem_size_L1 + (shmem_size_L0 + shmem_size_L1) * 0.1; +#endif + + // Per team offset for buffer in HBM. const int reduce_offset = - m_shmem_block_index * - (shmem_size_L0 + shmem_size_L1 + - ((shmem_size_L0 + shmem_size_L1) * 0.1) + TEAM_REDUCE_SIZE); + m_shmem_block_index * (total_shmem + TEAM_REDUCE_SIZE); + +#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) + const int l1_offset = reduce_offset + TEAM_REDUCE_SIZE; + char* l0_scratch = + static_cast(llvm_omp_target_dynamic_shared_alloc()); + m_team_shared = scratch_memory_space( + l0_scratch, shmem_size_L0, static_cast(glb_scratch) + l1_offset, + shmem_size_L1); +#else const int l0_offset = reduce_offset + TEAM_REDUCE_SIZE; const int l1_offset = l0_offset + shmem_size_L0; m_team_shared = scratch_memory_space( (static_cast(glb_scratch) + l0_offset), shmem_size_L0, static_cast(glb_scratch) + l1_offset, shmem_size_L1); +#endif m_reduce_scratch = static_cast(glb_scratch) + reduce_offset; m_league_rank = league_rank; m_team_rank = omp_tid; @@ -751,6 +774,7 @@ class OpenMPTargetExec { int64_t thread_local_bytes, int64_t league_size); static void* m_scratch_ptr; + static std::mutex m_mutex_scratch_ptr; static int64_t m_scratch_size; static int* m_lock_array; static uint64_t m_lock_size; diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp index 1abc925cae..26085f1140 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -140,8 +141,10 @@ class ParallelFor, // guarantees that the number of teams specified in the `num_teams` clause is // always less than or equal to the maximum concurrently running teams. #if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU) -#pragma omp target teams thread_limit(team_size) firstprivate(a_functor) \ - num_teams(max_active_teams) is_device_ptr(scratch_ptr) + KOKKOS_IMPL_OMPTARGET_PRAGMA( + teams thread_limit(team_size) firstprivate(a_functor) + num_teams(max_active_teams) is_device_ptr(scratch_ptr) + KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) #pragma omp parallel { if (omp_get_num_teams() > max_active_teams) diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp index 4452af3846..caa568a892 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -55,6 +55,9 @@ class ParallelReduce, const pointer_type m_result_ptr; bool m_result_ptr_on_device; const int m_result_ptr_num_elems; + // Only let one ParallelReduce instance at a time use the scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_memory_lock; using TagType = typename Policy::work_tag; public: @@ -105,7 +108,8 @@ class ParallelReduce, m_result_ptr_on_device( MemorySpaceAccess::accessible), - m_result_ptr_num_elems(arg_result_view.size()) {} + m_result_ptr_num_elems(arg_result_view.size()), + m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} }; } // namespace Impl diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp index a302fa7151..8abffa47a4 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp @@ -470,6 +470,10 @@ class ParallelReduce m_scratch_memory_lock; + public: void execute() const { const FunctorType& functor = m_functor_reducer.get_functor(); @@ -517,7 +521,8 @@ class ParallelReduce::value( - arg_functor_reducer.get_functor(), arg_policy.team_size())) {} + arg_functor_reducer.get_functor(), arg_policy.team_size())), + m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} }; } // namespace Impl diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index 1d6677a1df..c1f7851f41 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -238,8 +238,10 @@ class ParallelScanWithTotal, if (!base_t::m_result_ptr_device_accessible) { const int size = base_t::m_functor_reducer.get_reducer().value_size(); - DeepCopy( - base_t::m_result_ptr, chunk_values.data() + (n_chunks - 1), size); + DeepCopy( + base_t::m_policy.space(), base_t::m_result_ptr, + chunk_values.data() + (n_chunks - 1), size); } } else if (!base_t::m_result_ptr_device_accessible) { base_t::m_functor_reducer.get_reducer().init(base_t::m_result_ptr); diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp index fb75f05f27..eb3dc3773c 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace Kokkos { namespace Impl { @@ -394,9 +395,11 @@ struct ParallelReduceSpecialize, initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) #if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU) -#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ - firstprivate(f) is_device_ptr(scratch_ptr) reduction(custom \ - : result) + KOKKOS_IMPL_OMPTARGET_PRAGMA( + teams num_teams(max_active_teams) thread_limit(team_size) + firstprivate(f) is_device_ptr(scratch_ptr) reduction(custom + : result) + KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) #pragma omp parallel reduction(custom : result) { if (omp_get_num_teams() > max_active_teams) @@ -482,9 +485,11 @@ struct ParallelReduceSpecialize, // Case where reduction is on a native data type. if constexpr (std::is_arithmetic::value) { -#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(+: result) + // Use scratch memory extensions to request dynamic shared memory for + // the right compiler/architecture combination. + KOKKOS_IMPL_OMPTARGET_PRAGMA(teams num_teams(max_active_teams) thread_limit(team_size) map(to: f) \ + is_device_ptr(scratch_ptr) reduction(+: result) \ + KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) #pragma omp parallel reduction(+ : result) { if (omp_get_num_teams() > max_active_teams) @@ -636,11 +641,13 @@ struct ParallelReduceSpecialize, return; } - -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) - { + // Use scratch memory extensions to request dynamic shared memory for the + // right compiler/architecture combination. + KOKKOS_IMPL_OMPTARGET_PRAGMA( + teams num_teams(nteams) thread_limit(team_size) map(to + : f) + is_device_ptr(scratch_ptr) + KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) { #pragma omp parallel { const int team_num = omp_get_team_num(); @@ -665,9 +672,8 @@ struct ParallelReduceSpecialize, int tree_neighbor_offset = 1; do { -#pragma omp target teams distribute parallel for simd map(to \ - : final_reducer) \ - is_device_ptr(scratch_ptr) +#pragma omp target teams distribute parallel for simd firstprivate( \ + final_reducer) is_device_ptr(scratch_ptr) for (int i = 0; i < nteams - tree_neighbor_offset; i += 2 * tree_neighbor_offset) { ValueType* team_scratch = static_cast(scratch_ptr); diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp index 41e62ce6e6..6878531730 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp @@ -438,6 +438,10 @@ class ParallelReduce m_scratch_memory_lock; + public: inline void execute() const { execute_tile( @@ -452,7 +456,8 @@ class ParallelReduce::accessible) {} + typename ViewType::memory_space>::accessible), + m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} template inline std::enable_if_t execute_tile(const FunctorType& functor, diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp index 672271ed6b..9b578aca11 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp @@ -18,7 +18,6 @@ #define KOKKOS_OPENMPTARGETREDUCER_HPP #include -#include #include #include "Kokkos_OpenMPTarget_Abort.hpp" diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp index 7fa935f693..9a246f7642 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp @@ -88,26 +88,57 @@ bool SYCL::impl_is_initialized() { void SYCL::impl_finalize() { Impl::SYCLInternal::singleton().finalize(); } void SYCL::print_configuration(std::ostream& os, bool verbose) const { - os << "Devices:\n"; - os << " KOKKOS_ENABLE_SYCL: yes\n"; - os << "\nRuntime Configuration:\n"; - os << "macro KOKKOS_ENABLE_SYCL : defined\n"; +#ifdef KOKKOS_ENABLE_ONEDPL + os << "macro KOKKOS_ENABLE_ONEDPL : defined\n"; +#else + os << "macro KOKKOS_ENABLE_ONEDPL : undefined\n"; +#endif #ifdef KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED os << "macro KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : defined\n"; #else os << "macro KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : undefined\n"; #endif - +#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL + os << "macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL : defined\n"; +#else + os << "macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL : undefined\n"; +#endif #ifdef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES os << "macro KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : defined\n"; #else os << "macro KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : undefined\n"; #endif - if (verbose) + int counter = 0; + int active_device = Kokkos::device_id(); + std::cout << "\nAvailable devices: \n"; + std::vector devices = Impl::get_sycl_devices(); + for (const auto& device : devices) { + std::string device_type; + switch (device.get_info()) { + case sycl::info::device_type::cpu: device_type = "cpu"; break; + case sycl::info::device_type::gpu: device_type = "gpu"; break; + case sycl::info::device_type::accelerator: + device_type = "accelerator"; + break; + case sycl::info::device_type::custom: device_type = "custom"; break; + case sycl::info::device_type::automatic: device_type = "automatic"; break; + case sycl::info::device_type::host: device_type = "host"; break; + case sycl::info::device_type::all: device_type = "all"; break; + } + os << "[" << device.get_backend() << "]:" << device_type << ':' << counter + << "] " << device.get_info(); + if (counter == active_device) os << " : Selected"; + os << '\n'; + ++counter; + } + + if (verbose) { + os << '\n'; SYCL::impl_sycl_info(os, m_space_instance->m_queue->get_device()); + } } void SYCL::fence(const std::string& name) const { @@ -137,20 +168,11 @@ void SYCL::impl_static_fence(const std::string& name) { } void SYCL::impl_initialize(InitializationSettings const& settings) { - std::vector gpu_devices = - sycl::device::get_devices(sycl::info::device_type::gpu); - // If the device id is not specified and there are no GPUs, sidestep Kokkos - // device selection and use whatever is available (if no GPU architecture is - // specified). -#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) - if (!settings.has_device_id() && gpu_devices.empty()) { - Impl::SYCLInternal::singleton().initialize(sycl::device()); - Impl::SYCLInternal::m_syclDev = 0; - return; - } -#endif - const auto id = ::Kokkos::Impl::get_gpu(settings); - Impl::SYCLInternal::singleton().initialize(gpu_devices[id]); + const auto& visible_devices = ::Kokkos::Impl::get_visible_devices(); + const auto id = + ::Kokkos::Impl::get_gpu(settings).value_or(visible_devices[0]); + std::vector sycl_devices = Impl::get_sycl_devices(); + Impl::SYCLInternal::singleton().initialize(sycl_devices[id]); Impl::SYCLInternal::m_syclDev = id; } @@ -243,9 +265,32 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os, namespace Impl { +std::vector get_sycl_devices() { +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) || \ + defined(KOKKOS_ARCH_AMD_GPU) + std::vector devices = + sycl::device::get_devices(sycl::info::device_type::gpu); +#if defined(KOKKOS_ARCH_INTEL_GPU) + sycl::backend backend = sycl::backend::ext_oneapi_level_zero; +#elif defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + sycl::backend backend = sycl::backend::ext_oneapi_cuda; +#elif defined(KOKKOS_ARCH_AMD_GPU) + sycl::backend backend = sycl::backend::ext_oneapi_hip; +#endif + devices.erase(std::remove_if(devices.begin(), devices.end(), + [backend](const sycl::device& d) { + return d.get_backend() != backend; + }), + devices.end()); +#else + std::vector devices = sycl::device::get_devices(); +#endif + return devices; +} + int g_sycl_space_factory_initialized = Kokkos::Impl::initialize_space_factory("170_SYCL"); -} +} // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp index be6b4b8930..0f3d1f0994 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp @@ -78,19 +78,15 @@ class SYCL { //! \name Functions that all Kokkos devices must implement. //@{ - KOKKOS_INLINE_FUNCTION static int in_parallel() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() { #if defined(__SYCL_DEVICE_ONLY__) return true; #else return false; #endif } - - /** \brief Set the device in a "sleep" state. */ - static bool sleep(); - - /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */ - static bool wake(); +#endif /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */ static void impl_static_fence(const std::string& name); @@ -188,6 +184,10 @@ std::vector partition_space(const SYCL& sycl_space, sycl::queue(context, device, sycl::property::queue::in_order())); return instances; } + +namespace Impl { +std::vector get_sycl_devices(); +} // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 080369770d..0e67adb578 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -54,7 +54,7 @@ Kokkos::View sycl_global_unique_token_locks( } SYCLInternal::~SYCLInternal() { - if (!was_finalized || m_scratchSpace || m_scratchFlags) { + if (!was_finalized || m_scratchSpace || m_scratchHost || m_scratchFlags) { std::cerr << "Kokkos::Experimental::SYCL ERROR: Failed to call " "Kokkos::Experimental::SYCL::finalize()" << std::endl; @@ -102,6 +102,23 @@ void SYCLInternal::initialize(const sycl::device& d) { void SYCLInternal::initialize(const sycl::queue& q) { KOKKOS_EXPECTS(!is_initialized()); +#define KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(BACKEND, REQUIRED) \ + if (BACKEND != REQUIRED) \ + Kokkos::abort( \ + "The SYCL execution space instance was initialized with an " \ + "unsupported backend type! For this GPU architecture, only " #REQUIRED \ + " is supported.") +#if defined(KOKKOS_ARCH_INTEL_GPU) + KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(q.get_backend(), + sycl::backend::ext_oneapi_level_zero); +#elif defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(q.get_backend(), + sycl::backend::ext_oneapi_cuda); +#elif defined(KOKKOS_ARCH_AMD_GPU) + KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(q.get_backend(), + sycl::backend::ext_oneapi_hip); +#endif + if (was_finalized) Kokkos::abort("Calling SYCL::initialize after SYCL::finalize is illegal\n"); @@ -196,14 +213,22 @@ void SYCLInternal::finalize() { #endif } - using RecordSYCL = Kokkos::Impl::SharedAllocationRecord; + auto device_mem_space = SYCLDeviceUSMSpace(*m_queue); + auto host_mem_space = SYCLHostUSMSpace(*m_queue); if (nullptr != m_scratchSpace) - RecordSYCL::decrement(RecordSYCL::get_record(m_scratchSpace)); + device_mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + if (nullptr != m_scratchHost) + host_mem_space.deallocate(m_scratchHost, + m_scratchHostCount * sizeScratchGrain); if (nullptr != m_scratchFlags) - RecordSYCL::decrement(RecordSYCL::get_record(m_scratchFlags)); + device_mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); m_syclDev = -1; m_scratchSpaceCount = 0; m_scratchSpace = nullptr; + m_scratchHostCount = 0; + m_scratchHost = nullptr; m_scratchFlagsCount = 0; m_scratchFlags = nullptr; @@ -228,54 +253,68 @@ void SYCLInternal::finalize() { sycl::device_ptr SYCLInternal::scratch_space(const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { - m_scratchSpaceCount = scratch_count(size); - - using Record = Kokkos::Impl::SharedAllocationRecord< - Kokkos::Experimental::SYCLDeviceUSMSpace, void>; + auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); if (nullptr != m_scratchSpace) - Record::decrement(Record::get_record(m_scratchSpace)); + mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + + m_scratchSpaceCount = scratch_count(size); std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchSpaceCount, sizeScratchGrain); - Record* const r = Record::allocate( - Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue), - "Kokkos::Experimental::SYCL::InternalScratchSpace", alloc_size); - - Record::increment(r); - - m_scratchSpace = reinterpret_cast(r->data()); + m_scratchSpace = static_cast(mem_space.allocate( + "Kokkos::Experimental::SYCL::InternalScratchSpace", alloc_size)); } return m_scratchSpace; } +sycl::host_ptr SYCLInternal::scratch_host(const std::size_t size) { + if (verify_is_initialized("scratch_unified") && + m_scratchHostCount < scratch_count(size)) { + auto mem_space = Kokkos::Experimental::SYCLHostUSMSpace(*m_queue); + + if (nullptr != m_scratchHost) + mem_space.deallocate(m_scratchHost, + m_scratchHostCount * sizeScratchGrain); + + m_scratchHostCount = scratch_count(size); + + std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( + m_scratchHostCount, sizeScratchGrain); + m_scratchHost = static_cast(mem_space.allocate( + "Kokkos::Experimental::SYCL::InternalScratchHost", alloc_size)); + } + + return m_scratchHost; +} + sycl::device_ptr SYCLInternal::scratch_flags(const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { - m_scratchFlagsCount = scratch_count(size); - - using Record = Kokkos::Impl::SharedAllocationRecord< - Kokkos::Experimental::SYCLDeviceUSMSpace, void>; + auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); if (nullptr != m_scratchFlags) - Record::decrement(Record::get_record(m_scratchFlags)); + mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); + + m_scratchFlagsCount = scratch_count(size); std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchFlagsCount, sizeScratchGrain); - Record* const r = Record::allocate( - Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue), - "Kokkos::Experimental::SYCL::InternalScratchFlags", alloc_size); + m_scratchFlags = static_cast(mem_space.allocate( + "Kokkos::Experimental::SYCL::InternalScratchFlags", alloc_size)); - Record::increment(r); - - m_scratchFlags = reinterpret_cast(r->data()); - } - auto memset_event = m_queue->memset(m_scratchFlags, 0, - m_scratchFlagsCount * sizeScratchGrain); + // We only zero-initialize the allocation when we actually allocate. + // It's the responsibility of the features using scratch_flags, + // namely parallel_reduce and parallel_scan, to reset the used values to 0. + auto memset_event = m_queue->memset(m_scratchFlags, 0, + m_scratchFlagsCount * sizeScratchGrain); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - m_queue->ext_oneapi_submit_barrier(std::vector{memset_event}); + m_queue->ext_oneapi_submit_barrier(std::vector{memset_event}); #endif + } return m_scratchFlags; } @@ -318,15 +357,12 @@ size_t SYCLInternal::USMObjectMem::reserve(size_t n) { assert(m_q); if (m_capacity < n) { - using Record = Kokkos::Impl::SharedAllocationRecord; - // First free what we have (in case malloc can reuse it) - if (m_data) Record::decrement(Record::get_record(m_data)); + AllocationSpace alloc_space(*m_q); + if (m_data) alloc_space.deallocate(m_data, m_capacity); - Record* const r = Record::allocate( - AllocationSpace(*m_q), "Kokkos::Experimental::SYCL::USMObjectMem", n); - Record::increment(r); + m_data = + alloc_space.allocate("Kokkos::Experimental::SYCL::USMObjectMem", n); - m_data = r->data(); if constexpr (sycl::usm::alloc::device == Kind) m_staging.reset(new char[n]); m_capacity = n; @@ -340,8 +376,8 @@ void SYCLInternal::USMObjectMem::reset() { if (m_data) { // This implies a fence since this class is not copyable // and deallocating implies a fence across all registered queues. - using Record = Kokkos::Impl::SharedAllocationRecord; - Record::decrement(Record::get_record(m_data)); + AllocationSpace alloc_space(*m_q); + alloc_space.deallocate(m_data, m_capacity); m_capacity = 0; m_data = nullptr; diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp index 51a617054d..ab7e8ce71e 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -45,6 +45,7 @@ class SYCLInternal { sycl::device_ptr scratch_space(const std::size_t size); sycl::device_ptr scratch_flags(const std::size_t size); + sycl::host_ptr scratch_host(const std::size_t size); int acquire_team_scratch_space(); sycl::device_ptr resize_team_scratch_space(int scratch_pool_id, std::int64_t bytes, @@ -60,6 +61,8 @@ class SYCLInternal { std::size_t m_scratchSpaceCount = 0; sycl::device_ptr m_scratchSpace = nullptr; + std::size_t m_scratchHostCount = 0; + sycl::host_ptr m_scratchHost = nullptr; std::size_t m_scratchFlagsCount = 0; sycl::device_ptr m_scratchFlags = nullptr; // mutex to access shared memory @@ -330,8 +333,8 @@ struct sycl::is_device_copyable< Kokkos::Experimental::Impl::SYCLFunctionWrapper> : std::true_type {}; -// FIXME_SYCL Remove when this specialization when specializations for -// sycl::device_copyable also apply to const-qualified types. +#if (defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER < 20240000) || \ + (defined(__LIBSYCL_MAJOR_VERSION) && __LIBSYCL_MAJOR_VERSION < 7) template struct NonTriviallyCopyableAndDeviceCopyable { NonTriviallyCopyableAndDeviceCopyable( @@ -356,3 +359,4 @@ struct sycl::is_device_copyable< : std::true_type {}; #endif #endif +#endif diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp index f4fada570b..7fbf5420f8 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp @@ -118,6 +118,8 @@ class Kokkos::Impl::ParallelFor, const BarePolicy bare_policy(m_policy); + desul::ensure_sycl_lock_arrays_on_device(q); + auto parallel_for_event = q.submit([&](sycl::handler& cgh) { const auto range = compute_ranges(); const sycl::range<3> global_range = range.get_global_range(); diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp index 9c5767d209..b4de7eb89f 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp @@ -81,6 +81,8 @@ class Kokkos::Impl::ParallelFor, const Kokkos::Experimental::SYCL& space = policy.space(); sycl::queue& q = space.sycl_queue(); + desul::ensure_sycl_lock_arrays_on_device(q); + auto parallel_for_event = q.submit([&](sycl::handler& cgh) { #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index 4fc5818ce9..ecb4a863da 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -46,9 +46,9 @@ class Kokkos::Impl::ParallelFor, int m_shmem_size; sycl::device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; - // Only let one ParallelFor/Reduce modify the team scratch memory. The - // constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_lock; + // Only let one ParallelFor instance at a time use the team scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_buffers_lock; int m_scratch_pool_id = -1; template @@ -59,6 +59,8 @@ class Kokkos::Impl::ParallelFor, const Kokkos::Experimental::SYCL& space = policy.space(); sycl::queue& q = space.sycl_queue(); + desul::ensure_sycl_lock_arrays_on_device(q); + auto parallel_for_event = q.submit([&](sycl::handler& cgh) { // FIXME_SYCL accessors seem to need a size greater than zero at least for // host queues @@ -74,7 +76,8 @@ class Kokkos::Impl::ParallelFor, auto lambda = [=](sycl::nd_item<2> item) { const member_type team_member( - team_scratch_memory_L0.get_pointer(), shmem_begin, scratch_size[0], + KOKKOS_IMPL_SYCL_GET_MULTI_PTR(team_scratch_memory_L0), shmem_begin, + scratch_size[0], global_scratch_ptr + item.get_group(1) * scratch_size[1], scratch_size[1], item, item.get_group_linear_id(), item.get_group_range(1)); @@ -141,9 +144,9 @@ class Kokkos::Impl::ParallelFor, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), m_vector_size(arg_policy.impl_vector_length()), - m_scratch_lock(arg_policy.space() - .impl_internal_space_instance() - ->m_team_scratch_mutex) { + m_scratch_buffers_lock(arg_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_mutex) { // FIXME_SYCL optimize if (m_team_size < 0) m_team_size = diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index 6964c2dbcf..f55280e22e 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -78,7 +78,7 @@ class Kokkos::Impl::ParallelReduce::accessible), - m_shared_memory_lock( + m_scratch_buffers_lock( m_space.impl_internal_space_instance()->m_mutexScratchSpace) {} private: @@ -95,9 +95,16 @@ class Kokkos::Impl::ParallelReduce results_ptr; + auto host_result_ptr = + (m_result_ptr && !m_result_ptr_device_accessible) + ? static_cast>( + instance.scratch_host(sizeof(value_type) * value_count)) + : nullptr; sycl::event last_reduction_event; + desul::ensure_sycl_lock_arrays_on_device(q); + // If n_tiles==0 we only call init() and final() working with the global // scratch memory but don't copy back to m_result_ptr yet. if (n_tiles == 0) { @@ -109,8 +116,10 @@ class Kokkos::Impl::ParallelReduce>( instance.scratch_space(sizeof(value_type) * value_count)); - sycl::global_ptr device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast>(m_result_ptr) + : static_cast>(host_result_ptr); cgh.single_task([=]() { const CombinedFunctorReducerType& functor_reducer = functor_reducer_wrapper.get_functor(); @@ -148,8 +157,10 @@ class Kokkos::Impl::ParallelReduce>( instance.scratch_space(sizeof(value_type) * value_count * n_wgroups)); - sycl::global_ptr device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast>(m_result_ptr) + : static_cast>(host_result_ptr); auto scratch_flags = static_cast>( instance.scratch_flags(sizeof(unsigned int))); @@ -223,6 +234,7 @@ class Kokkos::Impl::ParallelReduce= static_cast(n_wgroups)) reducer.init(&local_mem[local_id * value_count]); else { @@ -268,6 +280,7 @@ class Kokkos::Impl::ParallelReduce= static_cast(n_wgroups)) reducer.init(&local_value); else { @@ -296,11 +309,13 @@ class Kokkos::Impl::ParallelReduce( - m_space, m_result_ptr, results_ptr, - sizeof(*m_result_ptr) * value_count); + // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower. + if (host_result_ptr) { + m_space.fence( + "Kokkos::Impl::ParallelReduce::execute: result " + "not device-accessible"); + std::memcpy(m_result_ptr, host_result_ptr, + sizeof(value_type) * value_count); } return last_reduction_event; @@ -335,9 +350,9 @@ class Kokkos::Impl::ParallelReduce m_shared_memory_lock; + // Only let one ParallelReduce instance at a time use the host scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_buffers_lock; }; #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_MDRANGE_HPP */ diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index 8c900cfa42..5333e3c8a8 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -51,7 +51,7 @@ class Kokkos::Impl::ParallelReduce::accessible), - m_shared_memory_lock( + m_scratch_buffers_lock( p.space().impl_internal_space_instance()->m_mutexScratchSpace) {} private: @@ -70,11 +70,20 @@ class Kokkos::Impl::ParallelReduce results_ptr = nullptr; - sycl::global_ptr device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto host_result_ptr = + (m_result_ptr && !m_result_ptr_device_accessible) + ? static_cast>( + instance.scratch_host(sizeof(value_type) * value_count)) + : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast>(m_result_ptr) + : static_cast>(host_result_ptr); sycl::event last_reduction_event; + desul::ensure_sycl_lock_arrays_on_device(q); + // If size<=1 we only call init(), the functor and possibly final once // working with the global scratch memory but don't copy back to // m_result_ptr yet. @@ -168,6 +177,7 @@ class Kokkos::Impl::ParallelReduce= n_wgroups) reducer.init(&local_mem[local_id * value_count]); else { @@ -210,6 +220,7 @@ class Kokkos::Impl::ParallelReduce= n_wgroups) reducer.init(&local_value); else { @@ -320,11 +331,13 @@ class Kokkos::Impl::ParallelReduce( - space, m_result_ptr, results_ptr, - sizeof(*m_result_ptr) * value_count); + // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower. + if (host_result_ptr) { + space.fence( + "Kokkos::Impl::ParallelReduce::execute: result " + "not device-accessible"); + std::memcpy(m_result_ptr, host_result_ptr, + sizeof(*m_result_ptr) * value_count); } return last_reduction_event; @@ -354,9 +367,9 @@ class Kokkos::Impl::ParallelReduce m_shared_memory_lock; + // Only let one ParallelReduce instance at a time use the host scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_buffers_lock; }; #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_RANGE_HPP */ diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index 07145b0fb9..27165c59e3 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -59,9 +59,10 @@ class Kokkos::Impl::ParallelReduce m_scratch_lock; + // Only let one ParallelReduce instance at a time use the team scratch memory + // and the host scratch memory. The constructor acquires the mutex which is + // released in the destructor. + std::scoped_lock m_scratch_buffers_lock; int m_scratch_pool_id = -1; template @@ -79,9 +80,16 @@ class Kokkos::Impl::ParallelReduce>( + instance.scratch_host(sizeof(value_type) * value_count)) + : nullptr; sycl::event last_reduction_event; + desul::ensure_sycl_lock_arrays_on_device(q); + // If size<=1 we only call init(), the functor and possibly final once // working with the global scratch memory but don't copy back to // m_result_ptr yet. @@ -89,8 +97,10 @@ class Kokkos::Impl::ParallelReduce>(instance.scratch_space( sizeof(value_type) * std::max(value_count, 1u))); - sycl::global_ptr device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast>(m_result_ptr) + : static_cast>(host_result_ptr); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { // FIXME_SYCL accessors seem to need a size greater than zero at least @@ -121,9 +131,10 @@ class Kokkos::Impl::ParallelReduce) functor(team_member, update); else @@ -160,12 +171,16 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = m_global_scratch_ptr; + sycl::local_accessor num_teams_done(1, cgh); auto team_reduction_factory = [&](sycl::local_accessor local_mem, sycl::device_ptr results_ptr) { - sycl::global_ptr device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast>(m_result_ptr) + : static_cast>( + host_result_ptr); auto lambda = [=](sycl::nd_item<2> item) { auto n_wgroups = item.get_group_range()[1]; int wgroup_size = @@ -173,8 +188,6 @@ class Kokkos::Impl::ParallelReduce( - local_mem[wgroup_size * std::max(value_count, 1u)]); const auto local_id = item.get_local_linear_id(); const CombinedFunctorReducerType& functor_reducer = functor_reducer_wrapper.get_functor(); @@ -188,8 +201,8 @@ class Kokkos::Impl::ParallelReduce scratch_flags_ref(*scratch_flags); - num_teams_done = ++scratch_flags_ref; + num_teams_done[0] = ++scratch_flags_ref; } sycl::group_barrier(item.get_group()); - if (num_teams_done == n_wgroups) { + if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; if (local_id >= n_wgroups) reducer.init(&local_mem[local_id * value_count]); else { @@ -241,8 +255,8 @@ class Kokkos::Impl::ParallelReduce scratch_flags_ref(*scratch_flags); - num_teams_done = ++scratch_flags_ref; + num_teams_done[0] = ++scratch_flags_ref; } item.barrier(sycl::access::fence_space::local_space); - if (num_teams_done == n_wgroups) { + if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; if (local_id >= n_wgroups) reducer.init(&local_value); else { @@ -311,10 +326,7 @@ class Kokkos::Impl::ParallelReduce local_mem( - sycl::range<1>(wgroup_size) * std::max(value_count, 1u) + - (sizeof(unsigned int) + sizeof(value_type) - 1) / - sizeof(value_type), - cgh); + sycl::range<1>(wgroup_size) * std::max(value_count, 1u), cgh); const auto init_size = std::max((size + wgroup_size - 1) / wgroup_size, 1); @@ -358,11 +370,13 @@ class Kokkos::Impl::ParallelReduce( - space, m_result_ptr, results_ptr, - sizeof(*m_result_ptr) * value_count); + // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower. + if (host_result_ptr) { + space.fence( + "Kokkos::Impl::ParallelReduce::execute: result not " + "device-accessible"); + std::memcpy(m_result_ptr, host_result_ptr, + sizeof(*m_result_ptr) * value_count); } return last_reduction_event; @@ -448,9 +462,9 @@ class Kokkos::Impl::ParallelReducem_team_scratch_mutex) { + m_scratch_buffers_lock(arg_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_mutex) { initialize(); } }; diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index 04425723e1..977b69bc9e 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKO_SYCL_PARALLEL_SCAN_RANGE_HPP -#define KOKKO_SYCL_PARALLEL_SCAN_RANGE_HPP +#ifndef KOKKOS_SYCL_PARALLEL_SCAN_RANGE_HPP +#define KOKKOS_SYCL_PARALLEL_SCAN_RANGE_HPP #include #include @@ -111,13 +111,13 @@ class ParallelScanSYCLBase { const CombinedFunctorReducer m_functor_reducer; const Policy m_policy; - pointer_type m_scratch_space = nullptr; - const pointer_type m_result_ptr; + sycl::host_ptr m_scratch_host = nullptr; + pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; - // Only let one Parallel/Scan modify the shared memory. The - // constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_shared_memory_lock; + // Only let one ParallelScan instance at a time use the host scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_buffers_lock; private: template @@ -187,6 +187,7 @@ class ParallelScanSYCLBase { } item.barrier(sycl::access::fence_space::global_space); if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; value_type total; reducer.init(&total); @@ -220,6 +221,8 @@ class ParallelScanSYCLBase { sycl::device_ptr global_mem; sycl::device_ptr group_results; + desul::ensure_sycl_lock_arrays_on_device(q); + auto perform_work_group_scans = q.submit([&](sycl::handler& cgh) { sycl::local_accessor num_teams_done(1, cgh); @@ -253,7 +256,8 @@ class ParallelScanSYCLBase { global_mem = static_cast>(instance.scratch_space( n_wgroups * (wgroup_size + 1) * sizeof(value_type))); - m_scratch_space = global_mem; + m_scratch_host = static_cast>( + instance.scratch_host(sizeof(value_type))); group_results = global_mem + n_wgroups * wgroup_size; @@ -281,10 +285,11 @@ class ParallelScanSYCLBase { // Write results to global memory auto update_global_results = q.submit([&](sycl::handler& cgh) { - auto result_ptr_device_accessible = m_result_ptr_device_accessible; // The compiler failed with CL_INVALID_ARG_VALUE if using m_result_ptr // directly. - auto result_ptr = m_result_ptr_device_accessible ? m_result_ptr : nullptr; + pointer_type result_ptr = m_result_ptr_device_accessible + ? m_result_ptr + : static_cast(m_scratch_host); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(perform_work_group_scans); @@ -293,7 +298,6 @@ class ParallelScanSYCLBase { cgh.parallel_for( sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size), [=](sycl::nd_item<1> item) { - auto global_mem_copy = global_mem; const index_type global_id = item.get_global_linear_id(); const CombinedFunctorReducer< FunctorType, typename Analysis::Reducer>& functor_reducer = @@ -312,9 +316,7 @@ class ParallelScanSYCLBase { else functor(WorkTag(), global_id + begin, update, true); - global_mem_copy[global_id] = update; - if (global_id == size - 1 && result_ptr_device_accessible) - *result_ptr = update; + if (global_id == size - 1) *result_ptr = update; } }); }); @@ -351,9 +353,9 @@ class ParallelScanSYCLBase { m_policy(arg_policy), m_result_ptr(arg_result_ptr), m_result_ptr_device_accessible(arg_result_ptr_device_accessible), - m_shared_memory_lock(m_policy.space() - .impl_internal_space_instance() - ->m_mutexScratchSpace) {} + m_scratch_buffers_lock(m_policy.space() + .impl_internal_space_instance() + ->m_mutexScratchSpace) {} }; } // namespace Kokkos::Impl @@ -390,11 +392,13 @@ class Kokkos::Impl::ParallelScanWithTotal< Base::impl_execute([&]() { const long long nwork = Base::m_policy.end() - Base::m_policy.begin(); if (nwork > 0 && !Base::m_result_ptr_device_accessible) { + // Using DeepCopy instead of fence+memcpy turned out to be up to 2x + // slower. + m_exec.fence( + "Kokkos::Impl::ParallelReduce::execute: " + "result not device-accessible"); const int size = Base::m_functor_reducer.get_reducer().value_size(); - DeepCopy(m_exec, Base::m_result_ptr, - Base::m_scratch_space + nwork - 1, - size); + std::memcpy(Base::m_result_ptr, Base::m_scratch_host, size); } }); } diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp index 64b7f56796..9cc8008cdf 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp @@ -25,7 +25,6 @@ #include #include #include -#include #include /*--------------------------------------------------------------------------*/ @@ -243,226 +242,17 @@ void SYCLHostUSMSpace::deallocate(const char* arg_label, } // namespace Experimental } // namespace Kokkos -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord SharedAllocationRecord< - Kokkos::Experimental::SYCLDeviceUSMSpace, void>::s_root_record; - -SharedAllocationRecord SharedAllocationRecord< - Kokkos::Experimental::SYCLSharedUSMSpace, void>::s_root_record; - -SharedAllocationRecord SharedAllocationRecord< - Kokkos::Experimental::SYCLHostUSMSpace, void>::s_root_record; -#endif - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCLDeviceUSMSpace& space, - const std::string& label, const size_t size, - const SharedAllocationRecord::function_type dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(space, label, size), - sizeof(SharedAllocationHeader) + size, dealloc, label), - m_space(space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, label); - - // Copy to device memory - Kokkos::Experimental::SYCL exec; - Kokkos::Impl::DeepCopy( - exec, RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - exec.fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& arg_exec_space, - const Kokkos::Experimental::SYCLDeviceUSMSpace& space, - const std::string& label, const size_t size, - const SharedAllocationRecord::function_type dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_exec_space, space, - label, size), - sizeof(SharedAllocationHeader) + size, dealloc, label), - m_space(space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, label); - - // Copy to device memory - Kokkos::Impl::DeepCopy( - arg_exec_space, RecordBase::m_alloc_ptr, &header, - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(exec_space, arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(exec_space, arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -SharedAllocationRecord::~SharedAllocationRecord() { - const auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, alloc_size - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - const auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, alloc_size - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - const auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, alloc_size - sizeof(SharedAllocationHeader)); -} - -//---------------------------------------------------------------------------- - -} // namespace Impl -} // namespace Kokkos - //============================================================================== // {{{1 #include -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; -template class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; -template class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLSharedUSMSpace>; -template class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLHostUSMSpace>; - -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::SYCLDeviceUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::SYCLSharedUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::SYCLHostUSMSpace); // end Explicit instantiations of CRTP Base classes }}}1 //============================================================================== diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp index 239c6e3ce0..b86cfca413 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp @@ -66,11 +66,6 @@ class SYCLDeviceUSMSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; - private: - template - friend class LogicalMemorySpace; - - public: static constexpr const char* name() { return "SYCLDeviceUSM"; }; private: @@ -87,6 +82,16 @@ class SYCLSharedUSMSpace { SYCLSharedUSMSpace(); explicit SYCLSharedUSMSpace(sycl::queue queue); + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const SYCL& exec_space, const std::size_t arg_alloc_size) const; void* allocate(const SYCL& exec_space, const char* arg_label, @@ -102,11 +107,6 @@ class SYCLSharedUSMSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; - private: - template - friend class LogicalMemorySpace; - - public: static constexpr const char* name() { return "SYCLSharedUSM"; }; private: @@ -123,6 +123,16 @@ class SYCLHostUSMSpace { SYCLHostUSMSpace(); explicit SYCLHostUSMSpace(sycl::queue queue); + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const SYCL& exec_space, const std::size_t arg_alloc_size) const; void* allocate(const SYCL& exec_space, const char* arg_label, @@ -138,11 +148,6 @@ class SYCLHostUSMSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; - private: - template - friend class LogicalMemorySpace; - - public: static constexpr const char* name() { return "SYCLHostUSM"; }; private: @@ -166,19 +171,16 @@ struct is_sycl_type_space : public std::true_type {}; static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); template <> struct MemorySpaceAccess -class SharedAllocationRecord - : public HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace> { - private: - friend class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; - friend class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; - using base_t = HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord(SharedAllocationRecord&&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const Kokkos::Experimental::SYCLDeviceUSMSpace m_space; - - protected: - ~SharedAllocationRecord(); - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLSharedUSMSpace> { - private: - friend class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLSharedUSMSpace>; - using base_t = - SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord(SharedAllocationRecord&&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete; - - static RecordBase s_root_record; - - const Kokkos::Experimental::SYCLSharedUSMSpace m_space; - - protected: - ~SharedAllocationRecord(); - - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLHostUSMSpace> { - private: - friend class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLHostUSMSpace>; - using base_t = - SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord(SharedAllocationRecord&&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete; - - static RecordBase s_root_record; - - const Kokkos::Experimental::SYCLHostUSMSpace m_space; - - protected: - ~SharedAllocationRecord(); - - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -} // namespace Impl - } // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::SYCLDeviceUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::SYCLSharedUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::SYCLHostUSMSpace); + #endif #endif diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp index 89c09c3195..dbba382758 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -140,9 +140,14 @@ class SYCLTeamMember { } value = sg.shuffle(value, 0); + const auto n_subgroups = sg.get_group_range()[0]; + if (n_subgroups == 1) { + reducer.reference() = value; + return; + } + // We need to chunk up the whole reduction because we might not have // allocated enough memory. - const auto n_subgroups = sg.get_group_range()[0]; const unsigned int maximum_work_range = std::min(m_team_reduce_size / sizeof(value_type), n_subgroups); diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp index 9548f211d9..61db6b34aa 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp @@ -26,8 +26,7 @@ namespace Impl { template struct ZeroMemset> { ZeroMemset(const Kokkos::Experimental::SYCL& exec_space, - const View& dst, - typename View::const_value_type&) { + const View& dst) { auto event = exec_space.impl_internal_space_instance()->m_queue->memset( dst.data(), 0, dst.size() * sizeof(typename View::value_type)); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES @@ -35,12 +34,6 @@ struct ZeroMemset> { ->m_queue->ext_oneapi_submit_barrier(std::vector{event}); #endif } - - ZeroMemset(const View& dst, - typename View::const_value_type&) { - Experimental::Impl::SYCLInternal::singleton().m_queue->memset( - dst.data(), 0, dst.size() * sizeof(typename View::value_type)); - } }; } // namespace Impl diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp b/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp index 071ecdbc4f..39b201976b 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp @@ -153,7 +153,7 @@ void Serial::print_configuration(std::ostream& os, bool /*verbose*/) const { os << "Host Serial Execution Space:\n"; os << " KOKKOS_ENABLE_SERIAL: yes\n"; -#ifdef KOKKOS_INTERNAL_NOT_PARALLEL +#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS os << "Kokkos atomics disabled\n"; #endif diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp index 67119cac16..43eb4992ed 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp @@ -121,7 +121,10 @@ class Serial { /// For the Serial device, this method always returns false, /// because parallel_for or parallel_reduce with the Serial device /// always execute sequentially. - inline static int in_parallel() { return false; } + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED inline static int in_parallel() { return false; } +#endif /// \brief Wait until all dispatched functors complete. /// diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp index 69787aa500..67978aa3e9 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKO_SERIAL_PARALLEL_MDRANGE_HPP -#define KOKKO_SERIAL_PARALLEL_MDRANGE_HPP +#ifndef KOKKOS_SERIAL_PARALLEL_MDRANGE_HPP +#define KOKKOS_SERIAL_PARALLEL_MDRANGE_HPP #include #include diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp index 56894716db..91b4c56711 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKO_SERIAL_PARALLEL_RANGE_HPP -#define KOKKO_SERIAL_PARALLEL_RANGE_HPP +#ifndef KOKKOS_SERIAL_PARALLEL_RANGE_HPP +#define KOKKOS_SERIAL_PARALLEL_RANGE_HPP #include diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp index 0876f1af22..f34a7daaca 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKO_SERIAL_PARALLEL_TEAM_HPP -#define KOKKO_SERIAL_PARALLEL_TEAM_HPP +#ifndef KOKKOS_SERIAL_PARALLEL_TEAM_HPP +#define KOKKOS_SERIAL_PARALLEL_TEAM_HPP #include diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp index f9c86f55ce..5905d6d32e 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp @@ -121,7 +121,7 @@ class TaskQueueSpecializationConstrained< using task_base_type = TaskBase; using queue_type = typename scheduler_type::queue_type; - task_base_type* const end = (task_base_type*)task_base_type::EndTag; + auto* const end = reinterpret_cast(task_base_type::EndTag); execution_space serial_execution_space; auto& data = serial_execution_space.impl_internal_space_instance() @@ -157,7 +157,7 @@ class TaskQueueSpecializationConstrained< using task_base_type = TaskBase; using queue_type = typename scheduler_type::queue_type; - task_base_type* const end = (task_base_type*)task_base_type::EndTag; + auto* const end = reinterpret_cast(task_base_type::EndTag); execution_space serial_execution_space; diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp index 3ec2dfbcfa..6ad6aabc5a 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp @@ -22,6 +22,7 @@ #include #include +#include namespace Kokkos { namespace Impl { @@ -34,14 +35,11 @@ template struct ZeroMemset< std::conditional_t::value, Serial, DummyExecutionSpace>, - View> - : public ZeroMemset> { - using Base = ZeroMemset>; - using Base::Base; - - ZeroMemset(const Serial&, const View& dst, - typename View::const_value_type& value) - : Base(dst, value) {} + View> { + ZeroMemset(const Serial&, const View& dst) { + using ValueType = typename View::value_type; + std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); + } }; } // namespace Impl diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads.hpp index c0d70c03ec..31653c46ca 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads.hpp @@ -38,15 +38,6 @@ static_assert(false, /*--------------------------------------------------------------------------*/ -namespace Kokkos { -namespace Impl { -class ThreadsExec; -enum class fence_is_static { yes, no }; -} // namespace Impl -} // namespace Kokkos - -/*--------------------------------------------------------------------------*/ - namespace Kokkos { /** \brief Execution space for a pool of C++11 threads on a CPU. */ @@ -73,7 +64,9 @@ class Threads { /// \brief True if and only if this method is being called in a /// thread-parallel function. - static int in_parallel(); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static int in_parallel(); +#endif /// \brief Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose = false) const; diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp similarity index 56% rename from lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp rename to lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp index 801a1ac82e..3842966cd7 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -16,17 +16,15 @@ #ifndef KOKKOS_IMPL_PUBLIC_INCLUDE #define KOKKOS_IMPL_PUBLIC_INCLUDE +#include "Threads/Kokkos_Threads_Instance.hpp" #endif #include -#include -#include #include #include #include #include -#include #include @@ -41,7 +39,6 @@ namespace Kokkos { namespace Impl { namespace { -std::mutex host_internal_cppthread_mutex; // std::thread compatible driver. // Recovery from an exception would require constant intra-thread health @@ -49,7 +46,7 @@ std::mutex host_internal_cppthread_mutex; // abort the process. void internal_cppthread_driver() { try { - ThreadsExec::driver(); + ThreadsInternal::driver(); } catch (const std::exception &x) { std::cerr << "Exception thrown from worker thread: " << x.what() << std::endl; @@ -62,32 +59,17 @@ void internal_cppthread_driver() { } } -ThreadsExec s_threads_process; -ThreadsExec *s_threads_exec[ThreadsExec::MAX_THREAD_COUNT] = {nullptr}; -std::thread::id s_threads_pid[ThreadsExec::MAX_THREAD_COUNT]; -std::pair s_threads_coord[ThreadsExec::MAX_THREAD_COUNT]; +ThreadsInternal s_threads_process; +ThreadsInternal *s_threads_exec[ThreadsInternal::MAX_THREAD_COUNT] = {nullptr}; +std::thread::id s_threads_pid[ThreadsInternal::MAX_THREAD_COUNT]; +std::pair + s_threads_coord[ThreadsInternal::MAX_THREAD_COUNT]; int s_thread_pool_size[3] = {0, 0, 0}; -unsigned s_current_reduce_size = 0; -unsigned s_current_shared_size = 0; - -void (*volatile s_current_function)(ThreadsExec &, const void *); +void (*volatile s_current_function)(ThreadsInternal &, const void *); const void *volatile s_current_function_arg = nullptr; -struct Sentinel { - ~Sentinel() { - if (s_thread_pool_size[0] || s_thread_pool_size[1] || - s_thread_pool_size[2] || s_current_reduce_size || - s_current_shared_size || s_current_function || s_current_function_arg || - s_threads_exec[0]) { - std::cerr << "ERROR : Process exiting while Kokkos::Threads is still " - "initialized" - << std::endl; - } - } -}; - inline unsigned fan_size(const unsigned rank, const unsigned size) { const unsigned rank_rev = size - (rank + 1); unsigned count = 0; @@ -97,6 +79,12 @@ inline unsigned fan_size(const unsigned rank, const unsigned size) { return count; } +void wait_yield(volatile ThreadState &flag, const ThreadState value) { + while (value == flag) { + std::this_thread::yield(); + } +} + } // namespace } // namespace Impl } // namespace Kokkos @@ -107,66 +95,44 @@ inline unsigned fan_size(const unsigned rank, const unsigned size) { namespace Kokkos { namespace Impl { -//---------------------------------------------------------------------------- -// Spawn a thread - -void ThreadsExec::spawn() { - std::thread t(internal_cppthread_driver); - t.detach(); -} - -//---------------------------------------------------------------------------- - -bool ThreadsExec::is_process() { +bool ThreadsInternal::is_process() { static const std::thread::id master_pid = std::this_thread::get_id(); return master_pid == std::this_thread::get_id(); } -void ThreadsExec::global_lock() { host_internal_cppthread_mutex.lock(); } - -void ThreadsExec::global_unlock() { host_internal_cppthread_mutex.unlock(); } - //---------------------------------------------------------------------------- -void ThreadsExec::wait_yield(volatile int &flag, const int value) { - while (value == flag) { - std::this_thread::yield(); - } -} +void execute_function_noop(ThreadsInternal &, const void *) {} -void execute_function_noop(ThreadsExec &, const void *) {} - -void ThreadsExec::driver() { +void ThreadsInternal::driver() { SharedAllocationRecord::tracking_enable(); - ThreadsExec this_thread; + ThreadsInternal this_thread; - while (ThreadsExec::Active == this_thread.m_pool_state) { + while (this_thread.m_pool_state == ThreadState::Active) { (*s_current_function)(this_thread, s_current_function_arg); // Deactivate thread and wait for reactivation - this_thread.m_pool_state = ThreadsExec::Inactive; + this_thread.m_pool_state = ThreadState::Inactive; - wait_yield(this_thread.m_pool_state, ThreadsExec::Inactive); + wait_yield(this_thread.m_pool_state, ThreadState::Inactive); } } -ThreadsExec::ThreadsExec() +ThreadsInternal::ThreadsInternal() : m_pool_base(nullptr), m_scratch(nullptr), m_scratch_reduce_end(0), m_scratch_thread_end(0), - m_numa_rank(0), - m_numa_core_rank(0), m_pool_rank(0), m_pool_size(0), m_pool_fan_size(0), - m_pool_state(ThreadsExec::Terminating) { + m_pool_state(ThreadState::Terminating) { if (&s_threads_process != this) { - // A spawned thread - - ThreadsExec *const nil = nullptr; + // The code in the if is executed by a spawned thread not by the root + // thread + ThreadsInternal *const nil = nullptr; // Which entry in 's_threads_exec', possibly determined from hwloc binding const int entry = reinterpret_cast(s_current_function_arg) < @@ -178,80 +144,66 @@ ThreadsExec::ThreadsExec() // Given a good entry set this thread in the 's_threads_exec' array if (entry < s_thread_pool_size[0] && nil == atomic_compare_exchange(s_threads_exec + entry, nil, this)) { - const std::pair coord = - Kokkos::hwloc::get_this_thread_coordinate(); - - m_numa_rank = coord.first; - m_numa_core_rank = coord.second; - m_pool_base = s_threads_exec; - m_pool_rank = s_thread_pool_size[0] - (entry + 1); - m_pool_rank_rev = s_thread_pool_size[0] - (pool_rank() + 1); - m_pool_size = s_thread_pool_size[0]; - m_pool_fan_size = fan_size(m_pool_rank, m_pool_size); - m_pool_state = ThreadsExec::Active; + m_pool_base = s_threads_exec; + m_pool_rank = s_thread_pool_size[0] - (entry + 1); + m_pool_rank_rev = s_thread_pool_size[0] - (pool_rank() + 1); + m_pool_size = s_thread_pool_size[0]; + m_pool_fan_size = fan_size(m_pool_rank, m_pool_size); + m_pool_state = ThreadState::Active; s_threads_pid[m_pool_rank] = std::this_thread::get_id(); // Inform spawning process that the threads_exec entry has been set. - s_threads_process.m_pool_state = ThreadsExec::Active; + s_threads_process.m_pool_state = ThreadState::Active; } else { // Inform spawning process that the threads_exec entry could not be set. - s_threads_process.m_pool_state = ThreadsExec::Terminating; + s_threads_process.m_pool_state = ThreadState::Terminating; } } else { // Enables 'parallel_for' to execute on unitialized Threads device m_pool_rank = 0; m_pool_size = 1; - m_pool_state = ThreadsExec::Inactive; + m_pool_state = ThreadState::Inactive; s_threads_pid[m_pool_rank] = std::this_thread::get_id(); } } -ThreadsExec::~ThreadsExec() { +ThreadsInternal::~ThreadsInternal() { const unsigned entry = m_pool_size - (m_pool_rank + 1); - using Record = Kokkos::Impl::SharedAllocationRecord; - if (m_scratch) { - Record *const r = Record::get_record(m_scratch); - + Kokkos::kokkos_free(m_scratch); m_scratch = nullptr; - - Record::decrement(r); } m_pool_base = nullptr; m_scratch_reduce_end = 0; m_scratch_thread_end = 0; - m_numa_rank = 0; - m_numa_core_rank = 0; m_pool_rank = 0; m_pool_size = 0; m_pool_fan_size = 0; - m_pool_state = ThreadsExec::Terminating; + m_pool_state = ThreadState::Terminating; if (&s_threads_process != this && entry < MAX_THREAD_COUNT) { - ThreadsExec *const nil = nullptr; + ThreadsInternal *const nil = nullptr; atomic_compare_exchange(s_threads_exec + entry, this, nil); - s_threads_process.m_pool_state = ThreadsExec::Terminating; + s_threads_process.m_pool_state = ThreadState::Terminating; } } -int ThreadsExec::get_thread_count() { return s_thread_pool_size[0]; } - -ThreadsExec *ThreadsExec::get_thread(const int init_thread_rank) { - ThreadsExec *const th = +ThreadsInternal *ThreadsInternal::get_thread(const int init_thread_rank) { + ThreadsInternal *const th = init_thread_rank < s_thread_pool_size[0] ? s_threads_exec[s_thread_pool_size[0] - (init_thread_rank + 1)] : nullptr; if (nullptr == th || th->m_pool_rank != init_thread_rank) { std::ostringstream msg; - msg << "Kokkos::Impl::ThreadsExec::get_thread ERROR : " + msg << "Kokkos::Impl::ThreadsInternal::get_thread ERROR : " << "thread " << init_thread_rank << " of " << s_thread_pool_size[0]; if (nullptr == th) { msg << " does not exist"; @@ -264,24 +216,6 @@ ThreadsExec *ThreadsExec::get_thread(const int init_thread_rank) { return th; } -//---------------------------------------------------------------------------- - -void ThreadsExec::execute_sleep(ThreadsExec &exec, const void *) { - ThreadsExec::global_lock(); - ThreadsExec::global_unlock(); - - const int n = exec.m_pool_fan_size; - const int rank_rev = exec.m_pool_size - (exec.m_pool_rank + 1); - - for (int i = 0; i < n; ++i) { - Impl::spinwait_while_equal( - exec.m_pool_base[rank_rev + (1 << i)]->m_pool_state, - ThreadsExec::Active); - } - - exec.m_pool_state = ThreadsExec::Inactive; -} - } // namespace Impl } // namespace Kokkos @@ -290,8 +224,8 @@ void ThreadsExec::execute_sleep(ThreadsExec &exec, const void *) { namespace Kokkos { namespace Impl { -void ThreadsExec::verify_is_process(const std::string &name, - const bool initialized) { +void ThreadsInternal::verify_is_process(const std::string &name, + const bool initialized) { if (!is_process()) { std::string msg(name); msg.append( @@ -307,63 +241,48 @@ void ThreadsExec::verify_is_process(const std::string &name, } } -int ThreadsExec::in_parallel() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_DEPRECATED int ThreadsInternal::in_parallel() { // A thread function is in execution and // the function argument is not the special threads process argument and // the master process is a worker or is not the master process. return s_current_function && (&s_threads_process != s_current_function_arg) && (s_threads_process.m_pool_base || !is_process()); } -void ThreadsExec::fence() { internal_fence(Impl::fence_is_static::yes); } -void ThreadsExec::fence(const std::string &name) { - internal_fence(name, Impl::fence_is_static::yes); +#endif +void ThreadsInternal::fence() { + fence("Kokkos::ThreadsInternal::fence: Unnamed Instance Fence"); } - -void ThreadsExec::internal_fence(Impl::fence_is_static is_static) { - internal_fence((is_static == Impl::fence_is_static::no) - ? "Kokkos::ThreadsExec::fence: Unnamed Instance Fence" - : "Kokkos::ThreadsExec::fence: Unnamed Static Fence", - is_static); +void ThreadsInternal::fence(const std::string &name) { + Kokkos::Tools::Experimental::Impl::profile_fence_event( + name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, + internal_fence); } // Wait for root thread to become inactive -void ThreadsExec::internal_fence(const std::string &name, - Impl::fence_is_static is_static) { - const auto &fence_lam = [&]() { - if (s_thread_pool_size[0]) { - // Wait for the root thread to complete: - Impl::spinwait_while_equal(s_threads_exec[0]->m_pool_state, - ThreadsExec::Active); - } - - s_current_function = nullptr; - s_current_function_arg = nullptr; - - // Make sure function and arguments are cleared before - // potentially re-activating threads with a subsequent launch. - memory_fence(); - }; - if (is_static == Impl::fence_is_static::yes) { - Kokkos::Tools::Experimental::Impl::profile_fence_event( - name, - Kokkos::Tools::Experimental::SpecialSynchronizationCases:: - GlobalDeviceSynchronization, - fence_lam); - } else { - Kokkos::Tools::Experimental::Impl::profile_fence_event( - name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, - fence_lam); +void ThreadsInternal::internal_fence() { + if (s_thread_pool_size[0]) { + // Wait for the root thread to complete: + Impl::spinwait_while_equal(s_threads_exec[0]->m_pool_state, + ThreadState::Active); } + + s_current_function = nullptr; + s_current_function_arg = nullptr; + + // Make sure function and arguments are cleared before + // potentially re-activating threads with a subsequent launch. + memory_fence(); } /** \brief Begin execution of the asynchronous functor */ -void ThreadsExec::start(void (*func)(ThreadsExec &, const void *), - const void *arg) { - verify_is_process("ThreadsExec::start", true); +void ThreadsInternal::start(void (*func)(ThreadsInternal &, const void *), + const void *arg) { + verify_is_process("ThreadsInternal::start", true); if (s_current_function || s_current_function_arg) { Kokkos::Impl::throw_runtime_exception( - std::string("ThreadsExec::start() FAILED : already executing")); + std::string("ThreadsInternal::start() FAILED : already executing")); } s_current_function = func; @@ -372,68 +291,29 @@ void ThreadsExec::start(void (*func)(ThreadsExec &, const void *), // Make sure function and arguments are written before activating threads. memory_fence(); - // Activate threads: + // Activate threads. The spawned threads will start working on + // s_current_function. The root thread is only set to active, we still need to + // call s_current_function. for (int i = s_thread_pool_size[0]; 0 < i--;) { - s_threads_exec[i]->m_pool_state = ThreadsExec::Active; + s_threads_exec[i]->m_pool_state = ThreadState::Active; } if (s_threads_process.m_pool_size) { // Master process is the root thread, run it: (*func)(s_threads_process, arg); - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; } } //---------------------------------------------------------------------------- -bool ThreadsExec::sleep() { - verify_is_process("ThreadsExec::sleep", true); - - if (&execute_sleep == s_current_function) return false; - - fence(); - - ThreadsExec::global_lock(); - - s_current_function = &execute_sleep; - - // Activate threads: - for (unsigned i = s_thread_pool_size[0]; 0 < i;) { - s_threads_exec[--i]->m_pool_state = ThreadsExec::Active; - } - - return true; -} - -bool ThreadsExec::wake() { - verify_is_process("ThreadsExec::wake", true); - - if (&execute_sleep != s_current_function) return false; - - ThreadsExec::global_unlock(); - - if (s_threads_process.m_pool_base) { - execute_sleep(s_threads_process, nullptr); - s_threads_process.m_pool_state = ThreadsExec::Inactive; - } - - fence(); - - return true; -} - -//---------------------------------------------------------------------------- - -void ThreadsExec::execute_resize_scratch_in_serial() { +void ThreadsInternal::execute_resize_scratch_in_serial() { const unsigned begin = s_threads_process.m_pool_base ? 1 : 0; - auto deallocate_scratch_memory = [](ThreadsExec &exec) { + auto deallocate_scratch_memory = [](ThreadsInternal &exec) { if (exec.m_scratch) { - using Record = - Kokkos::Impl::SharedAllocationRecord; - Record *const r = Record::get_record(exec.m_scratch); - exec.m_scratch = nullptr; - Record::decrement(r); + Kokkos::kokkos_free(exec.m_scratch); + exec.m_scratch = nullptr; } }; if (s_threads_process.m_pool_base) { @@ -449,18 +329,18 @@ void ThreadsExec::execute_resize_scratch_in_serial() { memory_fence(); for (unsigned i = s_thread_pool_size[0]; begin < i;) { - ThreadsExec &th = *s_threads_exec[--i]; + ThreadsInternal &th = *s_threads_exec[--i]; - th.m_pool_state = ThreadsExec::Active; + th.m_pool_state = ThreadState::Active; - wait_yield(th.m_pool_state, ThreadsExec::Active); + wait_yield(th.m_pool_state, ThreadState::Active); } if (s_threads_process.m_pool_base) { deallocate_scratch_memory(s_threads_process); - s_threads_process.m_pool_state = ThreadsExec::Active; + s_threads_process.m_pool_state = ThreadState::Active; first_touch_allocate_thread_private_scratch(s_threads_process, nullptr); - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; } s_current_function_arg = nullptr; @@ -472,27 +352,20 @@ void ThreadsExec::execute_resize_scratch_in_serial() { //---------------------------------------------------------------------------- -void *ThreadsExec::root_reduce_scratch() { +void *ThreadsInternal::root_reduce_scratch() { return s_threads_process.reduce_memory(); } -void ThreadsExec::first_touch_allocate_thread_private_scratch(ThreadsExec &exec, - const void *) { +void ThreadsInternal::first_touch_allocate_thread_private_scratch( + ThreadsInternal &exec, const void *) { exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end; exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end; if (s_threads_process.m_scratch_thread_end) { // Allocate tracked memory: { - using Record = - Kokkos::Impl::SharedAllocationRecord; - Record *const r = - Record::allocate(Kokkos::HostSpace(), "Kokkos::thread_scratch", - s_threads_process.m_scratch_thread_end); - - Record::increment(r); - - exec.m_scratch = r->data(); + exec.m_scratch = Kokkos::kokkos_malloc( + "Kokkos::thread_scratch", s_threads_process.m_scratch_thread_end); } unsigned *ptr = reinterpret_cast(exec.m_scratch); @@ -505,7 +378,7 @@ void ThreadsExec::first_touch_allocate_thread_private_scratch(ThreadsExec &exec, } } -void *ThreadsExec::resize_scratch(size_t reduce_size, size_t thread_size) { +void *ThreadsInternal::resize_scratch(size_t reduce_size, size_t thread_size) { enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 }; fence(); @@ -522,7 +395,7 @@ void *ThreadsExec::resize_scratch(size_t reduce_size, size_t thread_size) { if ((old_reduce_size < reduce_size) || (old_thread_size < thread_size) || ((reduce_size == 0 && thread_size == 0) && (old_reduce_size != 0 || old_thread_size != 0))) { - verify_is_process("ThreadsExec::resize_scratch", true); + verify_is_process("ThreadsInternal::resize_scratch", true); s_threads_process.m_scratch_reduce_end = reduce_size; s_threads_process.m_scratch_thread_end = reduce_size + thread_size; @@ -537,27 +410,22 @@ void *ThreadsExec::resize_scratch(size_t reduce_size, size_t thread_size) { //---------------------------------------------------------------------------- -void ThreadsExec::print_configuration(std::ostream &s, const bool detail) { - verify_is_process("ThreadsExec::print_configuration", false); +void ThreadsInternal::print_configuration(std::ostream &s, const bool detail) { + verify_is_process("ThreadsInternal::print_configuration", false); fence(); - const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); - const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); - const unsigned threads_per_core = - Kokkos::hwloc::get_available_threads_per_core(); - - // Forestall compiler warnings for unused variables. - (void)numa_count; - (void)cores_per_numa; - (void)threads_per_core; - s << "Kokkos::Threads"; #if defined(KOKKOS_ENABLE_THREADS) s << " KOKKOS_ENABLE_THREADS"; #endif #if defined(KOKKOS_ENABLE_HWLOC) + const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); + const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); + const unsigned threads_per_core = + Kokkos::hwloc::get_available_threads_per_core(); + s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]"; #endif @@ -569,25 +437,21 @@ void ThreadsExec::print_configuration(std::ostream &s, const bool detail) { if (nullptr == s_threads_process.m_pool_base) { s << " Asynchronous"; } - s << " ReduceScratch[" << s_current_reduce_size << "]" - << " SharedScratch[" << s_current_shared_size << "]"; s << std::endl; if (detail) { for (int i = 0; i < s_thread_pool_size[0]; ++i) { - ThreadsExec *const th = s_threads_exec[i]; + ThreadsInternal *const th = s_threads_exec[i]; if (th) { const int rank_rev = th->m_pool_size - (th->m_pool_rank + 1); - s << " Thread[ " << th->m_pool_rank << " : " << th->m_numa_rank << "." - << th->m_numa_core_rank << " ]"; + s << " Thread[ " << th->m_pool_rank << " ]"; s << " Fan{"; for (int j = 0; j < th->m_pool_fan_size; ++j) { - ThreadsExec *const thfan = th->m_pool_base[rank_rev + (1 << j)]; - s << " [ " << thfan->m_pool_rank << " : " << thfan->m_numa_rank - << "." << thfan->m_numa_core_rank << " ]"; + ThreadsInternal *const thfan = th->m_pool_base[rank_rev + (1 << j)]; + s << " [ " << thfan->m_pool_rank << " ]"; } s << " }"; @@ -605,29 +469,21 @@ void ThreadsExec::print_configuration(std::ostream &s, const bool detail) { //---------------------------------------------------------------------------- -int ThreadsExec::is_initialized() { return nullptr != s_threads_exec[0]; } +int ThreadsInternal::is_initialized() { return nullptr != s_threads_exec[0]; } -void ThreadsExec::initialize(int thread_count_arg) { - // legacy arguments - unsigned thread_count = thread_count_arg == -1 ? 0 : thread_count_arg; - unsigned use_numa_count = 0; - unsigned use_cores_per_numa = 0; - bool allow_asynchronous_threadpool = false; - // need to provide an initializer for Intel compilers - static const Sentinel sentinel = {}; +void ThreadsInternal::initialize(int thread_count_arg) { + unsigned thread_count = thread_count_arg == -1 ? 0 : thread_count_arg; const bool is_initialized = 0 != s_thread_pool_size[0]; unsigned thread_spawn_failed = 0; - for (int i = 0; i < ThreadsExec::MAX_THREAD_COUNT; i++) + for (int i = 0; i < ThreadsInternal::MAX_THREAD_COUNT; i++) s_threads_exec[i] = nullptr; if (!is_initialized) { - // If thread_count, use_numa_count, or use_cores_per_numa are zero - // then they will be given default values based upon hwloc detection - // and allowed asynchronous execution. - + // If thread_count is zero then it will be given default values based upon + // hwloc detection. const bool hwloc_avail = Kokkos::hwloc::available(); const bool hwloc_can_bind = hwloc_avail && Kokkos::hwloc::can_bind_threads(); @@ -640,17 +496,18 @@ void ThreadsExec::initialize(int thread_count_arg) { : 1; } - const unsigned thread_spawn_begin = hwloc::thread_mapping( - "Kokkos::Threads::initialize", allow_asynchronous_threadpool, - thread_count, use_numa_count, use_cores_per_numa, s_threads_coord); + const bool allow_asynchronous_threadpool = false; + unsigned use_numa_count = 0; + unsigned use_cores_per_numa = 0; + hwloc::thread_mapping("Kokkos::Threads::initialize", + allow_asynchronous_threadpool, thread_count, + use_numa_count, use_cores_per_numa, s_threads_coord); const std::pair proc_coord = s_threads_coord[0]; - if (thread_spawn_begin) { - // Synchronous with s_threads_coord[0] as the process core - // Claim entry #0 for binding the process core. - s_threads_coord[0] = std::pair(~0u, ~0u); - } + // Synchronous with s_threads_coord[0] as the process core + // Claim entry #0 for binding the process core. + s_threads_coord[0] = std::pair(~0u, ~0u); s_thread_pool_size[0] = thread_count; s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count; @@ -658,8 +515,8 @@ void ThreadsExec::initialize(int thread_count_arg) { s_current_function = &execute_function_noop; // Initialization work function - for (unsigned ith = thread_spawn_begin; ith < thread_count; ++ith) { - s_threads_process.m_pool_state = ThreadsExec::Inactive; + for (unsigned ith = 1; ith < thread_count; ++ith) { + s_threads_process.m_pool_state = ThreadState::Inactive; // If hwloc available then spawned thread will // choose its own entry in 's_threads_coord' @@ -675,18 +532,20 @@ void ThreadsExec::initialize(int thread_count_arg) { // Wait until spawned thread has attempted to initialize. // If spawning and initialization is successful then // an entry in 's_threads_exec' will be assigned. - ThreadsExec::spawn(); - wait_yield(s_threads_process.m_pool_state, ThreadsExec::Inactive); - if (s_threads_process.m_pool_state == ThreadsExec::Terminating) break; + std::thread t(internal_cppthread_driver); + t.detach(); + wait_yield(s_threads_process.m_pool_state, ThreadState::Inactive); + if (s_threads_process.m_pool_state == ThreadState::Terminating) break; } // Wait for all spawned threads to deactivate before zeroing the function. - for (unsigned ith = thread_spawn_begin; ith < thread_count; ++ith) { + for (unsigned ith = 1; ith < thread_count; ++ith) { // Try to protect against cache coherency failure by casting to volatile. - ThreadsExec *const th = ((ThreadsExec * volatile *)s_threads_exec)[ith]; + ThreadsInternal *const th = + ((ThreadsInternal * volatile *)s_threads_exec)[ith]; if (th) { - wait_yield(th->m_pool_state, ThreadsExec::Active); + wait_yield(th->m_pool_state, ThreadState::Active); } else { ++thread_spawn_failed; } @@ -694,7 +553,7 @@ void ThreadsExec::initialize(int thread_count_arg) { s_current_function = nullptr; s_current_function_arg = nullptr; - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; memory_fence(); @@ -705,30 +564,17 @@ void ThreadsExec::initialize(int thread_count_arg) { Kokkos::hwloc::bind_this_thread(proc_coord); } - if (thread_spawn_begin) { // Include process in pool. - const std::pair coord = - Kokkos::hwloc::get_this_thread_coordinate(); - - s_threads_exec[0] = &s_threads_process; - s_threads_process.m_numa_rank = coord.first; - s_threads_process.m_numa_core_rank = coord.second; - s_threads_process.m_pool_base = s_threads_exec; - s_threads_process.m_pool_rank = - thread_count - 1; // Reversed for scan-compatible reductions - s_threads_process.m_pool_size = thread_count; - s_threads_process.m_pool_fan_size = fan_size( - s_threads_process.m_pool_rank, s_threads_process.m_pool_size); - s_threads_pid[s_threads_process.m_pool_rank] = - std::this_thread::get_id(); - } else { - s_threads_process.m_pool_base = nullptr; - s_threads_process.m_pool_rank = 0; - s_threads_process.m_pool_size = 0; - s_threads_process.m_pool_fan_size = 0; - } + s_threads_exec[0] = &s_threads_process; + s_threads_process.m_pool_base = s_threads_exec; + s_threads_process.m_pool_rank = + thread_count - 1; // Reversed for scan-compatible reductions + s_threads_process.m_pool_size = thread_count; + s_threads_process.m_pool_fan_size = fan_size( + s_threads_process.m_pool_rank, s_threads_process.m_pool_size); + s_threads_pid[s_threads_process.m_pool_rank] = std::this_thread::get_id(); // Initial allocations: - ThreadsExec::resize_scratch(1024, 1024); + ThreadsInternal::resize_scratch(1024, 1024); } else { s_thread_pool_size[0] = 0; s_thread_pool_size[1] = 0; @@ -773,8 +619,8 @@ void ThreadsExec::initialize(int thread_count_arg) { //---------------------------------------------------------------------------- -void ThreadsExec::finalize() { - verify_is_process("ThreadsExec::finalize", false); +void ThreadsInternal::finalize() { + verify_is_process("ThreadsInternal::finalize", false); fence(); @@ -784,18 +630,18 @@ void ThreadsExec::finalize() { for (unsigned i = s_thread_pool_size[0]; begin < i--;) { if (s_threads_exec[i]) { - s_threads_exec[i]->m_pool_state = ThreadsExec::Terminating; + s_threads_exec[i]->m_pool_state = ThreadState::Terminating; - wait_yield(s_threads_process.m_pool_state, ThreadsExec::Inactive); + wait_yield(s_threads_process.m_pool_state, ThreadState::Inactive); - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; } s_threads_pid[i] = std::thread::id(); } if (s_threads_process.m_pool_base) { - (&s_threads_process)->~ThreadsExec(); + (&s_threads_process)->~ThreadsInternal(); s_threads_exec[0] = nullptr; } @@ -808,13 +654,11 @@ void ThreadsExec::finalize() { s_thread_pool_size[2] = 0; // Reset master thread to run solo. - s_threads_process.m_numa_rank = 0; - s_threads_process.m_numa_core_rank = 0; - s_threads_process.m_pool_base = nullptr; - s_threads_process.m_pool_rank = 0; - s_threads_process.m_pool_size = 1; - s_threads_process.m_pool_fan_size = 0; - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_base = nullptr; + s_threads_process.m_pool_rank = 0; + s_threads_process.m_pool_size = 1; + s_threads_process.m_pool_fan_size = 0; + s_threads_process.m_pool_state = ThreadState::Inactive; } //---------------------------------------------------------------------------- @@ -834,7 +678,7 @@ int Threads::concurrency() const { return impl_thread_pool_size(0); } #endif void Threads::fence(const std::string &name) const { - Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::no); + Impl::ThreadsInternal::fence(name); } Threads &Threads::impl_instance(int) { diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp similarity index 76% rename from lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp rename to lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp index 377e096bfb..a5eb231cb0 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_THREADSEXEC_HPP -#define KOKKOS_THREADSEXEC_HPP +#ifndef KOKKOS_THREADS_INSTANCE_HPP +#define KOKKOS_THREADS_INSTANCE_HPP #include @@ -23,41 +23,25 @@ #include #include -#include - #include #include #include #include +#include +#include //---------------------------------------------------------------------------- namespace Kokkos { namespace Impl { -class ThreadsExec { +class ThreadsInternal { public: // Fan array has log_2(NT) reduction threads plus 2 scan threads // Currently limited to 16k threads. - enum { MAX_FAN_COUNT = 16 }; - enum { MAX_THREAD_COUNT = 1 << (MAX_FAN_COUNT - 2) }; - enum { VECTOR_LENGTH = 8 }; - - /** \brief States of a worker thread */ - enum { - Terminating ///< Termination in progress - , - Inactive ///< Exists, waiting for work - , - Active ///< Exists, performing work - , - Rendezvous ///< Exists, waiting in a barrier or reduce - - , - ScanCompleted, - ScanAvailable, - ReductionAvailable - }; + static constexpr int MAX_FAN_COUNT = 16; + static constexpr int MAX_THREAD_COUNT = 1 << (MAX_FAN_COUNT - 2); + static constexpr int VECTOR_LENGTH = 8; private: friend class Kokkos::Threads; @@ -67,18 +51,16 @@ class ThreadsExec { // the threads that need them. // For a simple reduction the thread location is arbitrary. - ThreadsExec *const *m_pool_base; ///< Base for pool fan-in + ThreadsInternal *const *m_pool_base; ///< Base for pool fan-in void *m_scratch; int m_scratch_reduce_end; size_t m_scratch_thread_end; - int m_numa_rank; - int m_numa_core_rank; int m_pool_rank; int m_pool_rank_rev; int m_pool_size; int m_pool_fan_size; - int volatile m_pool_state; ///< State for global synchronizations + ThreadState volatile m_pool_state; ///< State for global synchronizations // Members for dynamic scheduling // Which thread am I stealing from currently @@ -93,41 +75,36 @@ class ThreadsExec { static void global_lock(); static void global_unlock(); - static void spawn(); - static void first_touch_allocate_thread_private_scratch(ThreadsExec &, + static void first_touch_allocate_thread_private_scratch(ThreadsInternal &, const void *); - static void execute_sleep(ThreadsExec &, const void *); - ThreadsExec(const ThreadsExec &); - ThreadsExec &operator=(const ThreadsExec &); + ThreadsInternal(const ThreadsInternal &); + ThreadsInternal &operator=(const ThreadsInternal &); static void execute_resize_scratch_in_serial(); public: KOKKOS_INLINE_FUNCTION int pool_size() const { return m_pool_size; } KOKKOS_INLINE_FUNCTION int pool_rank() const { return m_pool_rank; } - KOKKOS_INLINE_FUNCTION int numa_rank() const { return m_numa_rank; } - KOKKOS_INLINE_FUNCTION int numa_core_rank() const { return m_numa_core_rank; } inline long team_work_index() const { return m_team_work_index; } - static int get_thread_count(); - static ThreadsExec *get_thread(const int init_thread_rank); + static ThreadsInternal *get_thread(const int init_thread_rank); inline void *reduce_memory() const { return m_scratch; } KOKKOS_INLINE_FUNCTION void *scratch_memory() const { return reinterpret_cast(m_scratch) + m_scratch_reduce_end; } - KOKKOS_INLINE_FUNCTION int volatile &state() { return m_pool_state; } - KOKKOS_INLINE_FUNCTION ThreadsExec *const *pool_base() const { + KOKKOS_INLINE_FUNCTION ThreadState volatile &state() { return m_pool_state; } + KOKKOS_INLINE_FUNCTION ThreadsInternal *const *pool_base() const { return m_pool_base; } static void driver(void); - ~ThreadsExec(); - ThreadsExec(); + ~ThreadsInternal(); + ThreadsInternal(); static void *resize_scratch(size_t reduce_size, size_t thread_size); @@ -143,15 +120,8 @@ class ThreadsExec { static void finalize(); - /* Given a requested team size, return valid team size */ - static unsigned team_size_valid(unsigned); - static void print_configuration(std::ostream &, const bool detail = false); - //------------------------------------ - - static void wait_yield(volatile int &, const int); - //------------------------------------ // All-thread functions: @@ -166,14 +136,14 @@ class ThreadsExec { // Fan-in reduction with highest ranking thread as the root for (int i = 0; i < m_pool_fan_size; ++i) { // Wait: Active -> Rendezvous - Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Active); } if (rev_rank) { - m_pool_state = ThreadsExec::Rendezvous; + m_pool_state = ThreadState::Rendezvous; // Wait: Rendezvous -> Active - Impl::spinwait_while_equal(m_pool_state, ThreadsExec::Rendezvous); + spinwait_while_equal(m_pool_state, ThreadState::Rendezvous); } else { // Root thread does the reduction and broadcast @@ -191,7 +161,7 @@ class ThreadsExec { memory_fence(); for (int rank = 0; rank < m_pool_size; ++rank) { - get_thread(rank)->m_pool_state = ThreadsExec::Active; + get_thread(rank)->m_pool_state = ThreadState::Active; } } @@ -207,21 +177,21 @@ class ThreadsExec { // Fan-in reduction with highest ranking thread as the root for (int i = 0; i < m_pool_fan_size; ++i) { // Wait: Active -> Rendezvous - Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Active); } if (rev_rank) { - m_pool_state = ThreadsExec::Rendezvous; + m_pool_state = ThreadState::Rendezvous; // Wait: Rendezvous -> Active - Impl::spinwait_while_equal(m_pool_state, ThreadsExec::Rendezvous); + spinwait_while_equal(m_pool_state, ThreadState::Rendezvous); } else { // Root thread does the reduction and broadcast memory_fence(); for (int rank = 0; rank < m_pool_size; ++rank) { - get_thread(rank)->m_pool_state = ThreadsExec::Active; + get_thread(rank)->m_pool_state = ThreadState::Active; } } } @@ -234,9 +204,9 @@ class ThreadsExec { const int rev_rank = m_pool_size - (m_pool_rank + 1); for (int i = 0; i < m_pool_fan_size; ++i) { - ThreadsExec &fan = *m_pool_base[rev_rank + (1 << i)]; + ThreadsInternal &fan = *m_pool_base[rev_rank + (1 << i)]; - Impl::spinwait_while_equal(fan.m_pool_state, ThreadsExec::Active); + spinwait_while_equal(fan.m_pool_state, ThreadState::Active); f.join( reinterpret_cast(reduce_memory()), @@ -265,8 +235,8 @@ class ThreadsExec { const int rev_rank = m_pool_size - (m_pool_rank + 1); for (int i = 0; i < m_pool_fan_size; ++i) { - Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Active); } } @@ -289,10 +259,10 @@ class ThreadsExec { //-------------------------------- // Fan-in reduction with highest ranking thread as the root for (int i = 0; i < m_pool_fan_size; ++i) { - ThreadsExec &fan = *m_pool_base[rev_rank + (1 << i)]; + ThreadsInternal &fan = *m_pool_base[rev_rank + (1 << i)]; // Wait: Active -> ReductionAvailable (or ScanAvailable) - Impl::spinwait_while_equal(fan.m_pool_state, ThreadsExec::Active); + spinwait_while_equal(fan.m_pool_state, ThreadState::Active); f.join(work_value, fan.reduce_memory()); } @@ -303,39 +273,37 @@ class ThreadsExec { if (rev_rank) { // Set: Active -> ReductionAvailable - m_pool_state = ThreadsExec::ReductionAvailable; + m_pool_state = ThreadState::ReductionAvailable; // Wait for contributing threads' scan value to be available. if ((1 << m_pool_fan_size) < (m_pool_rank + 1)) { - ThreadsExec &th = *m_pool_base[rev_rank + (1 << m_pool_fan_size)]; + ThreadsInternal &th = *m_pool_base[rev_rank + (1 << m_pool_fan_size)]; // Wait: Active -> ReductionAvailable // Wait: ReductionAvailable -> ScanAvailable - Impl::spinwait_while_equal(th.m_pool_state, ThreadsExec::Active); - Impl::spinwait_while_equal(th.m_pool_state, - ThreadsExec::ReductionAvailable); + spinwait_while_equal(th.m_pool_state, ThreadState::Active); + spinwait_while_equal(th.m_pool_state, ThreadState::ReductionAvailable); f.join(work_value + count, ((scalar_type *)th.reduce_memory()) + count); } // This thread has completed inclusive scan // Set: ReductionAvailable -> ScanAvailable - m_pool_state = ThreadsExec::ScanAvailable; + m_pool_state = ThreadState::ScanAvailable; // Wait for all threads to complete inclusive scan // Wait: ScanAvailable -> Rendezvous - Impl::spinwait_while_equal(m_pool_state, ThreadsExec::ScanAvailable); + spinwait_while_equal(m_pool_state, ThreadState::ScanAvailable); } //-------------------------------- for (int i = 0; i < m_pool_fan_size; ++i) { - ThreadsExec &fan = *m_pool_base[rev_rank + (1 << i)]; + ThreadsInternal &fan = *m_pool_base[rev_rank + (1 << i)]; // Wait: ReductionAvailable -> ScanAvailable - Impl::spinwait_while_equal(fan.m_pool_state, - ThreadsExec::ReductionAvailable); + spinwait_while_equal(fan.m_pool_state, ThreadState::ReductionAvailable); // Set: ScanAvailable -> Rendezvous - fan.m_pool_state = ThreadsExec::Rendezvous; + fan.m_pool_state = ThreadState::Rendezvous; } // All threads have completed the inclusive scan. @@ -346,7 +314,7 @@ class ThreadsExec { if ((rev_rank + 1) < m_pool_size) { // Exclusive scan: copy the previous thread's inclusive scan value - ThreadsExec &th = *m_pool_base[rev_rank + 1]; // Not the root thread + ThreadsInternal &th = *m_pool_base[rev_rank + 1]; // Not the root thread const scalar_type *const src_value = ((scalar_type *)th.reduce_memory()) + count; @@ -362,19 +330,18 @@ class ThreadsExec { // Wait for all threads to copy previous thread's inclusive scan value // Wait for all threads: Rendezvous -> ScanCompleted for (int i = 0; i < m_pool_fan_size; ++i) { - Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, - ThreadsExec::Rendezvous); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Rendezvous); } if (rev_rank) { // Set: ScanAvailable -> ScanCompleted - m_pool_state = ThreadsExec::ScanCompleted; + m_pool_state = ThreadState::ScanCompleted; // Wait: ScanCompleted -> Active - Impl::spinwait_while_equal(m_pool_state, ThreadsExec::ScanCompleted); + spinwait_while_equal(m_pool_state, ThreadState::ScanCompleted); } // Set: ScanCompleted -> Active for (int i = 0; i < m_pool_fan_size; ++i) { - m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadsExec::Active; + m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadState::Active; } } @@ -391,8 +358,8 @@ class ThreadsExec { // Fan-in reduction with highest ranking thread as the root for (int i = 0; i < m_pool_fan_size; ++i) { // Wait: Active -> Rendezvous - Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Active); } for (unsigned i = 0; i < count; ++i) { @@ -400,9 +367,9 @@ class ThreadsExec { } if (rev_rank) { - m_pool_state = ThreadsExec::Rendezvous; + m_pool_state = ThreadState::Rendezvous; // Wait: Rendezvous -> Active - Impl::spinwait_while_equal(m_pool_state, ThreadsExec::Rendezvous); + spinwait_while_equal(m_pool_state, ThreadState::Rendezvous); } else { // Root thread does the thread-scan before releasing threads @@ -424,7 +391,7 @@ class ThreadsExec { } for (int i = 0; i < m_pool_fan_size; ++i) { - m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadsExec::Active; + m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadState::Active; } } @@ -433,18 +400,14 @@ class ThreadsExec { * complete and release the Threads device. * Acquire the Threads device and start this functor. */ - static void start(void (*)(ThreadsExec &, const void *), const void *); + static void start(void (*)(ThreadsInternal &, const void *), const void *); - static int in_parallel(); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static int in_parallel(); +#endif static void fence(); static void fence(const std::string &); - static void internal_fence( - Impl::fence_is_static is_static = Impl::fence_is_static::yes); - static void internal_fence( - const std::string &, - Impl::fence_is_static is_static = Impl::fence_is_static::yes); - static bool sleep(); - static bool wake(); + static void internal_fence(); /* Dynamic Scheduling related functionality */ // Initialize the work range for this thread @@ -583,30 +546,38 @@ class ThreadsExec { namespace Kokkos { -inline int Threads::in_parallel() { return Impl::ThreadsExec::in_parallel(); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_DEPRECATED inline int Threads::in_parallel() { + return Impl::ThreadsInternal::in_parallel(); +} +#endif inline int Threads::impl_is_initialized() { - return Impl::ThreadsExec::is_initialized(); + return Impl::ThreadsInternal::is_initialized(); } inline void Threads::impl_initialize(InitializationSettings const &settings) { - Impl::ThreadsExec::initialize( + Impl::ThreadsInternal::initialize( settings.has_num_threads() ? settings.get_num_threads() : -1); } -inline void Threads::impl_finalize() { Impl::ThreadsExec::finalize(); } +inline void Threads::impl_finalize() { Impl::ThreadsInternal::finalize(); } inline void Threads::print_configuration(std::ostream &os, bool verbose) const { os << "Host Parallel Execution Space:\n"; os << " KOKKOS_ENABLE_THREADS: yes\n"; os << "\nThreads Runtime Configuration:\n"; - Impl::ThreadsExec::print_configuration(os, verbose); + Impl::ThreadsInternal::print_configuration(os, verbose); } inline void Threads::impl_static_fence(const std::string &name) { - Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::yes); + Kokkos::Tools::Experimental::Impl::profile_fence_event( + name, + Kokkos::Tools::Experimental::SpecialSynchronizationCases:: + GlobalDeviceSynchronization, + Impl::ThreadsInternal::internal_fence); } } /* namespace Kokkos */ -#endif /* #define KOKKOS_THREADSEXEC_HPP */ +#endif diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp index 0828f26299..59577609ab 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp @@ -46,54 +46,54 @@ class ParallelFor, } } - static void exec(ThreadsExec &exec, const void *arg) { - exec_schedule(exec, arg); + static void exec(ThreadsInternal &instance, const void *arg) { + exec_schedule(instance, arg); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); auto const num_tiles = self.m_iter.m_rp.m_num_tiles; - WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(), - exec.pool_size()); + WorkRange range(Policy(0, num_tiles).set_chunk_size(1), + instance.pool_rank(), instance.pool_size()); self.exec_range(range.begin(), range.end()); - exec.fan_in(); + instance.fan_in(); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); auto const num_tiles = self.m_iter.m_rp.m_num_tiles; - WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(), - exec.pool_size()); + WorkRange range(Policy(0, num_tiles).set_chunk_size(1), + instance.pool_rank(), instance.pool_size()); - exec.set_work_range(range.begin(), range.end(), 1); - exec.reset_steal_target(); - exec.barrier(); + instance.set_work_range(range.begin(), range.end(), 1); + instance.reset_steal_target(); + instance.barrier(); - long work_index = exec.get_work_index(); + long work_index = instance.get_work_index(); while (work_index != -1) { const Member begin = static_cast(work_index); const Member end = begin + 1 < num_tiles ? begin + 1 : num_tiles; self.exec_range(begin, end); - work_index = exec.get_work_index(); + work_index = instance.get_work_index(); } - exec.fan_in(); + instance.fan_in(); } public: inline void execute() const { - ThreadsExec::start(&ParallelFor::exec, this); - ThreadsExec::fence(); + ThreadsInternal::start(&ParallelFor::exec, this); + ThreadsInternal::fence(); } ParallelFor(const FunctorType &arg_functor, const MDRangePolicy &arg_policy) diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp index 3698416ef1..4a89c4fad8 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp @@ -59,37 +59,37 @@ class ParallelFor, } } - static void exec(ThreadsExec &exec, const void *arg) { - exec_schedule(exec, arg); + static void exec(ThreadsInternal &instance, const void *arg) { + exec_schedule(instance, arg); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); - WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + WorkRange range(self.m_policy, instance.pool_rank(), instance.pool_size()); ParallelFor::template exec_range(self.m_functor, range.begin(), range.end()); - exec.fan_in(); + instance.fan_in(); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); - WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + WorkRange range(self.m_policy, instance.pool_rank(), instance.pool_size()); - exec.set_work_range(range.begin() - self.m_policy.begin(), - range.end() - self.m_policy.begin(), - self.m_policy.chunk_size()); - exec.reset_steal_target(); - exec.barrier(); + instance.set_work_range(range.begin() - self.m_policy.begin(), + range.end() - self.m_policy.begin(), + self.m_policy.chunk_size()); + instance.reset_steal_target(); + instance.barrier(); - long work_index = exec.get_work_index(); + long work_index = instance.get_work_index(); while (work_index != -1) { const Member begin = @@ -100,16 +100,16 @@ class ParallelFor, ? begin + self.m_policy.chunk_size() : self.m_policy.end(); ParallelFor::template exec_range(self.m_functor, begin, end); - work_index = exec.get_work_index(); + work_index = instance.get_work_index(); } - exec.fan_in(); + instance.fan_in(); } public: inline void execute() const { - ThreadsExec::start(&ParallelFor::exec, this); - ThreadsExec::fence(); + ThreadsInternal::start(&ParallelFor::exec, this); + ThreadsInternal::fence(); } ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy) diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp index 36404857a2..f927d7c6a6 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp @@ -73,14 +73,14 @@ class ParallelFor, } } - static void exec(ThreadsExec &exec, const void *arg) { + static void exec(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); ParallelFor::exec_team( - self.m_functor, Member(&exec, self.m_policy, self.m_shared)); + self.m_functor, Member(&instance, self.m_policy, self.m_shared)); - exec.barrier(); - exec.fan_in(); + instance.barrier(); + instance.fan_in(); } template Policy fix_policy(Policy policy) { @@ -96,12 +96,12 @@ class ParallelFor, public: inline void execute() const { - ThreadsExec::resize_scratch( + ThreadsInternal::resize_scratch( 0, Policy::member_type::team_reduce_size() + m_shared); - ThreadsExec::start(&ParallelFor::exec, this); + ThreadsInternal::start(&ParallelFor::exec, this); - ThreadsExec::fence(); + ThreadsInternal::fence(); } ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy) diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp index 3d06379480..fa63215a9e 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp @@ -54,67 +54,67 @@ class ParallelReduce(exec, arg); + static void exec(ThreadsInternal &instance, const void *arg) { + exec_schedule(instance, arg); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); const auto num_tiles = self.m_iter.m_rp.m_num_tiles; const WorkRange range(Policy(0, num_tiles).set_chunk_size(1), - exec.pool_rank(), exec.pool_size()); + instance.pool_rank(), instance.pool_size()); const ReducerType &reducer = self.m_iter.m_func.get_reducer(); self.exec_range( range.begin(), range.end(), - reducer.init(static_cast(exec.reduce_memory()))); + reducer.init(static_cast(instance.reduce_memory()))); - exec.fan_in_reduce(reducer); + instance.fan_in_reduce(reducer); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); const auto num_tiles = self.m_iter.m_rp.m_num_tiles; const WorkRange range(Policy(0, num_tiles).set_chunk_size(1), - exec.pool_rank(), exec.pool_size()); + instance.pool_rank(), instance.pool_size()); - exec.set_work_range(range.begin(), range.end(), 1); - exec.reset_steal_target(); - exec.barrier(); + instance.set_work_range(range.begin(), range.end(), 1); + instance.reset_steal_target(); + instance.barrier(); - long work_index = exec.get_work_index(); + long work_index = instance.get_work_index(); const ReducerType &reducer = self.m_iter.m_func.get_reducer(); - reference_type update = - self.m_reducer.init(static_cast(exec.reduce_memory())); + reference_type update = self.m_reducer.init( + static_cast(instance.reduce_memory())); while (work_index != -1) { const Member begin = static_cast(work_index); const Member end = begin + 1 < num_tiles ? begin + 1 : num_tiles; self.exec_range(begin, end, update); - work_index = exec.get_work_index(); + work_index = instance.get_work_index(); } - exec.fan_in_reduce(self.m_reducer); + instance.fan_in_reduce(self.m_reducer); } public: inline void execute() const { const ReducerType &reducer = m_iter.m_func.get_reducer(); - ThreadsExec::resize_scratch(reducer.value_size(), 0); + ThreadsInternal::resize_scratch(reducer.value_size(), 0); - ThreadsExec::start(&ParallelReduce::exec, this); + ThreadsInternal::start(&ParallelReduce::exec, this); - ThreadsExec::fence(); + ThreadsInternal::fence(); if (m_result_ptr) { const pointer_type data = - (pointer_type)ThreadsExec::root_reduce_scratch(); + (pointer_type)ThreadsInternal::root_reduce_scratch(); const unsigned n = reducer.value_count(); for (unsigned i = 0; i < n; ++i) { diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp index 5fa97b403c..bf4c2a532a 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp @@ -68,42 +68,44 @@ class ParallelReduce, } } - static void exec(ThreadsExec &exec, const void *arg) { - exec_schedule(exec, arg); + static void exec(ThreadsInternal &instance, const void *arg) { + exec_schedule(instance, arg); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + const WorkRange range(self.m_policy, instance.pool_rank(), + instance.pool_size()); const ReducerType &reducer = self.m_functor_reducer.get_reducer(); ParallelReduce::template exec_range( self.m_functor_reducer.get_functor(), range.begin(), range.end(), - reducer.init(static_cast(exec.reduce_memory()))); + reducer.init(static_cast(instance.reduce_memory()))); - exec.fan_in_reduce(reducer); + instance.fan_in_reduce(reducer); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + const WorkRange range(self.m_policy, instance.pool_rank(), + instance.pool_size()); - exec.set_work_range(range.begin() - self.m_policy.begin(), - range.end() - self.m_policy.begin(), - self.m_policy.chunk_size()); - exec.reset_steal_target(); - exec.barrier(); + instance.set_work_range(range.begin() - self.m_policy.begin(), + range.end() - self.m_policy.begin(), + self.m_policy.chunk_size()); + instance.reset_steal_target(); + instance.barrier(); - long work_index = exec.get_work_index(); + long work_index = instance.get_work_index(); const ReducerType &reducer = self.m_functor_reducer.get_reducer(); reference_type update = - reducer.init(static_cast(exec.reduce_memory())); + reducer.init(static_cast(instance.reduce_memory())); while (work_index != -1) { const Member begin = static_cast(work_index) * self.m_policy.chunk_size() + @@ -114,10 +116,10 @@ class ParallelReduce, : self.m_policy.end(); ParallelReduce::template exec_range( self.m_functor_reducer.get_functor(), begin, end, update); - work_index = exec.get_work_index(); + work_index = instance.get_work_index(); } - exec.fan_in_reduce(reducer); + instance.fan_in_reduce(reducer); } public: @@ -130,15 +132,15 @@ class ParallelReduce, reducer.final(m_result_ptr); } } else { - ThreadsExec::resize_scratch(reducer.value_size(), 0); + ThreadsInternal::resize_scratch(reducer.value_size(), 0); - ThreadsExec::start(&ParallelReduce::exec, this); + ThreadsInternal::start(&ParallelReduce::exec, this); - ThreadsExec::fence(); + ThreadsInternal::fence(); if (m_result_ptr) { const pointer_type data = - (pointer_type)ThreadsExec::root_reduce_scratch(); + (pointer_type)ThreadsInternal::root_reduce_scratch(); const unsigned n = reducer.value_count(); for (unsigned i = 0; i < n; ++i) { diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp index c4b6100a9d..4db310701f 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp @@ -58,16 +58,16 @@ class ParallelReduce( self.m_functor_reducer.get_functor(), - Member(&exec, self.m_policy, self.m_shared), + Member(&instance, self.m_policy, self.m_shared), self.m_functor_reducer.get_reducer().init( - static_cast(exec.reduce_memory()))); + static_cast(instance.reduce_memory()))); - exec.fan_in_reduce(self.m_functor_reducer.get_reducer()); + instance.fan_in_reduce(self.m_functor_reducer.get_reducer()); } public: @@ -80,17 +80,17 @@ class ParallelReduce, } } - static void exec(ThreadsExec &exec, const void *arg) { + static void exec(ThreadsInternal &instance, const void *arg) { const ParallelScan &self = *((const ParallelScan *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + const WorkRange range(self.m_policy, instance.pool_rank(), + instance.pool_size()); typename Analysis::Reducer final_reducer(self.m_functor); reference_type update = - final_reducer.init(static_cast(exec.reduce_memory())); + final_reducer.init(static_cast(instance.reduce_memory())); ParallelScan::template exec_range(self.m_functor, range.begin(), range.end(), update, false); - // exec.template scan_large( final_reducer ); - exec.scan_small(final_reducer); + instance.scan_small(final_reducer); ParallelScan::template exec_range(self.m_functor, range.begin(), range.end(), update, true); - exec.fan_in(); + instance.fan_in(); } public: inline void execute() const { - ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0); - ThreadsExec::start(&ParallelScan::exec, this); - ThreadsExec::fence(); + ThreadsInternal::resize_scratch(2 * Analysis::value_size(m_functor), 0); + ThreadsInternal::start(&ParallelScan::exec, this); + ThreadsInternal::fence(); } ParallelScan(const FunctorType &arg_functor, const Policy &arg_policy) @@ -145,37 +145,37 @@ class ParallelScanWithTotal, } } - static void exec(ThreadsExec &exec, const void *arg) { + static void exec(ThreadsInternal &instance, const void *arg) { const ParallelScanWithTotal &self = *((const ParallelScanWithTotal *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + const WorkRange range(self.m_policy, instance.pool_rank(), + instance.pool_size()); typename Analysis::Reducer final_reducer(self.m_functor); reference_type update = - final_reducer.init(static_cast(exec.reduce_memory())); + final_reducer.init(static_cast(instance.reduce_memory())); ParallelScanWithTotal::template exec_range( self.m_functor, range.begin(), range.end(), update, false); - // exec.template scan_large(final_reducer); - exec.scan_small(final_reducer); + instance.scan_small(final_reducer); ParallelScanWithTotal::template exec_range( self.m_functor, range.begin(), range.end(), update, true); - exec.fan_in(); + instance.fan_in(); - if (exec.pool_rank() == exec.pool_size() - 1) { + if (instance.pool_rank() == instance.pool_size() - 1) { *self.m_result_ptr = update; } } public: inline void execute() const { - ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0); - ThreadsExec::start(&ParallelScanWithTotal::exec, this); - ThreadsExec::fence(); + ThreadsInternal::resize_scratch(2 * Analysis::value_size(m_functor), 0); + ThreadsInternal::start(&ParallelScanWithTotal::exec, this); + ThreadsInternal::fence(); } template diff --git a/lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp similarity index 90% rename from lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp rename to lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp index 0a7eda29bc..3df9dc07bf 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include @@ -108,5 +108,15 @@ void host_thread_yield(const uint32_t i, const WaitMode mode) { #endif /* defined( KOKKOS_ENABLE_ASM ) */ } +void spinwait_while_equal(ThreadState const volatile& flag, + ThreadState const value) { + Kokkos::store_fence(); + uint32_t i = 0; + while (value == flag) { + host_thread_yield(++i, WaitMode::ACTIVE); + } + Kokkos::load_fence(); +} + } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Swap.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp similarity index 52% rename from lib/kokkos/algorithms/src/std_algorithms/Kokkos_Swap.hpp rename to lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp index acd2a572c8..b98b6dbb73 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Swap.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp @@ -14,28 +14,30 @@ // //@HEADER -#ifndef KOKKOS_STD_ALGORITHMS_SWAP_HPP -#define KOKKOS_STD_ALGORITHMS_SWAP_HPP +#ifndef KOKKOS_THREADS_SPINWAIT_HPP +#define KOKKOS_THREADS_SPINWAIT_HPP -#include +#include + +#include namespace Kokkos { -namespace Experimental { +namespace Impl { -// swap -template -KOKKOS_INLINE_FUNCTION void swap(T& a, T& b) noexcept { - static_assert( - std::is_move_assignable::value && std::is_move_constructible::value, - "Kokkos::Experimental::swap arguments must be move assignable " - "and move constructible"); +enum class WaitMode : int { + ACTIVE // Used for tight loops to keep threads active longest + , + PASSIVE // Used to quickly yield the thread to quite down the system + , + ROOT // Never sleep or yield the root thread +}; - T tmp = std::move(a); - a = std::move(b); - b = std::move(tmp); -} +void host_thread_yield(const uint32_t i, const WaitMode mode); -} // namespace Experimental +void spinwait_while_equal(ThreadState const volatile& flag, + ThreadState const value); + +} // namespace Impl } // namespace Kokkos #endif diff --git a/lib/kokkos/core/src/fwd/Kokkos_Fwd_HBWSpace.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_State.hpp similarity index 59% rename from lib/kokkos/core/src/fwd/Kokkos_Fwd_HBWSpace.hpp rename to lib/kokkos/core/src/Threads/Kokkos_Threads_State.hpp index 21ba7fad01..148e9aa4e0 100644 --- a/lib/kokkos/core/src/fwd/Kokkos_Fwd_HBWSpace.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_State.hpp @@ -14,16 +14,26 @@ // //@HEADER -#ifndef KOKKOS_HBWSPACE_FWD_HPP_ -#define KOKKOS_HBWSPACE_FWD_HPP_ +#ifndef KOKKOS_THREADS_STATE_HPP +#define KOKKOS_THREADS_STATE_HPP -#ifdef KOKKOS_ENABLE_HBWSPACE namespace Kokkos { - -namespace Experimental { -class HBWSpace; /// Memory space for hbw_malloc from memkind (e.g. for KNL - /// processor) -} // namespace Experimental +namespace Impl { +/** \brief States of a worker thread */ +enum class ThreadState { + Terminating ///< Termination in progress + , + Inactive ///< Exists, waiting for work + , + Active ///< Exists, performing work + , + Rendezvous ///< Exists, waiting in a barrier or reduce + , + ScanCompleted, + ScanAvailable, + ReductionAvailable +}; +} // namespace Impl } // namespace Kokkos -#endif + #endif diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp similarity index 95% rename from lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp rename to lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp index b1cadc7c48..fd0f221365 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp @@ -22,10 +22,11 @@ #include #include -#include #include #include +#include +#include //---------------------------------------------------------------------------- @@ -50,8 +51,8 @@ class ThreadsExecTeamMember { private: using space = execution_space::scratch_memory_space; - ThreadsExec* const m_exec; - ThreadsExec* const* m_team_base; ///< Base for team fan-in + ThreadsInternal* const m_instance; + ThreadsInternal* const* m_team_base; ///< Base for team fan-in space m_team_shared; size_t m_team_shared_size; int m_team_size; @@ -84,14 +85,13 @@ class ThreadsExecTeamMember { for (n = 1; (!(m_team_rank_rev & n)) && ((j = m_team_rank_rev + n) < m_team_size); n <<= 1) { - Impl::spinwait_while_equal(m_team_base[j]->state(), - ThreadsExec::Active); + spinwait_while_equal(m_team_base[j]->state(), ThreadState::Active); } // If not root then wait for release if (m_team_rank_rev) { - m_exec->state() = ThreadsExec::Rendezvous; - Impl::spinwait_while_equal(m_exec->state(), ThreadsExec::Rendezvous); + m_instance->state() = ThreadState::Rendezvous; + spinwait_while_equal(m_instance->state(), ThreadState::Rendezvous); } return !m_team_rank_rev; @@ -102,7 +102,7 @@ class ThreadsExecTeamMember { for (n = 1; (!(m_team_rank_rev & n)) && ((j = m_team_rank_rev + n) < m_team_size); n <<= 1) { - m_team_base[j]->state() = ThreadsExec::Active; + m_team_base[j]->state() = ThreadState::Active; } } @@ -188,10 +188,10 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (nullptr == m_exec) return value; + if (m_instance == nullptr) return value; if (team_rank() != team_size() - 1) * - ((volatile type*)m_exec->scratch_memory()) = value; + ((volatile type*)m_instance->scratch_memory()) = value; memory_fence(); @@ -229,9 +229,9 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (nullptr == m_exec) return; + if (m_instance == nullptr) return; - type* const local_value = ((type*)m_exec->scratch_memory()); + type* const local_value = ((type*)m_instance->scratch_memory()); // Set this thread's contribution if (team_rank() != team_size() - 1) { *local_value = contribution; } @@ -285,9 +285,9 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (nullptr == m_exec) return type(0); + if (m_instance == nullptr) return type(0); - volatile type* const work_value = ((type*)m_exec->scratch_memory()); + volatile type* const work_value = ((type*)m_instance->scratch_memory()); *work_value = value; @@ -342,10 +342,10 @@ class ThreadsExecTeamMember { template ThreadsExecTeamMember( - Impl::ThreadsExec* exec, + Impl::ThreadsInternal* instance, const TeamPolicyInternal& team, const size_t shared_size) - : m_exec(exec), + : m_instance(instance), m_team_base(nullptr), m_team_shared(nullptr, 0), m_team_shared_size(shared_size), @@ -361,9 +361,11 @@ class ThreadsExecTeamMember { if (team.league_size()) { // Execution is using device-team interface: - const int pool_rank_rev = m_exec->pool_size() - (m_exec->pool_rank() + 1); + const int pool_rank_rev = + m_instance->pool_size() - (m_instance->pool_rank() + 1); const int team_rank_rev = pool_rank_rev % team.team_alloc(); - const size_t pool_league_size = m_exec->pool_size() / team.team_alloc(); + const size_t pool_league_size = + m_instance->pool_size() / team.team_alloc(); const size_t pool_league_rank_rev = pool_rank_rev / team.team_alloc(); if (pool_league_rank_rev >= pool_league_size) { m_invalid_thread = 1; @@ -372,7 +374,7 @@ class ThreadsExecTeamMember { const size_t pool_league_rank = pool_league_size - (pool_league_rank_rev + 1); - const int pool_num_teams = m_exec->pool_size() / team.team_alloc(); + const int pool_num_teams = m_instance->pool_size() / team.team_alloc(); const int chunk_size = team.chunk_size() > 0 ? team.chunk_size() : team.team_iter(); const int chunks_per_team = @@ -387,8 +389,8 @@ class ThreadsExecTeamMember { if ((team.team_alloc() > size_t(m_team_size)) ? (team_rank_rev >= m_team_size) - : (m_exec->pool_size() - pool_num_teams * m_team_size > - m_exec->pool_rank())) + : (m_instance->pool_size() - pool_num_teams * m_team_size > + m_instance->pool_rank())) m_invalid_thread = 1; else m_invalid_thread = 0; @@ -398,7 +400,7 @@ class ThreadsExecTeamMember { if (team_rank_rev < team.team_size() && !m_invalid_thread) { m_team_base = - m_exec->pool_base() + team.team_alloc() * pool_league_rank_rev; + m_instance->pool_base() + team.team_alloc() * pool_league_rank_rev; m_team_size = team.team_size(); m_team_rank = team.team_size() - (team_rank_rev + 1); m_team_rank_rev = team_rank_rev; @@ -413,13 +415,13 @@ class ThreadsExecTeamMember { } if ((m_team_rank_rev == 0) && (m_invalid_thread == 0)) { - m_exec->set_work_range(m_league_rank, m_league_end, m_chunk_size); - m_exec->reset_steal_target(m_team_size); + m_instance->set_work_range(m_league_rank, m_league_end, m_chunk_size); + m_instance->reset_steal_target(m_team_size); } if (std::is_same::schedule_type::type, Kokkos::Dynamic>::value) { - m_exec->barrier(); + m_instance->barrier(); } } else { m_invalid_thread = 1; @@ -427,7 +429,7 @@ class ThreadsExecTeamMember { } ThreadsExecTeamMember() - : m_exec(nullptr), + : m_instance(nullptr), m_team_base(nullptr), m_team_shared(nullptr, 0), m_team_shared_size(0), @@ -442,8 +444,8 @@ class ThreadsExecTeamMember { m_invalid_thread(0), m_team_alloc(0) {} - inline ThreadsExec& threads_exec_team_base() const { - return m_team_base ? **m_team_base : *m_exec; + inline ThreadsInternal& threads_exec_team_base() const { + return m_team_base ? **m_team_base : *m_instance; } bool valid_static() const { return m_league_rank < m_league_end; } @@ -999,8 +1001,10 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( lambda(i, scan_val, false); } + auto& team_member = loop_bounds.thread; + // 'scan_val' output is the exclusive prefix sum - scan_val = loop_bounds.thread.team_scan(scan_val); + scan_val = team_member.team_scan(scan_val); #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep @@ -1010,6 +1014,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( lambda(i, scan_val, true); } + team_member.team_broadcast(scan_val, team_member.team_size() - 1); + return_val = scan_val; } diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp index d4ce697548..c88d66db5f 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp @@ -18,7 +18,7 @@ #define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP #include -#include +#include namespace Kokkos { namespace Impl { @@ -61,16 +61,17 @@ class ParallelFor, } } - static inline void thread_main(ThreadsExec& exec, const void* arg) noexcept { + static inline void thread_main(ThreadsInternal& instance, + const void* arg) noexcept { const Self& self = *(static_cast(arg)); self.exec_one_thread(); - exec.fan_in(); + instance.fan_in(); } public: inline void execute() { - ThreadsExec::start(&Self::thread_main, this); - ThreadsExec::fence(); + ThreadsInternal::start(&Self::thread_main, this); + ThreadsInternal::fence(); } inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) diff --git a/lib/kokkos/core/src/decl/Kokkos_Declare_HBWSpace.hpp b/lib/kokkos/core/src/decl/Kokkos_Declare_HBWSpace.hpp deleted file mode 100644 index 1328c93135..0000000000 --- a/lib/kokkos/core/src/decl/Kokkos_Declare_HBWSpace.hpp +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_DECLARE_HBWSPACE_HPP -#define KOKKOS_DECLARE_HBWSPACE_HPP - -#ifdef KOKKOS_ENABLE_HBWSPACE -#include -#endif - -#endif diff --git a/lib/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp b/lib/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp index e115f7051f..cf405e57b8 100644 --- a/lib/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp +++ b/lib/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp @@ -25,9 +25,13 @@ #include #include #include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include #include #include #include diff --git a/lib/kokkos/core/src/decl/Kokkos_Declare_THREADS.hpp b/lib/kokkos/core/src/decl/Kokkos_Declare_THREADS.hpp index f5cbc0c1d1..4d7caec6f5 100644 --- a/lib/kokkos/core/src/decl/Kokkos_Declare_THREADS.hpp +++ b/lib/kokkos/core/src/decl/Kokkos_Declare_THREADS.hpp @@ -19,7 +19,7 @@ #if defined(KOKKOS_ENABLE_THREADS) #include -#include +#include #include #include #include @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #endif diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp index 5c182db566..4a69652616 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp @@ -90,8 +90,6 @@ void combine(Kokkos::InitializationSettings& out, KOKKOS_IMPL_COMBINE_SETTING(num_threads); KOKKOS_IMPL_COMBINE_SETTING(map_device_id_by); KOKKOS_IMPL_COMBINE_SETTING(device_id); - KOKKOS_IMPL_COMBINE_SETTING(num_devices); - KOKKOS_IMPL_COMBINE_SETTING(skip_device); KOKKOS_IMPL_COMBINE_SETTING(disable_warnings); KOKKOS_IMPL_COMBINE_SETTING(tune_internals); KOKKOS_IMPL_COMBINE_SETTING(tools_help); @@ -131,11 +129,15 @@ void combine(Kokkos::Tools::InitArguments& out, int get_device_count() { #if defined(KOKKOS_ENABLE_CUDA) - return Kokkos::Cuda::detect_device_count(); + int count; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); + return count; #elif defined(KOKKOS_ENABLE_HIP) - return Kokkos::HIP::detect_device_count(); + int count; + KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count)); + return count; #elif defined(KOKKOS_ENABLE_SYCL) - return sycl::device::get_devices(sycl::info::device_type::gpu).size(); + return Kokkos::Experimental::Impl::get_sycl_devices().size(); #elif defined(KOKKOS_ENABLE_OPENACC) return acc_get_num_devices( Kokkos::Experimental::Impl::OpenACC_Traits::dev_type); @@ -165,20 +167,43 @@ bool is_valid_map_device_id_by(std::string const& x) { } // namespace +std::vector const& Kokkos::Impl::get_visible_devices() { + static auto devices = get_visible_devices(get_device_count()); + return devices; +} + [[nodiscard]] int Kokkos::device_id() noexcept { #if defined(KOKKOS_ENABLE_CUDA) - return Cuda().cuda_device(); + int device = Cuda().cuda_device(); #elif defined(KOKKOS_ENABLE_HIP) - return HIP().hip_device(); + int device = HIP().hip_device(); #elif defined(KOKKOS_ENABLE_OPENACC) - return Experimental::OpenACC().acc_device_number(); + int device = Experimental::OpenACC().acc_device_number(); #elif defined(KOKKOS_ENABLE_OPENMPTARGET) - return omp_get_default_device(); // FIXME_OPENMPTARGET + int device = omp_get_default_device(); // FIXME_OPENMPTARGET #elif defined(KOKKOS_ENABLE_SYCL) - return Experimental::Impl::SYCLInternal::m_syclDev; + int device = Experimental::Impl::SYCLInternal::m_syclDev; #else - return -1; + int device = -1; + return device; #endif + auto const& visible_devices = Impl::get_visible_devices(); + for (std::size_t i = 0; i < visible_devices.size(); ++i) { + if (visible_devices[i] == device) { + return i; + } + } + Kokkos::abort("Unexpected error: cannot determine device id"); + return -1; +} + +[[nodiscard]] int Kokkos::num_devices() noexcept { + if constexpr (std::is_same_v) { + return -1; // no GPU backend enabled + } else { + return Impl::get_visible_devices().size(); + } } [[nodiscard]] int Kokkos::num_threads() noexcept { @@ -313,8 +338,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { return std::stoi(id.c_str()); } -std::vector Kokkos::Impl::get_visible_devices( - Kokkos::InitializationSettings const& settings, int device_count) { +std::vector Kokkos::Impl::get_visible_devices(int device_count) { std::vector visible_devices; char* env_visible_devices = std::getenv("KOKKOS_VISIBLE_DEVICES"); if (env_visible_devices) { @@ -341,30 +365,9 @@ std::vector Kokkos::Impl::get_visible_devices( } } } else { - int num_devices = - settings.has_num_devices() ? settings.get_num_devices() : device_count; - if (num_devices > device_count) { - std::stringstream ss; - ss << "Error: Specified number of devices '" << num_devices - << "' exceeds the actual number of GPUs available for execution '" - << device_count << "'." - << " Raised by Kokkos::initialize().\n"; - Kokkos::abort(ss.str().c_str()); - } - for (int i = 0; i < num_devices; ++i) { + for (int i = 0; i < device_count; ++i) { visible_devices.push_back(i); } - if (settings.has_skip_device()) { - if (visible_devices.size() == 1 && settings.get_skip_device() == 0) { - Kokkos::abort( - "Error: skipping the only GPU available for execution.\n" - " Raised by Kokkos::initialize().\n"); - } - visible_devices.erase( - std::remove(visible_devices.begin(), visible_devices.end(), - settings.get_skip_device()), - visible_devices.end()); - } } if (visible_devices.empty()) { Kokkos::abort( @@ -374,10 +377,10 @@ std::vector Kokkos::Impl::get_visible_devices( return visible_devices; } -int Kokkos::Impl::get_gpu(const InitializationSettings& settings) { - std::vector visible_devices = - get_visible_devices(settings, get_device_count()); - int const num_devices = visible_devices.size(); +std::optional Kokkos::Impl::get_gpu( + const InitializationSettings& settings) { + std::vector visible_devices = get_visible_devices(get_device_count()); + int const num_devices = visible_devices.size(); // device_id is provided if (settings.has_device_id()) { int const id = settings.get_device_id(); @@ -423,14 +426,15 @@ int Kokkos::Impl::get_gpu(const InitializationSettings& settings) { int const mpi_local_rank = mpi_local_rank_on_node(); - // use first GPU available for execution if unable to detect local MPI rank + // if unable to detect local MPI rank return nullopt to delegate device + // selection to the backend if (mpi_local_rank < 0) { if (settings.has_map_device_id_by()) { std::cerr << "Warning: unable to detect local MPI rank." << " Falling back to the first GPU available for execution." << " Raised by Kokkos::initialize()." << std::endl; } - return visible_devices[0]; + return std::nullopt; } // use device assigned by CTest when resource allocation is activated @@ -445,13 +449,6 @@ int Kokkos::Impl::get_gpu(const InitializationSettings& settings) { namespace { void initialize_backends(const Kokkos::InitializationSettings& settings) { -// This is an experimental setting -// For KNL in Flat mode this variable should be set, so that -// memkind allocates high bandwidth memory correctly. -#ifdef KOKKOS_ENABLE_HBWSPACE - setenv("MEMKIND_HBW_NODES", "1", 0); -#endif - Kokkos::Impl::ExecSpaceManager::get_instance().initialize_spaces(settings); } @@ -571,19 +568,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { "no"); #endif -#ifdef KOKKOS_ENABLE_HBWSPACE - declare_configuration_metadata("memory", "KOKKOS_ENABLE_HBWSPACE", "yes"); -#else - declare_configuration_metadata("memory", "KOKKOS_ENABLE_HBWSPACE", "no"); -#endif -#ifdef KOKKOS_ENABLE_INTEL_MM_ALLOC - declare_configuration_metadata("memory", "KOKKOS_ENABLE_INTEL_MM_ALLOC", - "yes"); -#else - declare_configuration_metadata("memory", "KOKKOS_ENABLE_INTEL_MM_ALLOC", - "no"); -#endif - #ifdef KOKKOS_ENABLE_ASM declare_configuration_metadata("options", "KOKKOS_ENABLE_ASM", "yes"); #else @@ -604,6 +588,11 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #else declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX23", "no"); #endif +#ifdef KOKKOS_ENABLE_CXX26 + declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX26", "yes"); +#else + declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX26", "no"); +#endif #ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK declare_configuration_metadata("options", "KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK", "yes"); @@ -616,11 +605,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #else declare_configuration_metadata("options", "KOKKOS_ENABLE_HWLOC", "no"); #endif -#ifdef KOKKOS_ENABLE_LIBRT - declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "yes"); -#else - declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "no"); -#endif #ifdef KOKKOS_ENABLE_LIBDL declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBDL", "yes"); #else @@ -645,8 +629,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { "ARMV8_THUNDERX2"); #elif defined(KOKKOS_ARCH_BDW) declare_configuration_metadata("architecture", "CPU architecture", "BDW"); -#elif defined(KOKKOS_ARCH_BGQ) - declare_configuration_metadata("architecture", "CPU architecture", "BGQ"); #elif defined(KOKKOS_ARCH_HSW) declare_configuration_metadata("architecture", "CPU architecture", "HSW"); #elif defined(KOKKOS_ARCH_ICL) @@ -659,8 +641,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { declare_configuration_metadata("architecture", "CPU architecture", "KNL"); #elif defined(KOKKOS_ARCH_NATIVE) declare_configuration_metadata("architecture", "CPU architecture", "NATIVE"); -#elif defined(KOKKOS_ARCH_POWER7) - declare_configuration_metadata("architecture", "CPU architecture", "POWER7"); #elif defined(KOKKOS_ARCH_POWER8) declare_configuration_metadata("architecture", "CPU architecture", "POWER8"); #elif defined(KOKKOS_ARCH_POWER9) @@ -673,8 +653,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { declare_configuration_metadata("architecture", "CPU architecture", "SNB"); #elif defined(KOKKOS_ARCH_SPR) declare_configuration_metadata("architecture", "CPU architecture", "SPR"); -#elif defined(KOKKOS_ARCH_WSM) - declare_configuration_metadata("architecture", "CPU architecture", "WSM"); #elif defined(KOKKOS_ARCH_AMD_ZEN) declare_configuration_metadata("architecture", "CPU architecture", "AMD_ZEN"); #elif defined(KOKKOS_ARCH_AMD_ZEN2) @@ -683,6 +661,9 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_AMD_ZEN3) declare_configuration_metadata("architecture", "CPU architecture", "AMD_ZEN3"); +#elif defined(KOKKOS_ARCH_RISCV_SG2042) + declare_configuration_metadata("architecture", "CPU architecture", + "SG2042 (RISC-V)") #else declare_configuration_metadata("architecture", "CPU architecture", "none"); #endif @@ -752,8 +733,8 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_ADA89) declare_configuration_metadata("architecture", "GPU architecture", "ADA89"); #elif defined(KOKKOS_ARCH_HOPPER90) - declare_configuration_metadata("architecture", "GPU architecture", - "HOPPER90"); + declare_configuration_metadata("architecture", "GPU architecture", + "HOPPER90"); #elif defined(KOKKOS_ARCH_AMD_GFX906) declare_configuration_metadata("architecture", "GPU architecture", "AMD_GFX906"); @@ -911,36 +892,18 @@ void Kokkos::Impl::parse_command_line_arguments( int num_threads; int device_id; - int num_devices; // deprecated - int skip_device; // deprecated std::string map_device_id_by; bool disable_warnings; bool print_configuration; bool tune_internals; - auto get_flag = [](std::string s) -> std::string { - return s.erase(s.find('=')); - }; - bool help_flag = false; int iarg = 0; while (iarg < argc) { bool remove_flag = false; - if (check_arg(argv[iarg], "--kokkos-numa") || - check_arg(argv[iarg], "--numa")) { - warn_deprecated_command_line_argument(get_flag(argv[iarg])); - // remove flag if prefixed with '--kokkos-' - remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0; - } else if (check_arg_int(argv[iarg], "--kokkos-num-threads", num_threads) || - check_arg_int(argv[iarg], "--num-threads", num_threads) || - check_arg_int(argv[iarg], "--kokkos-threads", num_threads) || - check_arg_int(argv[iarg], "--threads", num_threads)) { - if (get_flag(argv[iarg]) != "--kokkos-num-threads") { - warn_deprecated_command_line_argument(get_flag(argv[iarg]), - "--kokkos-num-threads"); - } + if (check_arg_int(argv[iarg], "--kokkos-num-threads", num_threads)) { if (!is_valid_num_threads(num_threads)) { std::stringstream ss; ss << "Error: command line argument '" << argv[iarg] << "' is invalid." @@ -949,15 +912,8 @@ void Kokkos::Impl::parse_command_line_arguments( Kokkos::abort(ss.str().c_str()); } settings.set_num_threads(num_threads); - remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0; - } else if (check_arg_int(argv[iarg], "--kokkos-device-id", device_id) || - check_arg_int(argv[iarg], "--device-id", device_id) || - check_arg_int(argv[iarg], "--kokkos-device", device_id) || - check_arg_int(argv[iarg], "--device", device_id)) { - if (get_flag(argv[iarg]) != "--kokkos-device-id") { - warn_deprecated_command_line_argument(get_flag(argv[iarg]), - "--kokkos-device-id"); - } + remove_flag = true; + } else if (check_arg_int(argv[iarg], "--kokkos-device-id", device_id)) { if (!is_valid_device_id(device_id)) { std::stringstream ss; ss << "Error: command line argument '" << argv[iarg] << "' is invalid." @@ -966,70 +922,7 @@ void Kokkos::Impl::parse_command_line_arguments( Kokkos::abort(ss.str().c_str()); } settings.set_device_id(device_id); - remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0; - } else if (check_arg(argv[iarg], "--kokkos-num-devices") || - check_arg(argv[iarg], "--num-devices") || - check_arg(argv[iarg], "--kokkos-ndevices") || - check_arg(argv[iarg], "--ndevices")) { - if (check_arg(argv[iarg], "--num-devices")) { - warn_deprecated_command_line_argument("--num-devices", - "--kokkos-num-devices"); - } - if (check_arg(argv[iarg], "--ndevices")) { - warn_deprecated_command_line_argument("--ndevices", - "--kokkos-num-devices"); - } - if (check_arg(argv[iarg], "--kokkos-ndevices")) { - warn_deprecated_command_line_argument("--kokkos-ndevices", - "--kokkos-num-devices"); - } - warn_deprecated_command_line_argument( - "--kokkos-num-devices", "--kokkos-map-device-id-by=mpi_rank"); - // Find the number of device (expecting --device=XX) - if (!((strncmp(argv[iarg], "--kokkos-num-devices=", 21) == 0) || - (strncmp(argv[iarg], "--num-devices=", 14) == 0) || - (strncmp(argv[iarg], "--kokkos-ndevices=", 18) == 0) || - (strncmp(argv[iarg], "--ndevices=", 11) == 0))) - throw_runtime_exception( - "Error: expecting an '=INT[,INT]' after command line argument " - "'--kokkos-num-devices'." - " Raised by Kokkos::initialize()."); - - char* num1 = strchr(argv[iarg], '=') + 1; - char* num2 = strpbrk(num1, ","); - int num1_len = num2 == nullptr ? strlen(num1) : num2 - num1; - char* num1_only = new char[num1_len + 1]; - strncpy(num1_only, num1, num1_len); - num1_only[num1_len] = '\0'; - - if (!is_unsigned_int(num1_only) || (strlen(num1_only) == 0)) { - throw_runtime_exception( - "Error: expecting an integer number after command line argument " - "'--kokkos-num-devices'." - " Raised by Kokkos::initialize()."); - } - if (check_arg(argv[iarg], "--kokkos-num-devices") || - check_arg(argv[iarg], "--kokkos-ndevices")) { - num_devices = std::stoi(num1_only); - settings.set_num_devices(num_devices); - settings.set_map_device_id_by("mpi_rank"); - } - delete[] num1_only; - - if (num2 != nullptr) { - if ((!is_unsigned_int(num2 + 1)) || (strlen(num2) == 1)) - throw_runtime_exception( - "Error: expecting an integer number after command line argument " - "'--kokkos-num-devices=XX,'." - " Raised by Kokkos::initialize()."); - - if (check_arg(argv[iarg], "--kokkos-num-devices") || - check_arg(argv[iarg], "--kokkos-ndevices")) { - skip_device = std::stoi(num2 + 1); - settings.set_skip_device(skip_device); - } - } - remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0; + remove_flag = true; } else if (check_arg_bool(argv[iarg], "--kokkos-disable-warnings", disable_warnings)) { settings.set_disable_warnings(disable_warnings); @@ -1098,9 +991,6 @@ void Kokkos::Impl::parse_environment_variables( } combine(settings, tools_init_arguments); - if (std::getenv("KOKKOS_NUMA")) { - warn_deprecated_environment_variable("KOKKOS_NUMA"); - } int num_threads; if (check_env_int("KOKKOS_NUM_THREADS", num_threads)) { if (!is_valid_num_threads(num_threads)) { @@ -1125,34 +1015,6 @@ void Kokkos::Impl::parse_environment_variables( } settings.set_device_id(device_id); } - int num_devices; - int rand_devices; - bool has_num_devices = check_env_int("KOKKOS_NUM_DEVICES", num_devices); - bool has_rand_devices = check_env_int("KOKKOS_RAND_DEVICES", rand_devices); - if (has_rand_devices && has_num_devices) { - Impl::throw_runtime_exception( - "Error: cannot specify both KOKKOS_NUM_DEVICES and " - "KOKKOS_RAND_DEVICES." - " Raised by Kokkos::initialize()."); - } - if (has_num_devices) { - warn_deprecated_environment_variable("KOKKOS_NUM_DEVICES", - "KOKKOS_MAP_DEVICE_ID_BY=mpi_rank"); - settings.set_map_device_id_by("mpi_rank"); - settings.set_num_devices(num_devices); - } - if (has_rand_devices) { - warn_deprecated_environment_variable("KOKKOS_RAND_DEVICES", - "KOKKOS_MAP_DEVICE_ID_BY=random"); - settings.set_map_device_id_by("random"); - settings.set_num_devices(rand_devices); - } - if (has_num_devices || has_rand_devices) { - int skip_device; - if (check_env_int("KOKKOS_SKIP_DEVICE", skip_device)) { - settings.set_skip_device(skip_device); - } - } bool disable_warnings; if (check_env_bool("KOKKOS_DISABLE_WARNINGS", disable_warnings)) { settings.set_disable_warnings(disable_warnings); diff --git a/lib/kokkos/core/src/impl/Kokkos_DeviceManagement.hpp b/lib/kokkos/core/src/impl/Kokkos_DeviceManagement.hpp index bd89c8b19c..70dca5d8fa 100644 --- a/lib/kokkos/core/src/impl/Kokkos_DeviceManagement.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_DeviceManagement.hpp @@ -17,17 +17,17 @@ #ifndef KOKKOS_DEVICE_MANAGEMENT_HPP #define KOKKOS_DEVICE_MANAGEMENT_HPP +#include #include namespace Kokkos { class InitializationSettings; namespace Impl { -int get_gpu(const Kokkos::InitializationSettings& settings); +std::optional get_gpu(const Kokkos::InitializationSettings& settings); // This declaration is provided for testing purposes only int get_ctest_gpu(int local_rank); -// ditto -std::vector get_visible_devices( - Kokkos::InitializationSettings const& settings, int device_count); +std::vector get_visible_devices(int device_count); // test-only +std::vector const& get_visible_devices(); // use this instead } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/impl/Kokkos_Error.cpp b/lib/kokkos/core/src/impl/Kokkos_Error.cpp index 4babe2d72b..de6e83ed1f 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Error.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Error.cpp @@ -21,10 +21,11 @@ #include #include -#include +#include #include #include #include +#include // show_warnings #include #include @@ -38,6 +39,12 @@ void throw_runtime_exception(const std::string &msg) { throw std::runtime_error(msg); } +void log_warning(const std::string &msg) { + if (show_warnings()) { + std::cerr << msg << std::flush; + } +} + std::string human_memory_size(size_t arg_bytes) { double bytes = arg_bytes; const double K = 1024; @@ -64,7 +71,8 @@ std::string human_memory_size(size_t arg_bytes) { void Experimental::RawMemoryAllocationFailure::print_error_message( std::ostream &o) const { - o << "Allocation of size " << Impl::human_memory_size(m_attempted_size); + o << "Allocation of size " + << ::Kokkos::Impl::human_memory_size(m_attempted_size); o << " failed"; switch (m_failure_mode) { case FailureMode::OutOfMemoryError: diff --git a/lib/kokkos/core/src/impl/Kokkos_Error.hpp b/lib/kokkos/core/src/impl/Kokkos_Error.hpp index 3d0b1d3274..1058fd98db 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Error.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Error.hpp @@ -28,6 +28,8 @@ namespace Impl { [[noreturn]] void throw_runtime_exception(const std::string &msg); +void log_warning(const std::string &msg); + std::string human_memory_size(size_t arg_bytes); } // namespace Impl @@ -58,7 +60,8 @@ class RawMemoryAllocationFailure : public std::bad_alloc { HIPMallocManaged, SYCLMallocDevice, SYCLMallocShared, - SYCLMallocHost + SYCLMallocHost, + OpenACCMalloc, }; private: diff --git a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp deleted file mode 100644 index cd640b88cb..0000000000 --- a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp +++ /dev/null @@ -1,313 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#ifdef KOKKOS_ENABLE_HBWSPACE -#include -#endif - -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -#ifdef KOKKOS_ENABLE_HBWSPACE -#define MEMKIND_TYPE MEMKIND_HBW // hbw_get_kind(HBW_PAGESIZE_4KB) - -/*--------------------------------------------------------------------------*/ - -namespace Kokkos { -namespace Experimental { - -/* Default allocation mechanism */ -HBWSpace::HBWSpace() : m_alloc_mech(HBWSpace::STD_MALLOC) { - printf("Init\n"); - setenv("MEMKIND_HBW_NODES", "1", 0); -} - -/* Default allocation mechanism */ -HBWSpace::HBWSpace(const HBWSpace::AllocationMechanism &arg_alloc_mech) - : m_alloc_mech(HBWSpace::STD_MALLOC) { - printf("Init2\n"); - setenv("MEMKIND_HBW_NODES", "1", 0); - if (arg_alloc_mech == STD_MALLOC) { - m_alloc_mech = HBWSpace::STD_MALLOC; - } -} - -void *HBWSpace::allocate(const size_t arg_alloc_size) const { - return allocate("[unlabeled]", arg_alloc_size); -} -void *HBWSpace::allocate(const char *arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size) const { - return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); -} -void *HBWSpace::impl_allocate( - const char *arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size, - const Kokkos::Tools::SpaceHandle arg_handle) const { - static_assert(sizeof(void *) == sizeof(uintptr_t), - "Error sizeof(void*) != sizeof(uintptr_t)"); - - static_assert( - Kokkos::Impl::power_of_two::value, - "Memory alignment must be power of two"); - - constexpr uintptr_t alignment = Kokkos::Impl::MEMORY_ALIGNMENT; - constexpr uintptr_t alignment_mask = alignment - 1; - - void *ptr = nullptr; - - if (arg_alloc_size) { - if (m_alloc_mech == STD_MALLOC) { - // Over-allocate to and round up to guarantee proper alignment. - size_t size_padded = arg_alloc_size + sizeof(void *) + alignment; - - void *alloc_ptr = memkind_malloc(MEMKIND_TYPE, size_padded); - - if (alloc_ptr) { - uintptr_t address = reinterpret_cast(alloc_ptr); - - // offset enough to record the alloc_ptr - address += sizeof(void *); - uintptr_t rem = address % alignment; - uintptr_t offset = rem ? (alignment - rem) : 0u; - address += offset; - ptr = reinterpret_cast(address); - // record the alloc'd pointer - address -= sizeof(void *); - *reinterpret_cast(address) = alloc_ptr; - } - } - } - - if ((ptr == nullptr) || (reinterpret_cast(ptr) == ~uintptr_t(0)) || - (reinterpret_cast(ptr) & alignment_mask)) { - std::ostringstream msg; - msg << "Kokkos::Experimental::HBWSpace::allocate[ "; - switch (m_alloc_mech) { - case STD_MALLOC: msg << "STD_MALLOC"; break; - case POSIX_MEMALIGN: msg << "POSIX_MEMALIGN"; break; - case POSIX_MMAP: msg << "POSIX_MMAP"; break; - case INTEL_MM_ALLOC: msg << "INTEL_MM_ALLOC"; break; - } - msg << " ]( " << arg_alloc_size << " ) FAILED"; - if (ptr == nullptr) { - msg << " nullptr"; - } else { - msg << " NOT ALIGNED " << ptr; - } - - std::cerr << msg.str() << std::endl; - std::cerr.flush(); - - Kokkos::Impl::throw_runtime_exception(msg.str()); - } - if (Kokkos::Profiling::profileLibraryLoaded()) { - const size_t reported_size = - (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); - } - - return ptr; -} - -void HBWSpace::deallocate(void *const arg_alloc_ptr, - const size_t arg_alloc_size) const { - deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); -} -void HBWSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size) const { - impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); -} -void HBWSpace::impl_deallocate( - const char *arg_label, void *const arg_alloc_ptr, - const size_t arg_alloc_size, const size_t arg_logical_size, - const Kokkos::Tools::SpaceHandle arg_handle) const { - if (arg_alloc_ptr) { - if (Kokkos::Profiling::profileLibraryLoaded()) { - const size_t reported_size = - (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, - reported_size); - } - - if (m_alloc_mech == STD_MALLOC) { - void *alloc_ptr = *(reinterpret_cast(arg_alloc_ptr) - 1); - memkind_free(MEMKIND_TYPE, alloc_ptr); - } - } -} - -} // namespace Experimental -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord - SharedAllocationRecord::s_root_record; -#endif - -void SharedAllocationRecord::deallocate( - SharedAllocationRecord *arg_rec) { - delete static_cast(arg_rec); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::HBWSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : SharedAllocationRecord( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - // Fill in the Header information - RecordBase::m_alloc_ptr->m_record = - static_cast *>(this); - - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length - 1); - // Set last element zero, in case c_str is too long - RecordBase::m_alloc_ptr - ->m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; -} - -//---------------------------------------------------------------------------- - -void * -SharedAllocationRecord::allocate_tracked( - const Kokkos::Experimental::HBWSpace &arg_space, - const std::string &arg_alloc_label, const size_t arg_alloc_size) { - if (!arg_alloc_size) return nullptr; - - SharedAllocationRecord *const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); -} - -void SharedAllocationRecord::deallocate_tracked(void *const - arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord *const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } -} - -void *SharedAllocationRecord:: - reallocate_tracked(void *const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord *const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - Kokkos::fence( - "SharedAllocationRecord::reallocate_tracked(): fence after copying data"); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - -SharedAllocationRecord - *SharedAllocationRecord::get_record( - void *alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordHost = - SharedAllocationRecord; - - SharedAllocationHeader const *const head = - alloc_ptr ? Header::get_header(alloc_ptr) : nullptr; - RecordHost *const record = - head ? static_cast(head->m_record) : nullptr; - - if (!alloc_ptr || record->m_alloc_ptr != head) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace " - ", void >::get_record ERROR")); - } - - return record; -} - -// Iterate records to print orphaned memory ... -void SharedAllocationRecord:: - print_records(std::ostream &s, const Kokkos::Experimental::HBWSpace &space, - bool detail) { -#ifdef KOKKOS_ENABLE_DEBUG - SharedAllocationRecord::print_host_accessible_records( - s, "HBWSpace", &s_root_record, detail); -#else - throw_runtime_exception( - "SharedAllocationRecord::print_records" - " only works with KOKKOS_ENABLE_DEBUG enabled"); -#endif -} - -} // namespace Impl -} // namespace Kokkos - -#endif diff --git a/lib/kokkos/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp b/lib/kokkos/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp index 4a22898d16..bcce013b00 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp @@ -196,12 +196,12 @@ KOKKOS_INLINE_FUNCTION template static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::half_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::half_impl_t::type&); + T x, const Kokkos::Impl::half_impl_t::type&); #ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED template static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::bhalf_impl_t::type&); + T x, const Kokkos::Impl::bhalf_impl_t::type&); #endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED template @@ -283,13 +283,6 @@ class alignas(FloatType) floating_point_wrapper { private: impl_type val; - using fixed_width_integer_type = std::conditional_t< - sizeof(impl_type) == 2, uint16_t, - std::conditional_t< - sizeof(impl_type) == 4, uint32_t, - std::conditional_t>>; - static_assert(!std::is_void::value, - "Invalid impl_type"); public: // In-class initialization and defaulted default constructors not used @@ -318,18 +311,6 @@ class alignas(FloatType) floating_point_wrapper { default; #endif - KOKKOS_INLINE_FUNCTION - floating_point_wrapper(const volatile floating_point_wrapper& rhs) { -#if defined(KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH) && !defined(KOKKOS_ENABLE_SYCL) - val = rhs.val; -#else - const volatile fixed_width_integer_type* rv_ptr = - reinterpret_cast(&rhs.val); - const fixed_width_integer_type rv_val = *rv_ptr; - val = reinterpret_cast(rv_val); -#endif // KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - } - KOKKOS_FUNCTION floating_point_wrapper(bit_comparison_type rhs) { val = Kokkos::bit_cast(rhs); @@ -492,15 +473,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - template - KOKKOS_FUNCTION void operator=(T rhs) volatile { - impl_type new_val = cast_to_wrapper(rhs, val).val; - volatile fixed_width_integer_type* val_ptr = - reinterpret_cast( - const_cast(&val)); - *val_ptr = reinterpret_cast(new_val); - } - // Compound operators KOKKOS_FUNCTION floating_point_wrapper& operator+=(floating_point_wrapper rhs) { @@ -515,15 +487,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - KOKKOS_FUNCTION - void operator+=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs += tmp_rhs; - *this = tmp_lhs; - } - // Compound operators: upcast overloads for += template KOKKOS_FUNCTION friend std::enable_if_t< @@ -560,15 +523,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - KOKKOS_FUNCTION - void operator-=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs -= tmp_rhs; - *this = tmp_lhs; - } - // Compund operators: upcast overloads for -= template KOKKOS_FUNCTION friend std::enable_if_t< @@ -605,15 +559,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - KOKKOS_FUNCTION - void operator*=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs *= tmp_rhs; - *this = tmp_lhs; - } - // Compund operators: upcast overloads for *= template KOKKOS_FUNCTION friend std::enable_if_t< @@ -650,15 +595,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - KOKKOS_FUNCTION - void operator/=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs /= tmp_rhs; - *this = tmp_lhs; - } - // Compund operators: upcast overloads for /= template KOKKOS_FUNCTION friend std::enable_if_t< @@ -884,27 +820,6 @@ class alignas(FloatType) floating_point_wrapper { #endif } - KOKKOS_FUNCTION - friend bool operator==(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs == tmp_rhs; - } - - KOKKOS_FUNCTION - friend bool operator!=(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs != tmp_rhs; - } - - KOKKOS_FUNCTION - friend bool operator<(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs < tmp_rhs; - } - template KOKKOS_FUNCTION friend std::enable_if_t && (std::is_same_v || @@ -923,13 +838,6 @@ class alignas(FloatType) floating_point_wrapper { return lhs < static_cast(rhs); } - KOKKOS_FUNCTION - friend bool operator>(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs > tmp_rhs; - } - template KOKKOS_FUNCTION friend std::enable_if_t && (std::is_same_v || @@ -948,13 +856,6 @@ class alignas(FloatType) floating_point_wrapper { return lhs > static_cast(rhs); } - KOKKOS_FUNCTION - friend bool operator<=(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs <= tmp_rhs; - } - template KOKKOS_FUNCTION friend std::enable_if_t && (std::is_same_v || @@ -973,13 +874,6 @@ class alignas(FloatType) floating_point_wrapper { return lhs <= static_cast(rhs); } - KOKKOS_FUNCTION - friend bool operator>=(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs >= tmp_rhs; - } - template KOKKOS_FUNCTION friend std::enable_if_t && (std::is_same_v || @@ -1018,14 +912,14 @@ class alignas(FloatType) floating_point_wrapper { // Declare wrapper overloads now that floating_point_wrapper is declared template static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::half_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::half_impl_t::type&) { + T x, const Kokkos::Impl::half_impl_t::type&) { return Kokkos::Experimental::cast_to_half(x); } #ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED template static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::bhalf_impl_t::type&) { + T x, const Kokkos::Impl::bhalf_impl_t::type&) { return Kokkos::Experimental::cast_to_bhalf(x); } #endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED diff --git a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp index a9d7216059..1047b773d7 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp @@ -20,23 +20,11 @@ #include +#include +#include #include -#include #include -/*--------------------------------------------------------------------------*/ - -#if (defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM)) && \ - !defined(KOKKOS_ENABLE_CUDA) - -// Intel specialized allocator does not interoperate with CUDA memory allocation - -#define KOKKOS_ENABLE_INTEL_MM_ALLOC - -#endif - -/*--------------------------------------------------------------------------*/ - #include #include #include @@ -50,10 +38,6 @@ #include #endif -#include -#include -#include - //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -150,84 +134,6 @@ void HostSpace::impl_deallocate( } // namespace Kokkos -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord - SharedAllocationRecord::s_root_record; -#endif - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -SharedAllocationHeader *_do_allocation(Kokkos::HostSpace const &space, - std::string const &label, - size_t alloc_size) { - try { - return reinterpret_cast( - space.allocate(alloc_size)); - } catch (Experimental::RawMemoryAllocationFailure const &failure) { - if (failure.failure_mode() == Experimental::RawMemoryAllocationFailure:: - FailureMode::AllocationNotAligned) { - // TODO: delete the misaligned memory - } - - std::cerr << "Kokkos failed to allocate memory for label \"" << label - << "\". Allocation using MemorySpace named \"" << space.name() - << " failed with the following error: "; - failure.print_error_message(std::cerr); - std::cerr.flush(); - Kokkos::Impl::throw_runtime_exception("Memory allocation failure"); - } - return nullptr; // unreachable -} - -SharedAllocationRecord::SharedAllocationRecord( - const Kokkos::HostSpace &arg_space, const std::string &arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, - arg_label); -} - -} // namespace Impl -} // namespace Kokkos - -//============================================================================== -// {{{1 - #include -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class SharedAllocationRecordCommon; - -} // end namespace Impl -} // end namespace Kokkos - -// end Explicit instantiations of CRTP Base classes }}}1 -//============================================================================== +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(Kokkos::HostSpace); diff --git a/lib/kokkos/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp b/lib/kokkos/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp index f740c408fb..3072e2ce82 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp @@ -26,8 +26,7 @@ namespace Impl { template struct ZeroMemset> { - ZeroMemset(const HostSpace::execution_space& exec, const View& dst, - typename View::const_value_type&) { + ZeroMemset(const HostSpace::execution_space& exec, const View& dst) { // Host spaces, except for HPX, are synchronous and we need to fence for HPX // since we can't properly enqueue a std::memset otherwise. // We can't use exec.fence() directly since we don't have a full definition @@ -36,12 +35,6 @@ struct ZeroMemset> { using ValueType = typename View::value_type; std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); } - - ZeroMemset(const View& dst, - typename View::const_value_type&) { - using ValueType = typename View::value_type; - std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); - } }; } // end namespace Impl diff --git a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp index bfe5902bf7..11bf701b57 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp @@ -22,7 +22,6 @@ #include #include #include -#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp index 51f25a8b60..25f09b8286 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp @@ -885,7 +885,7 @@ KOKKOS_INLINE_FUNCTION closure(i, accum, false); } - auto team_member = loop_boundaries.thread; + auto& team_member = loop_boundaries.thread; // 'accum' output is the exclusive prefix sum accum = team_member.team_scan(accum); diff --git a/lib/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp b/lib/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp index ab4350f3a7..11a93c6bb5 100644 --- a/lib/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp @@ -24,32 +24,6 @@ namespace Kokkos { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -struct InitArguments { - int num_threads; - int num_numa; - int device_id; - int ndevices; - int skip_device; - bool disable_warnings; - bool tune_internals; - bool tool_help = false; - std::string tool_lib = {}; - std::string tool_args = {}; - - KOKKOS_DEPRECATED_WITH_COMMENT("Use InitializationSettings instead!") - InitArguments(int nt = -1, int nn = -1, int dv = -1, bool dw = false, - bool ti = false) - : num_threads{nt}, - num_numa{nn}, - device_id{dv}, - ndevices{-1}, - skip_device{9999}, - disable_warnings{dw}, - tune_internals{ti} {} -}; -#endif - class InitializationSettings { #define KOKKOS_IMPL_DECLARE(TYPE, NAME) \ private: \ @@ -64,12 +38,32 @@ class InitializationSettings { TYPE get_##NAME() const noexcept { return *m_##NAME; } \ static_assert(true, "no-op to require trailing semicolon") +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#define KOKKOS_IMPL_DECLARE_DEPRECATED(TYPE, NAME) \ + private: \ + std::optional m_##NAME; \ + \ + public: \ + KOKKOS_DEPRECATED InitializationSettings& set_##NAME(TYPE NAME) { \ + m_##NAME = NAME; \ + return *this; \ + } \ + KOKKOS_DEPRECATED bool has_##NAME() const noexcept { \ + return static_cast(m_##NAME); \ + } \ + KOKKOS_DEPRECATED TYPE get_##NAME() const noexcept { return *m_##NAME; } \ + static_assert(true, "no-op to require trailing semicolon") +#else +#define KOKKOS_IMPL_DECLARE_DEPRECATED(TYPE, NAME) \ + static_assert(true, "no-op to require trailing semicolon") +#endif + public: KOKKOS_IMPL_DECLARE(int, num_threads); KOKKOS_IMPL_DECLARE(int, device_id); KOKKOS_IMPL_DECLARE(std::string, map_device_id_by); - KOKKOS_IMPL_DECLARE(int, num_devices); // deprecated - KOKKOS_IMPL_DECLARE(int, skip_device); // deprecated + KOKKOS_IMPL_DECLARE_DEPRECATED(int, num_devices); + KOKKOS_IMPL_DECLARE_DEPRECATED(int, skip_device); KOKKOS_IMPL_DECLARE(bool, disable_warnings); KOKKOS_IMPL_DECLARE(bool, print_configuration); KOKKOS_IMPL_DECLARE(bool, tune_internals); @@ -80,41 +74,6 @@ class InitializationSettings { #undef KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER_TYPE #undef KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER #undef KOKKOS_IMPL_DECLARE - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - public: - InitializationSettings() = default; - - InitializationSettings(InitArguments const& old) { - if (old.num_threads != -1) { - set_num_threads(old.num_threads); - } - if (old.device_id != -1) { - set_device_id(old.device_id); - } - if (old.ndevices != -1) { - set_num_devices(old.ndevices); - } - if (old.skip_device != 9999) { - set_skip_device(old.skip_device); - } - if (old.disable_warnings) { - set_disable_warnings(true); - } - if (old.tune_internals) { - set_tune_internals(true); - } - if (old.tool_help) { - set_tools_help(true); - } - if (!old.tool_lib.empty()) { - set_tools_libs(old.tool_lib); - } - if (!old.tool_args.empty()) { - set_tools_args(old.tool_args); - } - } -#endif }; } // namespace Kokkos diff --git a/lib/kokkos/core/src/impl/Kokkos_MemorySpace.cpp b/lib/kokkos/core/src/impl/Kokkos_MemorySpace.cpp deleted file mode 100644 index 2f0e01c5b2..0000000000 --- a/lib/kokkos/core/src/impl/Kokkos_MemorySpace.cpp +++ /dev/null @@ -1,72 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -/** @file Kokkos_MemorySpace.cpp - * - * Operations common to memory space instances, or at least default - * implementations thereof. - */ - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include - -#include -#include -#include - -namespace Kokkos { -namespace Impl { - -void safe_throw_allocation_with_header_failure( - std::string const& space_name, std::string const& label, - Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { - auto generate_failure_message = [&](std::ostream& o) { - o << "Kokkos failed to allocate memory for label \"" << label - << "\". Allocation using MemorySpace named \"" << space_name - << "\" failed with the following error: "; - failure.print_error_message(o); - if (failure.failure_mode() == - Kokkos::Experimental::RawMemoryAllocationFailure::FailureMode:: - AllocationNotAligned) { - // TODO: delete the misaligned memory? - o << "Warning: Allocation failed due to misalignment; memory may " - "be leaked.\n"; - } - o.flush(); - }; - try { - std::ostringstream sstr; - generate_failure_message(sstr); - Kokkos::Impl::throw_runtime_exception(sstr.str()); - } catch (std::bad_alloc const&) { - // Probably failed to allocate the string because we're so close to out - // of memory. Try printing to std::cerr instead - try { - generate_failure_message(std::cerr); - } catch (std::bad_alloc const&) { - // oh well, we tried... - } - Kokkos::Impl::throw_runtime_exception( - "Kokkos encountered an allocation failure, then another allocation " - "failure while trying to create the error message."); - } -} - -} // end namespace Impl -} // end namespace Kokkos diff --git a/lib/kokkos/core/src/impl/Kokkos_MemorySpace.hpp b/lib/kokkos/core/src/impl/Kokkos_MemorySpace.hpp deleted file mode 100644 index 44956dd7c5..0000000000 --- a/lib/kokkos/core/src/impl/Kokkos_MemorySpace.hpp +++ /dev/null @@ -1,71 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -/** @file Kokkos_MemorySpace.hpp - * - * Operations common to memory space instances, or at least default - * implementations thereof. - */ - -#ifndef KOKKOS_IMPL_MEMORYSPACE_HPP -#define KOKKOS_IMPL_MEMORYSPACE_HPP - -#include -#include -#include - -#include - -namespace Kokkos { -namespace Impl { - -// Defined in implementation file to avoid having to include iostream -void safe_throw_allocation_with_header_failure( - std::string const &space_name, std::string const &label, - Kokkos::Experimental::RawMemoryAllocationFailure const &failure); - -template -SharedAllocationHeader *checked_allocation_with_header(MemorySpace const &space, - std::string const &label, - size_t alloc_size) { - try { - return reinterpret_cast(space.allocate( - label.c_str(), alloc_size + sizeof(SharedAllocationHeader), - alloc_size)); - } catch (Kokkos::Experimental::RawMemoryAllocationFailure const &failure) { - safe_throw_allocation_with_header_failure(space.name(), label, failure); - } - return nullptr; // unreachable -} - -template -SharedAllocationHeader *checked_allocation_with_header( - ExecutionSpace const &exec_space, MemorySpace const &space, - std::string const &label, size_t alloc_size) { - try { - return reinterpret_cast(space.allocate( - exec_space, label.c_str(), alloc_size + sizeof(SharedAllocationHeader), - alloc_size)); - } catch (Kokkos::Experimental::RawMemoryAllocationFailure const &failure) { - safe_throw_allocation_with_header_failure(space.name(), label, failure); - } - return nullptr; // unreachable -} - -} // end namespace Impl -} // end namespace Kokkos - -#endif // KOKKOS_IMPL_MEMORYSPACE_HPP diff --git a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp deleted file mode 100644 index 42a53b04fb..0000000000 --- a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp +++ /dev/null @@ -1,54 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_MEMORY_FENCE_HPP) -#define KOKKOS_MEMORY_FENCE_HPP -namespace Kokkos { - -////////////////////////////////////////////////////// -// store_fence() -// -// If possible use a store fence on the architecture, if not run a full memory -// fence - -KOKKOS_FORCEINLINE_FUNCTION -void store_fence() { -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) - asm volatile("sfence" ::: "memory"); -#else - memory_fence(); -#endif -} - -////////////////////////////////////////////////////// -// load_fence() -// -// If possible use a load fence on the architecture, if not run a full memory -// fence - -KOKKOS_FORCEINLINE_FUNCTION -void load_fence() { -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) - asm volatile("lfence" ::: "memory"); -#else - memory_fence(); -#endif -} - -} // namespace Kokkos - -#endif diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h b/lib/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h index 731a11e917..15c466b27e 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h +++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h @@ -154,7 +154,7 @@ enum Kokkos_Tools_OptimizationType { Kokkos_Tools_Maximize }; -struct Kokkos_Tools_OptimzationGoal { +struct Kokkos_Tools_OptimizationGoal { size_t type_id; enum Kokkos_Tools_OptimizationType goal; }; @@ -220,7 +220,7 @@ typedef void (*Kokkos_Tools_contextBeginFunction)(const size_t); typedef void (*Kokkos_Tools_contextEndFunction)( const size_t, struct Kokkos_Tools_VariableValue); typedef void (*Kokkos_Tools_optimizationGoalDeclarationFunction)( - const size_t, const struct Kokkos_Tools_OptimzationGoal goal); + const size_t, const struct Kokkos_Tools_OptimizationGoal goal); struct Kokkos_Profiling_EventSet { Kokkos_Profiling_initFunction init; diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp index af71932e47..b66886d9f7 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp @@ -226,7 +226,7 @@ using ValueType = Kokkos_Tools_VariableInfo_ValueType; using CandidateValueType = Kokkos_Tools_VariableInfo_CandidateValueType; using SetOrRange = Kokkos_Tools_VariableInfo_SetOrRange; using VariableInfo = Kokkos_Tools_VariableInfo; -using OptimizationGoal = Kokkos_Tools_OptimzationGoal; +using OptimizationGoal = Kokkos_Tools_OptimizationGoal; using TuningString = Kokkos_Tools_Tuning_String; using VariableValue = Kokkos_Tools_VariableValue; diff --git a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp index 255f5125f4..0bc3814b3a 100644 --- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp @@ -20,6 +20,8 @@ #include #include +#include +#include namespace Kokkos { namespace Impl { @@ -321,5 +323,53 @@ void SharedAllocationRecord::print_host_accessible_records( } #endif +void safe_throw_allocation_with_header_failure( + std::string const& space_name, std::string const& label, + Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { + auto generate_failure_message = [&](std::ostream& o) { + o << "Kokkos failed to allocate memory for label \"" << label + << "\". Allocation using MemorySpace named \"" << space_name + << "\" failed with the following error: "; + failure.print_error_message(o); + if (failure.failure_mode() == + Kokkos::Experimental::RawMemoryAllocationFailure::FailureMode:: + AllocationNotAligned) { + // TODO: delete the misaligned memory? + o << "Warning: Allocation failed due to misalignment; memory may " + "be leaked.\n"; + } + o.flush(); + }; + try { + std::ostringstream sstr; + generate_failure_message(sstr); + Kokkos::Impl::throw_runtime_exception(sstr.str()); + } catch (std::bad_alloc const&) { + // Probably failed to allocate the string because we're so close to out + // of memory. Try printing to std::cerr instead + try { + generate_failure_message(std::cerr); + } catch (std::bad_alloc const&) { + // oh well, we tried... + } + Kokkos::Impl::throw_runtime_exception( + "Kokkos encountered an allocation failure, then another allocation " + "failure while trying to create the error message."); + } +} + +void fill_host_accessible_header_info( + SharedAllocationRecord* arg_record, + SharedAllocationHeader& arg_header, std::string const& arg_label) { + // Fill in the Header information, directly accessible on the host + + arg_header.m_record = arg_record; + + strncpy(arg_header.m_label, arg_label.c_str(), + SharedAllocationHeader::maximum_label_length); + // Set last element zero, in case c_str is too long + arg_header.m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; +} + } /* namespace Impl */ } /* namespace Kokkos */ diff --git a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp index 043505a158..99ab660213 100644 --- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp @@ -51,6 +51,9 @@ class SharedAllocationHeader { friend class SharedAllocationRecordCommon; template friend class HostInaccessibleSharedAllocationRecordCommon; + friend void fill_host_accessible_header_info( + SharedAllocationRecord*, SharedAllocationHeader&, + std::string const&); Record* m_record; char m_label[maximum_label_length]; @@ -145,25 +148,23 @@ class SharedAllocationRecord { SharedAllocationRecord() : m_alloc_ptr(nullptr), m_alloc_size(0), - m_dealloc(nullptr) + m_dealloc(nullptr), #ifdef KOKKOS_ENABLE_DEBUG - , m_root(this), m_prev(this), - m_next(this) + m_next(this), #endif - , m_count(0) { } static constexpr unsigned maximum_label_length = SharedAllocationHeader::maximum_label_length; - KOKKOS_INLINE_FUNCTION + KOKKOS_FUNCTION const SharedAllocationHeader* head() const { return m_alloc_ptr; } /* User's memory begins at the end of the header */ - KOKKOS_INLINE_FUNCTION + KOKKOS_FUNCTION void* data() const { return static_cast(m_alloc_ptr + 1); } /* User's memory begins at the end of the header */ @@ -195,23 +196,79 @@ class SharedAllocationRecord { const SharedAllocationRecord* const root, const bool detail); }; +void safe_throw_allocation_with_header_failure( + std::string const& space_name, std::string const& label, + Kokkos::Experimental::RawMemoryAllocationFailure const& failure); + +template +SharedAllocationHeader* checked_allocation_with_header(MemorySpace const& space, + std::string const& label, + size_t alloc_size) { + try { + return reinterpret_cast(space.allocate( + label.c_str(), alloc_size + sizeof(SharedAllocationHeader), + alloc_size)); + } catch (Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { + safe_throw_allocation_with_header_failure(space.name(), label, failure); + } + return nullptr; // unreachable +} + +template +SharedAllocationHeader* checked_allocation_with_header( + ExecutionSpace const& exec_space, MemorySpace const& space, + std::string const& label, size_t alloc_size) { + try { + return reinterpret_cast(space.allocate( + exec_space, label.c_str(), alloc_size + sizeof(SharedAllocationHeader), + alloc_size)); + } catch (Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { + safe_throw_allocation_with_header_failure(space.name(), label, failure); + } + return nullptr; // unreachable +} + +void fill_host_accessible_header_info(SharedAllocationHeader& arg_header, + std::string const& arg_label); + template class SharedAllocationRecordCommon : public SharedAllocationRecord { private: using derived_t = SharedAllocationRecord; using record_base_t = SharedAllocationRecord; - derived_t& self() { return *static_cast(this); } - derived_t const& self() const { return *static_cast(this); } protected: using record_base_t::record_base_t; - void _fill_host_accessible_header_info(SharedAllocationHeader& arg_header, - std::string const& arg_label); + MemorySpace m_space; + +#ifdef KOKKOS_ENABLE_DEBUG + static record_base_t s_root_record; +#endif static void deallocate(record_base_t* arg_rec); public: + ~SharedAllocationRecordCommon(); + template + SharedAllocationRecordCommon( + ExecutionSpace const& exec, MemorySpace const& space, + std::string const& label, std::size_t alloc_size, + record_base_t::function_type dealloc = &deallocate) + : SharedAllocationRecord( +#ifdef KOKKOS_ENABLE_DEBUG + &s_root_record, +#endif + checked_allocation_with_header(exec, space, label, alloc_size), + sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), + m_space(space) { + auto& header = *SharedAllocationRecord::m_alloc_ptr; + fill_host_accessible_header_info(this, header, label); + } + SharedAllocationRecordCommon( + MemorySpace const& space, std::string const& label, std::size_t size, + record_base_t::function_type dealloc = &deallocate); + static auto allocate(MemorySpace const& arg_space, std::string const& arg_label, size_t arg_alloc_size) -> derived_t*; @@ -231,22 +288,103 @@ class SharedAllocationRecordCommon : public SharedAllocationRecord { template class HostInaccessibleSharedAllocationRecordCommon - : public SharedAllocationRecordCommon { + : public SharedAllocationRecord { private: - using base_t = SharedAllocationRecordCommon; using derived_t = SharedAllocationRecord; using record_base_t = SharedAllocationRecord; protected: - using base_t::base_t; + using record_base_t::record_base_t; + + MemorySpace m_space; + +#ifdef KOKKOS_ENABLE_DEBUG + static record_base_t s_root_record; +#endif + + static void deallocate(record_base_t* arg_rec); public: + ~HostInaccessibleSharedAllocationRecordCommon(); + template + HostInaccessibleSharedAllocationRecordCommon( + ExecutionSpace const& exec, MemorySpace const& space, + std::string const& label, std::size_t alloc_size, + record_base_t::function_type dealloc = &deallocate) + : SharedAllocationRecord( +#ifdef KOKKOS_ENABLE_DEBUG + &s_root_record, +#endif + checked_allocation_with_header(exec, space, label, alloc_size), + sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), + m_space(space) { + SharedAllocationHeader header; + + fill_host_accessible_header_info(this, header, label); + + Kokkos::Impl::DeepCopy( + exec, SharedAllocationRecord::m_alloc_ptr, &header, + sizeof(SharedAllocationHeader)); + } + HostInaccessibleSharedAllocationRecordCommon( + MemorySpace const& space, std::string const& label, std::size_t size, + record_base_t::function_type dealloc = &deallocate); + + static auto allocate(MemorySpace const& arg_space, + std::string const& arg_label, size_t arg_alloc_size) + -> derived_t*; + /**\brief Allocate tracked memory in the space */ + static void* allocate_tracked(MemorySpace const& arg_space, + std::string const& arg_alloc_label, + size_t arg_alloc_size); + /**\brief Reallocate tracked memory in the space */ + static void deallocate_tracked(void* arg_alloc_ptr); + /**\brief Deallocate tracked memory in the space */ + static void* reallocate_tracked(void* arg_alloc_ptr, size_t arg_alloc_size); + static void print_records(std::ostream& s, MemorySpace const&, bool detail = false); static auto get_record(void* alloc_ptr) -> derived_t*; std::string get_label() const; }; +#ifdef KOKKOS_ENABLE_DEBUG +template +SharedAllocationRecord + SharedAllocationRecordCommon::s_root_record; + +template +SharedAllocationRecord + HostInaccessibleSharedAllocationRecordCommon::s_root_record; +#endif + +#define KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(MEMORY_SPACE) \ + template <> \ + class Kokkos::Impl::SharedAllocationRecord \ + : public Kokkos::Impl::SharedAllocationRecordCommon { \ + using SharedAllocationRecordCommon< \ + MEMORY_SPACE>::SharedAllocationRecordCommon; \ + } + +#define KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( \ + MEMORY_SPACE) \ + template <> \ + class Kokkos::Impl::SharedAllocationRecord \ + : public Kokkos::Impl::HostInaccessibleSharedAllocationRecordCommon< \ + MEMORY_SPACE> { \ + using HostInaccessibleSharedAllocationRecordCommon< \ + MEMORY_SPACE>::HostInaccessibleSharedAllocationRecordCommon; \ + } + +#define KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( \ + MEMORY_SPACE) \ + template class Kokkos::Impl::SharedAllocationRecordCommon + +#define KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( \ + MEMORY_SPACE) \ + template class Kokkos::Impl::HostInaccessibleSharedAllocationRecordCommon< \ + MEMORY_SPACE> + namespace { /* Taking the address of this function so make sure it is unique */ diff --git a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp index d403ef9db0..41036ab067 100644 --- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp @@ -31,6 +31,66 @@ namespace Kokkos { namespace Impl { +template +SharedAllocationRecordCommon::~SharedAllocationRecordCommon() { + auto alloc_ptr = SharedAllocationRecord::m_alloc_ptr; + auto alloc_size = SharedAllocationRecord::m_alloc_size; + auto label = SharedAllocationRecord::m_label; + m_space.deallocate(label.c_str(), alloc_ptr, alloc_size, + alloc_size - sizeof(SharedAllocationHeader)); +} +template +HostInaccessibleSharedAllocationRecordCommon< + MemorySpace>::~HostInaccessibleSharedAllocationRecordCommon() { + auto alloc_ptr = SharedAllocationRecord::m_alloc_ptr; + auto alloc_size = SharedAllocationRecord::m_alloc_size; + auto label = SharedAllocationRecord::m_label; + m_space.deallocate(label.c_str(), alloc_ptr, alloc_size, + alloc_size - sizeof(SharedAllocationHeader)); +} + +template +SharedAllocationRecordCommon::SharedAllocationRecordCommon( + MemorySpace const& space, std::string const& label, std::size_t alloc_size, + SharedAllocationRecord::function_type dealloc) + : SharedAllocationRecord( +#ifdef KOKKOS_ENABLE_DEBUG + &s_root_record, +#endif + checked_allocation_with_header(space, label, alloc_size), + sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), + m_space(space) { + auto& header = *SharedAllocationRecord::m_alloc_ptr; + fill_host_accessible_header_info(this, header, label); +} + +template +HostInaccessibleSharedAllocationRecordCommon:: + HostInaccessibleSharedAllocationRecordCommon( + MemorySpace const& space, std::string const& label, + std::size_t alloc_size, + SharedAllocationRecord::function_type dealloc) + : SharedAllocationRecord( +#ifdef KOKKOS_ENABLE_DEBUG + &s_root_record, +#endif + checked_allocation_with_header(space, label, alloc_size), + sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), + m_space(space) { + SharedAllocationHeader header; + + fill_host_accessible_header_info(this, header, label); + + typename MemorySpace::execution_space exec; + Kokkos::Impl::DeepCopy( + exec, SharedAllocationRecord::m_alloc_ptr, &header, + sizeof(SharedAllocationHeader)); + exec.fence(std::string("SharedAllocationRecord::SharedAllocationRecord(): " + "fence after copying header from HostSpace"); +} + template auto SharedAllocationRecordCommon::allocate( MemorySpace const& arg_space, std::string const& arg_label, @@ -76,9 +136,64 @@ void* SharedAllocationRecordCommon::reallocate_tracked( Kokkos::Impl::DeepCopy( r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - Kokkos::fence( - "SharedAllocationRecord::reallocate_tracked(): fence after copying data"); + Kokkos::fence(std::string("SharedAllocationRecord<") + MemorySpace::name() + + ", void>::reallocate_tracked(): fence after copying data"); + + record_base_t::increment(r_new); + record_base_t::decrement(r_old); + + return r_new->data(); +} + +template +auto HostInaccessibleSharedAllocationRecordCommon::allocate( + MemorySpace const& arg_space, std::string const& arg_label, + size_t arg_alloc_size) -> derived_t* { + return new derived_t(arg_space, arg_label, arg_alloc_size); +} + +template +void* HostInaccessibleSharedAllocationRecordCommon< + MemorySpace>::allocate_tracked(const MemorySpace& arg_space, + const std::string& arg_alloc_label, + size_t arg_alloc_size) { + if (!arg_alloc_size) return nullptr; + + SharedAllocationRecord* const r = + allocate(arg_space, arg_alloc_label, arg_alloc_size); + + record_base_t::increment(r); + + return r->data(); +} + +template +void HostInaccessibleSharedAllocationRecordCommon::deallocate( + HostInaccessibleSharedAllocationRecordCommon::record_base_t* arg_rec) { + delete static_cast(arg_rec); +} + +template +void HostInaccessibleSharedAllocationRecordCommon< + MemorySpace>::deallocate_tracked(void* arg_alloc_ptr) { + if (arg_alloc_ptr != nullptr) { + SharedAllocationRecord* const r = derived_t::get_record(arg_alloc_ptr); + record_base_t::decrement(r); + } +} + +template +void* HostInaccessibleSharedAllocationRecordCommon< + MemorySpace>::reallocate_tracked(void* arg_alloc_ptr, + size_t arg_alloc_size) { + derived_t* const r_old = derived_t::get_record(arg_alloc_ptr); + derived_t* const r_new = + allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); + + Kokkos::Impl::DeepCopy( + r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); + Kokkos::fence(std::string("SharedAllocationRecord<") + MemorySpace::name() + + ", void>::reallocate_tracked(): fence after copying data"); record_base_t::increment(r_new); record_base_t::decrement(r_old); @@ -108,20 +223,6 @@ std::string SharedAllocationRecordCommon::get_label() const { return record_base_t::m_label; } -template -void SharedAllocationRecordCommon:: - _fill_host_accessible_header_info(SharedAllocationHeader& arg_header, - std::string const& arg_label) { - // Fill in the Header information, directly accessible on the host - - arg_header.m_record = &self(); - - strncpy(arg_header.m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); - // Set last element zero, in case c_str is too long - arg_header.m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; -} - template void SharedAllocationRecordCommon::print_records( std::ostream& s, const MemorySpace&, bool detail) { diff --git a/lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp b/lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp deleted file mode 100644 index c57b17d646..0000000000 --- a/lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp +++ /dev/null @@ -1,109 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_SPINWAIT_HPP -#define KOKKOS_SPINWAIT_HPP - -#include -#include - -#include - -#include - -namespace Kokkos { -namespace Impl { - -enum class WaitMode : int { - ACTIVE // Used for tight loops to keep threads active longest - , - PASSIVE // Used to quickly yield the thread to quite down the system - , - ROOT // Never sleep or yield the root thread -}; - -void host_thread_yield(const uint32_t i, const WaitMode mode); - -template -std::enable_if_t::value, void> root_spinwait_while_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value == flag) { - host_thread_yield(++i, WaitMode::ROOT); - } - Kokkos::load_fence(); -} - -template -std::enable_if_t::value, void> root_spinwait_until_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value != flag) { - host_thread_yield(++i, WaitMode::ROOT); - } - Kokkos::load_fence(); -} - -template -std::enable_if_t::value, void> spinwait_while_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value == flag) { - host_thread_yield(++i, WaitMode::ACTIVE); - } - Kokkos::load_fence(); -} - -template -std::enable_if_t::value, void> yield_while_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value == flag) { - host_thread_yield(++i, WaitMode::PASSIVE); - } - Kokkos::load_fence(); -} - -template -std::enable_if_t::value, void> spinwait_until_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value != flag) { - host_thread_yield(++i, WaitMode::ACTIVE); - } - Kokkos::load_fence(); -} - -template -std::enable_if_t::value, void> yield_until_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value != flag) { - host_thread_yield(++i, WaitMode::PASSIVE); - } - Kokkos::load_fence(); -} - -} /* namespace Impl */ -} /* namespace Kokkos */ - -#endif /* #ifndef KOKKOS_SPINWAIT_HPP */ diff --git a/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp b/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp index 7e2f130564..cadeed1a6d 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp @@ -49,6 +49,11 @@ struct integral_constant { template struct always_true : std::true_type {}; +// type-dependent expression that is always false intended for use in +// static_assert to check "we should never get there" +template +struct always_false : std::false_type {}; + //============================================================================== #if defined(__cpp_lib_type_identity) diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp index 725ba5de09..fe43b63018 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp @@ -27,10 +27,9 @@ struct ViewDataAnalysis> { private: using array_analysis = ViewArrayAnalysis; - static_assert(std::is_void

::value, ""); + static_assert(std::is_void

::value); static_assert(std::is_same>::value, - ""); + Kokkos::Array>::value); static_assert(std::is_scalar::value, "View of Array type must be of a scalar type"); @@ -130,6 +129,12 @@ class ViewMapping> { return m_impl_offset.m_dim.extent(r); } + static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( + const unsigned r) noexcept { + using dim_type = typename offset_type::dimension_type; + return dim_type::static_extent(r); + } + KOKKOS_INLINE_FUNCTION constexpr typename Traits::array_layout layout() const { return m_impl_offset.layout(); @@ -507,7 +512,7 @@ class ViewMapping< Kokkos::LayoutStride>::value))>, SrcTraits, Args...> { private: - static_assert(SrcTraits::rank == sizeof...(Args), ""); + static_assert(SrcTraits::rank == sizeof...(Args)); enum : bool { R0 = is_integral_extent<0, Args...>::value, diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewDataAnalysis.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewDataAnalysis.hpp new file mode 100644 index 0000000000..04c0c9aeed --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_ViewDataAnalysis.hpp @@ -0,0 +1,402 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif + +#ifndef KOKKOS_VIEW_DATA_ANALYSIS_HPP +#define KOKKOS_VIEW_DATA_ANALYSIS_HPP + +#include + +namespace Kokkos::Impl { + +template +struct variadic_size_t { + enum : size_t { value = KOKKOS_INVALID_INDEX }; +}; + +template +struct variadic_size_t<0, Val, Args...> { + enum : size_t { value = Val }; +}; + +template +struct variadic_size_t { + enum : size_t { value = variadic_size_t::value }; +}; + +template +struct rank_dynamic; + +template <> +struct rank_dynamic<> { + enum : unsigned { value = 0 }; +}; + +template +struct rank_dynamic { + enum : unsigned { value = (Val == 0 ? 1 : 0) + rank_dynamic::value }; +}; + +#define KOKKOS_IMPL_VIEW_DIMENSION(R) \ + template \ + struct ViewDimension##R { \ + static constexpr size_t ArgN##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ + static constexpr size_t N##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ + KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t) {} \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R& operator=(const ViewDimension##R&) = default; \ + }; \ + template \ + constexpr size_t ViewDimension##R::ArgN##R; \ + template \ + constexpr size_t ViewDimension##R::N##R; \ + template \ + struct ViewDimension##R<0u, RD> { \ + static constexpr size_t ArgN##R = 0; \ + std::conditional_t<(RD < 3), size_t, unsigned> N##R; \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R& operator=(const ViewDimension##R&) = default; \ + KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t V) : N##R(V) {} \ + }; \ + template \ + constexpr size_t ViewDimension##R<0u, RD>::ArgN##R; + +KOKKOS_IMPL_VIEW_DIMENSION(0) +KOKKOS_IMPL_VIEW_DIMENSION(1) +KOKKOS_IMPL_VIEW_DIMENSION(2) +KOKKOS_IMPL_VIEW_DIMENSION(3) +KOKKOS_IMPL_VIEW_DIMENSION(4) +KOKKOS_IMPL_VIEW_DIMENSION(5) +KOKKOS_IMPL_VIEW_DIMENSION(6) +KOKKOS_IMPL_VIEW_DIMENSION(7) + +#undef KOKKOS_IMPL_VIEW_DIMENSION + +// MSVC does not do empty base class optimization by default. +// Per standard it is required for standard layout types +template +struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension + : public ViewDimension0::value, + rank_dynamic::value>, + public ViewDimension1::value, + rank_dynamic::value>, + public ViewDimension2::value, + rank_dynamic::value>, + public ViewDimension3::value, + rank_dynamic::value>, + public ViewDimension4::value, + rank_dynamic::value>, + public ViewDimension5::value, + rank_dynamic::value>, + public ViewDimension6::value, + rank_dynamic::value>, + public ViewDimension7::value, + rank_dynamic::value> { + using D0 = ViewDimension0::value, + rank_dynamic::value>; + using D1 = ViewDimension1::value, + rank_dynamic::value>; + using D2 = ViewDimension2::value, + rank_dynamic::value>; + using D3 = ViewDimension3::value, + rank_dynamic::value>; + using D4 = ViewDimension4::value, + rank_dynamic::value>; + using D5 = ViewDimension5::value, + rank_dynamic::value>; + using D6 = ViewDimension6::value, + rank_dynamic::value>; + using D7 = ViewDimension7::value, + rank_dynamic::value>; + + using D0::ArgN0; + using D1::ArgN1; + using D2::ArgN2; + using D3::ArgN3; + using D4::ArgN4; + using D5::ArgN5; + using D6::ArgN6; + using D7::ArgN7; + + using D0::N0; + using D1::N1; + using D2::N2; + using D3::N3; + using D4::N4; + using D5::N5; + using D6::N6; + using D7::N7; + + static constexpr unsigned rank = sizeof...(Vals); + static constexpr unsigned rank_dynamic = Impl::rank_dynamic::value; + + ViewDimension() = default; + ViewDimension(const ViewDimension&) = default; + ViewDimension& operator=(const ViewDimension&) = default; + + KOKKOS_INLINE_FUNCTION + constexpr ViewDimension(size_t n0, size_t n1, size_t n2, size_t n3, size_t n4, + size_t n5, size_t n6, size_t n7) + : D0(n0 == KOKKOS_INVALID_INDEX ? 1 : n0), + D1(n1 == KOKKOS_INVALID_INDEX ? 1 : n1), + D2(n2 == KOKKOS_INVALID_INDEX ? 1 : n2), + D3(n3 == KOKKOS_INVALID_INDEX ? 1 : n3), + D4(n4 == KOKKOS_INVALID_INDEX ? 1 : n4), + D5(n5 == KOKKOS_INVALID_INDEX ? 1 : n5), + D6(n6 == KOKKOS_INVALID_INDEX ? 1 : n6), + D7(n7 == KOKKOS_INVALID_INDEX ? 1 : n7) {} + + KOKKOS_INLINE_FUNCTION + constexpr size_t extent(const unsigned r) const noexcept { + return r == 0 + ? N0 + : (r == 1 + ? N1 + : (r == 2 + ? N2 + : (r == 3 + ? N3 + : (r == 4 + ? N4 + : (r == 5 + ? N5 + : (r == 6 + ? N6 + : (r == 7 ? N7 + : 0))))))); + } + + static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( + const unsigned r) noexcept { + return r == 0 + ? ArgN0 + : (r == 1 + ? ArgN1 + : (r == 2 + ? ArgN2 + : (r == 3 + ? ArgN3 + : (r == 4 + ? ArgN4 + : (r == 5 + ? ArgN5 + : (r == 6 + ? ArgN6 + : (r == 7 ? ArgN7 + : 0))))))); + } + + template + struct prepend { + using type = ViewDimension; + }; + + template + struct append { + using type = ViewDimension; + }; +}; + +template +struct ViewDimensionJoin; + +template +struct ViewDimensionJoin, ViewDimension> { + using type = ViewDimension; +}; + +//---------------------------------------------------------------------------- + +template +struct ViewDimensionAssignable; + +template +struct ViewDimensionAssignable, + ViewDimension> { + using dst = ViewDimension; + using src = ViewDimension; + + enum { + value = unsigned(dst::rank) == unsigned(src::rank) && + ( + // Compile time check that potential static dimensions match + ((1 > dst::rank_dynamic && 1 > src::rank_dynamic) + ? (size_t(dst::ArgN0) == size_t(src::ArgN0)) + : true) && + ((2 > dst::rank_dynamic && 2 > src::rank_dynamic) + ? (size_t(dst::ArgN1) == size_t(src::ArgN1)) + : true) && + ((3 > dst::rank_dynamic && 3 > src::rank_dynamic) + ? (size_t(dst::ArgN2) == size_t(src::ArgN2)) + : true) && + ((4 > dst::rank_dynamic && 4 > src::rank_dynamic) + ? (size_t(dst::ArgN3) == size_t(src::ArgN3)) + : true) && + ((5 > dst::rank_dynamic && 5 > src::rank_dynamic) + ? (size_t(dst::ArgN4) == size_t(src::ArgN4)) + : true) && + ((6 > dst::rank_dynamic && 6 > src::rank_dynamic) + ? (size_t(dst::ArgN5) == size_t(src::ArgN5)) + : true) && + ((7 > dst::rank_dynamic && 7 > src::rank_dynamic) + ? (size_t(dst::ArgN6) == size_t(src::ArgN6)) + : true) && + ((8 > dst::rank_dynamic && 8 > src::rank_dynamic) + ? (size_t(dst::ArgN7) == size_t(src::ArgN7)) + : true)) + }; +}; + +/** \brief Given a value type and dimension generate the View data type */ +template +struct ViewDataType; + +template +struct ViewDataType> { + using type = T; +}; + +template +struct ViewDataType> { + using type = typename ViewDataType>::type; +}; + +template +struct ViewDataType> { + using type = typename ViewDataType>::type[N]; +}; + +/**\brief Analysis of View data type. + * + * Data type conforms to one of the following patterns : + * {const} value_type [][#][#][#] + * {const} value_type ***[#][#][#] + * Where the sum of counts of '*' and '[#]' is at most ten. + * + * Provide alias for ViewDimension<...> and value_type. + */ +template +struct ViewArrayAnalysis { + using value_type = T; + using const_value_type = std::add_const_t; + using non_const_value_type = std::remove_const_t; + using static_dimension = ViewDimension<>; + using dynamic_dimension = ViewDimension<>; + using dimension = ViewDimension<>; +}; + +template +struct ViewArrayAnalysis { + private: + using nested = ViewArrayAnalysis; + + public: + using value_type = typename nested::value_type; + using const_value_type = typename nested::const_value_type; + using non_const_value_type = typename nested::non_const_value_type; + + using static_dimension = + typename nested::static_dimension::template prepend::type; + + using dynamic_dimension = typename nested::dynamic_dimension; + + using dimension = + typename ViewDimensionJoin::type; +}; + +template +struct ViewArrayAnalysis { + private: + using nested = ViewArrayAnalysis; + using nested_dimension = typename nested::dimension; + + public: + using value_type = typename nested::value_type; + using const_value_type = typename nested::const_value_type; + using non_const_value_type = typename nested::non_const_value_type; + + using dynamic_dimension = + typename nested::dynamic_dimension::template prepend<0>::type; + + using static_dimension = typename nested::static_dimension; + + using dimension = + typename ViewDimensionJoin::type; +}; + +template +struct ViewArrayAnalysis { + private: + using nested = ViewArrayAnalysis; + + public: + using value_type = typename nested::value_type; + using const_value_type = typename nested::const_value_type; + using non_const_value_type = typename nested::non_const_value_type; + + using dynamic_dimension = + typename nested::dynamic_dimension::template prepend<0>::type; + + using static_dimension = typename nested::static_dimension; + + using dimension = + typename ViewDimensionJoin::type; +}; + +template +struct ViewDataAnalysis { + private: + using array_analysis = ViewArrayAnalysis; + + // ValueType is opportunity for partial specialization. + // Must match array analysis when this default template is used. + static_assert( + std::is_same::value); + + public: + using specialize = void; // No specialization + + using dimension = typename array_analysis::dimension; + using value_type = typename array_analysis::value_type; + using const_value_type = typename array_analysis::const_value_type; + using non_const_value_type = typename array_analysis::non_const_value_type; + + // Generate analogous multidimensional array specification type. + using type = typename ViewDataType::type; + using const_type = typename ViewDataType::type; + using non_const_type = + typename ViewDataType::type; + + // Generate "flattened" multidimensional array specification type. + using scalar_array_type = type; + using const_scalar_array_type = const_type; + using non_const_scalar_array_type = non_const_type; +}; + +template +struct ViewOffset { + using is_mapping_plugin = std::false_type; +}; +} // namespace Kokkos::Impl + +#endif // KOKKOS_VIEW_DATA_ANALYSIS_HPP diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp index 01d0dc4f68..3217c76e38 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp @@ -33,255 +33,7 @@ #include #include #include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -struct variadic_size_t { - enum : size_t { value = KOKKOS_INVALID_INDEX }; -}; - -template -struct variadic_size_t<0, Val, Args...> { - enum : size_t { value = Val }; -}; - -template -struct variadic_size_t { - enum : size_t { value = variadic_size_t::value }; -}; - -template -struct rank_dynamic; - -template <> -struct rank_dynamic<> { - enum : unsigned { value = 0 }; -}; - -template -struct rank_dynamic { - enum : unsigned { value = (Val == 0 ? 1 : 0) + rank_dynamic::value }; -}; - -#define KOKKOS_IMPL_VIEW_DIMENSION(R) \ - template \ - struct ViewDimension##R { \ - static constexpr size_t ArgN##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ - static constexpr size_t N##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ - KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t) {} \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ - ViewDimension##R& operator=(const ViewDimension##R&) = default; \ - }; \ - template \ - constexpr size_t ViewDimension##R::ArgN##R; \ - template \ - constexpr size_t ViewDimension##R::N##R; \ - template \ - struct ViewDimension##R<0u, RD> { \ - static constexpr size_t ArgN##R = 0; \ - std::conditional_t<(RD < 3), size_t, unsigned> N##R; \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ - ViewDimension##R& operator=(const ViewDimension##R&) = default; \ - KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t V) : N##R(V) {} \ - }; \ - template \ - constexpr size_t ViewDimension##R<0u, RD>::ArgN##R; - -KOKKOS_IMPL_VIEW_DIMENSION(0) -KOKKOS_IMPL_VIEW_DIMENSION(1) -KOKKOS_IMPL_VIEW_DIMENSION(2) -KOKKOS_IMPL_VIEW_DIMENSION(3) -KOKKOS_IMPL_VIEW_DIMENSION(4) -KOKKOS_IMPL_VIEW_DIMENSION(5) -KOKKOS_IMPL_VIEW_DIMENSION(6) -KOKKOS_IMPL_VIEW_DIMENSION(7) - -#undef KOKKOS_IMPL_VIEW_DIMENSION - -// MSVC does not do empty base class optimization by default. -// Per standard it is required for standard layout types -template -struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension - : public ViewDimension0::value, - rank_dynamic::value>, - public ViewDimension1::value, - rank_dynamic::value>, - public ViewDimension2::value, - rank_dynamic::value>, - public ViewDimension3::value, - rank_dynamic::value>, - public ViewDimension4::value, - rank_dynamic::value>, - public ViewDimension5::value, - rank_dynamic::value>, - public ViewDimension6::value, - rank_dynamic::value>, - public ViewDimension7::value, - rank_dynamic::value> { - using D0 = ViewDimension0::value, - rank_dynamic::value>; - using D1 = ViewDimension1::value, - rank_dynamic::value>; - using D2 = ViewDimension2::value, - rank_dynamic::value>; - using D3 = ViewDimension3::value, - rank_dynamic::value>; - using D4 = ViewDimension4::value, - rank_dynamic::value>; - using D5 = ViewDimension5::value, - rank_dynamic::value>; - using D6 = ViewDimension6::value, - rank_dynamic::value>; - using D7 = ViewDimension7::value, - rank_dynamic::value>; - - using D0::ArgN0; - using D1::ArgN1; - using D2::ArgN2; - using D3::ArgN3; - using D4::ArgN4; - using D5::ArgN5; - using D6::ArgN6; - using D7::ArgN7; - - using D0::N0; - using D1::N1; - using D2::N2; - using D3::N3; - using D4::N4; - using D5::N5; - using D6::N6; - using D7::N7; - - static constexpr unsigned rank = sizeof...(Vals); - static constexpr unsigned rank_dynamic = Impl::rank_dynamic::value; - - ViewDimension() = default; - ViewDimension(const ViewDimension&) = default; - ViewDimension& operator=(const ViewDimension&) = default; - - KOKKOS_INLINE_FUNCTION - constexpr ViewDimension(size_t n0, size_t n1, size_t n2, size_t n3, size_t n4, - size_t n5, size_t n6, size_t n7) - : D0(n0 == KOKKOS_INVALID_INDEX ? 1 : n0), - D1(n1 == KOKKOS_INVALID_INDEX ? 1 : n1), - D2(n2 == KOKKOS_INVALID_INDEX ? 1 : n2), - D3(n3 == KOKKOS_INVALID_INDEX ? 1 : n3), - D4(n4 == KOKKOS_INVALID_INDEX ? 1 : n4), - D5(n5 == KOKKOS_INVALID_INDEX ? 1 : n5), - D6(n6 == KOKKOS_INVALID_INDEX ? 1 : n6), - D7(n7 == KOKKOS_INVALID_INDEX ? 1 : n7) {} - - KOKKOS_INLINE_FUNCTION - constexpr size_t extent(const unsigned r) const noexcept { - return r == 0 - ? N0 - : (r == 1 - ? N1 - : (r == 2 - ? N2 - : (r == 3 - ? N3 - : (r == 4 - ? N4 - : (r == 5 - ? N5 - : (r == 6 - ? N6 - : (r == 7 ? N7 - : 0))))))); - } - - static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( - const unsigned r) noexcept { - return r == 0 - ? ArgN0 - : (r == 1 - ? ArgN1 - : (r == 2 - ? ArgN2 - : (r == 3 - ? ArgN3 - : (r == 4 - ? ArgN4 - : (r == 5 - ? ArgN5 - : (r == 6 - ? ArgN6 - : (r == 7 ? ArgN7 - : 0))))))); - } - - template - struct prepend { - using type = ViewDimension; - }; - - template - struct append { - using type = ViewDimension; - }; -}; - -template -struct ViewDimensionJoin; - -template -struct ViewDimensionJoin, ViewDimension> { - using type = ViewDimension; -}; - -//---------------------------------------------------------------------------- - -template -struct ViewDimensionAssignable; - -template -struct ViewDimensionAssignable, - ViewDimension> { - using dst = ViewDimension; - using src = ViewDimension; - - enum { - value = unsigned(dst::rank) == unsigned(src::rank) && - ( - // Compile time check that potential static dimensions match - ((1 > dst::rank_dynamic && 1 > src::rank_dynamic) - ? (size_t(dst::ArgN0) == size_t(src::ArgN0)) - : true) && - ((2 > dst::rank_dynamic && 2 > src::rank_dynamic) - ? (size_t(dst::ArgN1) == size_t(src::ArgN1)) - : true) && - ((3 > dst::rank_dynamic && 3 > src::rank_dynamic) - ? (size_t(dst::ArgN2) == size_t(src::ArgN2)) - : true) && - ((4 > dst::rank_dynamic && 4 > src::rank_dynamic) - ? (size_t(dst::ArgN3) == size_t(src::ArgN3)) - : true) && - ((5 > dst::rank_dynamic && 5 > src::rank_dynamic) - ? (size_t(dst::ArgN4) == size_t(src::ArgN4)) - : true) && - ((6 > dst::rank_dynamic && 6 > src::rank_dynamic) - ? (size_t(dst::ArgN5) == size_t(src::ArgN5)) - : true) && - ((7 > dst::rank_dynamic && 7 > src::rank_dynamic) - ? (size_t(dst::ArgN6) == size_t(src::ArgN6)) - : true) && - ((8 > dst::rank_dynamic && 8 > src::rank_dynamic) - ? (size_t(dst::ArgN7) == size_t(src::ArgN7)) - : true)) - }; -}; - -} // namespace Impl -} // namespace Kokkos +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -657,21 +409,20 @@ struct SubviewExtents { template KOKKOS_INLINE_FUNCTION SubviewExtents(const ViewDimension& dim, Args... args) { - static_assert(DomainRank == sizeof...(DimArgs), ""); - static_assert(DomainRank == sizeof...(Args), ""); + static_assert(DomainRank == sizeof...(DimArgs)); + static_assert(DomainRank == sizeof...(Args)); // Verifies that all arguments, up to 8, are integral types, // integral extents, or don't exist. - static_assert( - RangeRank == unsigned(is_integral_extent<0, Args...>::value) + - unsigned(is_integral_extent<1, Args...>::value) + - unsigned(is_integral_extent<2, Args...>::value) + - unsigned(is_integral_extent<3, Args...>::value) + - unsigned(is_integral_extent<4, Args...>::value) + - unsigned(is_integral_extent<5, Args...>::value) + - unsigned(is_integral_extent<6, Args...>::value) + - unsigned(is_integral_extent<7, Args...>::value), - ""); + static_assert(RangeRank == + unsigned(is_integral_extent<0, Args...>::value) + + unsigned(is_integral_extent<1, Args...>::value) + + unsigned(is_integral_extent<2, Args...>::value) + + unsigned(is_integral_extent<3, Args...>::value) + + unsigned(is_integral_extent<4, Args...>::value) + + unsigned(is_integral_extent<5, Args...>::value) + + unsigned(is_integral_extent<6, Args...>::value) + + unsigned(is_integral_extent<7, Args...>::value)); if (RangeRank == 0) { m_length[0] = 0; @@ -708,149 +459,6 @@ struct SubviewExtents { namespace Kokkos { namespace Impl { - -/** \brief Given a value type and dimension generate the View data type */ -template -struct ViewDataType; - -template -struct ViewDataType> { - using type = T; -}; - -template -struct ViewDataType> { - using type = typename ViewDataType>::type; -}; - -template -struct ViewDataType> { - using type = typename ViewDataType>::type[N]; -}; - -/**\brief Analysis of View data type. - * - * Data type conforms to one of the following patterns : - * {const} value_type [][#][#][#] - * {const} value_type ***[#][#][#] - * Where the sum of counts of '*' and '[#]' is at most ten. - * - * Provide alias for ViewDimension<...> and value_type. - */ -template -struct ViewArrayAnalysis { - using value_type = T; - using const_value_type = std::add_const_t; - using non_const_value_type = std::remove_const_t; - using static_dimension = ViewDimension<>; - using dynamic_dimension = ViewDimension<>; - using dimension = ViewDimension<>; -}; - -template -struct ViewArrayAnalysis { - private: - using nested = ViewArrayAnalysis; - - public: - using value_type = typename nested::value_type; - using const_value_type = typename nested::const_value_type; - using non_const_value_type = typename nested::non_const_value_type; - - using static_dimension = - typename nested::static_dimension::template prepend::type; - - using dynamic_dimension = typename nested::dynamic_dimension; - - using dimension = - typename ViewDimensionJoin::type; -}; - -template -struct ViewArrayAnalysis { - private: - using nested = ViewArrayAnalysis; - using nested_dimension = typename nested::dimension; - - public: - using value_type = typename nested::value_type; - using const_value_type = typename nested::const_value_type; - using non_const_value_type = typename nested::non_const_value_type; - - using dynamic_dimension = - typename nested::dynamic_dimension::template prepend<0>::type; - - using static_dimension = typename nested::static_dimension; - - using dimension = - typename ViewDimensionJoin::type; -}; - -template -struct ViewArrayAnalysis { - private: - using nested = ViewArrayAnalysis; - - public: - using value_type = typename nested::value_type; - using const_value_type = typename nested::const_value_type; - using non_const_value_type = typename nested::non_const_value_type; - - using dynamic_dimension = - typename nested::dynamic_dimension::template prepend<0>::type; - - using static_dimension = typename nested::static_dimension; - - using dimension = - typename ViewDimensionJoin::type; -}; - -template -struct ViewDataAnalysis { - private: - using array_analysis = ViewArrayAnalysis; - - // ValueType is opportunity for partial specialization. - // Must match array analysis when this default template is used. - static_assert( - std::is_same::value, - ""); - - public: - using specialize = void; // No specialization - - using dimension = typename array_analysis::dimension; - using value_type = typename array_analysis::value_type; - using const_value_type = typename array_analysis::const_value_type; - using non_const_value_type = typename array_analysis::non_const_value_type; - - // Generate analogous multidimensional array specification type. - using type = typename ViewDataType::type; - using const_type = typename ViewDataType::type; - using non_const_type = - typename ViewDataType::type; - - // Generate "flattened" multidimensional array specification type. - using scalar_array_type = type; - using const_scalar_array_type = const_type; - using non_const_scalar_array_type = non_const_type; -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -struct ViewOffset { - using is_mapping_plugin = std::false_type; -}; - //---------------------------------------------------------------------------- // LayoutLeft AND ( 1 >= rank OR 0 == rank_dynamic ) : no padding / striding template @@ -2919,13 +2527,9 @@ struct ViewValueFunctor { "Kokkos::View::initialization [" + name + "] via memset", Kokkos::Profiling::Experimental::device_id(space), &kpID); } - (void)ZeroMemset< - ExecSpace, Kokkos::View>>( - space, - Kokkos::View>(ptr, n), - value); + (void)ZeroMemset( + space, Kokkos::View>(ptr, n)); if (Kokkos::Profiling::profileLibraryLoaded()) { Kokkos::Profiling::endParallelFor(kpID); @@ -2949,37 +2553,33 @@ struct ViewValueFunctor { template void parallel_for_implementation() { - if (!space.in_parallel()) { - using PolicyType = - Kokkos::RangePolicy, Tag>; - PolicyType policy(space, 0, n); - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - const std::string functor_name = - (std::is_same_v - ? "Kokkos::View::destruction [" + name + "]" - : "Kokkos::View::initialization [" + name + "]"); - Kokkos::Profiling::beginParallelFor( - functor_name, Kokkos::Profiling::Experimental::device_id(space), - &kpID); - } + using PolicyType = + Kokkos::RangePolicy, Tag>; + PolicyType policy(space, 0, n); + uint64_t kpID = 0; + if (Kokkos::Profiling::profileLibraryLoaded()) { + const std::string functor_name = + (std::is_same_v + ? "Kokkos::View::destruction [" + name + "]" + : "Kokkos::View::initialization [" + name + "]"); + Kokkos::Profiling::beginParallelFor( + functor_name, Kokkos::Profiling::Experimental::device_id(space), + &kpID); + } #ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, - true); - } + if (std::is_same::value) { + Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, + true); + } #endif - const Kokkos::Impl::ParallelFor closure( - *this, policy); - closure.execute(); - if (default_exec_space || std::is_same_v) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - } else { - for (size_t i = 0; i < n; ++i) operator()(Tag{}, i); + const Kokkos::Impl::ParallelFor closure( + *this, policy); + closure.execute(); + if (default_exec_space || std::is_same_v) + space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); } } @@ -3057,13 +2657,9 @@ struct ViewValueFunctor { Kokkos::Profiling::Experimental::device_id(space), &kpID); } - (void)ZeroMemset< - ExecSpace, Kokkos::View>>( - space, - Kokkos::View>(ptr, n), - value); + (void)ZeroMemset( + space, Kokkos::View>(ptr, n)); if (Kokkos::Profiling::profileLibraryLoaded()) { Kokkos::Profiling::endParallelFor(kpID); @@ -3086,32 +2682,28 @@ struct ViewValueFunctor { } void parallel_for_implementation() { - if (!space.in_parallel()) { - PolicyType policy(0, n); - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "]", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } + PolicyType policy(0, n); + uint64_t kpID = 0; + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::beginParallelFor( + "Kokkos::View::initialization [" + name + "]", + Kokkos::Profiling::Experimental::device_id(space), &kpID); + } #ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, - true); - } + if (std::is_same::value) { + Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, + true); + } #endif - const Kokkos::Impl::ParallelFor closure( - *this, PolicyType(0, n)); - closure.execute(); - if (default_exec_space) - space.fence( - "Kokkos::Impl::ViewValueFunctor: Fence after setting values in " - "view"); - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - } else { - for (size_t i = 0; i < n; ++i) operator()(i); + const Kokkos::Impl::ParallelFor closure( + *this, PolicyType(0, n)); + closure.execute(); + if (default_exec_space) + space.fence( + "Kokkos::Impl::ViewValueFunctor: Fence after setting values in " + "view"); + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); } } @@ -3896,7 +3488,7 @@ class ViewMapping< template struct apply { - static_assert(Kokkos::is_memory_traits::value, ""); + static_assert(Kokkos::is_memory_traits::value); using traits_type = Kokkos::ViewTraits -KOKKOS_INLINE_FUNCTION bool view_verify_operator_bounds(const MapType&) { - return true; +template +KOKKOS_FUNCTION bool within_range(Map const& map, + std::index_sequence, + Indices... indices) { + return (((std::size_t)indices < map.extent(Enumerate)) && ...); } -template -KOKKOS_INLINE_FUNCTION bool view_verify_operator_bounds(const MapType& map, - const iType& i, - Args... args) { - return (size_t(i) < map.extent(R)) && - view_verify_operator_bounds(map, args...); +template +KOKKOS_FUNCTION constexpr char* append_formatted_multidimensional_index( + char* dest, Indices... indices) { + char* d = dest; + strcat(d, "["); + ( + [&] { + d += strlen(d); + to_chars_i(d, + d + 20, // 20 digits ought to be enough + indices); + strcat(d, ","); + }(), + ...); + d[strlen(d) - 1] = ']'; // overwrite trailing comma + return dest; } -template -inline void view_error_operator_bounds(char*, int, const MapType&) {} - -template -inline void view_error_operator_bounds(char* buf, int len, const MapType& map, - const iType& i, Args... args) { - const int n = snprintf( - buf, len, " %ld < %ld %c", static_cast(i), - static_cast(map.extent(R)), (sizeof...(Args) ? ',' : ')')); - view_error_operator_bounds(buf + n, len - n, map, args...); +template +KOKKOS_FUNCTION void print_extents(char* dest, Map const& map, + std::index_sequence) { + append_formatted_multidimensional_index(dest, map.extent(Enumerate)...); } -/* Check #3: is the View managed as determined by the MemoryTraits? */ -template -struct OperatorBoundsErrorOnDevice; - -template -struct OperatorBoundsErrorOnDevice { - KOKKOS_INLINE_FUNCTION - static void run(MapType const&) { Kokkos::abort("View bounds error"); } -}; - -template -struct OperatorBoundsErrorOnDevice { - KOKKOS_INLINE_FUNCTION - static void run(MapType const& map) { - SharedAllocationHeader const* const header = - SharedAllocationHeader::get_header( - static_cast(map.data())); - char const* const label = header->label(); - enum { LEN = 128 }; - char msg[LEN]; - char const* const first_part = "View bounds error of view "; - char* p = msg; - char* const end = msg + LEN - 1; - for (char const* p2 = first_part; (*p2 != '\0') && (p < end); ++p, ++p2) { - *p = *p2; - } - for (char const* p2 = label; (*p2 != '\0') && (p < end); ++p, ++p2) { - *p = *p2; - } - *p = '\0'; - Kokkos::abort(msg); - } -}; - -/* Check #2: does the ViewMapping have the printable_label_typedef defined? - See above that only the non-specialized standard-layout ViewMapping has - this defined by default. - The existence of this alias indicates the existence of MapType::is_managed - */ template using printable_label_typedef_t = typename T::printable_label_typedef; -template -KOKKOS_FUNCTION - std::enable_if_t::value> - operator_bounds_error_on_device(Map const&) { - Kokkos::abort("View bounds error"); -} - -template -KOKKOS_FUNCTION - std::enable_if_t::value> - operator_bounds_error_on_device(Map const& map) { - OperatorBoundsErrorOnDevice::run(map); -} - template KOKKOS_INLINE_FUNCTION void view_verify_operator_bounds( Kokkos::Impl::ViewTracker const& tracker, const MapType& map, Args... args) { - if (!view_verify_operator_bounds<0>(map, args...)) { + if (!within_range(map, std::make_index_sequence(), + args...)) { + char err[256] = ""; + strcat(err, "Kokkos::View ERROR: out of bounds access"); + strcat(err, " label=(\""); KOKKOS_IF_ON_HOST( - (enum {LEN = 1024}; char buffer[LEN]; - const std::string label = - tracker.m_tracker.template get_label(); - int n = snprintf(buffer, LEN, "View bounds error of view %s (", - label.c_str()); - view_error_operator_bounds<0>(buffer + n, LEN - n, map, args...); - Kokkos::Impl::throw_runtime_exception(std::string(buffer));)) - - KOKKOS_IF_ON_DEVICE(( - /* Check #1: is there a SharedAllocationRecord? - (we won't use it, but if its not there then there isn't - a corresponding SharedAllocationHeader containing a label). - This check should cover the case of Views that don't - have the Unmanaged trait but were initialized by pointer. */ if (tracker.m_tracker.has_record()) { - operator_bounds_error_on_device(map); - } else { Kokkos::abort("View bounds error"); })) + strncat(err, tracker.m_tracker.template get_label().c_str(), + 128); + } else { strcat(err, "**UNMANAGED**"); }) + KOKKOS_IF_ON_DEVICE([&] { + // Check #1: is there a SharedAllocationRecord? (we won't use it, but + // if its not there then there isn't a corresponding + // SharedAllocationHeader containing a label). This check should cover + // the case of Views that don't have the Unmanaged trait but were + // initialized by pointer. + if (!tracker.m_tracker.has_record()) { + strcat(err, "**UNMANAGED**"); + return; + } + // Check #2: does the ViewMapping have the printable_label_typedef + // defined? See above that only the non-specialized standard-layout + // ViewMapping has this defined by default. The existence of this + // alias indicates the existence of MapType::is_managed + if constexpr (is_detected_v) { + // Check #3: is the View managed as determined by the MemoryTraits? + if constexpr (MapType::is_managed != 0) { + SharedAllocationHeader const* const header = + SharedAllocationHeader::get_header( + static_cast(map.data())); + char const* const label = header->label(); + strcat(err, label); + return; + } + strcat(err, "**UNAVAILABLE**"); + } + }();) + strcat(err, "\") with indices "); + append_formatted_multidimensional_index(err, args...); + strcat(err, " but extents "); + print_extents(err, map, std::make_index_sequence()); + Kokkos::abort(err); } } diff --git a/lib/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp b/lib/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp index 7f7957bc61..30f6fa2ad2 100644 --- a/lib/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp +++ b/lib/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp @@ -38,12 +38,11 @@ #include #endif -#ifdef __SYCL_DEVICE_ONLY__ -#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(format, ...) \ - do { \ - const __attribute__((opencl_constant)) char fmt[] = (format); \ - sycl::ext::oneapi::experimental::printf(fmt, ##__VA_ARGS__); \ - } while (0) +#if defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER >= 20230200 +#define KOKKOS_IMPL_SYCL_GET_MULTI_PTR(accessor) \ + accessor.get_multi_ptr() +#else +#define KOKKOS_IMPL_SYCL_GET_MULTI_PTR(accessor) accessor.get_pointer() #endif #endif diff --git a/lib/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp index 91820fbcca..e43535451c 100644 --- a/lib/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp +++ b/lib/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp @@ -83,7 +83,7 @@ struct IndexTypePolicyMixin : AnalyzeNextTrait { "Kokkos Error: More than one index type given. Search " "compiler output for 'show_extra_index_type' to see the " "type of the errant tag."); - static_assert(std::is_integral::value, ""); + static_assert(std::is_integral::value); static constexpr bool index_type_is_defaulted = false; using index_type = Kokkos::IndexType; }; diff --git a/lib/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp index dadf582c37..c2ca5a341f 100644 --- a/lib/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp +++ b/lib/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp @@ -163,7 +163,7 @@ auto prefer(Policy const& p, DesiredOccupancy occ) { template constexpr auto prefer(Policy const& p, MaximizeOccupancy) { - static_assert(Kokkos::is_execution_policy::value, ""); + static_assert(Kokkos::is_execution_policy::value); using new_policy_t = Kokkos::Impl::OccupancyControlTrait::policy_with_trait; diff --git a/lib/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp b/lib/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp index 578e9e762a..98ad1d7ebb 100644 --- a/lib/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp +++ b/lib/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp @@ -68,7 +68,7 @@ struct PolicyTraitAdaptorImpl< TraitSpec, PolicyTemplate, type_list, type_list, NewTrait, std::enable_if_t::value>> { - static_assert(PolicyTraitMatcher::value, ""); + static_assert(PolicyTraitMatcher::value); using type = PolicyTemplate; }; @@ -92,7 +92,7 @@ template class PolicyTemplate, struct PolicyTraitAdaptorImpl, type_list<>, NewTrait> { - static_assert(PolicyTraitMatcher::value, ""); + static_assert(PolicyTraitMatcher::value); using type = PolicyTemplate; }; diff --git a/lib/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp index 8613002553..4e91d89f0f 100644 --- a/lib/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp +++ b/lib/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp @@ -78,7 +78,7 @@ namespace Experimental { template constexpr auto require(Policy const& p, Kokkos::Schedule) { - static_assert(Kokkos::is_execution_policy::value, ""); + static_assert(Kokkos::is_execution_policy::value); using new_policy_t = Kokkos::Impl::ScheduleTrait::policy_with_trait< Policy, Kokkos::Schedule>; return new_policy_t{p}; diff --git a/lib/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp index 8f95385c85..ae7aa6e534 100644 --- a/lib/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp +++ b/lib/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp @@ -57,7 +57,7 @@ namespace Experimental { template constexpr auto require(const Policy p, WorkItemProperty::ImplWorkItemProperty) { - static_assert(Kokkos::is_execution_policy::value, ""); + static_assert(Kokkos::is_execution_policy::value); using new_policy_t = Kokkos::Impl::WorkItemPropertyTrait::policy_with_trait< Policy, WorkItemProperty::ImplWorkItemProperty>; return new_policy_t{p}; diff --git a/lib/kokkos/core/unit_test/CMakeLists.txt b/lib/kokkos/core/unit_test/CMakeLists.txt index b71c72c3c9..6dfb7505c5 100644 --- a/lib/kokkos/core/unit_test/CMakeLists.txt +++ b/lib/kokkos/core/unit_test/CMakeLists.txt @@ -65,7 +65,7 @@ SET(KOKKOS_THREADS_NAME Threads) IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) SET(KOKKOS_OPENACC_FEATURE_LEVEL 9) ELSE() - SET(KOKKOS_OPENACC_FEATURE_LEVEL 16) + SET(KOKKOS_OPENACC_FEATURE_LEVEL 17) ENDIF() SET(KOKKOS_OPENACC_NAME Experimental::OpenACC) @@ -86,11 +86,13 @@ SET(COMPILE_ONLY_SOURCES TestDetectionIdiom.cpp TestBitManipulation.cpp TestInterOp.cpp + TestRangePolicyCTAD.cpp TestStringManipulation.cpp TestVersionMacros.cpp TestViewRank.cpp TestViewTypeTraits.cpp TestTypeList.cpp + TestMDRangePolicyCTAD.cpp view/TestExtentsDatatypeConversion.cpp ) @@ -184,6 +186,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) MDSpan MinMaxClamp NumericTraits + OccupancyControlTrait Other ParallelScanRangePolicy Printf @@ -200,6 +203,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) Reductions Reductions_DeviceView SharedAlloc + Swap ) set(file ${dir}/Test${Tag}_${Name}.cpp) # Write to a temporary intermediate file and call configure_file to avoid @@ -233,6 +237,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) ViewCopy_a ViewCopy_b ViewCtorDimMatch + ViewEmptyRuntimeUnmanaged ViewHooks ViewLayoutStrideAssignment ViewMapping_a @@ -240,6 +245,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) ViewMapping_subview ViewMemoryAccessViolation ViewOfClass + ViewOutOfBoundsAccess ViewResize WorkGraph WithoutInitializing @@ -372,20 +378,21 @@ if(Kokkos_ENABLE_OPENMPTARGET) ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamCombinedReducers.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_WorkGraph.cpp - IF (KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c01.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c02.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c03.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp - endif() IF (KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_shared.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MinMaxClamp.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_LocalDeepCopy.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_e.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScan.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamBasic.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_e.cpp + IF (KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.0.3) + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c01.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c02.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c03.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp + endif() endif() # FIXME_OPENMPTARGET_CRAY: The following tests fail at compile time when the OpenMPTarget backend is enabled with the Cray compiler. # Atomic compare/exchange is used in these tests which can be one of the reasons for the compilation failures. @@ -522,17 +529,7 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) list(REMOVE_ITEM OpenACC_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_a1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_b1.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_double.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_float.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_int.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_longint.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_longlongint.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_shared.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_unsignedint.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_unsignedlongint.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Atomics.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicViews.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_BlockSizeDeduction.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_DeepCopyAlignment.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtr.cpp @@ -549,17 +546,10 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_d.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions_DeviceView.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_b.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c02.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c03.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c05.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c08.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c11.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamBasic.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamScratch.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamTeamSize.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_UniqueToken.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewResize.cpp ) endif() @@ -677,7 +667,6 @@ endif() if (Kokkos_ENABLE_OPENMP) set(OpenMP_EXTRA_SOURCES openmp/TestOpenMP_Task.cpp - openmp/TestOpenMP_PartitionMaster.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_OpenMP @@ -724,12 +713,14 @@ if(Kokkos_ENABLE_HPX) hpx/TestHPX_IndependentInstancesRefCounting.cpp hpx/TestHPX_IndependentInstancesSynchronization.cpp ) +if(Kokkos_ENABLE_DEPRECATED_CODE_4) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_HPX_InParallel SOURCES UnitTestMainInit.cpp hpx/TestHPX_InParallel.cpp ) + endif() endif() if(Kokkos_ENABLE_OPENMPTARGET) @@ -797,6 +788,12 @@ if(Kokkos_ENABLE_CUDA) UnitTestMain.cpp cuda/TestCuda_InterOp_Streams.cpp ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + CoreUnitTest_CudaInterOpStreamsMultiGPU + SOURCES + UnitTestMainInit.cpp + cuda/TestCuda_InterOp_StreamsMultiGPU.cpp + ) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_CudaGraph SOURCES @@ -1039,13 +1036,7 @@ KOKKOS_ADD_ADVANCED_TEST( CoreUnitTest_PushFinalizeHook_terminate tools/TestCategoricalTuner.cpp ) endif() - if((NOT Kokkos_ENABLE_OPENMPTARGET) AND (NOT Kokkos_ENABLE_OPENACC)) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_LogicalSpaces - SOURCES - tools/TestLogicalSpaces.cpp - ) - endif() + SET(KOKKOSP_SOURCES UnitTestMainInit.cpp tools/TestEventCorrectness.cpp @@ -1167,15 +1158,6 @@ KOKKOS_ADD_TEST( NAME CoreUnitTest_StackTraceTest ) endif() -if(Kokkos_ENABLE_DEPRECATED_CODE_3) - foreach(INITTESTS_NUM RANGE 1 18) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_DefaultInit_${INITTESTS_NUM} - SOURCES UnitTestMain.cpp default/TestDefaultDeviceTypeInit_${INITTESTS_NUM}.cpp - ) - endforeach(INITTESTS_NUM) -endif() - if (KOKKOS_ENABLE_HWLOC) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_HWLOC @@ -1259,12 +1241,10 @@ if (NOT KOKKOS_HAS_TRILINOS) INPUT TestDeviceAndThreads.py ${USE_SOURCE_PERMISSIONS_WHEN_SUPPORTED} ) - if(NOT Kokkos_ENABLE_OPENMPTARGET) # FIXME_OPENMPTARGET does not select the right device - add_test( - NAME Kokkos_CoreUnitTest_DeviceAndThreads - COMMAND ${Python3_EXECUTABLE} -m unittest -v $/TestDeviceAndThreads.py - ) - endif() + add_test( + NAME Kokkos_CoreUnitTest_DeviceAndThreads + COMMAND ${Python3_EXECUTABLE} -m unittest -v $/TestDeviceAndThreads.py + ) endif() endif() diff --git a/lib/kokkos/core/unit_test/Makefile b/lib/kokkos/core/unit_test/Makefile index 33a84b61f9..202809d3fc 100644 --- a/lib/kokkos/core/unit_test/Makefile +++ b/lib/kokkos/core/unit_test/Makefile @@ -67,8 +67,8 @@ TESTS = AtomicOperations_int AtomicOperations_unsignedint AtomicOperations_longi tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ tmp2 := $(foreach test, $(TESTS), \ $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > Test$(device)_$(test).cpp); \ - $(shell echo "\#include " >> Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include " > Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include " >> Test$(device)_$(test).cpp); \ ) \ ) \ ) @@ -82,8 +82,8 @@ KOKKOS_SUBVIEW_DEVICELIST := $(filter-out Cuda, $(KOKKOS_DEVICELIST)) tmp := $(foreach device, $(KOKKOS_SUBVIEW_DEVICELIST), \ tmp2 := $(foreach test, $(SUBVIEW_TESTS), \ $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),, \ - $(shell echo "\#include " > Test$(device)_$(test).cpp); \ - $(shell echo "\#include " >> Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include " > Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include " >> Test$(device)_$(test).cpp); \ ) \ )\ ) @@ -91,8 +91,8 @@ tmp := $(foreach device, $(KOKKOS_SUBVIEW_DEVICELIST), \ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp2 := $(foreach test, $(SUBVIEW_TESTS), \ $(if $(filter TestCuda_$(test).cpp, $(shell ls TestCuda_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > TestCuda_$(test).cpp); \ - $(shell echo "\#include " >> TestCuda_$(test).cpp); \ + $(shell echo "$(H)include " > TestCuda_$(test).cpp); \ + $(shell echo "$(H)include " >> TestCuda_$(test).cpp); \ )\ ) @@ -100,8 +100,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp := $(foreach space, $(GPU_SPACES), \ tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \ $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > Test$(space)_$(test).cpp); \ - $(shell echo "\#include " >> Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include " > Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include " >> Test$(space)_$(test).cpp); \ )\ )\ ) @@ -277,8 +277,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) tmp := $(foreach space, $(GPU_SPACES), \ tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \ $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > Test$(space)_$(test).cpp); \ - $(shell echo "\#include " >> Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include " > Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include " >> Test$(space)_$(test).cpp); \ )\ )\ ) diff --git a/lib/kokkos/core/unit_test/TestAggregate.hpp b/lib/kokkos/core/unit_test/TestAggregate.hpp index 4f67b2eddc..f1316a7426 100644 --- a/lib/kokkos/core/unit_test/TestAggregate.hpp +++ b/lib/kokkos/core/unit_test/TestAggregate.hpp @@ -29,35 +29,31 @@ void TestViewAggregate() { value_type>; static_assert( - std::is_same >::value, - ""); + std::is_same >::value); using a32_traits = Kokkos::ViewTraits; using flat_traits = Kokkos::ViewTraits; static_assert( - std::is_same >::value, - ""); + std::is_same >::value); static_assert( - std::is_same::value, ""); - static_assert(a32_traits::rank == 2, ""); - static_assert(a32_traits::rank_dynamic == 2, ""); + std::is_same::value); + static_assert(a32_traits::rank == 2); + static_assert(a32_traits::rank_dynamic == 2); - static_assert(std::is_void::value, ""); - static_assert(flat_traits::rank == 3, ""); - static_assert(flat_traits::rank_dynamic == 2, ""); - static_assert(flat_traits::dimension::N2 == 32, ""); + static_assert(std::is_void::value); + static_assert(flat_traits::rank == 3); + static_assert(flat_traits::rank_dynamic == 2); + static_assert(flat_traits::dimension::N2 == 32); using a32_type = Kokkos::View **, DeviceType>; using a32_flat_type = typename a32_type::array_type; - static_assert(std::is_same::value, - ""); - static_assert(std::is_same::value, - ""); - static_assert(a32_type::rank == 2, ""); - static_assert(a32_flat_type::rank == 3, ""); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(a32_type::rank == 2); + static_assert(a32_flat_type::rank == 3); a32_type x("test", 4, 5); a32_flat_type y(x); diff --git a/lib/kokkos/core/unit_test/TestArray.cpp b/lib/kokkos/core/unit_test/TestArray.cpp index d3bdc4f93f..673d0036b7 100644 --- a/lib/kokkos/core/unit_test/TestArray.cpp +++ b/lib/kokkos/core/unit_test/TestArray.cpp @@ -49,4 +49,28 @@ KOKKOS_FUNCTION constexpr bool test_array_structured_binding_support() { static_assert(test_array_structured_binding_support()); +template +KOKKOS_FUNCTION constexpr bool is_equal(L const& l, R const& r) { + if (std::size(l) != std::size(r)) return false; + + for (size_t i = 0; i != std::size(l); ++i) { + if (l[i] != r[i]) return false; + } + + return true; +} + +// Disable ctad test for intel versions < 2021, see issue #6702 +#if !defined(KOKKOS_COMPILER_INTEL) || KOKKOS_COMPILER_INTEL >= 2021 +KOKKOS_FUNCTION constexpr bool test_array_ctad() { + constexpr int x = 10; + constexpr Kokkos::Array a{1, 2, 3, 5, x}; + constexpr Kokkos::Array b{1, 2, 3, 5, x}; + + return std::is_same_v && is_equal(a, b); +} + +static_assert(test_array_ctad()); +#endif + } // namespace diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations.hpp index a5aebed413..cd7ba47aa1 100644 --- a/lib/kokkos/core/unit_test/TestAtomicOperations.hpp +++ b/lib/kokkos/core/unit_test/TestAtomicOperations.hpp @@ -368,6 +368,63 @@ bool atomic_op_test(T old_val, T update) { return result == 0; } +template +constexpr T relative_error_threshold = T(1.0e-15); + +template +bool atomic_op_test_rel(T old_val, T update) { + Kokkos::View op_data("op_data"); + Kokkos::deep_copy(op_data, old_val); + int result = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, 1), + KOKKOS_LAMBDA(int, int& local_result) { + auto fetch_result = + Op::atomic_op(&op_data(0), &op_data(1), &op_data(2), update); + T expected_val = Op::op(old_val, update); + Kokkos::memory_fence(); + if (expected_val == T(0)) { + if (fabs(op_data(0)) > relative_error_threshold) local_result += 1; + if (fabs(op_data(1)) > relative_error_threshold) local_result += 2; + if (fabs(op_data(2)) > relative_error_threshold) local_result += 4; + if (fetch_result.first != old_val) local_result += 8; + if (fabs(fetch_result.second) > relative_error_threshold) + local_result += 16; + } else { + if (fabs((op_data(0) - expected_val) / expected_val) > + relative_error_threshold) + local_result += 1; + if (fabs((op_data(1) - expected_val) / expected_val) > + relative_error_threshold) + local_result += 2; + if (fabs((op_data(2) - expected_val) / expected_val) > + relative_error_threshold) + local_result += 4; + if (fetch_result.first != old_val) local_result += 8; + if (fabs((fetch_result.second - expected_val) / expected_val) > + relative_error_threshold) + local_result += 16; + } + }, + result); + if ((result & 1) != 0) + printf("atomic_%s failed with type %s\n", Op::name(), typeid(T).name()); + if ((result & 2) != 0) + printf("atomic_fetch_%s failed with type %s\n", Op::name(), + typeid(T).name()); + if ((result & 4) != 0) + printf("atomic_%s_fetch failed with type %s\n", Op::name(), + typeid(T).name()); + if ((result & 8) != 0) + printf("atomic_fetch_%s did not return old value with type %s\n", + Op::name(), typeid(T).name()); + if ((result & 16) != 0) + printf("atomic_%s_fetch did not return updated value with type %s\n", + Op::name(), typeid(T).name()); + + return result == 0; +} + //--------------------------------------------------- //--------------atomic_test_control------------------ //--------------------------------------------------- @@ -395,6 +452,12 @@ bool AtomicOperationsTestIntegralType(int old_val_in, int update_in, int test) { case 9: return atomic_op_test(old_val, update); case 10: return atomic_op_test(old_val, update); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + // FIXME_NVHPC: atomic-fetch-shift operation fails due to NVHPC OpenACC + // compiler bugs, which are reported to NVIDIA. + case 11: return true; + case 12: return true; +#else case 11: return update_in >= 0 ? atomic_op_test( old_val, update) @@ -403,6 +466,7 @@ bool AtomicOperationsTestIntegralType(int old_val_in, int update_in, int test) { return update_in >= 0 ? atomic_op_test( old_val, update) : true; +#endif case 13: return atomic_op_test(old_val, update); case 14: @@ -440,10 +504,20 @@ bool AtomicOperationsTestNonIntegralType(int old_val_in, int update_in, case 2: return atomic_op_test(old_val, update); case 3: return atomic_op_test(old_val, update); case 4: return atomic_op_test(old_val, update); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + // NVHPC may use different internal precisions for the device and host + // atomic operations. Therefore, relative errors are used to compare the + // host results and device results. + case 5: + return update != 0 ? atomic_op_test_rel( + old_val, update) + : true; +#else case 5: return update != 0 ? atomic_op_test(old_val, update) : true; +#endif case 6: return atomic_op_test(old_val, update); } diff --git a/lib/kokkos/core/unit_test/TestAtomics.hpp b/lib/kokkos/core/unit_test/TestAtomics.hpp index 2b40f12d0a..5f48e8c974 100644 --- a/lib/kokkos/core/unit_test/TestAtomics.hpp +++ b/lib/kokkos/core/unit_test/TestAtomics.hpp @@ -498,7 +498,9 @@ TEST(TEST_CATEGORY, atomics) { ASSERT_TRUE((TestAtomic::Loop(100, 2))); ASSERT_TRUE((TestAtomic::Loop(100, 3))); -#ifndef KOKKOS_ENABLE_OPENMPTARGET + // FIXME_OPENMPTARGET + // FIXME_OPENACC: atomic operations on composite types are not supported. +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_OPENACC) ASSERT_TRUE((TestAtomic::Loop, TEST_EXECSPACE>(1, 1))); ASSERT_TRUE((TestAtomic::Loop, TEST_EXECSPACE>(1, 2))); ASSERT_TRUE((TestAtomic::Loop, TEST_EXECSPACE>(1, 3))); diff --git a/lib/kokkos/core/unit_test/TestBitManipulationBuiltins.hpp b/lib/kokkos/core/unit_test/TestBitManipulationBuiltins.hpp index 092e7cff61..2f3bcfe817 100644 --- a/lib/kokkos/core/unit_test/TestBitManipulationBuiltins.hpp +++ b/lib/kokkos/core/unit_test/TestBitManipulationBuiltins.hpp @@ -804,26 +804,26 @@ struct TestBitCastFunction { using Kokkos::bit_cast; if (bit_cast(123) != 123) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #1\n"); + Kokkos::printf("failed check #1\n"); } if (bit_cast(123u) != 123) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #2\n"); + Kokkos::printf("failed check #2\n"); } if (bit_cast(~0u) != ~0) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #3\n"); + Kokkos::printf("failed check #3\n"); } if constexpr (sizeof(int) == sizeof(float)) { if (!check(12.34f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #4\n"); + Kokkos::printf("failed check #4\n"); } } if constexpr (sizeof(unsigned long long) == sizeof(double)) { if (!check(123.456)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #5\n"); + Kokkos::printf("failed check #5\n"); } } @@ -848,11 +848,11 @@ struct TestBitCastFunction { } if (!(bit_cast(arr) == arr)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #6\n"); + Kokkos::printf("failed check #6\n"); } if (!(bit_cast(arr2) == arr2)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #7\n"); + Kokkos::printf("failed check #7\n"); } } }; diff --git a/lib/kokkos/core/unit_test/TestComplex.hpp b/lib/kokkos/core/unit_test/TestComplex.hpp index bcae2e1d81..5501a35b7f 100644 --- a/lib/kokkos/core/unit_test/TestComplex.hpp +++ b/lib/kokkos/core/unit_test/TestComplex.hpp @@ -451,17 +451,15 @@ TEST(TEST_CATEGORY, complex_issue_3867) { ASSERT_FLOAT_EQ(x.real(), y.real()); ASSERT_FLOAT_EQ(x.imag(), y.imag()); -#define CHECK_POW_COMPLEX_PROMOTION(ARGTYPE1, ARGTYPE2, RETURNTYPE) \ - static_assert( \ - std::is_same(), \ - std::declval()))>::value, \ - ""); \ - static_assert( \ - std::is_same(), \ - std::declval()))>::value, \ - ""); +#define CHECK_POW_COMPLEX_PROMOTION(ARGTYPE1, ARGTYPE2, RETURNTYPE) \ + static_assert( \ + std::is_same(), \ + std::declval()))>::value); \ + static_assert( \ + std::is_same(), \ + std::declval()))>::value); CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex, long double, Kokkos::complex); diff --git a/lib/kokkos/core/unit_test/TestConcepts.hpp b/lib/kokkos/core/unit_test/TestConcepts.hpp index 476a884832..b85867bf63 100644 --- a/lib/kokkos/core/unit_test/TestConcepts.hpp +++ b/lib/kokkos/core/unit_test/TestConcepts.hpp @@ -22,42 +22,42 @@ using ExecutionSpace = TEST_EXECSPACE; using MemorySpace = typename ExecutionSpace::memory_space; using DeviceType = typename ExecutionSpace::device_type; -static_assert(Kokkos::is_execution_space{}, ""); -static_assert(Kokkos::is_execution_space{}, ""); -static_assert(!Kokkos::is_execution_space{}, ""); -static_assert(!Kokkos::is_execution_space{}, ""); +static_assert(Kokkos::is_execution_space{}); +static_assert(Kokkos::is_execution_space{}); +static_assert(!Kokkos::is_execution_space{}); +static_assert(!Kokkos::is_execution_space{}); -static_assert(Kokkos::is_memory_space{}, ""); -static_assert(Kokkos::is_memory_space{}, ""); -static_assert(!Kokkos::is_memory_space{}, ""); -static_assert(!Kokkos::is_memory_space{}, ""); +static_assert(Kokkos::is_memory_space{}); +static_assert(Kokkos::is_memory_space{}); +static_assert(!Kokkos::is_memory_space{}); +static_assert(!Kokkos::is_memory_space{}); -static_assert(Kokkos::is_device{}, ""); -static_assert(Kokkos::is_device{}, ""); -static_assert(!Kokkos::is_device{}, ""); -static_assert(!Kokkos::is_device{}, ""); +static_assert(Kokkos::is_device{}); +static_assert(Kokkos::is_device{}); +static_assert(!Kokkos::is_device{}); +static_assert(!Kokkos::is_device{}); -static_assert(!Kokkos::is_device{}, ""); -static_assert(!Kokkos::is_device{}, ""); +static_assert(!Kokkos::is_device{}); +static_assert(!Kokkos::is_device{}); -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(!Kokkos::is_space{}, ""); -static_assert(!Kokkos::is_space{}, ""); -static_assert(!Kokkos::is_space{}, ""); +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(!Kokkos::is_space{}); +static_assert(!Kokkos::is_space{}); +static_assert(!Kokkos::is_space{}); -static_assert(Kokkos::is_execution_space_v, ""); -static_assert(!Kokkos::is_execution_space_v, ""); +static_assert(Kokkos::is_execution_space_v); +static_assert(!Kokkos::is_execution_space_v); static_assert( - std::is_same>{}, ""); -static_assert(std::is_same>{}, ""); -static_assert(std::is_same>{}, ""); -static_assert(std::is_same>{}, ""); + std::is_same>{}); +static_assert(std::is_same>{}); +static_assert(std::is_same>{}); +static_assert(std::is_same>{}); /*------------------------------------------------- begin test for team_handle concept diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp deleted file mode 100644 index 929c91db4e..0000000000 --- a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp +++ /dev/null @@ -1,491 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include - -#include - -#ifdef KOKKOS_ENABLE_OPENMP -#include -#endif -#include -#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__) - -namespace Test { - -namespace Impl { - -std::set delete_these; -void cleanup_memory() { - for (auto x : delete_these) { - delete[] x; - } -} - -char** init_kokkos_args(bool do_threads, bool do_numa, bool do_device, - bool do_other, bool do_tune, int& nargs, - Kokkos::InitArguments& init_args) { - nargs = (do_threads ? 1 : 0) + (do_numa ? 1 : 0) + (do_device ? 1 : 0) + - (do_other ? 4 : 0) + (do_tune ? 1 : 0); - - char** args_kokkos = new char*[nargs]; - const int max_args_size = 45; - for (int i = 0; i < nargs; i++) { - args_kokkos[i] = new char[max_args_size]; - delete_these.insert(args_kokkos[i]); - } - - int threads_idx = do_other ? 1 : 0; - int numa_idx = (do_other ? 3 : 0) + (do_threads ? 1 : 0); - int device_idx = - (do_other ? 3 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0); - int tune_idx = (do_other ? 4 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0) + - (do_device ? 1 : 0); - - if (do_threads) { - int nthreads = 3; - -#ifdef KOKKOS_ENABLE_OPENMP - if (omp_get_max_threads() < nthreads) { - nthreads = omp_get_max_threads(); - } -#elif defined(KOKKOS_ENABLE_HPX) - const int concurrency = std::thread::hardware_concurrency(); - if (concurrency < nthreads) { - nthreads = concurrency; - } -#endif - - if (Kokkos::hwloc::available()) { - if (Kokkos::hwloc::get_available_threads_per_core() < - static_cast(nthreads)) - nthreads = Kokkos::hwloc::get_available_threads_per_core() * - Kokkos::hwloc::get_available_numa_count(); - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) { - nthreads = 1; - } -#endif - - init_args.num_threads = nthreads; - snprintf(args_kokkos[threads_idx], max_args_size, "--threads=%i", nthreads); - } - - if (do_numa) { - int numa = 1; - if (Kokkos::hwloc::available()) { - numa = Kokkos::hwloc::get_available_numa_count(); - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) { - numa = 1; - } -#endif - - init_args.num_numa = numa; - snprintf(args_kokkos[numa_idx], max_args_size, "--numa=%i", numa); - } - - if (do_device) { - init_args.device_id = 0; - snprintf(args_kokkos[device_idx], max_args_size, "--device-id=%i", 0); - } - - if (do_other) { - snprintf(args_kokkos[0], max_args_size, "--dummyarg=1"); - snprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0)], max_args_size, - "--dummy2arg"); - snprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0) + 1], max_args_size, - "dummy3arg"); - snprintf(args_kokkos[device_idx + (do_device ? 1 : 0)], max_args_size, - "dummy4arg=1"); - } - - if (do_tune) { - init_args.tune_internals = true; - snprintf(args_kokkos[tune_idx], max_args_size, "--kokkos-tune-internals"); - } - - return args_kokkos; -} - -Kokkos::InitArguments init_initstruct(bool do_threads, bool do_numa, - bool do_device, bool do_tune) { - Kokkos::InitArguments args; - - if (do_threads) { - int nthreads = 3; - -#ifdef KOKKOS_ENABLE_OPENMP - if (omp_get_max_threads() < nthreads) { - nthreads = omp_get_max_threads(); - } -#elif defined(KOKKOS_ENABLE_HPX) - const int concurrency = std::thread::hardware_concurrency(); - if (concurrency < nthreads) { - nthreads = concurrency; - } -#endif - - if (Kokkos::hwloc::available()) { - if (Kokkos::hwloc::get_available_threads_per_core() < - static_cast(nthreads)) { - nthreads = Kokkos::hwloc::get_available_threads_per_core() * - Kokkos::hwloc::get_available_numa_count(); - } - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) { - nthreads = 1; - } -#endif - - args.num_threads = nthreads; - } - - if (do_numa) { - int numa = 1; - if (Kokkos::hwloc::available()) { - numa = Kokkos::hwloc::get_available_numa_count(); - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) { - numa = 1; - } -#endif - - args.num_numa = numa; - } - - if (do_device) { - args.device_id = 0; - } - - if (do_tune) { - args.tune_internals = true; - } - - return args; -} - -void check_correct_initialization(const Kokkos::InitArguments& argstruct) { - ASSERT_EQ(Kokkos::DefaultExecutionSpace::impl_is_initialized(), 1); - ASSERT_EQ(Kokkos::HostSpace::execution_space::impl_is_initialized(), 1); - - // Figure out the number of threads the HostSpace ExecutionSpace should have - // initialized to. - int expected_nthreads = argstruct.num_threads; - -#ifdef KOKKOS_ENABLE_OPENMP - if (std::is_same::value) { - // use openmp default num threads - if (expected_nthreads < 0 || - (expected_nthreads == 0 && !Kokkos::hwloc::available())) { - expected_nthreads = omp_get_max_threads(); - } - // use hwloc if available - else if (expected_nthreads == 0 && Kokkos::hwloc::available()) { - expected_nthreads = Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa() * - Kokkos::hwloc::get_available_threads_per_core(); - } - } -#endif - - if (expected_nthreads < 1) { - if (Kokkos::hwloc::available()) { - expected_nthreads = Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa() * - Kokkos::hwloc::get_available_threads_per_core(); - } else { - expected_nthreads = 1; - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) { - expected_nthreads = 1; - } -#endif - -#ifdef KOKKOS_ENABLE_HPX - // HPX uses all cores on machine by default. Skip this test. - if (std::is_same::value || - std::is_same::value) { - return; - } -#endif - } - - int expected_numa = argstruct.num_numa; - - if (expected_numa < 1) { - if (Kokkos::hwloc::available()) { - expected_numa = Kokkos::hwloc::get_available_numa_count(); - } else { - expected_numa = 1; - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) - expected_numa = 1; -#endif - } - - ASSERT_EQ(Kokkos::HostSpace::execution_space().impl_thread_pool_size(), - expected_nthreads); - -#ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - int device; - cudaGetDevice(&device); - - int expected_device = argstruct.device_id; - if (argstruct.device_id < 0) { - expected_device = Kokkos::Cuda().cuda_device(); - } - - ASSERT_EQ(expected_device, device); - } -#endif - ASSERT_EQ(argstruct.tune_internals, Kokkos::tune_internals()); -} - -// TODO: Add check whether correct number of threads are actually started. -void test_no_arguments() { - Kokkos::initialize(); - check_correct_initialization(Kokkos::InitArguments()); - Kokkos::finalize(); -} - -void test_commandline_args(int nargs, char** args, - const Kokkos::InitArguments& argstruct) { - Kokkos::initialize(nargs, args); - check_correct_initialization(argstruct); - Kokkos::finalize(); -} - -void test_initstruct_args(const Kokkos::InitArguments& args) { - Kokkos::initialize(args); - check_correct_initialization(args); - Kokkos::finalize(); -} - -} // namespace Impl - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01 -TEST(defaultdevicetypeinit, no_args) { Impl::test_no_arguments(); } -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02 -TEST(defaultdevicetypeinit, commandline_args_empty) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = Impl::init_kokkos_args(false, false, false, false, false, nargs, - argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03 -TEST(defaultdevicetypeinit, commandline_args_other) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = Impl::init_kokkos_args(false, false, false, true, false, nargs, - argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04 -TEST(defaultdevicetypeinit, commandline_args_nthreads) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = Impl::init_kokkos_args(true, false, false, false, false, nargs, - argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05 -TEST(defaultdevicetypeinit, commandline_args_nthreads_numa) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, true, false, false, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06 -TEST(defaultdevicetypeinit, commandline_args_nthreads_numa_device) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, true, true, false, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07 -TEST(defaultdevicetypeinit, commandline_args_nthreads_device) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, false, true, false, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08 -TEST(defaultdevicetypeinit, commandline_args_numa_device) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(false, true, true, false, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09 -TEST(defaultdevicetypeinit, commandline_args_device) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = Impl::init_kokkos_args(false, false, true, false, false, nargs, - argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10 -TEST(defaultdevicetypeinit, commandline_args_nthreads_numa_device_other) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, true, true, true, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11 -TEST(defaultdevicetypeinit, commandline_args_nthreads_numa_device_other_tune) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, true, true, true, true, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12 -TEST(defaultdevicetypeinit, initstruct_default) { - Kokkos::InitArguments args; - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13 -TEST(defaultdevicetypeinit, initstruct_nthreads) { - Kokkos::InitArguments args = Impl::init_initstruct(true, false, false, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14 -TEST(defaultdevicetypeinit, initstruct_nthreads_numa) { - Kokkos::InitArguments args = Impl::init_initstruct(true, true, false, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15 -TEST(defaultdevicetypeinit, initstruct_device) { - Kokkos::InitArguments args = Impl::init_initstruct(false, false, true, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16 -TEST(defaultdevicetypeinit, initstruct_nthreads_device) { - Kokkos::InitArguments args = Impl::init_initstruct(true, false, true, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_17 -TEST(defaultdevicetypeinit, initstruct_nthreads_numa_device) { - Kokkos::InitArguments args = Impl::init_initstruct(true, true, true, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_18 -TEST(defaultdevicetypeinit, initstruct_nthreads_numa_device_tune) { - Kokkos::InitArguments args = Impl::init_initstruct(true, true, true, true); - Impl::test_initstruct_args(args); -} -#endif - -} // namespace Test - -#endif diff --git a/lib/kokkos/core/unit_test/TestDeviceAndThreads.py b/lib/kokkos/core/unit_test/TestDeviceAndThreads.py index 1d3ff8eea7..63d26ad41a 100644 --- a/lib/kokkos/core/unit_test/TestDeviceAndThreads.py +++ b/lib/kokkos/core/unit_test/TestDeviceAndThreads.py @@ -17,6 +17,8 @@ import unittest import subprocess +import platform +import os PREFIX = "$" EXECUTABLE = "$" @@ -30,7 +32,22 @@ def GetFlag(flag, *extra_args): return int(p.stdout) def GetNumThreads(max_threads): - for x in [1, 2, 3, 5, 7]: + args = [] + name = platform.system() + if name == 'Darwin': + args = ['sysctl', '-n', 'hw.physicalcpu_max'] + elif name == 'Linux': + args = ['nproc', '--all'] + else: + args = ['wmic', 'cpu', 'get', 'NumberOfCores'] + + result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output = result.stdout.decode('utf-8') + phys_cores_count = int(output) + looplist = [1] + [i*phys_cores_count for i in [1,2,3,4,5,6,7]] \ + if GetFlag("hwloc_enabled") else [1,2,3,4,5] + + for x in looplist: if x >= max_threads: break yield x @@ -48,13 +65,25 @@ class KokkosInitializationTestCase(unittest.TestCase): "num_threads", "--kokkos-num-threads={}".format(num_threads))) + def test_num_devices(self): + if "KOKKOS_VISIBLE_DEVICES" in os.environ: + self.skipTest("KOKKOS_VISIBLE_DEVICES environment variable is set") + num_devices = GetFlag("num_devices") + self.assertNotEqual(num_devices, 0) + if num_devices == -1: + self.skipTest("no device backend enabled") + self.assertGreaterEqual(num_devices, 1) + def test_device_id(self): - device_count = GetFlag("device_count") - if device_count == 0: - self.skipTest("no device detected") + if "KOKKOS_VISIBLE_DEVICES" in os.environ: + self.skipTest("KOKKOS_VISIBLE_DEVICES environment variable is set") + num_devices = GetFlag("num_devices") + if num_devices == -1: + self.assertEqual(-1, GetFlag("device_id")) + self.skipTest("no device backend enabled") # by default use the first GPU available for execution self.assertEqual(0, GetFlag("device_id")) - for device_id in range(device_count): + for device_id in range(num_devices): self.assertEqual( device_id, GetFlag( diff --git a/lib/kokkos/core/unit_test/TestExecutionSpace.hpp b/lib/kokkos/core/unit_test/TestExecutionSpace.hpp index 6f0f159c17..983a5975af 100644 --- a/lib/kokkos/core/unit_test/TestExecutionSpace.hpp +++ b/lib/kokkos/core/unit_test/TestExecutionSpace.hpp @@ -25,13 +25,7 @@ struct CheckClassWithExecutionSpaceAsDataMemberIsCopyable { Kokkos::DefaultExecutionSpace device; Kokkos::DefaultHostExecutionSpace host; - KOKKOS_FUNCTION void operator()(int, int& e) const { - // not actually doing anything useful, mostly checking that - // ExecutionSpace::in_parallel() is callable - if (static_cast(device.in_parallel()) < 0) { - ++e; - } - } + KOKKOS_FUNCTION void operator()(int i, int& e) const { e += i; } CheckClassWithExecutionSpaceAsDataMemberIsCopyable() { int errors; diff --git a/lib/kokkos/core/unit_test/TestFunctorAnalysis.hpp b/lib/kokkos/core/unit_test/TestFunctorAnalysis.hpp index c024526111..e58324144e 100644 --- a/lib/kokkos/core/unit_test/TestFunctorAnalysis.hpp +++ b/lib/kokkos/core/unit_test/TestFunctorAnalysis.hpp @@ -59,16 +59,15 @@ void test_functor_analysis() { using R01 = typename A01::Reducer; - static_assert(std::is_void::value, ""); - static_assert(std::is_void::value, ""); - static_assert(std::is_void::value, ""); - static_assert(std::is_same::value, - ""); + static_assert(std::is_void::value); + static_assert(std::is_void::value); + static_assert(std::is_void::value); + static_assert(std::is_same::value); - static_assert(!A01::has_join_member_function, ""); - static_assert(!A01::has_init_member_function, ""); - static_assert(!A01::has_final_member_function, ""); - static_assert(A01::StaticValueSize == 0, ""); + static_assert(!A01::has_join_member_function); + static_assert(!A01::has_init_member_function); + static_assert(!A01::has_final_member_function); + static_assert(A01::StaticValueSize == 0); ASSERT_EQ(R01(c01).length(), 0); //------------------------------ @@ -78,16 +77,15 @@ void test_functor_analysis() { Kokkos::RangePolicy, decltype(c02), void>; using R02 = typename A02::Reducer; - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, - ""); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); - static_assert(!A02::has_join_member_function, ""); - static_assert(!A02::has_init_member_function, ""); - static_assert(!A02::has_final_member_function, ""); - static_assert(A02::StaticValueSize == sizeof(double), ""); + static_assert(!A02::has_join_member_function); + static_assert(!A02::has_init_member_function); + static_assert(!A02::has_final_member_function); + static_assert(A02::StaticValueSize == sizeof(double)); ASSERT_EQ(R02(c02).length(), 1); //------------------------------ @@ -99,23 +97,19 @@ void test_functor_analysis() { using R03 = typename A03::Reducer; static_assert(std::is_same::value, - ""); + TestFunctorAnalysis_03::value_type>::value); static_assert(std::is_same::value, - ""); + TestFunctorAnalysis_03::value_type*>::value); static_assert(std::is_same::value, - ""); + TestFunctorAnalysis_03::value_type&>::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); - static_assert(A03::has_join_member_function, ""); - static_assert(A03::has_init_member_function, ""); - static_assert(!A03::has_final_member_function, ""); - static_assert( - A03::StaticValueSize == sizeof(TestFunctorAnalysis_03::value_type), ""); + static_assert(A03::has_join_member_function); + static_assert(A03::has_init_member_function); + static_assert(!A03::has_final_member_function); + static_assert(A03::StaticValueSize == + sizeof(TestFunctorAnalysis_03::value_type)); ASSERT_EQ(R03(c03).length(), 1); //------------------------------ diff --git a/lib/kokkos/core/unit_test/TestHalfOperators.hpp b/lib/kokkos/core/unit_test/TestHalfOperators.hpp index 752e3b5081..c69cdd5703 100644 --- a/lib/kokkos/core/unit_test/TestHalfOperators.hpp +++ b/lib/kokkos/core/unit_test/TestHalfOperators.hpp @@ -268,96 +268,6 @@ enum OP_TESTS { N_OP_TESTS }; -// volatile-qualified parameter type 'volatile half_type' is deprecated -#if !defined(KOKKOS_ENABLE_CXX20) && !defined(KOKKOS_ENABLE_CXX23) -template -struct Functor_TestHalfVolatileOperators { - volatile half_type h_lhs, h_rhs; - view_type actual_lhs, expected_lhs; - double d_lhs, d_rhs; - Functor_TestHalfVolatileOperators(volatile half_type lhs = half_type(0), - volatile half_type rhs = half_type(0)) - : h_lhs(lhs), h_rhs(rhs) { - actual_lhs = view_type("actual_lhs", N_OP_TESTS); - expected_lhs = view_type("expected_lhs", N_OP_TESTS); - half_type nv_tmp; - nv_tmp = h_lhs; - d_lhs = static_cast(nv_tmp); - nv_tmp = h_rhs; - d_rhs = static_cast(nv_tmp); - if (std::is_same::value) { - auto run_on_host = *this; - run_on_host(0); - } else { - Kokkos::parallel_for("Test::Functor_TestHalfVolatileOperators", - Kokkos::RangePolicy(0, 1), *this); - } - } - - KOKKOS_FUNCTION - void operator()(int) const { - volatile half_type tmp_lhs; - half_type nv_tmp; - - // Initialze output views to catch missing test invocations - for (int i = 0; i < N_OP_TESTS; ++i) { - actual_lhs(i) = 1; - expected_lhs(i) = -1; - } - - nv_tmp = h_lhs; - actual_lhs(ASSIGN) = static_cast(nv_tmp); - expected_lhs(ASSIGN) = d_lhs; - - actual_lhs(LT_H_H) = h_lhs < h_rhs; - expected_lhs(LT_H_H) = d_lhs < d_rhs; - - actual_lhs(LE_H_H) = h_lhs <= h_rhs; - expected_lhs(LE_H_H) = d_lhs <= d_rhs; - - actual_lhs(NEQ) = h_lhs != h_rhs; - expected_lhs(NEQ) = d_lhs != d_rhs; - - actual_lhs(GT_H_H) = h_lhs > h_rhs; - expected_lhs(GT_H_H) = d_lhs > d_rhs; - - actual_lhs(GE_H_H) = h_lhs >= h_rhs; - expected_lhs(GE_H_H) = d_lhs >= d_rhs; - - actual_lhs(EQ) = h_lhs == h_rhs; - expected_lhs(EQ) = d_lhs == d_rhs; - - tmp_lhs = h_lhs; - tmp_lhs += h_rhs; - nv_tmp = tmp_lhs; - actual_lhs(CADD_H_H) = static_cast(nv_tmp); - expected_lhs(CADD_H_H) = d_lhs; - expected_lhs(CADD_H_H) += d_rhs; - - tmp_lhs = h_lhs; - tmp_lhs -= h_rhs; - nv_tmp = tmp_lhs; - actual_lhs(CSUB_H_H) = static_cast(nv_tmp); - expected_lhs(CSUB_H_H) = d_lhs; - expected_lhs(CSUB_H_H) -= d_rhs; - - tmp_lhs = h_lhs; - tmp_lhs *= h_rhs; - nv_tmp = tmp_lhs; - actual_lhs(CMUL_H_H) = static_cast(nv_tmp); - expected_lhs(CMUL_H_H) = d_lhs; - expected_lhs(CMUL_H_H) *= d_rhs; - - tmp_lhs = h_lhs; - tmp_lhs /= h_rhs; - nv_tmp = tmp_lhs; - actual_lhs(CDIV_H_H) = static_cast(nv_tmp); - expected_lhs(CDIV_H_H) = d_lhs; - expected_lhs(CDIV_H_H) /= d_rhs; - } -}; -#endif - template struct Functor_TestHalfOperators { half_type h_lhs, h_rhs; @@ -995,33 +905,6 @@ void __test_half_operators(half_type h_lhs, half_type h_rhs) { static_cast(epsilon)); } -// volatile-qualified parameter type 'volatile half_type' is deprecated -#if !defined(KOKKOS_ENABLE_CXX20) && !defined(KOKKOS_ENABLE_CXX23) - // Test partial volatile support - volatile half_type _h_lhs = h_lhs; - volatile half_type _h_rhs = h_rhs; - Functor_TestHalfVolatileOperators f_volatile_device( - _h_lhs, _h_rhs); - Functor_TestHalfVolatileOperators f_volatile_host( - _h_lhs, _h_rhs); - - ExecutionSpace().fence(); - Kokkos::deep_copy(f_device_actual_lhs, f_device.actual_lhs); - Kokkos::deep_copy(f_device_expected_lhs, f_device.expected_lhs); - for (int op_test = 0; op_test < N_OP_TESTS; op_test++) { - // printf("op_test = %d\n", op_test); - if (op_test == ASSIGN || op_test == LT_H_H || op_test == LE_H_H || - op_test == NEQ || op_test == EQ || op_test == GT_H_H || - op_test == GE_H_H || op_test == CADD_H_H || op_test == CSUB_H_H || - op_test == CMUL_H_H || op_test == CDIV_H_H) { - ASSERT_NEAR(f_device_actual_lhs(op_test), f_device_expected_lhs(op_test), - static_cast(epsilon)); - ASSERT_NEAR(f_host.actual_lhs(op_test), f_host.expected_lhs(op_test), - static_cast(epsilon)); - } - } -#endif - // is_trivially_copyable is false with the addition of explicit // copy constructors that are required for supporting reductions // ASSERT_TRUE(std::is_trivially_copyable::value); diff --git a/lib/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp b/lib/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp index 3ee2ff5205..467b9ad157 100644 --- a/lib/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp +++ b/lib/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp @@ -37,7 +37,7 @@ template struct CheckAccessStoredPointerAndDereferenceOnDevice { SmartPtr m_device_ptr; using ElementType = typename SmartPtr::element_type; - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); CheckAccessStoredPointerAndDereferenceOnDevice(SmartPtr device_ptr) : m_device_ptr(device_ptr) { diff --git a/lib/kokkos/core/unit_test/TestInitializationSettings.cpp b/lib/kokkos/core/unit_test/TestInitializationSettings.cpp index f5be0e47aa..40dc3f11df 100644 --- a/lib/kokkos/core/unit_test/TestInitializationSettings.cpp +++ b/lib/kokkos/core/unit_test/TestInitializationSettings.cpp @@ -20,30 +20,6 @@ namespace { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -void take_initialization_settings(Kokkos::InitializationSettings const&) {} - -TEST(defaultdevicetype, - init_arguments_implicit_conversion_to_initialization_settings) { - Kokkos::InitArguments arguments; - take_initialization_settings(arguments); // check that conversion is implicit - arguments.device_id = 1; - arguments.tune_internals = true; - Kokkos::InitializationSettings settings{arguments}; - EXPECT_FALSE(settings.has_num_threads()); - EXPECT_TRUE(settings.has_device_id()); - EXPECT_EQ(settings.get_device_id(), 1); - EXPECT_FALSE(settings.has_num_devices()); - EXPECT_FALSE(settings.has_skip_device()); - EXPECT_FALSE(settings.has_disable_warnings()); - EXPECT_TRUE(settings.has_tune_internals()); - EXPECT_TRUE(settings.get_tune_internals()); - EXPECT_FALSE(settings.has_tools_help()); - EXPECT_FALSE(settings.has_tools_libs()); - EXPECT_FALSE(settings.has_tools_args()); -} -#endif - TEST(defaultdevicetype, initialization_settings) { auto const settings = Kokkos::InitializationSettings() .set_num_threads(255) @@ -52,8 +28,6 @@ TEST(defaultdevicetype, initialization_settings) { EXPECT_TRUE(settings.has_num_threads()); EXPECT_EQ(settings.get_num_threads(), 255); EXPECT_FALSE(settings.has_device_id()); - EXPECT_FALSE(settings.has_num_devices()); - EXPECT_FALSE(settings.has_skip_device()); EXPECT_TRUE(settings.has_disable_warnings()); EXPECT_FALSE(settings.get_disable_warnings()); EXPECT_FALSE(settings.has_tune_internals()); @@ -75,8 +49,6 @@ constexpr bool test_initialization_settings_getter() { TYPE>::value); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(num_threads, int); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(device_id, int); - CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(num_devices, int); - CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(skip_device, int); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(disable_warnings, bool); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(tune_internals, bool); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(tools_help, bool); diff --git a/lib/kokkos/core/unit_test/TestJoinBackwardCompatibility.hpp b/lib/kokkos/core/unit_test/TestJoinBackwardCompatibility.hpp index 24cf52aa70..efe4a2307a 100644 --- a/lib/kokkos/core/unit_test/TestJoinBackwardCompatibility.hpp +++ b/lib/kokkos/core/unit_test/TestJoinBackwardCompatibility.hpp @@ -36,9 +36,8 @@ KOKKOS_FUNCTION constexpr MyErrorCode operator|(MyErrorCode lhs, } static_assert((no_error | error_operator_plus_equal_volatile) == - error_operator_plus_equal_volatile, - ""); -static_assert((error_join_volatile | error_operator_plus_equal) == 0b101, ""); + error_operator_plus_equal_volatile); +static_assert((error_join_volatile | error_operator_plus_equal) == 0b101); struct MyJoinBackCompatValueType { MyErrorCode err = no_error; diff --git a/lib/kokkos/core/unit_test/TestMDRangePolicyCTAD.cpp b/lib/kokkos/core/unit_test/TestMDRangePolicyCTAD.cpp new file mode 100644 index 0000000000..b2c3d021c3 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestMDRangePolicyCTAD.cpp @@ -0,0 +1,138 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +namespace { + +struct TestMDRangePolicyCTAD { + template + static void maybe_unused(Ts&&...) {} + + struct SomeExecutionSpace { + using execution_space = SomeExecutionSpace; + using size_type = size_t; + }; + static_assert(Kokkos::is_execution_space_v); + + struct ImplicitlyConvertibleToDefaultExecutionSpace { + [[maybe_unused]] operator Kokkos::DefaultExecutionSpace() const { + return Kokkos::DefaultExecutionSpace(); + } + }; + static_assert(!Kokkos::is_execution_space_v< + ImplicitlyConvertibleToDefaultExecutionSpace>); + + [[maybe_unused]] static inline Kokkos::DefaultExecutionSpace des; + [[maybe_unused]] static inline ImplicitlyConvertibleToDefaultExecutionSpace + notEs; + [[maybe_unused]] static inline SomeExecutionSpace ses; + + [[maybe_unused]] static inline int t[5]; + [[maybe_unused]] static inline int64_t tt[5]; + [[maybe_unused]] static inline Kokkos::Array a; + [[maybe_unused]] static inline Kokkos::Array aa; + [[maybe_unused]] static inline int64_t i64; + + // Workaround for nvc++ (CUDA-11.7-NVHPC) ignoring [[maybe_unused]] on + // ImplicitlyConvertibleToDefaultExecutionSpace::operator + // Kokkos::DefaultExecutionSpace() const + [[maybe_unused]] static inline Kokkos::DefaultExecutionSpace notEsToDes = + notEs; + + // Workaround for HIP-ROCm-5.2 "declared but never referenced" + TestMDRangePolicyCTAD() { + maybe_unused(des, notEs, ses, t, tt, a, aa, notEsToDes, i64); + } + + // MDRangePolicy with C array parameters + + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(t, t))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(t, t, tt))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(des, t, tt))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(notEs, t, t))>); + + static_assert( + std::is_same_v< + Kokkos::MDRangePolicy>, + decltype(Kokkos::MDRangePolicy(ses, t, t))>); + + // MDRangePolicy with Kokkos::initializer_list parameters + + static_assert(std::is_same_v>, + decltype(Kokkos::MDRangePolicy( + {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}))>); + + static_assert(std::is_same_v>, + decltype(Kokkos::MDRangePolicy( + {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, + {i64, i64, i64, i64, i64, i64}))>); + + static_assert(std::is_same_v>, + decltype(Kokkos::MDRangePolicy( + des, {1, 2, 3, 4, 5, 6}, + {i64, i64, i64, i64, i64, i64}))>); + + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(notEs, {1, 2, 3, 4, 5, 6}, + {1, 2, 3, 4, 5, 6}))>); + + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(ses, {1, 2, 3, 4, 5, 6}, + {1, 2, 3, 4, 5, 6}))>); + + // MDRangePolicy with Kokkos::Array parameters + + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(a, a))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(a, a, aa))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(des, a, a))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(notEs, a, a))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(des, a, a, aa))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(notEs, a, a, aa))>); + + static_assert( + std::is_same_v< + Kokkos::MDRangePolicy>, + decltype(Kokkos::MDRangePolicy(ses, a, a))>); + static_assert( + std::is_same_v< + Kokkos::MDRangePolicy>, + decltype(Kokkos::MDRangePolicy(ses, a, a, aa))>); +}; + +} // namespace diff --git a/lib/kokkos/core/unit_test/TestMDRangePolicyConstructors.hpp b/lib/kokkos/core/unit_test/TestMDRangePolicyConstructors.hpp index f577f415e7..6f241b45d4 100644 --- a/lib/kokkos/core/unit_test/TestMDRangePolicyConstructors.hpp +++ b/lib/kokkos/core/unit_test/TestMDRangePolicyConstructors.hpp @@ -18,6 +18,8 @@ #include +#include + namespace { template @@ -86,12 +88,56 @@ TEST(TEST_CATEGORY_DEATH, policy_bounds_unsafe_narrowing_conversions) { using Policy = Kokkos::MDRangePolicy, Kokkos::IndexType>; + std::string msg = + "Kokkos::MDRangePolicy bound type error: an unsafe implicit conversion " + "is " + "performed on a bound (-1) in dimension (0), which may not preserve its " + "original value.\n"; + std::string expected = std::regex_replace(msg, std::regex("\\(|\\)"), "\\$&"); + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - ASSERT_DEATH( - { - (void)Policy({-1, 0}, {2, 3}); - }, - "unsafe narrowing conversion"); + ASSERT_DEATH({ (void)Policy({-1, 0}, {2, 3}); }, expected); +} + +TEST(TEST_CATEGORY_DEATH, policy_invalid_bounds) { + using Policy = Kokkos::MDRangePolicy>; + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + + auto [dim0, dim1] = (Policy::inner_direction == Kokkos::Iterate::Right) + ? std::make_pair(1, 0) + : std::make_pair(0, 1); + std::string msg1 = + "Kokkos::MDRangePolicy bounds error: The lower bound (100) is greater " + "than its upper bound (90) in dimension " + + std::to_string(dim0) + ".\n"; + + std::string msg2 = + "Kokkos::MDRangePolicy bounds error: The lower bound (100) is greater " + "than its upper bound (90) in dimension " + + std::to_string(dim1) + ".\n"; + +#if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + // escape the parentheses in the regex to match the error message + msg1 = std::regex_replace(msg1, std::regex("\\(|\\)"), "\\$&"); + (void)msg2; + ASSERT_DEATH({ (void)Policy({100, 100}, {90, 90}); }, msg1); +#else + if (!Kokkos::show_warnings()) { + GTEST_SKIP() << "Kokkos warning messages are disabled"; + } + + ::testing::internal::CaptureStderr(); + (void)Policy({100, 100}, {90, 90}); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + ASSERT_EQ(::testing::internal::GetCapturedStderr(), msg1 + msg2); +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg1; + (void)msg2; +#endif + +#endif } #endif diff --git a/lib/kokkos/core/unit_test/TestMathematicalFunctions.hpp b/lib/kokkos/core/unit_test/TestMathematicalFunctions.hpp index 424ba05a90..ad035d4e4b 100644 --- a/lib/kokkos/core/unit_test/TestMathematicalFunctions.hpp +++ b/lib/kokkos/core/unit_test/TestMathematicalFunctions.hpp @@ -287,21 +287,20 @@ struct FloatingPointComparison { public: template - KOKKOS_FUNCTION bool compare_near_zero(FPT const& fpv, double ulp) const { + KOKKOS_FUNCTION bool compare_near_zero(FPT const& fpv, int ulp) const { auto abs_tol = eps(fpv) * ulp; bool ar = absolute(fpv) < abs_tol; if (!ar) { Kokkos::printf("absolute value exceeds tolerance [|%e| > %e]\n", - (double)fpv, abs_tol); + (double)fpv, (double)abs_tol); } return ar; } template - KOKKOS_FUNCTION bool compare(Lhs const& lhs, Rhs const& rhs, - double ulp) const { + KOKKOS_FUNCTION bool compare(Lhs const& lhs, Rhs const& rhs, int ulp) const { if (lhs == 0) { return compare_near_zero(rhs, ulp); } else if (rhs == 0) { @@ -315,7 +314,7 @@ struct FloatingPointComparison { bool ar = abs_diff == 0 || rel_diff < rel_tol; if (!ar) { Kokkos::printf("relative difference exceeds tolerance [%e > %e]\n", - (double)rel_diff, rel_tol); + (double)rel_diff, (double)rel_tol); } return ar; @@ -348,7 +347,7 @@ struct math_function_name; } \ MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE \ } \ - static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ + static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; } \ }; \ using kk_##FUNC = MathUnaryFunction_##FUNC; \ template <> \ @@ -373,7 +372,7 @@ struct math_function_name; math_unary_function_return_type_t>::value); \ return REF_FUNC; \ } \ - static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ + static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; } \ }; \ using kk_##FUNC = MathUnaryFunction_##FUNC; \ template <> \ @@ -477,7 +476,7 @@ DEFINE_UNARY_FUNCTION_EVAL(logb, 2); } \ MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE \ } \ - static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ + static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; } \ }; \ using kk_##FUNC = MathBinaryFunction_##FUNC; \ template <> \ @@ -511,7 +510,7 @@ DEFINE_BINARY_FUNCTION_EVAL(copysign, 1); math_ternary_function_return_type_t>::value); \ return std::FUNC(x, y, z); \ } \ - static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ + static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; } \ }; \ using kk3_##FUNC = MathTernaryFunction_##FUNC; \ template <> \ @@ -1307,12 +1306,12 @@ struct TestAbsoluteValueFunction { if (abs(static_cast(4.f)) != static_cast(4.f) || abs(static_cast(-4.f)) != static_cast(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(KE::half_t)\n"); + Kokkos::printf("failed abs(KE::half_t)\n"); } if (abs(static_cast(4.f)) != static_cast(4.f) || abs(static_cast(-4.f)) != static_cast(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(KE::bhalf_t)\n"); + Kokkos::printf("failed abs(KE::bhalf_t)\n"); } if (abs(5.) != 5. || abs(-5.) != 5.) { ++e; @@ -1332,19 +1331,17 @@ struct TestAbsoluteValueFunction { Kokkos::printf("failed abs(floating_point) special values\n"); } - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); static_assert(std::is_same(4.f))), - KE::half_t>::value, - ""); + KE::half_t>::value); static_assert(std::is_same(4.f))), - KE::bhalf_t>::value, - ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); + KE::bhalf_t>::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); #endif } }; @@ -1365,26 +1362,26 @@ struct TestFloatingPointAbsoluteValueFunction { using Kokkos::fabs; if (fabs(4.f) != 4.f || fabs(-4.f) != 4.f) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(float)\n"); + Kokkos::printf("failed fabs(float)\n"); } if (fabs(static_cast(4.f)) != static_cast(4.f) || fabs(static_cast(-4.f)) != static_cast(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(KE::half_t)\n"); + Kokkos::printf("failed fabs(KE::half_t)\n"); } if (fabs(static_cast(4.f)) != static_cast(4.f) || fabs(static_cast(-4.f)) != static_cast(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(KE::bhalf_t)\n"); + Kokkos::printf("failed fabs(KE::bhalf_t)\n"); } if (fabs(5.) != 5. || fabs(-5.) != 5.) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(double)\n"); + Kokkos::printf("failed fabs(double)\n"); } #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (fabs(6.l) != 6.l || fabs(-6.l) != 6.l) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(long double)\n"); + Kokkos::printf("failed fabs(long double)\n"); } #endif // special values @@ -1392,8 +1389,7 @@ struct TestFloatingPointAbsoluteValueFunction { using Kokkos::isnan; if (fabs(-0.) != 0. || !isinf(fabs(-INFINITY)) || !isnan(fabs(-NAN))) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "failed fabs(floating_point) special values\n"); + Kokkos::printf("failed fabs(floating_point) special values\n"); } static_assert(std::is_same(4.f))), @@ -1425,7 +1421,7 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { if (!compare(fmod(6.2f, 4.f), 2.2f, 1) && !compare(fmod(-6.2f, 4.f), -2.2f, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(float)\n"); + Kokkos::printf("failed fmod(float)\n"); } if (!compare( fmod(static_cast(6.2f), static_cast(4.f)), @@ -1434,7 +1430,7 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { fmod(static_cast(-6.2f), static_cast(4.f)), -static_cast(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(KE::half_t)\n"); + Kokkos::printf("failed fmod(KE::half_t)\n"); } if (!compare( fmod(static_cast(6.2f), static_cast(4.f)), @@ -1443,17 +1439,17 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { static_cast(4.f)), -static_cast(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(KE::bhalf_t)\n"); + Kokkos::printf("failed fmod(KE::bhalf_t)\n"); } if (!compare(fmod(6.2, 4.), 2.2, 1) && !compare(fmod(-6.2, 4.), -2.2, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(double)\n"); + Kokkos::printf("failed fmod(double)\n"); } #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (!compare(fmod(6.2l, 4.l), 2.2l, 1) && !compare(fmod(-6.2l, 4.l), -2.2l, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(long double)\n"); + Kokkos::printf("failed fmod(long double)\n"); } #endif // special values @@ -1462,23 +1458,19 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { if (!isinf(fmod(-KE::infinity::value, 1.f)) && !isnan(fmod(-KE::quiet_NaN::value, 1.f))) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "failed fmod(floating_point) special values\n"); + Kokkos::printf("failed fmod(floating_point) special values\n"); } static_assert(std::is_same(4.f), static_cast(4.f))), - KE::half_t>::value, - ""); + KE::half_t>::value); static_assert(std::is_same(4.f), static_cast(4.f))), - KE::bhalf_t>::value, - ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); + KE::bhalf_t>::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - static_assert(std::is_same::value, - ""); + static_assert(std::is_same::value); #endif } }; @@ -1502,7 +1494,7 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { if (!compare(remainder(6.2f, 4.f), 2.2f, 2) && !compare(remainder(-6.2f, 4.f), 2.2f, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(float)\n"); + Kokkos::printf("failed remainder(float)\n"); } if (!compare(remainder(static_cast(6.2f), static_cast(4.f)), @@ -1511,7 +1503,7 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { static_cast(4.f)), -static_cast(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(KE::half_t)\n"); + Kokkos::printf("failed remainder(KE::half_t)\n"); } if (!compare(remainder(static_cast(6.2f), static_cast(4.f)), @@ -1520,18 +1512,18 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { static_cast(4.f)), -static_cast(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(KE::bhalf_t)\n"); + Kokkos::printf("failed remainder(KE::bhalf_t)\n"); } if (!compare(remainder(6.2, 4.), 2.2, 2) && !compare(remainder(-6.2, 4.), 2.2, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(double)\n"); + Kokkos::printf("failed remainder(double)\n"); } #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (!compare(remainder(6.2l, 4.l), 2.2l, 1) && !compare(remainder(-6.2l, 4.l), -2.2l, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(long double)\n"); + Kokkos::printf("failed remainder(long double)\n"); } #endif // special values @@ -1540,26 +1532,23 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { if (!isinf(remainder(-KE::infinity::value, 1.f)) && !isnan(remainder(-KE::quiet_NaN::value, 1.f))) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF( + Kokkos::printf( "failed remainder(floating_point) special values\n"); } static_assert( std::is_same(4.f), static_cast(4.f))), - KE::half_t>::value, - ""); + KE::half_t>::value); static_assert( std::is_same(4.f), static_cast(4.f))), - KE::bhalf_t>::value, - ""); - static_assert(std::is_same::value, - ""); - static_assert(std::is_same::value, ""); + KE::bhalf_t>::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS static_assert( - std::is_same::value, ""); + std::is_same::value); #endif } }; @@ -1765,7 +1754,7 @@ struct TestIsNaN { #endif ) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(KE::half_t)\n"); + Kokkos::printf("failed isnan(KE::half_t)\n"); } if (isnan(static_cast(2.f)) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 @@ -1775,7 +1764,7 @@ struct TestIsNaN { #endif ) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(KE::bhalf_t)\n"); + Kokkos::printf("failed isnan(KE::bhalf_t)\n"); } if (isnan(3.) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 @@ -1801,11 +1790,11 @@ struct TestIsNaN { Kokkos::printf("failed isnan(floating_point) special values\n"); } - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); #endif } }; diff --git a/lib/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp b/lib/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp index 06c84c7513..7969dc8686 100644 --- a/lib/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp +++ b/lib/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp @@ -1213,13 +1213,13 @@ struct TestComplexBesselI0K0Function { } EXPECT_EQ(h_ref_cbk0(0), h_cbk0(0)); - int upper_limit = N; + int upper_limit_0 = N; // FIXME_SYCL Failing for Intel GPUs, 19 is the first failing test case #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) if (std::is_same_v) - upper_limit = 19; + upper_limit_0 = 19; #endif - for (int i = 1; i < upper_limit; i++) { + for (int i = 1; i < upper_limit_0; i++) { EXPECT_LE(Kokkos::abs(h_cbk0(i) - h_ref_cbk0(i)), Kokkos::abs(h_ref_cbk0(i)) * 1e-13) << "at index " << i; @@ -1462,13 +1462,13 @@ struct TestComplexBesselI1K1Function { } EXPECT_EQ(h_ref_cbk1(0), h_cbk1(0)); - int upper_limit = N; + int upper_limit_1 = N; // FIXME_SYCL Failing for Intel GPUs, 8 is the first failing test case #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) if (std::is_same_v) - upper_limit = 8; + upper_limit_1 = 8; #endif - for (int i = 1; i < upper_limit; i++) { + for (int i = 1; i < upper_limit_1; i++) { EXPECT_LE(Kokkos::abs(h_cbk1(i) - h_ref_cbk1(i)), Kokkos::abs(h_ref_cbk1(i)) * 1e-13) << "at index " << i; @@ -1718,20 +1718,26 @@ struct TestComplexBesselH1Function { ((HIP_VERSION_MAJOR == 5) && \ !((HIP_VERSION_MINOR == 5) || (HIP_VERSION_MINOR == 6))) EXPECT_EQ(h_ref_ch10(0), h_ch10(0)); - for (int i = 1; i < N; i++) { + int upper_limit_10 = N; +// FIXME_SYCL Failing for Intel GPUs, 17 is the first failing test case +#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + upper_limit_10 = 17; +#endif + for (int i = 1; i < upper_limit_10; i++) { EXPECT_LE(Kokkos::abs(h_ch10(i) - h_ref_ch10(i)), Kokkos::abs(h_ref_ch10(i)) * 1e-13) << "at index " << i; } EXPECT_EQ(h_ref_ch11(0), h_ch11(0)); - int upper_limit = N; - // FIXME_SYCL Failing for Intel GPUs, 16 is the first failing test case + int upper_limit_11 = N; + // FIXME_SYCL Failing for Intel GPUs, 2 is the first failing test case #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) if (std::is_same_v) - upper_limit = 16; + upper_limit_11 = 2; #endif - for (int i = 1; i < upper_limit; i++) { + for (int i = 1; i < upper_limit_11; i++) { EXPECT_LE(Kokkos::abs(h_ch11(i) - h_ref_ch11(i)), Kokkos::abs(h_ref_ch11(i)) * 1e-13) << "at index " << i; @@ -1912,19 +1918,26 @@ struct TestComplexBesselH2Function { ((HIP_VERSION_MAJOR == 5) && \ !((HIP_VERSION_MINOR == 5) || (HIP_VERSION_MINOR == 6))) EXPECT_EQ(h_ref_ch20(0), h_ch20(0)); - for (int i = 1; i < N; i++) { + int upper_limit_20 = N; +// FIXME_SYCL Failing for Intel GPUs, 16 is the first failing test case +#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + upper_limit_20 = 16; +#endif + for (int i = 1; i < upper_limit_20; i++) { EXPECT_LE(Kokkos::abs(h_ch20(i) - h_ref_ch20(i)), - Kokkos::abs(h_ref_ch20(i)) * 1e-13); + Kokkos::abs(h_ref_ch20(i)) * 1e-13) + << "at index " << i; } EXPECT_EQ(h_ref_ch21(0), h_ch21(0)); - int upper_limit = N; - // FIXME_SYCL Failing for Intel GPUs, 17 is the first failing test case + int upper_limit_21 = N; + // FIXME_SYCL Failing for Intel GPUs, 1 is the first failing test case #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) if (std::is_same_v) - upper_limit = 17; + upper_limit_21 = 1; #endif - for (int i = 1; i < upper_limit; i++) { + for (int i = 1; i < upper_limit_21; i++) { EXPECT_LE(Kokkos::abs(h_ch21(i) - h_ref_ch21(i)), Kokkos::abs(h_ref_ch21(i)) * 1e-13) << "at index " << i; @@ -1954,31 +1967,61 @@ TEST(TEST_CATEGORY, mathspecialfunc_errorfunc) { #endif TEST(TEST_CATEGORY, mathspecialfunc_cbesselj0y0) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselJ0Y0Function test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesselj1y1) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselJ1Y1Function test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesseli0k0) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselI0K0Function test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesseli1k1) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselI1K1Function test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesselh1stkind) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselH1Function test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesselh2ndkind) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselH2Function test; test.testit(); } diff --git a/lib/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp b/lib/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp index eaf7a4125c..116ac58c39 100644 --- a/lib/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp +++ b/lib/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp @@ -214,7 +214,7 @@ struct point_t { uint8_t x, y, z; KOKKOS_FUNCTION - point_t() : x(1), y(1), z(1){}; + point_t() : x(0), y(0), z(0){}; KOKKOS_FUNCTION point_t(const point_t &val) : x(val.x), y(val.y), z(val.z){}; diff --git a/lib/kokkos/core/unit_test/TestNumericTraits.hpp b/lib/kokkos/core/unit_test/TestNumericTraits.hpp index ec1c1e0ca0..81a9d0a5e0 100644 --- a/lib/kokkos/core/unit_test/TestNumericTraits.hpp +++ b/lib/kokkos/core/unit_test/TestNumericTraits.hpp @@ -210,9 +210,10 @@ TEST(TEST_CATEGORY, numeric_traits_infinity) { #endif TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -224,9 +225,9 @@ TEST(TEST_CATEGORY, numeric_traits_epsilon) { #endif TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -239,9 +240,9 @@ TEST(TEST_CATEGORY, numeric_traits_round_error) { #endif TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -253,9 +254,9 @@ TEST(TEST_CATEGORY, numeric_traits_norm_min) { #endif TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -263,9 +264,9 @@ TEST(TEST_CATEGORY, numeric_traits_norm_min) { TEST(TEST_CATEGORY, numeric_traits_denorm_min) { TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -302,8 +303,10 @@ TEST(TEST_CATEGORY, numeric_traits_finite_min_max) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); TestNumericTraits(); #endif @@ -326,8 +329,10 @@ TEST(TEST_CATEGORY, numeric_traits_digits) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -349,8 +354,10 @@ TEST(TEST_CATEGORY, numeric_traits_digits10) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -358,8 +365,10 @@ TEST(TEST_CATEGORY, numeric_traits_digits10) { TEST(TEST_CATEGORY, numeric_traits_max_digits10) { TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -380,8 +389,10 @@ TEST(TEST_CATEGORY, numeric_traits_radix) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -395,8 +406,10 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); TestNumericTraits(); #endif @@ -407,8 +420,10 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent10) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); TestNumericTraits(); #endif @@ -426,8 +441,10 @@ TEST(TEST_CATEGORY, numeric_traits_quiet_and_signaling_nan) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); TestNumericTraits(); #endif @@ -442,7 +459,7 @@ struct HasNoSpecialization {}; using TRAIT##_value_t = decltype(Kokkos::Experimental::TRAIT::value); \ template \ using has_##TRAIT = Kokkos::is_detected; \ - static_assert(!has_##TRAIT::value, ""); + static_assert(!has_##TRAIT::value); CHECK_TRAIT_IS_SFINAE_FRIENDLY(infinity) CHECK_TRAIT_IS_SFINAE_FRIENDLY(finite_min) @@ -524,39 +541,39 @@ CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, denorm_min); #endif // clang-format off -static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< float>::min(), ""); -static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< double>::min(), ""); -static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits::min(), ""); +static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< float>::min()); +static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< double>::min()); +static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits::min()); // integer types -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< char>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< signed char>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned char>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< short>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned short>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< long int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned long int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< long long int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits::min(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< char>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< signed char>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned char>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< short>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned short>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< long int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned long int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< long long int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits::max(), ""); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< char>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< signed char>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned char>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< short>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned short>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< long int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned long int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< long long int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits::min()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< char>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< signed char>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned char>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< short>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned short>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< long int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned long int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< long long int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits::max()); // floating point types -static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits< float>::max(), ""); -static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits< double>::max(), ""); -static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< float>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< double>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits::max(), ""); +static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits< float>::max()); +static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits< double>::max()); +static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< float>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< double>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits::max()); // clang-format on CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(bool, digits); @@ -623,15 +640,13 @@ CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, max_exponent10); #undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION #undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT -#define CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(T, TRAIT) \ - static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ - static_assert( \ - std::numeric_limits::TRAIT() != std::numeric_limits::TRAIT(), ""); \ - static_assert(Kokkos::Experimental::TRAIT::value != \ - std::numeric_limits::TRAIT(), \ - "") +#define CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(T, TRAIT) \ + static_assert(Kokkos::Experimental::TRAIT::value != \ + Kokkos::Experimental::TRAIT::value); \ + static_assert(std::numeric_limits::TRAIT() != \ + std::numeric_limits::TRAIT()); \ + static_assert(Kokkos::Experimental::TRAIT::value != \ + std::numeric_limits::TRAIT()) // Workaround compiler issue error: expression must have a constant value // See kokkos/kokkos#4574 @@ -651,14 +666,11 @@ CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, signaling_NaN); #define CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES(T, TRAIT) \ static_assert(Kokkos::Experimental::TRAIT::value == \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value == \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value == \ - Kokkos::Experimental::TRAIT::value, \ - "") + Kokkos::Experimental::TRAIT::value) #define CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(TRAIT) \ CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES(float, TRAIT); \ @@ -706,17 +718,13 @@ CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(max_exponent10); #define CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES(T, TRAIT) \ static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - "") + Kokkos::Experimental::TRAIT::value) #define CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(TRAIT) \ CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES(float, TRAIT); \ diff --git a/lib/kokkos/core/unit_test/TestOccupancyControlTrait.hpp b/lib/kokkos/core/unit_test/TestOccupancyControlTrait.hpp new file mode 100644 index 0000000000..345a906d66 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestOccupancyControlTrait.hpp @@ -0,0 +1,80 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +namespace { + +template +void test_policy_execution(const Kokkos::RangePolicy& policy) { + Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int){}); +} +template +void test_policy_execution(const Kokkos::TeamPolicy& policy) { + Kokkos::parallel_for( + policy, + KOKKOS_LAMBDA( + const typename Kokkos::TeamPolicy::member_type&){}); +} +template +void test_policy_execution(const Kokkos::MDRangePolicy& policy) { + Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int, int){}); +} + +template +void test_prefer_desired_occupancy(Policy policy) { + using Kokkos::Experimental::DesiredOccupancy; + using Kokkos::Experimental::MaximizeOccupancy; + using Kokkos::Experimental::prefer; + using Kokkos::Experimental::WorkItemProperty; + + // MaximizeOccupancy -> MaximizeOccupancy + auto const policy_still_no_occ = prefer(policy, MaximizeOccupancy{}); + test_policy_execution(policy_still_no_occ); + + // MaximizeOccupancy -> DesiredOccupancy + auto const policy_with_occ = + prefer(policy_still_no_occ, DesiredOccupancy{33}); + test_policy_execution(policy_with_occ); + + // DesiredOccupancy -> DesiredOccupancy + auto const policy_change_occ = prefer(policy_with_occ, DesiredOccupancy{24}); + test_policy_execution(policy_change_occ); + + // DesiredOccupancy -> DesiredOccupancy w/ hint + auto policy_with_occ_and_hint = Kokkos::Experimental::require( + policy_change_occ, + Kokkos::Experimental::WorkItemProperty::HintLightWeight); + test_policy_execution(policy_with_occ_and_hint); + + // DesiredOccupancy -> MaximizeOccupancy + auto const policy_drop_occ = + prefer(policy_with_occ_and_hint, MaximizeOccupancy{}); + test_policy_execution(policy_drop_occ); +} + +// FIXME_MSVC_WITH_CUDA +// This test doesn't compile with CUDA on Windows +#if !(defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA)) +TEST(TEST_CATEGORY, occupancy_control) { + test_prefer_desired_occupancy(Kokkos::RangePolicy(0, 1)); + test_prefer_desired_occupancy( + Kokkos::TeamPolicy{1, Kokkos::AUTO}); + test_prefer_desired_occupancy( + Kokkos::MDRangePolicy>{{0, 0}, {1, 1}}); +} +#endif +} // namespace diff --git a/lib/kokkos/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp b/lib/kokkos/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp index 176ce9b5fe..a56dfd9efc 100644 --- a/lib/kokkos/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp +++ b/lib/kokkos/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp @@ -166,22 +166,6 @@ TEST(defaultdevicetype, cmd_line_args_device_id) { EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {"--dummy"}); } -TEST(defaultdevicetype, cmd_line_args_num_devices) { - CmdLineArgsHelper cla = {{ - "--kokkos-num-devices=5,6", - "--kokkos-num-devices=7", - "-v", - }}; - Kokkos::InitializationSettings settings; - Kokkos::Impl::parse_command_line_arguments(cla.argc(), cla.argv(), settings); - EXPECT_TRUE(settings.has_num_devices()); - EXPECT_EQ(settings.get_num_devices(), 7); - // this is the current behavior, not suggesting this cannot be revisited - EXPECT_TRUE(settings.has_skip_device()) << "behavior changed see comment"; - EXPECT_EQ(settings.get_skip_device(), 6) << "behavior changed see comment"; - EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {"-v"}); -} - TEST(defaultdevicetype, cmd_line_args_disable_warning) { CmdLineArgsHelper cla = {{ "--kokkos-disable-warnings=1", @@ -351,20 +335,6 @@ TEST(defaultdevicetype, env_vars_device_id) { EXPECT_EQ(settings.get_device_id(), 33); } -TEST(defaultdevicetype, env_vars_num_devices) { - EnvVarsHelper ev = {{ - {"KOKKOS_NUM_DEVICES", "4"}, - {"KOKKOS_SKIP_DEVICE", "1"}, - }}; - SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev); - Kokkos::InitializationSettings settings; - Kokkos::Impl::parse_environment_variables(settings); - EXPECT_TRUE(settings.has_num_devices()); - EXPECT_EQ(settings.get_num_devices(), 4); - EXPECT_TRUE(settings.has_skip_device()); - EXPECT_EQ(settings.get_skip_device(), 1); -} - TEST(defaultdevicetype, env_vars_disable_warnings) { for (auto const& value_true : {"1", "true", "TRUE", "yEs"}) { EnvVarsHelper ev = {{ @@ -420,22 +390,20 @@ TEST(defaultdevicetype, env_vars_tune_internals) { } TEST(defaultdevicetype, visible_devices) { -#define KOKKOS_TEST_VISIBLE_DEVICES(ENV, CNT, DEV) \ - do { \ - EnvVarsHelper ev{ENV}; \ - SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev); \ - Kokkos::InitializationSettings settings; \ - Kokkos::Impl::parse_environment_variables(settings); \ - auto computed = Kokkos::Impl::get_visible_devices(settings, CNT); \ - std::vector expected = DEV; \ - EXPECT_EQ(expected.size(), computed.size()) \ - << ev << "device count: " << CNT; \ - auto n = std::min(expected.size(), computed.size()); \ - for (int i = 0; i < n; ++i) { \ - EXPECT_EQ(expected[i], computed[i]) \ - << "devices differ at index " << i << '\n' \ - << ev << "device count: " << CNT; \ - } \ +#define KOKKOS_TEST_VISIBLE_DEVICES(ENV, CNT, DEV) \ + do { \ + EnvVarsHelper ev{ENV}; \ + SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev); \ + auto computed = Kokkos::Impl::get_visible_devices(CNT); \ + std::vector expected = DEV; \ + EXPECT_EQ(expected.size(), computed.size()) \ + << ev << "device count: " << CNT; \ + auto n = std::min(expected.size(), computed.size()); \ + for (int i = 0; i < n; ++i) { \ + EXPECT_EQ(expected[i], computed[i]) \ + << "devices differ at index " << i << '\n' \ + << ev << "device count: " << CNT; \ + } \ } while (false) #define DEV(...) \ @@ -444,6 +412,8 @@ TEST(defaultdevicetype, visible_devices) { // first test with all environment variables that are involved in determining // the visible devices so user set var do not mess up the logic below. + // KOKKOS_NUM_DEVICES and KOKKOS_SKIP_DEVICE are deprecated since 3.7 and are + // not taken into account anymore. KOKKOS_TEST_VISIBLE_DEVICES( ENV({"KOKKOS_VISIBLE_DEVICES", "2,1"}, {"KOKKOS_NUM_DEVICES", "8"}, {"KOKKOS_SKIP_DEVICE", "1"}), @@ -452,10 +422,10 @@ TEST(defaultdevicetype, visible_devices) { ENV({"KOKKOS_VISIBLE_DEVICES", "2,1"}, {"KOKKOS_NUM_DEVICES", "8"}, ), 6, DEV(2, 1)); KOKKOS_TEST_VISIBLE_DEVICES(ENV({"KOKKOS_NUM_DEVICES", "3"}), 6, - DEV(0, 1, 2)); + DEV(0, 1, 2, 3, 4, 5)); KOKKOS_TEST_VISIBLE_DEVICES( ENV({"KOKKOS_NUM_DEVICES", "4"}, {"KOKKOS_SKIP_DEVICE", "1"}, ), 6, - DEV(0, 2, 3)); + DEV(0, 1, 2, 3, 4, 5)); KOKKOS_TEST_VISIBLE_DEVICES(ENV({"KOKKOS_VISIBLE_DEVICES", "1,3,4"}), 6, DEV(1, 3, 4)); KOKKOS_TEST_VISIBLE_DEVICES( diff --git a/lib/kokkos/core/unit_test/TestRangePolicyCTAD.cpp b/lib/kokkos/core/unit_test/TestRangePolicyCTAD.cpp new file mode 100644 index 0000000000..20288e2b40 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestRangePolicyCTAD.cpp @@ -0,0 +1,150 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include "Kokkos_Core_fwd.hpp" + +namespace { + +struct TestRangePolicyCTAD { + struct SomeExecutionSpace { + using execution_space = SomeExecutionSpace; + using size_type = size_t; + + [[maybe_unused]] static int concurrency() { return 0; } + }; + static_assert(Kokkos::is_execution_space_v); + + struct ImplicitlyConvertibleToDefaultExecutionSpace { + [[maybe_unused]] operator Kokkos::DefaultExecutionSpace() const { + return Kokkos::DefaultExecutionSpace(); + } + }; + static_assert(!Kokkos::is_execution_space_v< + ImplicitlyConvertibleToDefaultExecutionSpace>); + + [[maybe_unused]] static inline auto i64 = int64_t(); + [[maybe_unused]] static inline auto i32 = int32_t(); + [[maybe_unused]] static inline auto cs = Kokkos::ChunkSize(0); + [[maybe_unused]] static inline auto des = Kokkos::DefaultExecutionSpace(); + [[maybe_unused]] static inline auto nes = + ImplicitlyConvertibleToDefaultExecutionSpace(); + [[maybe_unused]] static inline auto ses = SomeExecutionSpace(); + + // RangePolicy() + + [[maybe_unused]] static inline auto rp = Kokkos::RangePolicy{}; + static_assert(std::is_same_v, decltype(rp)>); + + // RangePolicy(index_type, index_type) + + [[maybe_unused]] static inline auto rpi64i64 = Kokkos::RangePolicy(i64, i64); + static_assert(std::is_same_v, decltype(rpi64i64)>); + + [[maybe_unused]] static inline auto rpi64i32 = Kokkos::RangePolicy(i64, i32); + static_assert(std::is_same_v, decltype(rpi64i32)>); + + [[maybe_unused]] static inline auto rpi32i64 = Kokkos::RangePolicy(i32, i64); + static_assert(std::is_same_v, decltype(rpi32i64)>); + + [[maybe_unused]] static inline auto rpi32i32 = Kokkos::RangePolicy(i32, i32); + static_assert(std::is_same_v, decltype(rpi32i32)>); + + // RangePolicy(index_type, index_type, ChunkSize) + + [[maybe_unused]] static inline auto rpi64i64cs = + Kokkos::RangePolicy(i64, i64, cs); + static_assert(std::is_same_v, decltype(rpi64i64cs)>); + + [[maybe_unused]] static inline auto rpi64i32cs = + Kokkos::RangePolicy(i64, i32, cs); + static_assert(std::is_same_v, decltype(rpi64i32cs)>); + + [[maybe_unused]] static inline auto rpi32i64cs = + Kokkos::RangePolicy(i32, i64, cs); + static_assert(std::is_same_v, decltype(rpi32i64cs)>); + + [[maybe_unused]] static inline auto rpi32i32cs = + Kokkos::RangePolicy(i32, i32, cs); + static_assert(std::is_same_v, decltype(rpi32i32cs)>); + + // RangePolicy(execution_space, index_type, index_type) + + [[maybe_unused]] static inline auto rpdesi64i64 = + Kokkos::RangePolicy(des, i64, i64); + static_assert(std::is_same_v, decltype(rpdesi64i64)>); + + [[maybe_unused]] static inline auto rpdesi32i32 = + Kokkos::RangePolicy(des, i32, i32); + static_assert(std::is_same_v, decltype(rpdesi32i32)>); + + [[maybe_unused]] static inline auto rpnesi64i64 = + Kokkos::RangePolicy(nes, i64, i64); + static_assert(std::is_same_v, decltype(rpnesi64i64)>); + + [[maybe_unused]] static inline auto rpnesi32i32 = + Kokkos::RangePolicy(nes, i32, i32); + static_assert(std::is_same_v, decltype(rpnesi32i32)>); + + [[maybe_unused]] static inline auto rpsesi64i64 = + Kokkos::RangePolicy(ses, i64, i64); + static_assert(std::is_same_v, + decltype(rpsesi64i64)>); + + [[maybe_unused]] static inline auto rpsesi32i32 = + Kokkos::RangePolicy(ses, i32, i32); + static_assert(std::is_same_v, + decltype(rpsesi32i32)>); + + // RangePolicy(execution_space, index_type, index_type, ChunkSize) + + [[maybe_unused]] static inline auto rpdesi64i64cs = + Kokkos::RangePolicy(des, i64, i64, cs); + static_assert(std::is_same_v, decltype(rpdesi64i64cs)>); + + [[maybe_unused]] static inline auto rpdesi32i32cs = + Kokkos::RangePolicy(des, i32, i32, cs); + static_assert(std::is_same_v, decltype(rpdesi32i32cs)>); + + [[maybe_unused]] static inline auto rpnesi64i64cs = + Kokkos::RangePolicy(nes, i64, i64, cs); + static_assert(std::is_same_v, decltype(rpnesi64i64cs)>); + + [[maybe_unused]] static inline auto rpnesi32i32cs = + Kokkos::RangePolicy(nes, i32, i32, cs); + static_assert(std::is_same_v, decltype(rpnesi32i32cs)>); + + [[maybe_unused]] static inline auto rpsesi64i64cs = + Kokkos::RangePolicy(ses, i64, i64, cs); + static_assert(std::is_same_v, + decltype(rpsesi64i64cs)>); + + [[maybe_unused]] static inline auto rpsesi32i32cs = + Kokkos::RangePolicy(ses, i32, i32, cs); + static_assert(std::is_same_v, + decltype(rpsesi32i32cs)>); + +}; // TestRangePolicyCTAD struct + +// To eliminate maybe_unused warning on some compilers + +[[maybe_unused]] const Kokkos::DefaultExecutionSpace nestodes = + TestRangePolicyCTAD::ImplicitlyConvertibleToDefaultExecutionSpace(); + +[[maybe_unused]] const auto sesconcurrency = + TestRangePolicyCTAD::ses.concurrency(); + +} // namespace diff --git a/lib/kokkos/core/unit_test/TestRangePolicyConstructors.hpp b/lib/kokkos/core/unit_test/TestRangePolicyConstructors.hpp index 0a7e59ed98..c8c1542af1 100644 --- a/lib/kokkos/core/unit_test/TestRangePolicyConstructors.hpp +++ b/lib/kokkos/core/unit_test/TestRangePolicyConstructors.hpp @@ -18,6 +18,9 @@ #include +#include +#include + namespace { TEST(TEST_CATEGORY, range_policy_runtime_parameters) { @@ -70,4 +73,127 @@ TEST(TEST_CATEGORY, range_policy_runtime_parameters) { } } +TEST(TEST_CATEGORY_DEATH, range_policy_invalid_bounds) { + using Policy = Kokkos::RangePolicy; + using ChunkSize = Kokkos::ChunkSize; + + std::string msg = + "Kokkos::RangePolicy bounds error: The lower bound (100) is greater than " + "the upper bound (90).\n"; +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + // escape the parentheses in the regex to match the error message + msg = std::regex_replace(msg, std::regex("\\(|\\)"), "\\$&"); + ASSERT_DEATH({ (void)Policy(100, 90); }, msg); + + ASSERT_DEATH({ (void)Policy(TEST_EXECSPACE(), 100, 90, ChunkSize(10)); }, + msg); +#else + + if (!Kokkos::show_warnings()) { + GTEST_SKIP() << "Kokkos warning messages are disabled"; + } + + { + ::testing::internal::CaptureStderr(); + Policy policy(100, 90); + ASSERT_EQ((int)policy.begin(), 0); + ASSERT_EQ((int)policy.end(), 0); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + ASSERT_EQ(::testing::internal::GetCapturedStderr(), msg); +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg; +#endif + } + + { + ::testing::internal::CaptureStderr(); + Policy policy(TEST_EXECSPACE(), 100, 90, ChunkSize(10)); + ASSERT_EQ((int)policy.begin(), 0); + ASSERT_EQ((int)policy.end(), 0); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + ASSERT_EQ(::testing::internal::GetCapturedStderr(), msg); +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg; +#endif + } + +#endif +} + +TEST(TEST_CATEGORY_DEATH, range_policy_implicitly_converted_bounds) { + using UIntIndexType = Kokkos::IndexType; + using IntIndexType = Kokkos::IndexType; + using UIntPolicy = Kokkos::RangePolicy; + using IntPolicy = Kokkos::RangePolicy; + + std::string msg = + "Kokkos::RangePolicy bound type error: an unsafe implicit conversion is " + "performed on a bound (), which may not preserve its original value.\n"; + + auto get_error_msg = [](auto str, auto val) { + return str.insert(str.find("(") + 1, std::to_string(val).c_str()); + }; +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + std::string expected = std::regex_replace(msg, std::regex("\\(|\\)"), "\\$&"); + { + int test_val = -1; + ASSERT_DEATH({ (void)UIntPolicy(test_val, 10); }, + get_error_msg(expected, test_val)); + } + { + unsigned test_val = std::numeric_limits::max(); + ASSERT_DEATH({ (void)IntPolicy(0u, test_val); }, + get_error_msg(expected, test_val)); + } + { + long long test_val = std::numeric_limits::max(); + ASSERT_DEATH({ (void)IntPolicy(0LL, test_val); }, + get_error_msg(expected, test_val)); + } + { + int test_val = -1; + ASSERT_DEATH({ (void)UIntPolicy(test_val, 10, Kokkos::ChunkSize(2)); }, + get_error_msg(expected, test_val)); + } + +#else + { + ::testing::internal::CaptureStderr(); + int test_val = -1; + UIntPolicy policy(test_val, 10); + ASSERT_EQ(policy.begin(), 0u); + ASSERT_EQ(policy.end(), 0u); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + if (Kokkos::show_warnings()) { + auto s = std::string(::testing::internal::GetCapturedStderr()); + ASSERT_EQ(s.substr(0, s.find("\n") + 1), get_error_msg(msg, test_val)); + } +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg; + (void)get_error_msg; +#endif + } + { + ::testing::internal::CaptureStderr(); + unsigned test_val = std::numeric_limits::max(); + IntPolicy policy(0u, test_val); + ASSERT_EQ(policy.begin(), 0); + ASSERT_EQ(policy.end(), 0); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + if (Kokkos::show_warnings()) { + auto s = std::string(::testing::internal::GetCapturedStderr()); + ASSERT_EQ(s.substr(0, s.find("\n") + 1), get_error_msg(msg, test_val)); + } +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg; + (void)get_error_msg; +#endif + } +#endif +} + } // namespace diff --git a/lib/kokkos/core/unit_test/TestReducers.hpp b/lib/kokkos/core/unit_test/TestReducers.hpp index 957b9a0ca1..fbcb9629af 100644 --- a/lib/kokkos/core/unit_test/TestReducers.hpp +++ b/lib/kokkos/core/unit_test/TestReducers.hpp @@ -19,6 +19,7 @@ #include #include +#include //-------------------------------------------------------------------------- @@ -46,6 +47,37 @@ struct TestReducers { void operator()(const int& i, Scalar& value) const { value += values(i); } }; + struct TeamSumFunctor { + using member_type = typename Kokkos::TeamPolicy::member_type; + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type& m, Scalar& value) const { + if (m.team_rank() == m.team_size() - 1) value += Scalar(1); + } + }; + + struct TeamSumNestedFunctor { + using member_type = typename Kokkos::TeamPolicy::member_type; + + SumFunctor f; + int M, N; + Kokkos::View result; + + TeamSumNestedFunctor(SumFunctor& f_, const int M_, const int N_, + Kokkos::View result_) + : f(f_), M(M_), N(N_), result(result_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type& m) const { + const int i = m.league_rank(); + Scalar local_scalar; + Kokkos::Sum reducer_scalar( + local_scalar); + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(m, N), f, reducer_scalar); + result(i) = local_scalar; + } + }; + struct ProdFunctor { Kokkos::View values; @@ -319,6 +351,102 @@ struct TestReducers { value = value || values(i); } }; + + // get number of teams for TeamPolicy depending on the tested type + constexpr static int get_num_teams() { + if constexpr (sizeof(Scalar) == 1) { + return 126; + } else if constexpr (std::is_same_v) { + return 256; + } + + return 1024; + } + + static void test_sum_team_policy(int N, SumFunctor f, Scalar reference_sum) { +#ifdef KOKKOS_ENABLE_OPENACC + if constexpr (std::is_same_v && + (std::is_same_v || + std::is_same_v)) { + return; // FIXME_OPENACC + } +#endif + + Scalar sum_scalar; + Kokkos::View sum_view("result"); + Kokkos::deep_copy(sum_view, Scalar(1)); + + // Test team policy reduction + { + constexpr int num_teams = get_num_teams(); + TeamSumFunctor tf; + // FIXME_OPENMPTARGET temporary restriction for team size to be at least + // 32 +#ifdef KOKKOS_ENABLE_OPENMPTARGET + int team_size = + std::is_same::value + ? 32 + : 1; +#else + int team_size = 1; +#endif + auto team_pol = Kokkos::TeamPolicy(num_teams, team_size); + Kokkos::parallel_reduce(team_pol, tf, sum_view); + Kokkos::deep_copy(sum_scalar, sum_view); + ASSERT_EQ(sum_scalar, Scalar{num_teams}) << "num_teams: " << num_teams; + } + + // Test TeamThreadRange level reduction with 0 work produces 0 result + { + const int league_size = 1; + Kokkos::View result("result", league_size); + TeamSumNestedFunctor tnf(f, league_size, 0, result); + // FIXME_OPENMPTARGET temporary restriction for team size to be at least + // 32 +#ifdef KOKKOS_ENABLE_OPENMPTARGET + int team_size = + std::is_same::value + ? 32 + : 1; +#else + int team_size = 1; +#endif + auto team_pol = Kokkos::TeamPolicy(1, team_size); + Kokkos::parallel_for(team_pol, tnf); + auto result_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), result); + ASSERT_EQ(result_h(0), Scalar{0}) << "N: " << N; + } + + // Same test as above, but with inner reduction over N, and league_size=10 + { + const int league_size = 10; + Kokkos::View result("result", league_size); + TeamSumNestedFunctor tnf(f, league_size, N, result); + // FIXME_OPENMPTARGET temporary restriction for team size to be at least + // 32 +#ifdef KOKKOS_ENABLE_OPENMPTARGET + int initial_team_size = + std::is_same_v ? 32 + : 1; +#else + int initial_team_size = 1; +#endif + auto team_size_max = + Kokkos::TeamPolicy(league_size, initial_team_size) + .team_size_max(tnf, Kokkos::ParallelForTag()); + auto team_size = std::min(team_size_max, TEST_EXECSPACE().concurrency()); + auto team_pol = Kokkos::TeamPolicy(league_size, team_size); + Kokkos::parallel_for(team_pol, tnf); + auto result_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), result); + for (int i = 0; i < result_h.extent_int(0); ++i) { + ASSERT_EQ(result_h(i), reference_sum) << "N: " << N; + } + } + } + static void test_sum(int N) { Kokkos::View values("Values", N); auto h_values = Kokkos::create_mirror_view(values); @@ -374,6 +502,8 @@ struct TestReducers { ASSERT_EQ(sum_scalar_view, reference_sum) << "N: " << N; } + test_sum_team_policy(N, f, reference_sum); + { Kokkos::View sum_view("View"); sum_view() = Scalar(1); diff --git a/lib/kokkos/core/unit_test/TestReducers_d.hpp b/lib/kokkos/core/unit_test/TestReducers_d.hpp index 19eaa6d700..ecf851aa10 100644 --- a/lib/kokkos/core/unit_test/TestReducers_d.hpp +++ b/lib/kokkos/core/unit_test/TestReducers_d.hpp @@ -80,6 +80,20 @@ TEST(TEST_CATEGORY, reducers_int8_t) { TestReducers::test_prod(4); } +TEST(TEST_CATEGORY, reducers_int16_t) { + using ThisTestType = int16_t; + + TestReducers::test_sum(1); + TestReducers::test_sum(2); + TestReducers::test_sum(3); + TestReducers::test_sum(4); + + TestReducers::test_prod(1); + TestReducers::test_prod(2); + TestReducers::test_prod(3); + TestReducers::test_prod(4); +} + #if !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_OPENMPTARGET) // TODO - resolve: "Kokkos_HIP_Vectorization.hpp:80:15: error: call to // implicitly-deleted default constructor of 'conv_type' diff --git a/lib/kokkos/core/unit_test/TestSwap.hpp b/lib/kokkos/core/unit_test/TestSwap.hpp new file mode 100644 index 0000000000..4e98351cf1 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestSwap.hpp @@ -0,0 +1,68 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include +#include +#include + +namespace { + +template +struct TestSwap { + KOKKOS_FUNCTION void operator()(int, int& err) const { + { + int a = 1; + int b = 2; + Kokkos::kokkos_swap(a, b); + if (!(a == 2 && b == 1)) { + Kokkos::printf("Failed Kokkos::kokkos_swap(int, int)\n"); + ++err; + } + } + { + float a = 1; + float b = 2; + Kokkos::kokkos_swap(a, b); + if (!(a == 2 && b == 1)) { + Kokkos::printf("Failed Kokkos::kokkos_swap(float, float)\n"); + ++err; + } + } + { + int a[3] = {1, 2, 3}; + int b[3] = {4, 5, 6}; + Kokkos::kokkos_swap(a, b); + if (!(a[0] == 4 && a[1] == 5 && a[2] == 6 && b[0] == 1 && b[1] == 2 && + b[2] == 3)) { + Kokkos::printf("Failed Kokkos::kokkos_swap(int[3], int[3])\n"); + ++err; + } + } + } + + TestSwap() { + int errors; + Kokkos::parallel_reduce( + "TestSwap", Kokkos::RangePolicy(0, 1), *this, errors); + EXPECT_EQ(errors, 0); + } +}; + +TEST(TEST_CATEGORY, kokkos_swap) { TestSwap(); } + +} // namespace diff --git a/lib/kokkos/core/unit_test/TestTeamBasic.hpp b/lib/kokkos/core/unit_test/TestTeamBasic.hpp index c395bc0837..a3d84c5e16 100644 --- a/lib/kokkos/core/unit_test/TestTeamBasic.hpp +++ b/lib/kokkos/core/unit_test/TestTeamBasic.hpp @@ -280,7 +280,7 @@ namespace Test { // Test for non-arithmetic type TEST(TEST_CATEGORY, team_broadcast_long_wrapper) { - static_assert(!std::is_arithmetic::value, ""); + static_assert(!std::is_arithmetic::value); TestTeamBroadcast, long_wrapper>::test_teambroadcast(0, 1); diff --git a/lib/kokkos/core/unit_test/TestTeamMDRange.hpp b/lib/kokkos/core/unit_test/TestTeamMDRange.hpp index 6e65cde0cf..81931467c5 100644 --- a/lib/kokkos/core/unit_test/TestTeamMDRange.hpp +++ b/lib/kokkos/core/unit_test/TestTeamMDRange.hpp @@ -169,7 +169,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -202,7 +209,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -236,7 +250,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -272,7 +293,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -310,7 +338,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4, n5); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -350,7 +385,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { n6); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -420,7 +462,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -457,7 +506,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -496,7 +552,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -536,7 +599,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4, n5); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -579,7 +649,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { n6); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -620,7 +697,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -653,7 +737,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -687,7 +778,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -723,7 +821,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -761,7 +866,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4, n5); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -801,7 +913,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { n6); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -908,13 +1027,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k) = fillFlattenedIndex(i, j, k); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -923,7 +1049,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j); }, teamSum); - leagueSum += teamSum; + // FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -952,13 +1084,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l) = fillFlattenedIndex(i, j, k, l); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -966,7 +1105,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { [=](const int& i, const int& j, const int& k, DataType& threadSum) { threadSum += v(leagueRank, i, j, k); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -997,13 +1142,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m) = fillFlattenedIndex(i, j, k, l, m); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -1013,7 +1165,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j, k, l); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1045,13 +1203,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m, n) = fillFlattenedIndex(i, j, k, l, m, n); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -1061,7 +1226,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j, k, l, m); }, teamSum); - leagueSum += teamSum; + // FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1100,13 +1271,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -1116,7 +1294,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j, k, l, m, n); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1157,13 +1341,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -1174,7 +1365,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j, k, l, m, n, o); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1207,20 +1404,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l) = fillFlattenedIndex(i, j, k, l); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange, TeamType>( team, n1, n2); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1228,11 +1431,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k); }, threadSum); - - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1263,20 +1464,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m) = fillFlattenedIndex(i, j, k, l, m); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange, TeamType>( team, n1, n2, n3); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1286,10 +1493,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { }, threadSum); - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1321,20 +1527,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m, n) = fillFlattenedIndex(i, j, k, l, m, n); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange, TeamType>( team, n1, n2, n3, n4); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1344,10 +1556,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { }, threadSum); - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1384,20 +1595,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange, TeamType>( team, n1, n2, n3, n4, n5); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1407,10 +1624,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { }, threadSum); - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1451,20 +1667,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange, TeamType>( team, n1, n2, n3, n4, n5, n6); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1474,10 +1696,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { }, threadSum); - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1510,13 +1731,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l) = fillFlattenedIndex(i, j, k, l); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange, TeamType>( @@ -1527,7 +1755,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { [=](const int& i, const int& j, const int& k, DataType& vectorSum) { vectorSum += v(leagueRank, i, j, k); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1558,13 +1792,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m) = fillFlattenedIndex(i, j, k, l, m); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange, TeamType>( @@ -1577,7 +1818,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k, l); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1609,13 +1856,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m, n) = fillFlattenedIndex(i, j, k, l, m, n); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange, TeamType>( @@ -1628,7 +1882,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k, l, m); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1665,13 +1925,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange, TeamType>( @@ -1684,7 +1951,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k, l, m, n); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1725,13 +1998,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange, TeamType>( @@ -1745,7 +2025,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k, l, m, n, o); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1904,13 +2190,6 @@ TEST(TEST_CATEGORY, ThreadVectorMDRangeParallelReduce) { GTEST_SKIP() << "skipping because of bug in group_barrier implementation"; #endif -// FIXME_OPENMPTARGET_CRAY: The unit tests fails correctness. -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CRAYCLANG) - if (std::is_same_v) - GTEST_SKIP() << "Cray compiler fails correctness at runtime with the " - "OpenMPTarget backend."; -#endif - TestThreadVectorMDRangeParallelReduce:: test_parallel_reduce_for_4D_ThreadVectorMDRange(dims); TestThreadVectorMDRangeParallelReduce:: @@ -1944,13 +2223,6 @@ TEST(TEST_CATEGORY, TeamVectorMDRangeParallelReduce) { GTEST_SKIP() << "skipping because of bug in group_barrier implementation"; #endif -// FIXME_OPENMPTARGET_CRAY: The unit tests fails correctness. -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CRAYCLANG) - if (std::is_same_v) - GTEST_SKIP() << "Cray compiler fails correctness at runtime with the " - "OpenMPTarget backend."; -#endif - TestTeamVectorMDRangeParallelReduce:: test_parallel_reduce_for_4D_TeamVectorMDRange(dims); TestTeamVectorMDRangeParallelReduce:: diff --git a/lib/kokkos/core/unit_test/TestTeamPolicyConstructors.hpp b/lib/kokkos/core/unit_test/TestTeamPolicyConstructors.hpp index 5b0bfdb175..9d89f75708 100644 --- a/lib/kokkos/core/unit_test/TestTeamPolicyConstructors.hpp +++ b/lib/kokkos/core/unit_test/TestTeamPolicyConstructors.hpp @@ -20,11 +20,24 @@ namespace { +struct SomeTag {}; + +struct FunctorFor { + KOKKOS_FUNCTION + void operator()( + Kokkos::TeamPolicy::member_type const&) const {} + + KOKKOS_FUNCTION + void operator()( + SomeTag, Kokkos::TeamPolicy::member_type const&) const {} +}; + template void test_run_time_parameters() { int league_size = 131; using ExecutionSpace = typename Policy::execution_space; + using ParallelTag = Kokkos::ParallelForTag; int team_size = 4 < ExecutionSpace().concurrency() ? 4 : ExecutionSpace().concurrency(); #ifdef KOKKOS_ENABLE_HPX @@ -44,6 +57,8 @@ void test_run_time_parameters() { ASSERT_EQ(p1.team_size(), team_size); ASSERT_GT(p1.chunk_size(), 0); ASSERT_EQ(p1.scratch_size(0), 0u); + ASSERT_GT(p1.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p1.team_size_recommended(FunctorFor(), ParallelTag()), 0); Policy p2 = p1.set_chunk_size(chunk_size); ASSERT_EQ(p1.league_size(), league_size); @@ -112,6 +127,8 @@ void test_run_time_parameters() { Policy p8; // default constructed ASSERT_EQ(p8.league_size(), 0); ASSERT_EQ(p8.scratch_size(0), 0u); + ASSERT_GT(p8.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p8.team_size_recommended(FunctorFor(), ParallelTag()), 0); p8 = p3; // call assignment operator ASSERT_EQ(p3.league_size(), league_size); ASSERT_EQ(p3.team_size(), team_size); @@ -121,11 +138,25 @@ void test_run_time_parameters() { ASSERT_EQ(p8.team_size(), team_size); ASSERT_EQ(p8.chunk_size(), chunk_size); ASSERT_EQ(p8.scratch_size(0), size_t(scratch_size)); + + Policy p9(league_size, Kokkos::AUTO); + ASSERT_EQ(p9.league_size(), league_size); + ASSERT_GT(p9.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p9.team_size_recommended(FunctorFor(), ParallelTag()), 0); + + Policy p10(league_size, team_size, Kokkos::AUTO); + ASSERT_EQ(p10.league_size(), league_size); + ASSERT_EQ(p10.team_size(), team_size); + ASSERT_GT(p10.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p10.team_size_recommended(FunctorFor(), ParallelTag()), 0); + + Policy p11(league_size, Kokkos::AUTO, Kokkos::AUTO); + ASSERT_EQ(p11.league_size(), league_size); + ASSERT_GT(p11.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p11.team_size_recommended(FunctorFor(), ParallelTag()), 0); } TEST(TEST_CATEGORY, team_policy_runtime_parameters) { - struct SomeTag {}; - using TestExecSpace = TEST_EXECSPACE; using DynamicSchedule = Kokkos::Schedule; using LongIndex = Kokkos::IndexType; diff --git a/lib/kokkos/core/unit_test/TestTeamVector.hpp b/lib/kokkos/core/unit_test/TestTeamVector.hpp index 39122736ed..5e16539d65 100644 --- a/lib/kokkos/core/unit_test/TestTeamVector.hpp +++ b/lib/kokkos/core/unit_test/TestTeamVector.hpp @@ -1012,7 +1012,6 @@ struct checkScan { }; } // namespace VectorScanReducer -#if !defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) TEST(TEST_CATEGORY, team_vector) { ASSERT_TRUE((TestTeamVector::Test(0))); ASSERT_TRUE((TestTeamVector::Test(1))); @@ -1028,9 +1027,7 @@ TEST(TEST_CATEGORY, team_vector) { ASSERT_TRUE((TestTeamVector::Test(11))); ASSERT_TRUE((TestTeamVector::Test(12))); } -#endif -#if !defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) TEST(TEST_CATEGORY, triple_nested_parallelism) { // With KOKKOS_ENABLE_DEBUG enabled, the functor uses too many registers to run // with a team size of 32 on GPUs, 16 is the max possible (at least on a K80 @@ -1055,7 +1052,6 @@ TEST(TEST_CATEGORY, triple_nested_parallelism) { TestTripleNestedReduce(8192, 2048, 16, 16); TestTripleNestedReduce(8192, 2048, 7, 16); } -#endif TEST(TEST_CATEGORY, parallel_scan_with_reducers) { using T = double; diff --git a/lib/kokkos/core/unit_test/TestUtilities.hpp b/lib/kokkos/core/unit_test/TestUtilities.hpp index b1f9d30c1f..ad5a0df92d 100644 --- a/lib/kokkos/core/unit_test/TestUtilities.hpp +++ b/lib/kokkos/core/unit_test/TestUtilities.hpp @@ -25,20 +25,18 @@ namespace Test { void test_is_specialization_of() { using Kokkos::Impl::is_specialization_of; - static_assert(is_specialization_of, Kokkos::pair>{}, - ""); - static_assert(!is_specialization_of, Kokkos::pair>{}, ""); - static_assert(is_specialization_of, Kokkos::View>{}, ""); + static_assert(is_specialization_of, Kokkos::pair>{}); + static_assert(!is_specialization_of, Kokkos::pair>{}); + static_assert(is_specialization_of, Kokkos::View>{}); // NOTE Not removing cv-qualifiers - static_assert(!is_specialization_of const, Kokkos::View>{}, - ""); + static_assert( + !is_specialization_of const, Kokkos::View>{}); // NOTE Would not compile because Kokkos::Array takes a non-type template // parameter - // static_assert(is_specialization_of, Kokkos::Array>{}, - // ""); + // static_assert(is_specialization_of, + // Kokkos::Array>{}); // But this is fine of course - static_assert(!is_specialization_of, Kokkos::pair>{}, - ""); + static_assert(!is_specialization_of, Kokkos::pair>{}); } namespace { diff --git a/lib/kokkos/core/unit_test/TestViewAPI.hpp b/lib/kokkos/core/unit_test/TestViewAPI.hpp index ffc500e4a9..ca098dbc24 100644 --- a/lib/kokkos/core/unit_test/TestViewAPI.hpp +++ b/lib/kokkos/core/unit_test/TestViewAPI.hpp @@ -958,8 +958,7 @@ class TestViewAPI { using mirror_type = typename view_type::HostMirror; static_assert(std::is_same::value, - ""); + typename mirror_type::memory_space>::value); view_type a("a"); mirror_type am = Kokkos::create_mirror_view(a); @@ -1005,25 +1004,25 @@ class TestViewAPI { hView3 hv_3("dView3::HostMirror", N0); hView4 hv_4("dView4::HostMirror", N0); - dView0 dv_0_1(nullptr, 0); + dView0 dv_0_1(nullptr); dView0 dv_0_2(hv_0.label(), hv_0.layout()); - dView1 dv_1_1(nullptr, 0); + dView1 dv_1_1(nullptr, N0); dView1 dv_1_2(hv_1.label(), hv_1.layout()); - dView2 dv_2_1(nullptr, 0); + dView2 dv_2_1(nullptr, N0); dView2 dv_2_2(hv_2.label(), hv_2.layout()); - dView3 dv_3_1(nullptr, 0); + dView3 dv_3_1(nullptr, N0); dView3 dv_3_2(hv_3.label(), hv_3.layout()); - dView4 dv_4_1(nullptr, 0); + dView4 dv_4_1(nullptr, N0); dView4 dv_4_2(hv_4.label(), hv_4.layout()); } static void run_test_contruction_from_layout_2() { using dView3_0 = Kokkos::View; - using dView3_1 = Kokkos::View; + using dView3_1 = Kokkos::View; using dView3_2 = Kokkos::View; using dView3_3 = Kokkos::View; @@ -1554,6 +1553,7 @@ class TestViewAPI { Kokkos::CudaUVMSpace>::value) return; #endif + bool did_throw = false; auto alloc_size = std::numeric_limits::max() - 42; try { auto should_always_fail = dView1("hello_world_failure", alloc_size); @@ -1585,7 +1585,9 @@ class TestViewAPI { "because of an unknown error.", msg); } #endif + did_throw = true; } + ASSERT_TRUE(did_throw); } }; diff --git a/lib/kokkos/core/unit_test/TestViewAPI_d.hpp b/lib/kokkos/core/unit_test/TestViewAPI_d.hpp index 08d21f5449..b0d759ffcc 100644 --- a/lib/kokkos/core/unit_test/TestViewAPI_d.hpp +++ b/lib/kokkos/core/unit_test/TestViewAPI_d.hpp @@ -27,8 +27,19 @@ TEST(TEST_CATEGORY, view_api_d) { } TEST(TEST_CATEGORY, view_allocation_error) { +#if defined(__has_feature) +#if __has_feature(address_sanitizer) + GTEST_SKIP() << "AddressSanitzer detects allocating too much memory " + "preventing our checks to run"; +#endif +#endif #if ((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 3)) GTEST_SKIP() << "ROCm 5.3 segfaults when trying to allocate too much memory"; +#endif +#if defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC + if (std::is_same_v) { + GTEST_SKIP() << "acc_malloc() not properly returning nullptr"; + } #endif TestViewAPI::run_test_error(); } diff --git a/lib/kokkos/core/unit_test/TestViewCopy_a.hpp b/lib/kokkos/core/unit_test/TestViewCopy_a.hpp index 3bfc93aada..a4735b2998 100644 --- a/lib/kokkos/core/unit_test/TestViewCopy_a.hpp +++ b/lib/kokkos/core/unit_test/TestViewCopy_a.hpp @@ -147,6 +147,40 @@ TEST(TEST_CATEGORY, view_copy_tests) { Kokkos::deep_copy(s_a, hs_a); ASSERT_TRUE(run_check(s_a, 6)); } + } else { + // These copies won't succeed, but they should each throw + // an exception whose message contains the view labels, + // and the names of the views' memory spaces. + // + // Note: original a,b both have the same device type, + // and their mirrors have the same device type. + using memory_space = typename decltype(a)::memory_space; + using mirror_memory_space = typename decltype(h_a)::memory_space; + bool threw = false; + std::string msg; + try { + Kokkos::deep_copy(hs_b, s_b); + } catch (std::exception& e) { + threw = true; + msg = e.what(); + } + ASSERT_TRUE(threw); + ASSERT_NE(msg.find(hs_b.label()), std::string::npos); + ASSERT_NE(msg.find(s_b.label()), std::string::npos); + ASSERT_NE(msg.find(memory_space().name()), std::string::npos); + ASSERT_NE(msg.find(mirror_memory_space().name()), std::string::npos); + threw = false; + try { + Kokkos::deep_copy(s_a, hs_a); + } catch (std::exception& e) { + threw = true; + msg = e.what(); + } + ASSERT_TRUE(threw); + ASSERT_NE(msg.find(s_a.label()), std::string::npos); + ASSERT_NE(msg.find(hs_a.label()), std::string::npos); + ASSERT_NE(msg.find(memory_space().name()), std::string::npos); + ASSERT_NE(msg.find(mirror_memory_space().name()), std::string::npos); } // Contiguous copies diff --git a/lib/kokkos/core/unit_test/TestViewCtorDimMatch.hpp b/lib/kokkos/core/unit_test/TestViewCtorDimMatch.hpp index d71841eef8..40b7737f2e 100644 --- a/lib/kokkos/core/unit_test/TestViewCtorDimMatch.hpp +++ b/lib/kokkos/core/unit_test/TestViewCtorDimMatch.hpp @@ -19,33 +19,72 @@ namespace Test { -#define LIVE(EXPR, ARGS, DYNRANK) EXPECT_NO_THROW(EXPR) -#define DIE(EXPR, ARGS, DYNRANK) \ - ASSERT_DEATH( \ - EXPR, \ - "Constructor for Kokkos View 'v_" #ARGS \ - "' has mismatched number of arguments. Number of arguments = " #ARGS \ - " but dynamic rank = " #DYNRANK) +template +void test_matching_arguments_rank_helper(std::index_sequence) { + constexpr int nargs = sizeof...(Is); + using view_type = Kokkos::View; + if (nargs == rank || nargs == dynrank) { + EXPECT_NO_THROW({ view_type v("v", ((Is * 0) + 1)...); }); + EXPECT_NO_THROW({ view_type v(nullptr, ((Is * 0) + 1)...); }); + } else { + ASSERT_DEATH( + { view_type v("v", ((Is * 0) + 1)...); }, + "Constructor for Kokkos::View 'v' has mismatched number of arguments. " + "The number of arguments = " + + std::to_string(nargs) + + " neither matches the dynamic rank = " + std::to_string(dynrank) + + " nor the total rank = " + std::to_string(rank)); + ASSERT_DEATH( + { view_type v(nullptr, ((Is * 0) + 1)...); }, + "Constructor for Kokkos::View 'UNMANAGED' has mismatched number of " + "arguments. " + "The number of arguments = " + + std::to_string(nargs) + + " neither matches the dynamic rank = " + std::to_string(dynrank) + + " nor the total rank = " + std::to_string(rank)); + } +} -#define PARAM_0 -#define PARAM_1 1 -#define PARAM_2 1, 1 -#define PARAM_3 1, 1, 1 -#define PARAM_4 1, 1, 1, 1 -#define PARAM_5 1, 1, 1, 1, 1 -#define PARAM_6 1, 1, 1, 1, 1, 1 -#define PARAM_7 1, 1, 1, 1, 1, 1, 1 +template class RankType> +void test_matching_arguments_rank() { + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<0>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<1>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<2>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<3>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<4>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<5>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<6>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<7>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<8>()); +} -#define PARAM_0_RANK 0 -#define PARAM_1_RANK 1 -#define PARAM_2_RANK 2 -#define PARAM_3_RANK 3 -#define PARAM_4_RANK 4 -#define PARAM_5_RANK 5 -#define PARAM_6_RANK 6 -#define PARAM_7_RANK 7 +template +struct DynamicRank { + using type = typename DynamicRank::type*; +}; -using DType = int; +template <> +struct DynamicRank<0> { + using type = int; +}; // Skip test execution when KOKKOS_ENABLE_OPENMPTARGET is enabled until // Kokkos::abort() aborts properly on that backend @@ -53,348 +92,110 @@ using DType = int; TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_dyn) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - using DType_0 = DType; - using DType_1 = DType *; - using DType_2 = DType **; - using DType_3 = DType ***; - using DType_4 = DType ****; - using DType_5 = DType *****; - using DType_6 = DType ******; - using DType_7 = DType *******; - { - // test View parameters for View dim = 0, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 1, dynamic = 1 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 1); - LIVE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 1); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 1); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 1); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 1); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 1); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 1); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 1); - } - - { - // test View parameters for View dim = 2, dynamic = 2 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 2); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 2); - LIVE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 2); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 2); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 2); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 2); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 2); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 2); - } - - { - // test View parameters for View dim = 3, dynamic = 3 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 3); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 3); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 3); - LIVE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 3); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 3); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 3); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 3); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 3); - } - - { - // test View parameters for View dim = 4, dynamic = 4 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 4); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 4); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 4); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 4); - LIVE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 4); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 4); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 4); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 4); - } - - { - // test View parameters for View dim = 5, dynamic = 5 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 5); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 5); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 5); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 5); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 5); - LIVE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 5); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 5); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 5); - } - - { - // test View parameters for View dim = 6, dynamic = 6 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 6); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 6); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 6); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 6); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 6); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 6); - LIVE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 6); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 6); - } - - { - // test View parameters for View dim = 7, dynamic = 7 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 7); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 7); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 7); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 7); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 7); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 7); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 7); - LIVE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 7); - } +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS + test_matching_arguments_rank<0, 0, DynamicRank>(); // dim = 0, dynamic = 0 + test_matching_arguments_rank<1, 1, DynamicRank>(); // dim = 1, dynamic = 1 + test_matching_arguments_rank<2, 2, DynamicRank>(); // dim = 2, dynamic = 2 + test_matching_arguments_rank<3, 3, DynamicRank>(); // dim = 3, dynamic = 3 + test_matching_arguments_rank<4, 4, DynamicRank>(); // dim = 4, dynamic = 4 + test_matching_arguments_rank<5, 5, DynamicRank>(); // dim = 5, dynamic = 5 + test_matching_arguments_rank<6, 6, DynamicRank>(); // dim = 6, dynamic = 6 + test_matching_arguments_rank<7, 7, DynamicRank>(); // dim = 7, dynamic = 7 + test_matching_arguments_rank<8, 8, DynamicRank>(); // dim = 8, dynamic = 8 +#endif } +template +struct StaticRank { + using type = typename StaticRank::type[1]; +}; + +template <> +struct StaticRank<0> { + using type = int; +}; + TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_stat) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - using DType_0 = DType; - using DType_1 = DType[1]; - using DType_2 = DType[1][1]; - using DType_3 = DType[1][1][1]; - using DType_4 = DType[1][1][1][1]; - using DType_5 = DType[1][1][1][1][1]; - using DType_6 = DType[1][1][1][1][1][1]; - using DType_7 = DType[1][1][1][1][1][1][1]; - { - // test View parameters for View dim = 0, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 1, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - LIVE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 2, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - LIVE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 3, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - LIVE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 4, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - LIVE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 5, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - LIVE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 6, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - LIVE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 7, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - LIVE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS + test_matching_arguments_rank<0, 0, StaticRank>(); // dim = 0, dynamic = 0 + test_matching_arguments_rank<1, 0, StaticRank>(); // dim = 1, dynamic = 0 + test_matching_arguments_rank<2, 0, StaticRank>(); // dim = 2, dynamic = 0 + test_matching_arguments_rank<3, 0, StaticRank>(); // dim = 3, dynamic = 0 + test_matching_arguments_rank<4, 0, StaticRank>(); // dim = 4, dynamic = 0 + test_matching_arguments_rank<5, 0, StaticRank>(); // dim = 5, dynamic = 0 + test_matching_arguments_rank<6, 0, StaticRank>(); // dim = 6, dynamic = 0 + test_matching_arguments_rank<7, 0, StaticRank>(); // dim = 7, dynamic = 0 + test_matching_arguments_rank<8, 0, StaticRank>(); // dim = 8, dynamic = 0 +#endif } +template +struct MixedRank { + using type = typename DynamicRank::type[1]; +}; + +template <> +struct MixedRank<0> { + using type = int; +}; + TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_mix) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - using DType_0 = DType; - using DType_1 = DType[1]; - using DType_2 = DType * [1]; - using DType_3 = DType * * [1]; - using DType_4 = DType ** * [1]; - using DType_5 = DType *** * [1]; - using DType_6 = DType **** * [1]; - using DType_7 = DType ***** * [1]; - { - // test View parameters for View dim = 0, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 1, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - LIVE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 2, dynamic = 1 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 1); - LIVE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 1); - LIVE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 1); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 1); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 1); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 1); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 1); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 1); - } - - { - // test View parameters for View dim = 3, dynamic = 2 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 2); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 2); - LIVE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 2); - LIVE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 2); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 2); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 2); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 2); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 2); - } - - { - // test View parameters for View dim = 4, dynamic = 3 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 3); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 3); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 3); - LIVE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 3); - LIVE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 3); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 3); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 3); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 3); - } - - { - // test View parameters for View dim = 5, dynamic = 4 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 4); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 4); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 4); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 4); - LIVE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 4); - LIVE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 4); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 4); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 4); - } - - { - // test View parameters for View dim = 6, dynamic = 5 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 5); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 5); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 5); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 5); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 5); - LIVE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 5); - LIVE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 5); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 5); - } - - { - // test View parameters for View dim = 7, dynamic = 6 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 6); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 6); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 6); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 6); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 6); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 6); - LIVE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 6); - LIVE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 6); - } +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS + test_matching_arguments_rank<0, 0, MixedRank>(); // dim = 0, dynamic = 0 + test_matching_arguments_rank<1, 0, MixedRank>(); // dim = 1, dynamic = 0 + test_matching_arguments_rank<2, 1, MixedRank>(); // dim = 2, dynamic = 1 + test_matching_arguments_rank<3, 2, MixedRank>(); // dim = 3, dynamic = 2 + test_matching_arguments_rank<4, 3, MixedRank>(); // dim = 4, dynamic = 3 + test_matching_arguments_rank<5, 4, MixedRank>(); // dim = 5, dynamic = 4 + test_matching_arguments_rank<6, 5, MixedRank>(); // dim = 6, dynamic = 5 + test_matching_arguments_rank<7, 6, MixedRank>(); // dim = 7, dynamic = 6 + test_matching_arguments_rank<8, 7, MixedRank>(); // dim = 8, dynamic = 7 +#endif } + +#define CHECK_DEATH(EXPR) \ + ASSERT_DEATH(EXPR, \ + "The specified run-time extent for Kokkos::View 'v' does not " \ + "match the compile-time extent in dimension 0. The given " \ + "extent is 2 but should be 1.") + +#define CHECK_DEATH_UNMANAGED(EXPR) \ + ASSERT_DEATH( \ + EXPR, \ + "The specified run-time extent for Kokkos::View 'UNMANAGED' does not " \ + "match the compile-time extent in dimension 0. The given " \ + "extent is 2 but should be 1.") + +TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_static_extents) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS + // clang-format off + CHECK_DEATH({ Kokkos::View v("v", 2); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1, 1, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1, 1, 1, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1, 1, 1, 1, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1, 1, 1, 1, 1, 1); }); + + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1, 1, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1, 1, 1, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1, 1, 1, 1, 1, 1); }); + // clang-format on +#endif +} + +#undef CHECK_DEATH #endif // KOKKOS_ENABLE_OPENMPTARGET - -#undef PARAM_0 -#undef PARAM_1 -#undef PARAM_2 -#undef PARAM_3 -#undef PARAM_4 -#undef PARAM_5 -#undef PARAM_6 -#undef PARAM_7 - -#undef PARAM_0_RANK -#undef PARAM_1_RANK -#undef PARAM_2_RANK -#undef PARAM_3_RANK -#undef PARAM_4_RANK -#undef PARAM_5_RANK -#undef PARAM_6_RANK -#undef PARAM_7_RANK - -#undef DType - -#undef LIVE -#undef DIE } // namespace Test diff --git a/lib/kokkos/core/unit_test/TestViewEmptyRuntimeUnmanaged.hpp b/lib/kokkos/core/unit_test/TestViewEmptyRuntimeUnmanaged.hpp new file mode 100644 index 0000000000..b156b72860 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestViewEmptyRuntimeUnmanaged.hpp @@ -0,0 +1,55 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include + +namespace { + +template +void test_empty_view_runtime_unmanaged() { + T d{}; + auto* p = reinterpret_cast(0xABADBABE); + + (void)Kokkos::View(p); + (void)Kokkos::View(&d); + (void)Kokkos::View(nullptr); + (void)Kokkos::View(NULL); // NOLINT(modernize-use-nullptr) + (void)Kokkos::View(0); // NOLINT(modernize-use-nullptr) + + (void)Kokkos::View(p, 0); + (void)Kokkos::View(&d, 0); + (void)Kokkos::View(nullptr, 0); + (void)Kokkos::View(NULL, 0); // NOLINT(modernize-use-nullptr) + (void)Kokkos::View(0, 0); // NOLINT(modernize-use-nullptr) + + (void)Kokkos::View(p, 0, 0); + (void)Kokkos::View(&d, 0, 0); + (void)Kokkos::View(nullptr, 0, 0); + (void)Kokkos::View(NULL, 0, 0); // NOLINT(modernize-use-nullptr) + (void)Kokkos::View(0, 0, 0); // NOLINT(modernize-use-nullptr) +} + +TEST(TEST_CATEGORY, view_empty_runtime_unmanaged) { + test_empty_view_runtime_unmanaged(); + test_empty_view_runtime_unmanaged(); + test_empty_view_runtime_unmanaged(); + test_empty_view_runtime_unmanaged(); + test_empty_view_runtime_unmanaged(); +} + +} // namespace diff --git a/lib/kokkos/core/unit_test/TestViewMapping_a.hpp b/lib/kokkos/core/unit_test/TestViewMapping_a.hpp index 9173f0d431..a4dfdb26e3 100644 --- a/lib/kokkos/core/unit_test/TestViewMapping_a.hpp +++ b/lib/kokkos/core/unit_test/TestViewMapping_a.hpp @@ -73,67 +73,67 @@ void test_view_mapping() { ASSERT_LE(sizeof(dim_s0_s0_s0_s0_s0_s0_s0), 8 * sizeof(unsigned)); ASSERT_EQ(sizeof(dim_s0_s0_s0_s0_s0_s0_s0_s0), 8 * sizeof(unsigned)); #endif - static_assert(int(dim_0::rank) == int(0), ""); - static_assert(int(dim_0::rank_dynamic) == int(0), ""); - static_assert(int(dim_0::ArgN0) == 1, ""); - static_assert(int(dim_0::ArgN1) == 1, ""); - static_assert(int(dim_0::ArgN2) == 1, ""); + static_assert(int(dim_0::rank) == int(0)); + static_assert(int(dim_0::rank_dynamic) == int(0)); + static_assert(int(dim_0::ArgN0) == 1); + static_assert(int(dim_0::ArgN1) == 1); + static_assert(int(dim_0::ArgN2) == 1); - static_assert(int(dim_s2::rank) == int(1), ""); - static_assert(int(dim_s2::rank_dynamic) == int(0), ""); - static_assert(int(dim_s2::ArgN0) == 2, ""); - static_assert(int(dim_s2::ArgN1) == 1, ""); + static_assert(int(dim_s2::rank) == int(1)); + static_assert(int(dim_s2::rank_dynamic) == int(0)); + static_assert(int(dim_s2::ArgN0) == 2); + static_assert(int(dim_s2::ArgN1) == 1); - static_assert(int(dim_s2_s3::rank) == int(2), ""); - static_assert(int(dim_s2_s3::rank_dynamic) == int(0), ""); - static_assert(int(dim_s2_s3::ArgN0) == 2, ""); - static_assert(int(dim_s2_s3::ArgN1) == 3, ""); - static_assert(int(dim_s2_s3::ArgN2) == 1, ""); + static_assert(int(dim_s2_s3::rank) == int(2)); + static_assert(int(dim_s2_s3::rank_dynamic) == int(0)); + static_assert(int(dim_s2_s3::ArgN0) == 2); + static_assert(int(dim_s2_s3::ArgN1) == 3); + static_assert(int(dim_s2_s3::ArgN2) == 1); - static_assert(int(dim_s2_s3_s4::rank) == int(3), ""); - static_assert(int(dim_s2_s3_s4::rank_dynamic) == int(0), ""); - static_assert(int(dim_s2_s3_s4::ArgN0) == 2, ""); - static_assert(int(dim_s2_s3_s4::ArgN1) == 3, ""); - static_assert(int(dim_s2_s3_s4::ArgN2) == 4, ""); - static_assert(int(dim_s2_s3_s4::ArgN3) == 1, ""); + static_assert(int(dim_s2_s3_s4::rank) == int(3)); + static_assert(int(dim_s2_s3_s4::rank_dynamic) == int(0)); + static_assert(int(dim_s2_s3_s4::ArgN0) == 2); + static_assert(int(dim_s2_s3_s4::ArgN1) == 3); + static_assert(int(dim_s2_s3_s4::ArgN2) == 4); + static_assert(int(dim_s2_s3_s4::ArgN3) == 1); - static_assert(int(dim_s0::rank) == int(1), ""); - static_assert(int(dim_s0::rank_dynamic) == int(1), ""); + static_assert(int(dim_s0::rank) == int(1)); + static_assert(int(dim_s0::rank_dynamic) == int(1)); - static_assert(int(dim_s0_s3::rank) == int(2), ""); - static_assert(int(dim_s0_s3::rank_dynamic) == int(1), ""); - static_assert(int(dim_s0_s3::ArgN0) == 0, ""); - static_assert(int(dim_s0_s3::ArgN1) == 3, ""); + static_assert(int(dim_s0_s3::rank) == int(2)); + static_assert(int(dim_s0_s3::rank_dynamic) == int(1)); + static_assert(int(dim_s0_s3::ArgN0) == 0); + static_assert(int(dim_s0_s3::ArgN1) == 3); - static_assert(int(dim_s0_s3_s4::rank) == int(3), ""); - static_assert(int(dim_s0_s3_s4::rank_dynamic) == int(1), ""); - static_assert(int(dim_s0_s3_s4::ArgN0) == 0, ""); - static_assert(int(dim_s0_s3_s4::ArgN1) == 3, ""); - static_assert(int(dim_s0_s3_s4::ArgN2) == 4, ""); + static_assert(int(dim_s0_s3_s4::rank) == int(3)); + static_assert(int(dim_s0_s3_s4::rank_dynamic) == int(1)); + static_assert(int(dim_s0_s3_s4::ArgN0) == 0); + static_assert(int(dim_s0_s3_s4::ArgN1) == 3); + static_assert(int(dim_s0_s3_s4::ArgN2) == 4); - static_assert(int(dim_s0_s0_s4::rank) == int(3), ""); - static_assert(int(dim_s0_s0_s4::rank_dynamic) == int(2), ""); - static_assert(int(dim_s0_s0_s4::ArgN0) == 0, ""); - static_assert(int(dim_s0_s0_s4::ArgN1) == 0, ""); - static_assert(int(dim_s0_s0_s4::ArgN2) == 4, ""); + static_assert(int(dim_s0_s0_s4::rank) == int(3)); + static_assert(int(dim_s0_s0_s4::rank_dynamic) == int(2)); + static_assert(int(dim_s0_s0_s4::ArgN0) == 0); + static_assert(int(dim_s0_s0_s4::ArgN1) == 0); + static_assert(int(dim_s0_s0_s4::ArgN2) == 4); - static_assert(int(dim_s0_s0_s0::rank) == int(3), ""); - static_assert(int(dim_s0_s0_s0::rank_dynamic) == int(3), ""); + static_assert(int(dim_s0_s0_s0::rank) == int(3)); + static_assert(int(dim_s0_s0_s0::rank_dynamic) == int(3)); - static_assert(int(dim_s0_s0_s0_s0::rank) == int(4), ""); - static_assert(int(dim_s0_s0_s0_s0::rank_dynamic) == int(4), ""); + static_assert(int(dim_s0_s0_s0_s0::rank) == int(4)); + static_assert(int(dim_s0_s0_s0_s0::rank_dynamic) == int(4)); - static_assert(int(dim_s0_s0_s0_s0_s0::rank) == int(5), ""); - static_assert(int(dim_s0_s0_s0_s0_s0::rank_dynamic) == int(5), ""); + static_assert(int(dim_s0_s0_s0_s0_s0::rank) == int(5)); + static_assert(int(dim_s0_s0_s0_s0_s0::rank_dynamic) == int(5)); - static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank) == int(6), ""); - static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(6), ""); + static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank) == int(6)); + static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(6)); - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank) == int(7), ""); - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(7), ""); + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank) == int(7)); + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(7)); - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank) == int(8), ""); - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(8), ""); + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank) == int(8)); + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(8)); dim_s0 d1(2, 3, 4, 5, 6, 7, 8, 9); dim_s0_s0 d2(2, 3, 4, 5, 6, 7, 8, 9); @@ -514,11 +514,11 @@ void test_view_mapping() { { using namespace Kokkos::Impl; - static_assert(rank_dynamic<>::value == 0, ""); - static_assert(rank_dynamic<1>::value == 0, ""); - static_assert(rank_dynamic<0>::value == 1, ""); - static_assert(rank_dynamic<0, 1>::value == 1, ""); - static_assert(rank_dynamic<0, 0, 1>::value == 2, ""); + static_assert(rank_dynamic<>::value == 0); + static_assert(rank_dynamic<1>::value == 0); + static_assert(rank_dynamic<0>::value == 1); + static_assert(rank_dynamic<0, 1>::value == 1); + static_assert(rank_dynamic<0, 0, 1>::value == 2); } { @@ -529,54 +529,48 @@ void test_view_mapping() { using a_const_int_r1 = ViewArrayAnalysis; using a_const_int_r5 = ViewArrayAnalysis; - static_assert(a_int_r1::dimension::rank == 1, ""); - static_assert(a_int_r1::dimension::rank_dynamic == 1, ""); - static_assert(a_int_r5::dimension::ArgN0 == 0, ""); - static_assert(a_int_r5::dimension::ArgN1 == 0, ""); - static_assert(a_int_r5::dimension::ArgN2 == 4, ""); - static_assert(a_int_r5::dimension::ArgN3 == 5, ""); - static_assert(a_int_r5::dimension::ArgN4 == 6, ""); - static_assert(a_int_r5::dimension::ArgN5 == 1, ""); + static_assert(a_int_r1::dimension::rank == 1); + static_assert(a_int_r1::dimension::rank_dynamic == 1); + static_assert(a_int_r5::dimension::ArgN0 == 0); + static_assert(a_int_r5::dimension::ArgN1 == 0); + static_assert(a_int_r5::dimension::ArgN2 == 4); + static_assert(a_int_r5::dimension::ArgN3 == 5); + static_assert(a_int_r5::dimension::ArgN4 == 6); + static_assert(a_int_r5::dimension::ArgN5 == 1); static_assert( - std::is_same >::value, - ""); + std::is_same >::value); static_assert( - std::is_same::value, ""); + std::is_same::value); - static_assert(a_const_int_r1::dimension::rank == 1, ""); - static_assert(a_const_int_r1::dimension::rank_dynamic == 1, ""); + static_assert(a_const_int_r1::dimension::rank == 1); + static_assert(a_const_int_r1::dimension::rank_dynamic == 1); static_assert(std::is_same >::value, - ""); - static_assert( - std::is_same::value, - ""); + ViewDimension<0> >::value); + static_assert(std::is_same::value); - static_assert(a_const_int_r5::dimension::rank == 5, ""); - static_assert(a_const_int_r5::dimension::rank_dynamic == 2, ""); + static_assert(a_const_int_r5::dimension::rank == 5); + static_assert(a_const_int_r5::dimension::rank_dynamic == 2); - static_assert(a_const_int_r5::dimension::ArgN0 == 0, ""); - static_assert(a_const_int_r5::dimension::ArgN1 == 0, ""); - static_assert(a_const_int_r5::dimension::ArgN2 == 4, ""); - static_assert(a_const_int_r5::dimension::ArgN3 == 5, ""); - static_assert(a_const_int_r5::dimension::ArgN4 == 6, ""); - static_assert(a_const_int_r5::dimension::ArgN5 == 1, ""); + static_assert(a_const_int_r5::dimension::ArgN0 == 0); + static_assert(a_const_int_r5::dimension::ArgN1 == 0); + static_assert(a_const_int_r5::dimension::ArgN2 == 4); + static_assert(a_const_int_r5::dimension::ArgN3 == 5); + static_assert(a_const_int_r5::dimension::ArgN4 == 6); + static_assert(a_const_int_r5::dimension::ArgN5 == 1); static_assert(std::is_same >::value, - ""); - static_assert( - std::is_same::value, - ""); + ViewDimension<0, 0, 4, 5, 6> >::value); + static_assert(std::is_same::value); - static_assert(a_int_r5::dimension::rank == 5, ""); - static_assert(a_int_r5::dimension::rank_dynamic == 2, ""); + static_assert(a_int_r5::dimension::rank == 5); + static_assert(a_int_r5::dimension::rank_dynamic == 2); static_assert(std::is_same >::value, - ""); + ViewDimension<0, 0, 4, 5, 6> >::value); static_assert( - std::is_same::value, ""); + std::is_same::value); } { @@ -587,15 +581,15 @@ void test_view_mapping() { // Dimensions of t_i4 are appended to the multdimensional array. using a_int_r5 = ViewArrayAnalysis; - static_assert(a_int_r5::dimension::rank == 5, ""); - static_assert(a_int_r5::dimension::rank_dynamic == 3, ""); - static_assert(a_int_r5::dimension::ArgN0 == 0, ""); - static_assert(a_int_r5::dimension::ArgN1 == 0, ""); - static_assert(a_int_r5::dimension::ArgN2 == 0, ""); - static_assert(a_int_r5::dimension::ArgN3 == 3, ""); - static_assert(a_int_r5::dimension::ArgN4 == 4, ""); + static_assert(a_int_r5::dimension::rank == 5); + static_assert(a_int_r5::dimension::rank_dynamic == 3); + static_assert(a_int_r5::dimension::ArgN0 == 0); + static_assert(a_int_r5::dimension::ArgN1 == 0); + static_assert(a_int_r5::dimension::ArgN2 == 0); + static_assert(a_int_r5::dimension::ArgN3 == 3); + static_assert(a_int_r5::dimension::ArgN4 == 4); static_assert( - std::is_same::value, ""); + std::is_same::value); } { @@ -603,71 +597,54 @@ void test_view_mapping() { using a_const_int_r1 = ViewDataAnalysis; - static_assert(std::is_void::value, ""); + static_assert(std::is_void::value); static_assert(std::is_same >::value, - ""); + Kokkos::Impl::ViewDimension<0> >::value); static_assert( - std::is_same::value, ""); + std::is_same::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert(std::is_same::value, - ""); + const int*>::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert(std::is_same::value, - ""); + const int>::value); static_assert(std::is_same::value, - ""); + const int*>::value); static_assert( - std::is_same::value, ""); - static_assert( - std::is_same::value, - ""); + std::is_same::value); + static_assert(std::is_same::value); using a_const_int_r3 = ViewDataAnalysis; - static_assert(std::is_void::value, ""); + static_assert(std::is_void::value); static_assert(std::is_same >::value, - ""); + Kokkos::Impl::ViewDimension<0, 0, 4> >::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert(std::is_same::value, - ""); + const int* * [4]>::value); static_assert(std::is_same::value, - ""); + const int* * [4]>::value); static_assert(std::is_same::value, - ""); + const int>::value); static_assert(std::is_same::value, - ""); + const int* * [4]>::value); static_assert(std::is_same::value, - ""); - static_assert( - std::is_same::value, - ""); + int* * [4]>::value); + static_assert(std::is_same::value); static_assert( std::is_same::value, - ""); + int* * [4]>::value); // std::cout << "typeid( const int**[4] ).name() = " << typeid( const // int**[4] ).name() << std::endl; diff --git a/lib/kokkos/core/unit_test/TestViewMapping_b.hpp b/lib/kokkos/core/unit_test/TestViewMapping_b.hpp index 9ac4e7da84..4aee035d17 100644 --- a/lib/kokkos/core/unit_test/TestViewMapping_b.hpp +++ b/lib/kokkos/core/unit_test/TestViewMapping_b.hpp @@ -156,7 +156,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using dst_traits = Kokkos::ViewTraits; using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View src; Kokkos::View dst(src); @@ -167,7 +167,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using dst_traits = Kokkos::ViewTraits; using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View src; Kokkos::View dst(src); @@ -180,7 +180,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View src; Kokkos::View dst(src); @@ -193,7 +193,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View src; Kokkos::View dst(src); @@ -206,7 +206,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(!mapping::is_assignable, ""); + static_assert(!mapping::is_assignable); } { // Assignment of rank-2 Right = Left @@ -215,7 +215,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(!mapping::is_assignable, ""); + static_assert(!mapping::is_assignable); } } @@ -226,7 +226,7 @@ TEST(TEST_CATEGORY, view_mapping_trivially_copyable) { using src_traits = dst_traits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(std::is_trivially_copyable{}, ""); + static_assert(std::is_trivially_copyable{}); } } // namespace Test diff --git a/lib/kokkos/core/unit_test/TestViewOutOfBoundsAccess.hpp b/lib/kokkos/core/unit_test/TestViewOutOfBoundsAccess.hpp new file mode 100644 index 0000000000..2716856c1f --- /dev/null +++ b/lib/kokkos/core/unit_test/TestViewOutOfBoundsAccess.hpp @@ -0,0 +1,175 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +#include + +namespace { + +TEST(TEST_CATEGORY, append_formatted_multidimensional_index) { + using Kokkos::Impl::append_formatted_multidimensional_index; + { + char buffer[64] = "my prefix "; + append_formatted_multidimensional_index(buffer, 1); + EXPECT_STREQ(buffer, "my prefix [1]"); + } + { + char buffer[64] = "I was here"; + append_formatted_multidimensional_index(buffer, 1, 2, 3); + EXPECT_STREQ(buffer, "I was here[1,2,3]"); + } + { + char buffer[64] = "with mixed integer types "; + append_formatted_multidimensional_index(buffer, 1u, -2); + EXPECT_STREQ(buffer, "with mixed integer types [1,-2]"); + } +} + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + +template +struct TestViewOutOfBoundAccess { + View v; + static constexpr auto rank = View::rank; + + template + KOKKOS_FUNCTION decltype(auto) bad_access(std::index_sequence) const { + return v((Is * 1 + Is == 0 ? v.extent(Is) + 3 : 0)...); + } + + KOKKOS_FUNCTION void operator()(int) const { + ++bad_access(std::make_index_sequence{}); + } + + template + std::string get_details(std::index_sequence) { + std::stringstream ss; + ss << "with indices \\["; + ((ss << (Is == 0 ? v.extent(Is) + 3 : 0) + << (Is == View::rank() - 1 ? "\\]" : ",")), + ...); + ss << " but extents \\["; + ((ss << v.extent(Is) << (Is == View::rank() - 1 ? "\\]" : ",")), ...); + return ss.str(); + } + + auto get_details() { + return get_details(std::make_index_sequence()); + } + + TestViewOutOfBoundAccess(View w, ExecutionSpace const& s, std::string matcher) + : v(std::move(w)) { + constexpr bool view_accessible_from_execution_space = + Kokkos::SpaceAccessibility< + /*AccessSpace=*/ExecutionSpace, + /*MemorySpace=*/typename View::memory_space>::accessible; + EXPECT_TRUE(view_accessible_from_execution_space); + + matcher += ".*" + get_details(); + + EXPECT_DEATH( + { + Kokkos::parallel_for(Kokkos::RangePolicy(s, 0, 1), + *this); + Kokkos::fence(); + }, + matcher); + } +}; + +template +auto make_view_impl(LblOrPtr x, std::index_sequence) { + return View(x, (Is + 1)...); +} + +template +auto make_view(LblOrPtr x) { + return make_view_impl(std::move(x), + std::make_index_sequence()); +} + +template +void test_view_out_of_bounds_access() { + ExecutionSpace const exec_space{}; + // clang-format off + using V1 = Kokkos::View; + using V2 = Kokkos::View; + using V3 = Kokkos::View; + using V4 = Kokkos::View; + using V5 = Kokkos::View; + using V6 = Kokkos::View; + using V7 = Kokkos::View; + using V8 = Kokkos::View; + std::string const prefix = "Kokkos::View ERROR: out of bounds access"; + std::string const lbl = "my_label"; + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + int* const ptr = nullptr; + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + // clang-format on +} + +TEST(TEST_CATEGORY_DEATH, view_out_of_bounds_access) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + + using ExecutionSpace = TEST_EXECSPACE; + + if (false && Kokkos::SpaceAccessibility< + /*AccessSpace=*/ExecutionSpace, + /*MemorySpace=*/Kokkos::HostSpace>::accessible) { + GTEST_SKIP() << "skipping since no memory access violation would occur"; + } + +#if defined(KOKKOS_ENABLE_SYCL) && defined(NDEBUG) // FIXME_SYCL + if (std::is_same_v) { + GTEST_SKIP() << "skipping SYCL device-side abort does not work when NDEBUG " + "is defined"; + } +#endif +#if defined(KOKKOS_ENABLE_OPENMPTARGET) // FIXME_OPENMPTARGET + if (std::is_same_v) { + GTEST_SKIP() << "skipping because OpenMPTarget backend is currently not " + "able to abort from the device"; + } +#endif +#if defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC + if (std::is_same::value) { + GTEST_SKIP() << "skipping because OpenACC backend is currently not " + "able to abort from the device"; + } +#endif + + test_view_out_of_bounds_access(); +} + +#endif + +} // namespace diff --git a/lib/kokkos/core/unit_test/UnitTest_DeviceAndThreads.cpp b/lib/kokkos/core/unit_test/UnitTest_DeviceAndThreads.cpp index b522ac3e69..25442146fb 100644 --- a/lib/kokkos/core/unit_test/UnitTest_DeviceAndThreads.cpp +++ b/lib/kokkos/core/unit_test/UnitTest_DeviceAndThreads.cpp @@ -19,22 +19,23 @@ #include #include -int get_device_count() { +int get_num_devices() { + int num_devices; #if defined(KOKKOS_ENABLE_CUDA) - int count; - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); - return count; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&num_devices)); #elif defined(KOKKOS_ENABLE_HIP) - int count; - KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDevice(&count)); - return count; + KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&num_devices)); #elif defined(KOKKOS_ENABLE_OPENMPTARGET) - return omp_get_num_devices(); + num_devices = omp_get_num_devices(); #elif defined(KOKKOS_ENABLE_OPENACC) - return acc_get_num_devices(acc_get_device_type()); + num_devices = acc_get_num_devices(acc_get_device_type()); +#elif defined(KOKKOS_ENABLE_SYCL) + num_devices = sycl::device::get_devices(sycl::info::device_type::gpu).size(); #else - return 0; + num_devices = -1; #endif + assert(num_devices == Kokkos::num_devices()); + return num_devices; } int get_device_id() { @@ -44,15 +45,17 @@ int get_device_id() { #elif defined(KOKKOS_ENABLE_HIP) KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDevice(&device_id)); #elif defined(KOKKOS_ENABLE_OPENMPTARGET) - device_id = omp_get_device_num(); + device_id = omp_get_default_device(); #elif defined(KOKKOS_ENABLE_OPENACC) - device_id = acc_get_device_num(acc_get_device_type()); + device_id = acc_get_device_num(acc_get_device_type()); #elif defined(KOKKOS_ENABLE_SYCL) - // FIXME_SYCL ? - assert(false); - return -2; + // Not able to query the underlying runtime because there is no such thing as + // device currently being used with SYCL. We go through the Kokkos runtime + // which makes the assert below pointless but it still let us check that + // Kokkos selected the device we asked for from the Python tests. + device_id = Kokkos::device_id(); #else - device_id = -1; + device_id = -1; #endif assert(device_id == Kokkos::device_id()); return device_id; @@ -68,6 +71,14 @@ int get_max_threads() { #endif } +int get_hwloc_enabled() { +#ifdef KOKKOS_ENABLE_HWLOC + return 1; +#else + return 0; +#endif +} + int get_num_threads() { int const num_threads = Kokkos::DefaultHostExecutionSpace().concurrency(); assert(num_threads == Kokkos::num_threads()); @@ -90,9 +101,10 @@ int print_flag(std::string const& flag) { KOKKOS_TEST_PRINT_FLAG(num_threads); KOKKOS_TEST_PRINT_FLAG(max_threads); KOKKOS_TEST_PRINT_FLAG(device_id); - KOKKOS_TEST_PRINT_FLAG(device_count); + KOKKOS_TEST_PRINT_FLAG(num_devices); KOKKOS_TEST_PRINT_FLAG(disable_warnings); KOKKOS_TEST_PRINT_FLAG(tune_internals); + KOKKOS_TEST_PRINT_FLAG(hwloc_enabled); #undef KOKKOS_TEST_PRINT_FLAG diff --git a/lib/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash b/lib/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash index 8fe8e2b5ec..8bc8ef21cd 100755 --- a/lib/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash +++ b/lib/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash @@ -4,7 +4,7 @@ HostArch=(SNB HSW SKX KNL) DeviceArch=(Kepler35 Kepler37 Pascal60 Pascal61 Volta70) if [ ! -z "$KOKKOS_HOST_ARCH_TEST" ]; then export KOKKOS_ARCH_TEST=1 - HostArch=(WSM SNB HSW SKX WSM AMDAVX ARMv80 ARMv81 BDW KNC KNL BGQ Power7 Power8 Power9 Zen Zen2 Zen3 ARMv8_ThunderX ARMv8_ThunderX2) + HostArch=(SNB HSW SKX AMDAVX ARMv80 ARMv81 BDW KNC KNL Power8 Power9 Zen Zen2 Zen3 ARMv8_ThunderX ARMv8_ThunderX2) DeviceArch=() fi diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp new file mode 100644 index 0000000000..d94735ceb2 --- /dev/null +++ b/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp @@ -0,0 +1,268 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +namespace { + +struct StreamsAndDevices { + std::array streams; + std::array devices; + + StreamsAndDevices() { + int n_devices; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&n_devices)); + + devices = {0, n_devices - 1}; + for (int i = 0; i < 2; ++i) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(devices[i])); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&streams[i])); + } + } + StreamsAndDevices(const StreamsAndDevices &) = delete; + StreamsAndDevices &operator=(const StreamsAndDevices &) = delete; + ~StreamsAndDevices() { + for (int i = 0; i < 2; ++i) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(devices[i])); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(streams[i])); + } + } +}; + +std::array get_execution_spaces( + const StreamsAndDevices &streams_and_devices) { + TEST_EXECSPACE exec0(streams_and_devices.streams[0]); + TEST_EXECSPACE exec1(streams_and_devices.streams[1]); + + // Must return void to use ASSERT_EQ + [&]() { + ASSERT_EQ(exec0.cuda_device(), streams_and_devices.devices[0]); + ASSERT_EQ(exec1.cuda_device(), streams_and_devices.devices[1]); + }(); + + return {exec0, exec1}; +} + +// Test Interoperability with Cuda Streams +void test_policies(TEST_EXECSPACE exec0, Kokkos::View v0, + TEST_EXECSPACE exec, Kokkos::View v) { + using MemorySpace = typename TEST_EXECSPACE::memory_space; + + Kokkos::deep_copy(exec, v, 5); + Kokkos::deep_copy(exec0, v0, 5); + + Kokkos::deep_copy(v, v0); + + int sum; + int sum0; + + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Range_0", + Kokkos::RangePolicy(exec0, 0, 100), + Test::FunctorRange(v0)); + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Range", + Kokkos::RangePolicy(exec, 0, 100), + Test::FunctorRange(v)); + Kokkos::parallel_reduce( + "Test::cuda::raw_cuda_stream::RangeReduce_0", + Kokkos::RangePolicy>(exec0, + 0, 100), + Test::FunctorRangeReduce(v0), sum0); + Kokkos::parallel_reduce( + "Test::cuda::raw_cuda_stream::RangeReduce", + Kokkos::RangePolicy>(exec, 0, + 100), + Test::FunctorRangeReduce(v), sum); + ASSERT_EQ(600, sum0); + ASSERT_EQ(600, sum); + + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::MDRange_0", + Kokkos::MDRangePolicy>( + exec0, {0, 0}, {10, 10}), + Test::FunctorMDRange(v0)); + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::MDRange", + Kokkos::MDRangePolicy>( + exec, {0, 0}, {10, 10}), + Test::FunctorMDRange(v)); + Kokkos::parallel_reduce("Test::cuda::raw_cuda_stream::MDRangeReduce_0", + Kokkos::MDRangePolicy, + Kokkos::LaunchBounds<128, 2>>( + exec0, {0, 0}, {10, 10}), + Test::FunctorMDRangeReduce(v0), sum0); + Kokkos::parallel_reduce("Test::cuda::raw_cuda_stream::MDRangeReduce", + Kokkos::MDRangePolicy, + Kokkos::LaunchBounds<128, 2>>( + exec, {0, 0}, {10, 10}), + Test::FunctorMDRangeReduce(v), sum); + ASSERT_EQ(700, sum0); + ASSERT_EQ(700, sum); + + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Team_0", + Kokkos::TeamPolicy(exec0, 10, 10), + Test::FunctorTeam(v0)); + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Team", + Kokkos::TeamPolicy(exec, 10, 10), + Test::FunctorTeam(v)); + Kokkos::parallel_reduce( + "Test::cuda::raw_cuda_stream::Team_0", + Kokkos::TeamPolicy>(exec0, + 10, 10), + Test::FunctorTeamReduce(v0), sum0); + Kokkos::parallel_reduce( + "Test::cuda::raw_cuda_stream::Team", + Kokkos::TeamPolicy>(exec, 10, + 10), + Test::FunctorTeamReduce(v), sum); + ASSERT_EQ(800, sum0); + ASSERT_EQ(800, sum); +} + +TEST(cuda_multi_gpu, managed_views) { + StreamsAndDevices streams_and_devices; + { + std::array execs = + get_execution_spaces(streams_and_devices); + + Kokkos::View view0( + Kokkos::view_alloc("v0", execs[0]), 100); + Kokkos::View view(Kokkos::view_alloc("v", execs[1]), + 100); + + test_policies(execs[0], view0, execs[1], view); + } +} + +TEST(cuda_multi_gpu, unmanaged_views) { + StreamsAndDevices streams_and_devices; + { + std::array execs = + get_execution_spaces(streams_and_devices); + + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(execs[0].cuda_device())); + int *p0; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc(reinterpret_cast(&p0), sizeof(int) * 100)); + Kokkos::View view0(p0, 100); + + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(execs[1].cuda_device())); + int *p; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc(reinterpret_cast(&p), sizeof(int) * 100)); + Kokkos::View view(p, 100); + + test_policies(execs[0], view0, execs[1], view); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(p0)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(p)); + } +} + +struct ScratchFunctor { + int scratch_size; + int R; + + ScratchFunctor(int scratch_size_, int R_) + : scratch_size(scratch_size_), R(R_) {} + + KOKKOS_FUNCTION + void operator()(const Kokkos::TeamPolicy::member_type &team, + int &error_accum) const { + Kokkos::View scratch_mem( + team.team_scratch(1), scratch_size); + + // Initialize scratch memory + Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i) { scratch_mem(i) = 0; }); + team.team_barrier(); + + // Increment each entry in scratch memory R times + for (int r = 0; r < R; ++r) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i) { scratch_mem(i) += 1; }); + } + team.team_barrier(); + + // Check that each scratch entry has been incremented exactly R times + int team_error_accum; + auto R_loc = R; // avoid implicit capture of this + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i, int &tsum) { + if (scratch_mem(i) != R_loc) { + tsum += 1; + } + }, + team_error_accum); + Kokkos::single(Kokkos::PerTeam(team), + [&]() { error_accum += team_error_accum; }); + } +}; + +void test_scratch(TEST_EXECSPACE exec0, TEST_EXECSPACE exec1) { + constexpr int N = 10; + constexpr int R = 1000; + constexpr int scratch_size = 100; + using ScratchType = Kokkos::View; + + // Test allocating and using scratch space + ScratchFunctor f(scratch_size, R); + + auto policy0 = + Kokkos::TeamPolicy(exec0, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); + auto policy1 = + Kokkos::TeamPolicy(exec1, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); + + int error0, error1; + + Kokkos::parallel_reduce("test_scratch_device_0", policy0, f, error0); + Kokkos::parallel_reduce("test_scratch_device_1", policy1, f, error1); + ASSERT_EQ(error0, 0); + ASSERT_EQ(error1, 0); + + // Request larger scratch size to trigger a realloc and test + const auto new_scratch_size = scratch_size + 10; + ScratchFunctor f_more_scratch(new_scratch_size, R); + + auto policy0_more_scratch = + Kokkos::TeamPolicy(exec0, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); + auto policy1_more_scratch = + Kokkos::TeamPolicy(exec1, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); + + Kokkos::parallel_reduce("test_realloc_scratch_device_0", policy0_more_scratch, + f_more_scratch, error0); + Kokkos::parallel_reduce("test_realloc_scratch_device_1", policy1_more_scratch, + f_more_scratch, error1); + ASSERT_EQ(error0, 0); + ASSERT_EQ(error1, 0); +} + +TEST(cuda_multi_gpu, scratch_space) { + StreamsAndDevices streams_and_devices; + { + std::array execs = + get_execution_spaces(streams_and_devices); + + test_scratch(execs[0], execs[1]); + } +} +} // namespace diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp index ae603101ab..11fe6b8555 100644 --- a/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp +++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp @@ -29,200 +29,166 @@ __global__ void test_cuda_spaces_int_value(int *ptr) { TEST(cuda, space_access) { static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaSpace>::assignable); + + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); + + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaUVMSpace>::assignable); + + static_assert(!Kokkos::Impl::MemorySpaceAccess< + Kokkos::CudaSpace, Kokkos::CudaHostPinnedSpace>::assignable); + + static_assert( + Kokkos::Impl::MemorySpaceAccess::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( - Kokkos::Impl::MemorySpaceAccess::accessible, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + !Kokkos::Impl::MemorySpaceAccess::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - - static_assert(Kokkos::Impl::MemorySpaceAccess::accessible, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaUVMSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaSpace>::assignable); + + static_assert(Kokkos::Impl::MemorySpaceAccess::accessible); + + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); + + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); + + static_assert(!Kokkos::Impl::MemorySpaceAccess< + Kokkos::CudaUVMSpace, Kokkos::CudaHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - - static_assert(Kokkos::Impl::MemorySpaceAccess::accessible, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); + + static_assert(Kokkos::Impl::MemorySpaceAccess::accessible); + + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); + + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); + + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); //-------------------------------------- static_assert( - !Kokkos::SpaceAccessibility::accessible, - ""); + !Kokkos::SpaceAccessibility::accessible); static_assert( - Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::SpaceAccessibility::accessible); static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); static_assert( Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); static_assert(!Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaSpace>::accessible); static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); static_assert( Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); static_assert(std::is_same::Space, - Kokkos::HostSpace>::value, - ""); + Kokkos::HostSpace>::value); static_assert( std::is_same::Space, Kokkos::Device>::value, - ""); + Kokkos::CudaUVMSpace>>::value); static_assert( std::is_same::Space, - Kokkos::CudaHostPinnedSpace>::value, - ""); + Kokkos::CudaHostPinnedSpace>::value); static_assert(std::is_same, Kokkos::Device>::value, - ""); + Kokkos::CudaUVMSpace>>::value); static_assert( Kokkos::SpaceAccessibility::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); - static_assert( - Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + static_assert(Kokkos::SpaceAccessibility< + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); #ifdef KOKKOS_ENABLE_CUDA_UVM using uvm_view = Kokkos::View; static_assert(std::is_same::Space; static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert( - Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::SpaceAccessibility::accessible); static_assert( - Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::SpaceAccessibility::accessible); } } // namespace Test diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp deleted file mode 100644 index 348b9feeab..0000000000 --- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01 -#include diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp deleted file mode 100644 index a77a55ea65..0000000000 --- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10 -#include diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp deleted file mode 100644 index 1b6a140920..0000000000 --- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11 -#include diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp deleted file mode 100644 index 316bc85526..0000000000 --- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12 -#include diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp deleted file mode 100644 index 6344960a1c..0000000000 --- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13 -#include diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp deleted file mode 100644 index 4515174b82..0000000000 --- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14 -#include diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp deleted file mode 100644 index 7ead50f094..0000000000 --- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15 -#include diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp deleted file mode 100644 index e12b9b3894..0000000000 --- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16 -#include diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp deleted file mode 100644 index 959d0ab750..0000000000 --- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_17 -#include diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp deleted file mode 100644 index 07d841519d..0000000000 --- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_18 -#include diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp deleted file mode 100644 index 042a515b16..0000000000 --- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02 -#include diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp deleted file mode 100644 index dba401e5bc..0000000000 --- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03 -#include diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp deleted file mode 100644 index a44c58bdb5..0000000000 --- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04 -#include diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp deleted file mode 100644 index cac0841dd8..0000000000 --- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05 -#include diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp deleted file mode 100644 index bafe3b3fd2..0000000000 --- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06 -#include diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp deleted file mode 100644 index 3a4dd9d253..0000000000 --- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07 -#include diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp deleted file mode 100644 index 4e92aae565..0000000000 --- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08 -#include diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp deleted file mode 100644 index 44b8f3428d..0000000000 --- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09 -#include diff --git a/lib/kokkos/core/unit_test/hip/TestHIP_Memory_Requirements.cpp b/lib/kokkos/core/unit_test/hip/TestHIP_Memory_Requirements.cpp index 8c72e9f297..a213453ea1 100644 --- a/lib/kokkos/core/unit_test/hip/TestHIP_Memory_Requirements.cpp +++ b/lib/kokkos/core/unit_test/hip/TestHIP_Memory_Requirements.cpp @@ -48,6 +48,9 @@ TEST(hip, memory_requirements) { // we want all user-facing memory in hip to be coarse grained. As of // today(07.01.22) the documentation is not reliable/correct, we test the // memory on the device and host + // FIXME_HIP + GTEST_SKIP() << "skipping the test because the CI on MI100 returns: error( " + "hipErrorInvalidValue)"; KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPSpace, int, 10); KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPHostPinnedSpace, int, 10); KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPManagedSpace, int, 10); diff --git a/lib/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp b/lib/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp index 14fd4e2883..8f7499c244 100644 --- a/lib/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp +++ b/lib/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp @@ -29,198 +29,164 @@ __global__ void test_hip_spaces_int_value(int *ptr) { TEST(hip, space_access) { static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPManagedSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPManagedSpace>::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - - static_assert(Kokkos::Impl::MemorySpaceAccess::accessible, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); + + static_assert(Kokkos::Impl::MemorySpaceAccess::accessible); + + static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable); + + static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible); + + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPManagedSpace>::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - - static_assert(Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPManagedSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); + + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); + + static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable); + + static_assert(Kokkos::Impl::MemorySpaceAccess::accessible); + + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); //-------------------------------------- static_assert( - !Kokkos::SpaceAccessibility::accessible, - ""); + !Kokkos::SpaceAccessibility::accessible); static_assert( - Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::SpaceAccessibility::accessible); static_assert( Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); - static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + static_assert( + Kokkos::SpaceAccessibility::accessible); static_assert(!Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HIPSpace>::accessible); static_assert( Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); - static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + static_assert( + Kokkos::SpaceAccessibility::accessible); static_assert(std::is_same::Space, - Kokkos::HostSpace>::value, - ""); + Kokkos::HostSpace>::value); static_assert( std::is_same::Space, - Kokkos::HIPHostPinnedSpace>::value, - ""); + Kokkos::HIPHostPinnedSpace>::value); static_assert( std::is_same::Space, Kokkos::Device>::value, - ""); + Kokkos::HIPManagedSpace>>::value); static_assert( Kokkos::SpaceAccessibility::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); } template diff --git a/lib/kokkos/core/unit_test/incremental/Test01_execspace.hpp b/lib/kokkos/core/unit_test/incremental/Test01_execspace.hpp index 25c7138ed3..d7b2a57b44 100644 --- a/lib/kokkos/core/unit_test/incremental/Test01_execspace.hpp +++ b/lib/kokkos/core/unit_test/incremental/Test01_execspace.hpp @@ -62,8 +62,10 @@ struct TestIncrExecSpace { auto concurrency = ExecSpace().concurrency(); ASSERT_GT(concurrency, 0); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 int in_parallel = ExecSpace::in_parallel(); ASSERT_FALSE(in_parallel); +#endif const char* name = ExecSpace::name(); std::cout << name << std::endl; diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp deleted file mode 100644 index 92b8032bf0..0000000000 --- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp +++ /dev/null @@ -1,105 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include - -#include - -namespace Test { - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -TEST(openmp, partition_master) { - using Mutex = Kokkos::Experimental::MasterLock; - - Mutex mtx; - int errors = 0; - - auto master = [&errors, &mtx](int /*partition_id*/, int /*num_partitions*/) { - const int pool_size = Kokkos::OpenMP().impl_thread_pool_size(); - - { - std::unique_lock lock(mtx); - if (Kokkos::OpenMP::in_parallel()) { - ++errors; - } - if (Kokkos::OpenMP::impl_thread_pool_rank() != 0) { - ++errors; - } - } - - { - int local_errors = 0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, 1000), - [pool_size](const int, int& errs) { - if (Kokkos::OpenMP().impl_thread_pool_size() != pool_size) { - ++errs; - } - }, - local_errors); - Kokkos::atomic_add(&errors, local_errors); - } - - Kokkos::Experimental::UniqueToken token; - - Kokkos::View count("", token.size()); - - Kokkos::parallel_for(Kokkos::RangePolicy(0, 1000), - [=](const int) { - int i = token.acquire(); - ++count[i]; - token.release(i); - }); - - Kokkos::View sum(""); - Kokkos::parallel_for( - Kokkos::RangePolicy(0, token.size()), - [=](const int i) { Kokkos::atomic_add(sum.data(), count[i]); }); - - if (sum() != 1000) { - Kokkos::atomic_add(&errors, 1); - } - }; - - master(0, 1); - - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 4, 0); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 0, 4); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 2, 2); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 8, 0); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 0, 8); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 8, 8); - ASSERT_EQ(errors, 0); -} -#endif - -} // namespace Test diff --git a/lib/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp b/lib/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp index 914f843248..a4fd053e83 100644 --- a/lib/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp +++ b/lib/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp @@ -21,235 +21,192 @@ namespace Test { TEST(sycl, space_access) { static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::accessible); //-------------------------------------- static_assert(!Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLSharedUSMSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); static_assert(!Kokkos::SpaceAccessibility< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); static_assert( std::is_same::Space, - Kokkos::HostSpace>::value, - ""); + Kokkos::HostSpace>::value); static_assert( std::is_same< Kokkos::Impl::HostMirror< Kokkos::Experimental::SYCLSharedUSMSpace>::Space, Kokkos::Device>::value, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace>>::value); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); static_assert(std::is_same::Space, - Kokkos::Experimental::SYCLHostUSMSpace>::value, - ""); + Kokkos::Experimental::SYCLHostUSMSpace>::value); static_assert( std::is_same< Kokkos::Device, Kokkos::Device>::value, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace>>::value); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror< - Kokkos::Experimental::SYCLDeviceUSMSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror< + Kokkos::Experimental::SYCLDeviceUSMSpace>::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror< - Kokkos::Experimental::SYCLSharedUSMSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror< + Kokkos::Experimental::SYCLSharedUSMSpace>::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror< - Kokkos::Experimental::SYCLHostUSMSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror< + Kokkos::Experimental::SYCLHostUSMSpace>::Space, + Kokkos::HostSpace>::accessible); } TEST(sycl, uvm) { diff --git a/lib/kokkos/core/unit_test/tools/TestEventCorrectness.hpp b/lib/kokkos/core/unit_test/tools/TestEventCorrectness.hpp index 3c85f661aa..946169a786 100644 --- a/lib/kokkos/core/unit_test/tools/TestEventCorrectness.hpp +++ b/lib/kokkos/core/unit_test/tools/TestEventCorrectness.hpp @@ -409,14 +409,19 @@ TEST(kokkosp, parallel_scan_no_fence) { << "skipping since the OpenMPTarget backend has unexpected fences"; #endif + // Execute the parallel_scan first without looking for fence events. + // Depending on the backend implementation and the order of tests, + // it might be that the first call to parallel_scan is reallocating scratch + // memory which implies a fence when deallocating. We are not interested in + // detecting this event. + TestScanFunctor tf; + Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf); + using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableFences()); auto success = validate_absence( - [=]() { - TestScanFunctor tf; - Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf); - }, + [=]() { Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf); }, [=](BeginFenceEvent begin_event) { if (begin_event.name.find("Debug Only Check for Execution Error") != std::string::npos || @@ -450,13 +455,20 @@ TEST(kokkosp, parallel_scan_no_fence_view) { << "skipping since the OpenMPTarget backend has unexpected fences"; #endif + // Execute the parallel_scan first without looking for fence events. + // Depending on the backend implementation and the order of tests, + // it might be that the first call to parallel_scan is reallocating scratch + // memory which implies a fence when deallocating. We are not interested in + // detecting this event. + TestScanFunctor tf; + Kokkos::View v("scan_result"); + Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf, v); + using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableFences()); - Kokkos::View v("scan_result"); auto success = validate_absence( [=]() { - TestScanFunctor tf; Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf, v); }, [=](BeginFenceEvent begin_event) { diff --git a/lib/kokkos/core/unit_test/tools/TestLogicalSpaces.hpp b/lib/kokkos/core/unit_test/tools/TestLogicalSpaces.hpp deleted file mode 100644 index 4e56f8996a..0000000000 --- a/lib/kokkos/core/unit_test/tools/TestLogicalSpaces.hpp +++ /dev/null @@ -1,177 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER -#include -#include -#include "Kokkos_Core.hpp" - -#include - -namespace Test { - -void debug_print(const Kokkos_Profiling_SpaceHandle hand, const char* name, - const void* ptr, const size_t size) { - std::cout << "Alloc: " << hand.name << ", [" << name << "," << ptr << "] " - << size << std::endl; -} -void debug_dealloc(const Kokkos_Profiling_SpaceHandle hand, const char* name, - const void* ptr, const size_t size) { - std::cout << "Dealloc: " << hand.name << ", [" << name << "," << ptr << "] " - << size << std::endl; -} - -void fail_on_event(const Kokkos::Profiling::SpaceHandle, const char*, - const void*, const uint64_t) { - ASSERT_TRUE(false) << "Unexpected memory event"; -} - -void expect_no_events() { - Kokkos::Tools::Experimental::set_allocate_data_callback(&fail_on_event); - Kokkos::Tools::Experimental::set_deallocate_data_callback(&fail_on_event); -} - -std::string expected_view_name; -std::string expected_space_name; -std::string error_message; -void expect_allocation_event(const std::string evn, const std::string esn, - const std::string em) { - expected_view_name = evn; - expected_space_name = esn; - error_message = em; - Kokkos::Tools::Experimental::set_allocate_data_callback( - [](const Kokkos_Profiling_SpaceHandle hand, const char* name, const void*, - const uint64_t) { - ASSERT_EQ(std::string(hand.name), expected_space_name) - << error_message << " (bad handle)"; - ASSERT_EQ(std::string(name), expected_view_name) - << error_message << " (bad view name)"; - expect_no_events(); - }); -} -void expect_deallocation_event(const std::string& evn, const std::string& esn, - const std::string em) { - expected_view_name = evn; - expected_space_name = esn; - error_message = em; - Kokkos::Tools::Experimental::set_deallocate_data_callback( - [](const Kokkos_Profiling_SpaceHandle hand, const char* name, const void*, - const uint64_t) { - ASSERT_EQ(std::string(hand.name), expected_space_name) - << error_message << " (bad handle)"; - ASSERT_EQ(std::string(name), expected_view_name) - << error_message << " (bad view name)"; - expect_no_events(); - }); -} - -struct TestSpaceNamer { - static constexpr const char* get_name() { return "TestSpace"; } -}; -struct TestSpaceNamerTwo { - static constexpr const char* get_name() { return "YoDawg"; } -}; -struct TestSpaceNamerThree { - static constexpr const char* get_name() { return "CustomAccessSpace"; } -}; -using fake_memory_space = Kokkos::Experimental::LogicalMemorySpace< - Kokkos::HostSpace, Kokkos::DefaultHostExecutionSpace, TestSpaceNamer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>; - -void test_view_construct() { - { - expect_allocation_event("puppy_view", "TestSpace", "View allocation"); - Kokkos::View pup_view("puppy_view", 1000); - expect_deallocation_event("puppy_view", "TestSpace", "View free"); - } - Kokkos::Tools::Experimental::pause_tools(); -} -void test_malloc_free() { - expect_allocation_event("does_malloc_work", "TestSpace", - "Error in malloc event"); - auto* temp = - Kokkos::kokkos_malloc("does_malloc_work", 1000); - expect_deallocation_event("does_malloc_work", "TestSpace", "Error in free"); - Kokkos::kokkos_free(temp); - Kokkos::Tools::Experimental::pause_tools(); -} -void test_chained_spaces() { - using doubly_fake_memory_space = Kokkos::Experimental::LogicalMemorySpace< - fake_memory_space, Kokkos::DefaultHostExecutionSpace, TestSpaceNamerTwo, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>; - { - expect_allocation_event("xzibit_dot_jpeg", "YoDawg", - "Chained space view allocation"); - Kokkos::View pup_view("xzibit_dot_jpeg", - 1000); - expect_deallocation_event("xzibit_dot_jpeg", "YoDawg", - "Chained space free"); - } - Kokkos::Tools::Experimental::pause_tools(); -} -void test_space_allocations() { - fake_memory_space debug_space; - expect_allocation_event("allocation_from_space", "TestSpace", - "Space allocation"); - auto* temp = debug_space.allocate("allocation_from_space", 1000); - expect_deallocation_event("allocation_from_space", "TestSpace", - "Space deallocation"); - debug_space.deallocate("allocation_from_space", temp, 1000); - Kokkos::Tools::Experimental::pause_tools(); -} -template -struct AccessCheckKernel { - Kokkos::View data; - KOKKOS_FUNCTION void operator()(const int i) const { data[i] = i; } -}; - -template -void test_allowed_access() { - constexpr const int data_size = 1000; - // We use an unmananged View here since we want to detect a memory access - // violation in the parallel_for and not in the initialization of the View. - std::vector test_data(data_size); - Kokkos::View test_view(test_data.data(), data_size); - AccessCheckKernel functor{test_view}; - Kokkos::parallel_for( - "access_allowed", - Kokkos::RangePolicy(0, data_size), - functor); - Kokkos::fence(); -} - -using semantically_independent_logical_space = - Kokkos::Experimental::LogicalMemorySpace< - Kokkos::HostSpace, Kokkos::DefaultHostExecutionSpace, - TestSpaceNamerThree, - Kokkos::Experimental::LogicalSpaceSharesAccess::no_shared_access>; - -TEST(defaultdevicetype, logical_space_views) { test_view_construct(); } -TEST(defaultdevicetype, logical_space_malloc) { test_malloc_free(); } -TEST(defaultdevicetype, logical_space_alloc) { test_space_allocations(); } -TEST(defaultdevicetype, chained_logical_spaces) { test_chained_spaces(); } -TEST(defaultdevicetype, access_allowed) { - test_allowed_access(); -} -// FIXME_SYCL -#if !(defined(KOKKOS_COMPILER_INTEL_LLVM) && defined(KOKKOS_ENABLE_SYCL)) -TEST(defaultdevicetype_DeathTest, access_forbidden) { - ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - ASSERT_DEATH( - { test_allowed_access(); }, - "Kokkos::View ERROR: attempt to access inaccessible memory space"); -} -#endif - -} // namespace Test diff --git a/lib/kokkos/core/unit_test/tools/TestProfilingSection.cpp b/lib/kokkos/core/unit_test/tools/TestProfilingSection.cpp index 318766ac45..9d35d67feb 100644 --- a/lib/kokkos/core/unit_test/tools/TestProfilingSection.cpp +++ b/lib/kokkos/core/unit_test/tools/TestProfilingSection.cpp @@ -108,8 +108,8 @@ TEST(defaultdevicetype, profiling_section) { } using Kokkos::Profiling::ProfilingSection; -static_assert(!std::is_default_constructible::value, ""); -static_assert(!std::is_copy_constructible::value, ""); -static_assert(!std::is_move_constructible::value, ""); -static_assert(!std::is_copy_assignable::value, ""); -static_assert(!std::is_move_assignable::value, ""); +static_assert(!std::is_default_constructible::value); +static_assert(!std::is_copy_constructible::value); +static_assert(!std::is_move_constructible::value); +static_assert(!std::is_copy_assignable::value); +static_assert(!std::is_move_assignable::value); diff --git a/lib/kokkos/example/tutorial/01_hello_world/hello_world.cpp b/lib/kokkos/example/tutorial/01_hello_world/hello_world.cpp index 5b8a21af83..22b8b6d63c 100644 --- a/lib/kokkos/example/tutorial/01_hello_world/hello_world.cpp +++ b/lib/kokkos/example/tutorial/01_hello_world/hello_world.cpp @@ -58,12 +58,7 @@ struct hello_world { // is unnecessary but harmless. KOKKOS_INLINE_FUNCTION void operator()(const int i) const { - // FIXME_SYCL needs workaround for printf -#ifndef __SYCL_DEVICE_ONLY__ - printf("Hello from i = %i\n", i); -#else - (void)i; -#endif + Kokkos::printf("Hello from i = %i\n", i); } }; diff --git a/lib/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp b/lib/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp index c78f307636..909765e1fc 100644 --- a/lib/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp +++ b/lib/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp @@ -76,13 +76,9 @@ int main(int argc, char* argv[]) { #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( 15, KOKKOS_LAMBDA(const int i) { - // FIXME_SYCL needs workaround for printf -#ifndef __SYCL_DEVICE_ONLY__ - // printf works in a CUDA parallel kernel; std::ostream does not. - printf("Hello from i = %i\n", i); -#else - (void)i; -#endif + // Kokko::printf works for all backends in a parallel kernel; + // std::ostream does not. + Kokkos::printf("Hello from i = %i\n", i); }); #endif // You must call finalize() after you are done using Kokkos. diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp index b041f8d435..ee3f4721d9 100644 --- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp +++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp @@ -47,13 +47,9 @@ struct hello_world { // The TeamPolicy<>::member_type provides functions to query the multi // dimensional index of a thread as well as the number of thread-teams and // the size of each team. -#ifndef __SYCL_DEVICE_ONLY__ - // FIXME_SYCL needs printf workaround - printf("Hello World: %i %i // %i %i\n", thread.league_rank(), - thread.team_rank(), thread.league_size(), thread.team_size()); -#else - (void)thread; -#endif + Kokkos::printf("Hello World: %i %i // %i %i\n", thread.league_rank(), + thread.team_rank(), thread.league_size(), + thread.team_size()); } }; diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp index 933b254f7c..1e6812adea 100644 --- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp +++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp @@ -57,16 +57,12 @@ int main(int narg, char* args[]) { policy, KOKKOS_LAMBDA(const team_member& thread, int& lsum) { lsum += 1; - // TeamPolicy<>::member_type provides functions to query the - // multidimensional index of a thread, as well as the number of - // thread teams and the size of each team. -#ifndef __SYCL_DEVICE_ONLY__ - // FIXME_SYCL needs workaround for printf - printf("Hello World: %i %i // %i %i\n", thread.league_rank(), - thread.team_rank(), thread.league_size(), thread.team_size()); -#else - (void)thread; -#endif + // TeamPolicy<>::member_type provides functions to query the + // multidimensional index of a thread, as well as the number of + // thread teams and the size of each team. + Kokkos::printf("Hello World: %i %i // %i %i\n", thread.league_rank(), + thread.team_rank(), thread.league_size(), + thread.team_size()); }, sum); #endif diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp index 398810d133..75d6089e9a 100644 --- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp +++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp @@ -43,16 +43,11 @@ struct hello_world { // the operator using a team_policy acts like a parallel region for the // team. That means that everything outside of the nested parallel_for is // also executed by all threads of the team. - Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, 31), - [&](const int& i) { -#ifndef __SYCL_DEVICE_ONLY__ - // FIXME_SYCL needs printf workaround - printf("Hello World: (%i , %i) executed loop %i \n", - thread.league_rank(), thread.team_rank(), i); -#else - (void) i; -#endif - }); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, 31), [&](const int& i) { + Kokkos::printf("Hello World: (%i , %i) executed loop %i \n", + thread.league_rank(), thread.team_rank(), i); + }); } }; diff --git a/lib/kokkos/generate_makefile.bash b/lib/kokkos/generate_makefile.bash index 301a1fceb5..25370daa3f 100755 --- a/lib/kokkos/generate_makefile.bash +++ b/lib/kokkos/generate_makefile.bash @@ -170,12 +170,9 @@ display_help_text() { echo " ARMV8_THUNDERX = ARMv8 Cavium ThunderX CPU" echo " ARMV8_THUNDERX2 = ARMv8 Cavium ThunderX2 CPU" echo " [IBM]" - echo " BGQ = IBM Blue Gene Q" - echo " Power7 = IBM POWER7 and POWER7+ CPUs" echo " Power8 = IBM POWER8 CPUs" echo " Power9 = IBM POWER9 CPUs" echo " [Intel]" - echo " WSM = Intel Westmere CPUs" echo " SNB = Intel Sandy/Ivy Bridge CPUs" echo " HSW = Intel Haswell CPUs" echo " BDW = Intel Broadwell Xeon E-class CPUs" @@ -226,7 +223,6 @@ display_help_text() { echo "--with-gtest=/Path/To/Gtest: Set path to gtest. (Used in unit and performance" echo " tests.)" echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc library." - echo "--with-memkind=/Path/To/MemKind: Set path to memkind library." echo "--with-options=[OPT]: Additional options to Kokkos:" echo " compiler_warnings" echo " aggressive_vectorization = add ivdep on loops" @@ -342,10 +338,6 @@ do KOKKOS_HWLOC=ON HWLOC_PATH="${key#*=}" ;; - --with-memkind*) - KOKKOS_MEMKIND=ON - MEMKIND_PATH="${key#*=}" - ;; --arch*) KOKKOS_ARCH="${key#*=}" ;; @@ -452,15 +444,6 @@ else KOKKOS_HWLOC_CMD= fi -if [ "$KOKKOS_MEMKIND" == "ON" ]; then - KOKKOS_MEMKIND_CMD=-DKokkos_ENABLE_MEMKIND=ON - if [ "$MEMKIND_PATH" != "" ]; then - KOKKOS_MEMKIND_PATH_CMD=-DMEMKIND_ROOT=$MEMKIND_PATH - fi -else - KOKKOS_MEMKIND_CMD= -fi - if [ ! -e ${KOKKOS_PATH}/CMakeLists.txt ]; then if [ "${KOKKOS_PATH}" == "" ]; then CM_SCRIPT=$0 @@ -506,5 +489,5 @@ if [[ ${COMPILER} == *clang* ]]; then fi fi -echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} -cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} +echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} +cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} diff --git a/lib/kokkos/gnu_generate_makefile.bash b/lib/kokkos/gnu_generate_makefile.bash index 5ea159cdd4..7a197bb71d 100755 --- a/lib/kokkos/gnu_generate_makefile.bash +++ b/lib/kokkos/gnu_generate_makefile.bash @@ -74,9 +74,6 @@ do --with-hwloc*) HWLOC_PATH="${key#*=}" ;; - --with-memkind*) - MEMKIND_PATH="${key#*=}" - ;; --arch*) KOKKOS_ARCH="${key#*=}" ;; @@ -148,12 +145,9 @@ do echo " ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU" echo " ARMv8-TX2 = ARMv8 Cavium ThunderX2 CPU" echo " [IBM]" - echo " BGQ = IBM Blue Gene Q" - echo " Power7 = IBM POWER7 and POWER7+ CPUs" echo " Power8 = IBM POWER8 CPUs" echo " Power9 = IBM POWER9 CPUs" echo " [Intel]" - echo " WSM = Intel Westmere CPUs" echo " SNB = Intel Sandy/Ivy Bridge CPUs" echo " HSW = Intel Haswell CPUs" echo " BDW = Intel Broadwell Xeon E-class CPUs" @@ -198,7 +192,6 @@ do echo "--with-gtest=/Path/To/Gtest: Set path to gtest. (Used in unit and performance" echo " tests.)" echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc library." - echo "--with-memkind=/Path/To/MemKind: Set path to memkind library." echo "--with-options=[OPT]: Additional options to Kokkos:" echo " compiler_warnings" echo " aggressive_vectorization = add ivdep on loops" @@ -298,11 +291,6 @@ if [ ${#HWLOC_PATH} -gt 0 ]; then KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},hwloc" fi -if [ ${#MEMKIND_PATH} -gt 0 ]; then - KOKKOS_SETTINGS="${KOKKOS_SETTINGS} MEMKIND_PATH=${MEMKIND_PATH}" - KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},experimental_memkind" -fi - if [ ${#KOKKOS_USE_TPLS} -gt 0 ]; then KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_USE_TPLS=${KOKKOS_USE_TPLS}" fi diff --git a/lib/kokkos/master_history.txt b/lib/kokkos/master_history.txt index a43b5276a8..bd122a456b 100644 --- a/lib/kokkos/master_history.txt +++ b/lib/kokkos/master_history.txt @@ -35,3 +35,4 @@ tag: 4.0.01 date: 04:26:2023 master: aa1f48f3 release: 5893754f tag: 4.1.00 date: 06:20:2023 master: 62d2b6c8 release: adde1e6a tag: 4.2.00 date: 11:09:2023 master: 1a3ea28f release: abe01c88 tag: 4.2.01 date: 01:30:2024 master: 71a9bcae release: 221e5f7a +tag: 4.3.00 date: 04:03:2024 master: e0dc0128 release: f08217a4 diff --git a/lib/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp b/lib/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp index 521160b76f..6d0956f383 100644 --- a/lib/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp +++ b/lib/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp @@ -30,9 +30,11 @@ "Kokkos_SIMD_AVX2.hpp must be included before Kokkos_SIMD_Common_Math.hpp!" #endif -// FIXME_HIP ROCm 5.6 and 5.7 can't compile with the intrinsic used here. -#if defined(__HIPCC__) && (HIP_VERSION_MAJOR == 5) && \ - ((HIP_VERSION_MINOR == 6) || (HIP_VERSION_MINOR == 7)) +// FIXME_HIP ROCm 5.6, 5.7, and 6.0 can't compile with the intrinsic used here. +#if defined(__HIPCC__) && \ + (((HIP_VERSION_MAJOR == 5) && \ + ((HIP_VERSION_MINOR == 6) || (HIP_VERSION_MINOR == 7))) || \ + ((HIP_VERSION_MAJOR == 6) && ((HIP_VERSION_MINOR == 0)))) #define KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE #endif @@ -563,10 +565,18 @@ class simd> { element_aligned_tag) { m_value = _mm256_loadu_pd(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm256_load_pd(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm256_storeu_pd(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_store_pd(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256d() const { return m_value; @@ -818,10 +828,18 @@ class simd> { element_aligned_tag) { m_value = _mm_loadu_ps(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm_load_ps(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm_storeu_ps(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm_store_ps(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m128() const { return m_value; @@ -1059,17 +1077,31 @@ class simd> { } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, element_aligned_tag) { - // FIXME_HIP ROCm 5.6 can't compile with the intrinsic used here. + // FIXME_HIP ROCm 5.6, 5.7, and 6.0 can't compile with the intrinsic used + // here. #ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE m_value = _mm_loadu_si128(reinterpret_cast<__m128i const*>(ptr)); #else m_value = _mm_maskload_epi32(ptr, static_cast<__m128i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + // FIXME_HIP ROCm 5.6 can't compile with the intrinsic used here. +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm_load_si128(reinterpret_cast<__m128i const*>(ptr)); +#else + m_value = _mm_maskload_epi32(ptr, static_cast<__m128i>(mask_type(true))); #endif } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm_maskstore_epi32(ptr, static_cast<__m128i>(mask_type(true)), m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm_maskstore_epi32(ptr, static_cast<__m128i>(mask_type(true)), m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m128i() const { return m_value; @@ -1111,6 +1143,11 @@ class simd> { return simd( _mm_add_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm_mullo_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( simd const& lhs, int rhs) noexcept { @@ -1249,6 +1286,15 @@ class simd> { #else m_value = _mm256_maskload_epi64(reinterpret_cast(ptr), static_cast<__m256i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm256_load_si256(reinterpret_cast<__m256i const*>(ptr)); +#else + m_value = _mm256_maskload_epi64(reinterpret_cast(ptr), + static_cast<__m256i>(mask_type(true))); #endif } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( @@ -1256,6 +1302,11 @@ class simd> { _mm256_maskstore_epi64(reinterpret_cast(ptr), static_cast<__m256i>(mask_type(true)), m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_maskstore_epi64(reinterpret_cast(ptr), + static_cast<__m256i>(mask_type(true)), m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() const { return m_value; @@ -1278,6 +1329,13 @@ class simd> { _mm256_add_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); } + // fallback simd multiplication using generator constructor + // multiplying vectors of 64-bit signed integers is not available in AVX2 + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } + // AVX2 only has eq and gt comparisons for int64 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { @@ -1306,17 +1364,19 @@ class simd> { return !(lhs == rhs); } + // fallback simd shift right arithmetic using generator constructor // Shift right arithmetic for 64bit packed ints is not availalbe in AVX2 - // [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd( - // simd const& lhs, int rhs) noexcept { - // return simd(_mm256_srai_epi64(static_cast<__m256i>(lhs), rhs)); - // } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, int rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] >> rhs; }); + } - // [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd( - // simd const& lhs, simd const& rhs) noexcept { - // return simd(_mm256_srav_epi64(static_cast<__m256i>(lhs), - // static_cast<__m256i>(rhs)))); - // } + // fallback simd shift right arithmetic using generator constructor + // Shift right arithmetic for 64bit packed ints is not availalbe in AVX2 + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] >> rhs[i]; }); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( simd const& lhs, int rhs) noexcept { @@ -1444,6 +1504,15 @@ class simd> { #else m_value = _mm256_maskload_epi64(reinterpret_cast(ptr), static_cast<__m256i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm256_load_si256(reinterpret_cast<__m256i const*>(ptr)); +#else + m_value = _mm256_maskload_epi64(reinterpret_cast(ptr), + static_cast<__m256i>(mask_type(true))); #endif } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() @@ -1460,6 +1529,14 @@ class simd> { return simd( _mm256_sub_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); } + + // fallback simd multiplication using generator constructor + // multiplying vectors of 64-bit unsigned integers is not available in AVX2 + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( simd const& lhs, int rhs) noexcept { return _mm256_srli_epi64(static_cast<__m256i>(lhs), rhs); @@ -1588,6 +1665,11 @@ class const_where_expression>, static_cast<__m256d>(m_value)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(double* mem, vector_aligned_tag) const { + _mm256_maskstore_pd(mem, _mm256_castpd_si256(static_cast<__m256d>(m_mask)), + static_cast<__m256d>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( double* mem, simd> const& index) const { @@ -1624,6 +1706,11 @@ class where_expression>, mem, _mm256_castpd_si256(static_cast<__m256d>(m_mask)))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(double const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_maskload_pd( + mem, _mm256_castpd_si256(static_cast<__m256d>(m_mask)))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( double const* mem, simd> const& index) { @@ -1667,6 +1754,11 @@ class const_where_expression>, static_cast<__m128>(m_value)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, vector_aligned_tag) const { + _mm_maskstore_ps(mem, _mm_castps_si128(static_cast<__m128>(m_mask)), + static_cast<__m128>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( float* mem, simd> const& index) const { @@ -1703,6 +1795,11 @@ class where_expression>, _mm_maskload_ps(mem, _mm_castps_si128(static_cast<__m128>(m_mask)))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(float const* mem, vector_aligned_tag) { + m_value = value_type( + _mm_maskload_ps(mem, _mm_castps_si128(static_cast<__m128>(m_mask)))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( float const* mem, simd> const& index) { @@ -1746,6 +1843,12 @@ class const_where_expression< _mm_maskstore_epi32(mem, static_cast<__m128i>(m_mask), static_cast<__m128i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, vector_aligned_tag) const { + _mm_maskstore_epi32(mem, static_cast<__m128i>(m_mask), + static_cast<__m128i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int32_t* mem, @@ -1786,6 +1889,16 @@ class where_expression>, m_value = value_type(_mm_maskload_epi32(mem, static_cast<__m128i>(m_mask))); #endif } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + __m128i tmp = _mm_load_si128(reinterpret_cast<__m128i const*>(mem)); + m_value = value_type(_mm_and_si128(tmp, static_cast<__m128i>(m_mask))); +#else + m_value = value_type(_mm_maskload_epi32(mem, static_cast<__m128i>(m_mask))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int32_t const* mem, @@ -1833,6 +1946,13 @@ class const_where_expression< static_cast<__m256i>(m_mask), static_cast<__m256i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(std::int64_t* mem, + vector_aligned_tag) const { + _mm256_maskstore_epi64(reinterpret_cast(mem), + static_cast<__m256i>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int64_t* mem, @@ -1874,6 +1994,17 @@ class where_expression>, reinterpret_cast(mem), static_cast<__m256i>(m_mask))); #endif } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(std::int64_t const* mem, + vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + __m256i tmp = _mm256_load_si256(reinterpret_cast<__m256i const*>(mem)); + m_value = value_type(_mm256_and_si256(tmp, static_cast<__m256i>(m_mask))); +#else + m_value = value_type(_mm256_maskload_epi64( + reinterpret_cast(mem), static_cast<__m256i>(m_mask))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int64_t const* mem, @@ -1922,6 +2053,13 @@ class const_where_expression< static_cast<__m256i>(m_mask), static_cast<__m256i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(std::uint64_t* mem, + vector_aligned_tag) const { + _mm256_maskstore_epi64(reinterpret_cast(mem), + static_cast<__m256i>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::uint64_t* mem, @@ -1963,6 +2101,17 @@ class where_expression>, reinterpret_cast(mem), static_cast<__m256i>(m_mask))); #endif } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(std::uint64_t const* mem, + vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + __m256i tmp = _mm256_load_si256(reinterpret_cast<__m256i const*>(mem)); + m_value = value_type(_mm256_and_si256(tmp, static_cast<__m256i>(m_mask))); +#else + m_value = value_type(_mm256_maskload_epi64( + reinterpret_cast(mem), static_cast<__m256i>(m_mask))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::uint64_t const* mem, diff --git a/lib/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp b/lib/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp index c5d1717ad4..7fa35c204a 100644 --- a/lib/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp +++ b/lib/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp @@ -193,10 +193,18 @@ class simd> { element_aligned_tag) { m_value = _mm512_loadu_pd(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm512_load_pd(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm512_storeu_pd(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm512_store_pd(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512d() const { return m_value; @@ -475,10 +483,18 @@ class simd> { element_aligned_tag) { m_value = _mm256_loadu_ps(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm256_load_ps(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm256_storeu_ps(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_store_ps(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256() const { return m_value; @@ -735,15 +751,25 @@ class simd> { operator[](std::size_t i) const { return reinterpret_cast(&m_value)[i]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = _mm256_mask_loadu_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm256_mask_load_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)), m_value); } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, - element_aligned_tag) { - m_value = _mm256_mask_loadu_epi32( - _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_mask_store_epi32(ptr, static_cast<__mmask8>(mask_type(true)), + m_value); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() const { @@ -934,21 +960,30 @@ class simd> { operator[](std::size_t i) const { return reinterpret_cast(&m_value)[i]; } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( - value_type* ptr, element_aligned_tag) const { - _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)), - m_value); - } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, element_aligned_tag) { m_value = _mm256_mask_loadu_epi32( _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm256_mask_load_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)), + m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_mask_store_epi32(ptr, static_cast<__mmask8>(mask_type(true)), + m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( simd const& lhs, simd const& rhs) noexcept { return simd(_mm256_mullo_epi32(static_cast<__m256i>(lhs), @@ -1130,10 +1165,19 @@ class simd> { element_aligned_tag) { m_value = _mm512_loadu_si512(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm512_load_si512(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm512_storeu_si512(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm512_store_si512(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i() const { return m_value; @@ -1331,10 +1375,19 @@ class simd> { element_aligned_tag) { m_value = _mm512_loadu_si512(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm512_load_si512(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm512_storeu_si512(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm512_store_si512(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i() const { return m_value; @@ -1505,6 +1558,11 @@ class const_where_expression>, static_cast<__m512d>(m_value)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(double* mem, vector_aligned_tag) const { + _mm512_mask_store_pd(mem, static_cast<__mmask8>(m_mask), + static_cast<__m512d>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( double* mem, simd> const& index) const { @@ -1541,6 +1599,11 @@ class where_expression>, _mm512_set1_pd(0.0), static_cast<__mmask8>(m_mask), mem)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(double const* mem, vector_aligned_tag) { + m_value = value_type(_mm512_mask_load_pd( + _mm512_set1_pd(0.0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( double const* mem, simd> const& index) { @@ -1584,6 +1647,11 @@ class const_where_expression>, static_cast<__m256>(m_value)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, vector_aligned_tag) const { + _mm256_mask_store_ps(mem, static_cast<__mmask8>(m_mask), + static_cast<__m256>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( float* mem, simd> const& index) const { @@ -1619,6 +1687,10 @@ class where_expression>, m_value = value_type(_mm256_mask_loadu_ps( _mm256_set1_ps(0.0), static_cast<__mmask8>(m_mask), mem)); } + void copy_from(float const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_mask_load_ps( + _mm256_set1_ps(0.0), static_cast<__mmask8>(m_mask), mem)); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( float const* mem, @@ -1666,6 +1738,12 @@ class const_where_expression< _mm256_mask_storeu_epi32(mem, static_cast<__mmask8>(m_mask), static_cast<__m256i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, vector_aligned_tag) const { + _mm256_mask_store_epi32(mem, static_cast<__mmask8>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int32_t* mem, @@ -1702,6 +1780,11 @@ class where_expression>, m_value = value_type(_mm256_mask_loadu_epi32( _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem)); } + void copy_from(std::int32_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_mask_load_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int32_t const* mem, @@ -1710,6 +1793,7 @@ class where_expression>, static_cast<__m256i>(m_value), static_cast<__mmask8>(m_mask), static_cast<__m256i>(index), mem, 4)); } + template (m_mask), static_cast<__m256i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::uint32_t* mem, vector_aligned_tag) const { + _mm256_mask_store_epi32(mem, static_cast<__mmask8>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::uint32_t* mem, @@ -1784,6 +1874,12 @@ class where_expression>, m_value = value_type(_mm256_mask_loadu_epi32( _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::uint32_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_mask_load_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::uint32_t const* mem, @@ -1792,6 +1888,7 @@ class where_expression>, static_cast<__m256i>(m_value), static_cast<__mmask8>(m_mask), static_cast<__m256i>(index), mem, 4)); } + template (m_mask), static_cast<__m512i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int64_t* mem, vector_aligned_tag) const { + _mm512_mask_store_epi64(mem, static_cast<__mmask8>(m_mask), + static_cast<__m512i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int64_t* mem, @@ -1866,6 +1969,12 @@ class where_expression>, m_value = value_type(_mm512_mask_loadu_epi64( _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int64_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm512_mask_load_epi64( + _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int64_t const* mem, @@ -1874,6 +1983,7 @@ class where_expression>, static_cast<__m512i>(m_value), static_cast<__mmask8>(m_mask), static_cast<__m256i>(index), mem, 8)); } + template (m_mask), static_cast<__m512i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::uint64_t* mem, vector_aligned_tag) const { + _mm512_mask_store_epi64(mem, static_cast<__mmask8>(m_mask), + static_cast<__m512i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::uint64_t* mem, @@ -1949,6 +2065,11 @@ class where_expression>, _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::uint64_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm512_mask_load_epi64( + _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::uint64_t const* mem, simd> const& index) { @@ -1956,6 +2077,7 @@ class where_expression>, static_cast<__m512i>(m_value), static_cast<__mmask8>(m_mask), static_cast<__m256i>(index), mem, 8)); } + template class simd_mask; -struct element_aligned_tag {}; +class simd_alignment_vector_aligned {}; + +template +struct simd_flags {}; + +inline constexpr simd_flags<> simd_flag_default{}; +inline constexpr simd_flags simd_flag_aligned{}; + +using element_aligned_tag = simd_flags<>; +using vector_aligned_tag = simd_flags; // class template declarations for const_where_expression and where_expression @@ -117,48 +126,6 @@ template return const_where_expression(mask, value); } -// fallback simd multiplication using generator constructor -// At the time of this writing, this fallback is only used -// to multiply vectors of 64-bit signed integers for the AVX2 backend - -template -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator*( - simd const& lhs, simd const& rhs) { - return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); -} - -// fallback simd shift using generator constructor -// At the time of this edit, only the fallback for shift vectors of -// 64-bit signed integers for the AVX2 backend is used - -template >> -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator>>( - simd const& lhs, int rhs) { - return simd([&](std::size_t i) { return lhs[i] >> rhs; }); -} - -template >> -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator<<( - simd const& lhs, int rhs) { - return simd([&](std::size_t i) { return lhs[i] << rhs; }); -} - -template >> -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator>>( - simd const& lhs, simd const& rhs) { - return simd([&](std::size_t i) { return lhs[i] >> rhs[i]; }); -} - -template >> -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator<<( - simd const& lhs, simd const& rhs) { - return simd([&](std::size_t i) { return lhs[i] << rhs[i]; }); -} - // The code below provides: // operator@(simd, Arithmetic) // operator@(Arithmetic, simd) diff --git a/lib/kokkos/simd/src/Kokkos_SIMD_NEON.hpp b/lib/kokkos/simd/src/Kokkos_SIMD_NEON.hpp index 43ece20389..efc81135d1 100644 --- a/lib/kokkos/simd/src/Kokkos_SIMD_NEON.hpp +++ b/lib/kokkos/simd/src/Kokkos_SIMD_NEON.hpp @@ -363,10 +363,18 @@ class simd> { element_aligned_tag) { m_value = vld1q_f64(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1q_f64(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { vst1q_f64(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1q_f64(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator float64x2_t() const { return m_value; @@ -607,10 +615,18 @@ class simd> { element_aligned_tag) { m_value = vld1_f32(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1_f32(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { vst1_f32(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1_f32(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator float32x2_t() const { return m_value; @@ -844,10 +860,18 @@ class simd> { element_aligned_tag) { m_value = vld1_s32(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1_s32(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { vst1_s32(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1_s32(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator int32x2_t() const { return m_value; @@ -868,7 +892,11 @@ class simd> { return simd( vadd_s32(static_cast(lhs), static_cast(rhs))); } - + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd( + vmul_s32(static_cast(lhs), static_cast(rhs))); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { return mask_type( @@ -1044,10 +1072,18 @@ class simd> { element_aligned_tag) { m_value = vld1q_s64(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1q_s64(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { vst1q_s64(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1q_s64(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator int64x2_t() const { return m_value; @@ -1068,7 +1104,10 @@ class simd> { return simd( vaddq_s64(static_cast(lhs), static_cast(rhs))); } - + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { return mask_type( @@ -1246,6 +1285,18 @@ class simd> { element_aligned_tag) { m_value = vld1q_u64(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1q_u64(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + vst1q_u64(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1q_u64(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator uint64x2_t() const { return m_value; @@ -1261,7 +1312,10 @@ class simd> { return simd( vaddq_u64(static_cast(lhs), static_cast(rhs))); } - + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator&( simd const& lhs, simd const& rhs) noexcept { return simd( @@ -1386,6 +1440,11 @@ class const_where_expression>, if (m_mask[1]) mem[1] = m_value[1]; } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(double* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( double* mem, simd> const& index) const { @@ -1421,6 +1480,11 @@ class where_expression>, if (m_mask[1]) m_value[1] = mem[1]; } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(double const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( double const* mem, simd> const& index) { @@ -1464,6 +1528,11 @@ class const_where_expression>, if (m_mask[1]) mem[1] = m_value[1]; } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( float* mem, simd> const& index) const { @@ -1498,6 +1567,10 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[0]; if (m_mask[1]) m_value[1] = mem[1]; } + void copy_from(float const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( float const* mem, @@ -1542,6 +1615,12 @@ class const_where_expression< if (m_mask[0]) mem[0] = m_value[0]; if (m_mask[1]) mem[1] = m_value[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int32_t* mem, @@ -1577,6 +1656,12 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[0]; if (m_mask[1]) m_value[1] = mem[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int32_t const* mem, @@ -1584,6 +1669,7 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[index[0]]; if (m_mask[1]) m_value[1] = mem[index[1]]; } + template < class U, std::enable_if_t< @@ -1622,6 +1708,12 @@ class const_where_expression< if (m_mask[0]) mem[0] = m_value[0]; if (m_mask[1]) mem[1] = m_value[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int64_t* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int64_t* mem, @@ -1657,6 +1749,12 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[0]; if (m_mask[1]) m_value[1] = mem[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int64_t const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int64_t const* mem, @@ -1664,6 +1762,7 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[index[0]]; if (m_mask[1]) m_value[1] = mem[index[1]]; } + template < class U, std::enable_if_t>, if (m_mask[0]) m_value[0] = mem[0]; if (m_mask[1]) m_value[1] = mem[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::uint64_t const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::uint64_t const* mem, @@ -1744,6 +1855,7 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[index[0]]; if (m_mask[1]) m_value[1] = mem[index[1]]; } + template { element_aligned_tag) { m_value = *ptr; } + KOKKOS_FORCEINLINE_FUNCTION void copy_from(T const* ptr, vector_aligned_tag) { + m_value = *ptr; + } KOKKOS_FORCEINLINE_FUNCTION void copy_to(T* ptr, element_aligned_tag) const { *ptr = m_value; } + KOKKOS_FORCEINLINE_FUNCTION void copy_to(T* ptr, vector_aligned_tag) const { + *ptr = m_value; + } + KOKKOS_FORCEINLINE_FUNCTION reference operator[](std::size_t) { return m_value; } @@ -308,6 +315,10 @@ class const_where_expression, void copy_to(T* mem, element_aligned_tag) const { if (static_cast(m_mask)) *mem = static_cast(m_value); } + KOKKOS_FORCEINLINE_FUNCTION + void copy_to(T* mem, vector_aligned_tag) const { + if (static_cast(m_mask)) *mem = static_cast(m_value); + } template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t> scatter_to(T* mem, simd const& index) const { @@ -315,13 +326,13 @@ class const_where_expression, mem[static_cast(index)] = static_cast(m_value); } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const& - impl_get_value() const { + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION value_type const& impl_get_value() + const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const& - impl_get_mask() const { + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION mask_type const& impl_get_mask() + const { return m_mask; } }; @@ -344,6 +355,10 @@ class where_expression, void copy_from(T const* mem, element_aligned_tag) { if (static_cast(this->m_mask)) this->m_value = *mem; } + KOKKOS_FORCEINLINE_FUNCTION + void copy_from(T const* mem, vector_aligned_tag) { + if (static_cast(this->m_mask)) this->m_value = *mem; + } template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t> gather_from(T const* mem, simd const& index) { diff --git a/lib/kokkos/simd/unit_tests/TestSIMD.cpp b/lib/kokkos/simd/unit_tests/TestSIMD.cpp index 61c076e824..7a1f9be2a0 100644 --- a/lib/kokkos/simd/unit_tests/TestSIMD.cpp +++ b/lib/kokkos/simd/unit_tests/TestSIMD.cpp @@ -21,3 +21,4 @@ #include #include #include +#include diff --git a/lib/kokkos/simd/unit_tests/include/SIMDTesting_Ops.hpp b/lib/kokkos/simd/unit_tests/include/SIMDTesting_Ops.hpp index 6529f20e66..c587ccf304 100644 --- a/lib/kokkos/simd/unit_tests/include/SIMDTesting_Ops.hpp +++ b/lib/kokkos/simd/unit_tests/include/SIMDTesting_Ops.hpp @@ -209,4 +209,165 @@ class shift_left { } }; +class cbrt_op { + public: + template + auto on_host(T const& a) const { +#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + return Kokkos::Experimental::cbrt(a); +#else + return Kokkos::cbrt(a); +#endif + } + template + auto on_host_serial(T const& a) const { + return Kokkos::cbrt(a); + } +}; + +class exp_op { + public: + template + auto on_host(T const& a) const { +#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + return Kokkos::Experimental::exp(a); +#else + return Kokkos::exp(a); +#endif + } + template + auto on_host_serial(T const& a) const { + return Kokkos::exp(a); + } +}; + +class log_op { + public: + template + auto on_host(T const& a) const { +#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + return Kokkos::Experimental::log(a); +#else + return Kokkos::log(a); +#endif + } + template + auto on_host_serial(T const& a) const { + return Kokkos::log(a); + } +}; + +class hmin { + public: + template + auto on_host(T const& a) const { + return Kokkos::Experimental::hmin(a); + } + template + auto on_host_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::min(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::min(result, v[i]); + } + return result; + } + + template + KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const { + return Kokkos::Experimental::hmin(a); + } + template + KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::min(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::min(result, v[i]); + } + return result; + } +}; + +class hmax { + public: + template + auto on_host(T const& a) const { + return Kokkos::Experimental::hmax(a); + } + template + auto on_host_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::max(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::max(result, v[i]); + } + return result; + } + + template + KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const { + return Kokkos::Experimental::hmax(a); + } + template + KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::max(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::max(result, v[i]); + } + return result; + } +}; + +class reduce { + public: + template + auto on_host(T const& a) const { + using DataType = typename T::value_type::value_type; + return Kokkos::Experimental::reduce(a, DataType(0), std::plus<>()); + } + template + auto on_host_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::sum(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result += v[i]; + } + return result; + } + + template + KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const { + using DataType = typename T::value_type::value_type; + return Kokkos::Experimental::reduce(a, DataType(0), std::plus<>()); + } + template + KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::sum(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result += v[i]; + } + return result; + } +}; + #endif diff --git a/lib/kokkos/simd/unit_tests/include/SIMDTesting_Utilities.hpp b/lib/kokkos/simd/unit_tests/include/SIMDTesting_Utilities.hpp index ae2ab2c697..d36e1e5afc 100644 --- a/lib/kokkos/simd/unit_tests/include/SIMDTesting_Utilities.hpp +++ b/lib/kokkos/simd/unit_tests/include/SIMDTesting_Utilities.hpp @@ -93,7 +93,7 @@ class load_element_aligned { bool host_load(T const* mem, std::size_t n, Kokkos::Experimental::simd& result) const { if (n < result.size()) return false; - result.copy_from(mem, Kokkos::Experimental::element_aligned_tag()); + result.copy_from(mem, Kokkos::Experimental::simd_flag_default); return true; } template @@ -101,7 +101,26 @@ class load_element_aligned { T const* mem, std::size_t n, Kokkos::Experimental::simd& result) const { if (n < result.size()) return false; - result.copy_from(mem, Kokkos::Experimental::element_aligned_tag()); + result.copy_from(mem, Kokkos::Experimental::simd_flag_default); + return true; + } +}; + +class load_vector_aligned { + public: + template + bool host_load(T const* mem, std::size_t n, + Kokkos::Experimental::simd& result) const { + if (n < result.size()) return false; + result.copy_from(mem, Kokkos::Experimental::simd_flag_aligned); + return true; + } + template + KOKKOS_INLINE_FUNCTION bool device_load( + T const* mem, std::size_t n, + Kokkos::Experimental::simd& result) const { + if (n < result.size()) return false; + result.copy_from(mem, Kokkos::Experimental::simd_flag_aligned); return true; } }; @@ -116,8 +135,7 @@ class load_masked { for (std::size_t i = 0; i < n; ++i) { mask[i] = true; } - where(mask, result) - .copy_from(mem, Kokkos::Experimental::element_aligned_tag()); + where(mask, result).copy_from(mem, Kokkos::Experimental::simd_flag_default); where(!mask, result) = 0; return true; } @@ -130,8 +148,7 @@ class load_masked { for (std::size_t i = 0; i < n; ++i) { mask[i] = true; } - where(mask, result) - .copy_from(mem, Kokkos::Experimental::element_aligned_tag()); + where(mask, result).copy_from(mem, Kokkos::Experimental::simd_flag_default); where(!mask, result) = T(0); return true; } diff --git a/lib/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp b/lib/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp index 4af08c266b..23e3826c75 100644 --- a/lib/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp +++ b/lib/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp @@ -37,10 +37,10 @@ inline void host_check_gen_ctor() { } simd_type rhs; - rhs.copy_from(init, Kokkos::Experimental::element_aligned_tag()); + rhs.copy_from(init, Kokkos::Experimental::simd_flag_default); simd_type blend; - blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag()); + blend.copy_from(expected, Kokkos::Experimental::simd_flag_default); #if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) if constexpr (std::is_same_v) { @@ -98,7 +98,7 @@ KOKKOS_INLINE_FUNCTION void device_check_gen_ctor() { simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; }); simd_type rhs; - rhs.copy_from(init, Kokkos::Experimental::element_aligned_tag()); + rhs.copy_from(init, Kokkos::Experimental::simd_flag_default); device_check_equality(basic, rhs, lanes); simd_type lhs(KOKKOS_LAMBDA(std::size_t i) { return init[i] * 9; }); @@ -106,7 +106,7 @@ KOKKOS_INLINE_FUNCTION void device_check_gen_ctor() { KOKKOS_LAMBDA(std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; }); simd_type blend; - blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag()); + blend.copy_from(expected, Kokkos::Experimental::simd_flag_default); device_check_equality(result, blend, lanes); } diff --git a/lib/kokkos/simd/unit_tests/include/TestSIMD_MathOps.hpp b/lib/kokkos/simd/unit_tests/include/TestSIMD_MathOps.hpp index 802e41efe5..59f2f6c18f 100644 --- a/lib/kokkos/simd/unit_tests/include/TestSIMD_MathOps.hpp +++ b/lib/kokkos/simd/unit_tests/include/TestSIMD_MathOps.hpp @@ -61,13 +61,18 @@ void host_check_math_op_one_loader(UnaryOp unary_op, std::size_t n, simd_type arg; bool const loaded_arg = loader.host_load(args + i, nlanes, arg); if (!loaded_arg) continue; - auto computed_result = unary_op.on_host(arg); - decltype(computed_result) expected_result; + decltype(unary_op.on_host(arg)) expected_result; for (std::size_t lane = 0; lane < simd_type::size(); ++lane) { - if (lane < nlanes) + if (lane < nlanes) { + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) + arg[lane] = Kokkos::abs(arg[lane]); expected_result[lane] = unary_op.on_host_serial(T(arg[lane])); + } } + auto computed_result = unary_op.on_host(arg); host_check_equality(expected_result, computed_result, nlanes); } } @@ -78,6 +83,7 @@ inline void host_check_math_op_all_loaders(Op op, std::size_t n, host_check_math_op_one_loader(op, n, args...); host_check_math_op_one_loader(op, n, args...); host_check_math_op_one_loader(op, n, args...); + host_check_math_op_one_loader(op, n, args...); } template @@ -96,6 +102,13 @@ inline void host_check_all_math_ops(const DataType (&first_args)[n], // TODO: Place fallback implementations for all simd integer types if constexpr (std::is_floating_point_v) { host_check_math_op_all_loaders(divides(), n, first_args, second_args); + +#if defined(__INTEL_COMPILER) && \ + (defined(KOKKOS_ARCH_AVX2) || defined(KOKKOS_ARCH_AVX512XEON)) + host_check_math_op_all_loaders(cbrt_op(), n, first_args); + host_check_math_op_all_loaders(exp_op(), n, first_args); + host_check_math_op_all_loaders(log_op(), n, first_args); +#endif } } @@ -109,23 +122,29 @@ inline void host_check_abi_size() { template inline void host_check_math_ops() { constexpr size_t n = 11; + constexpr size_t alignment = + Kokkos::Experimental::simd::size() * sizeof(DataType); host_check_abi_size(); if constexpr (!std::is_integral_v) { - DataType const first_args[n] = {0.1, 0.4, 0.5, 0.7, 1.0, 1.5, - -2.0, 10.0, 0.0, 1.2, -2.8}; - DataType const second_args[n] = {1.0, 0.2, 1.1, 1.8, -0.1, -3.0, - -2.4, 1.0, 13.0, -3.2, -2.1}; + alignas(alignment) DataType const first_args[n] = { + 0.1, 0.4, 0.5, 0.7, 1.0, 1.5, -2.0, 10.0, 0.0, 1.2, -2.8}; + alignas(alignment) DataType const second_args[n] = { + 1.0, 0.2, 1.1, 1.8, -0.1, -3.0, -2.4, 1.0, 13.0, -3.2, -2.1}; host_check_all_math_ops(first_args, second_args); } else { if constexpr (std::is_signed_v) { - DataType const first_args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; - DataType const second_args[n] = {1, 2, 1, 1, 1, -3, -2, 1, 13, -3, -2}; + alignas(alignment) + DataType const first_args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; + alignas(alignment) DataType const second_args[n] = {1, 2, 1, 1, 1, -3, + -2, 1, 13, -3, -2}; host_check_all_math_ops(first_args, second_args); } else { - DataType const first_args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; - DataType const second_args[n] = {1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2}; + alignas(alignment) + DataType const first_args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; + alignas(alignment) + DataType const second_args[n] = {1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2}; host_check_all_math_ops(first_args, second_args); } } @@ -202,6 +221,7 @@ KOKKOS_INLINE_FUNCTION void device_check_math_op_all_loaders(Op op, device_check_math_op_one_loader(op, n, args...); device_check_math_op_one_loader(op, n, args...); device_check_math_op_one_loader(op, n, args...); + device_check_math_op_one_loader(op, n, args...); } template @@ -282,8 +302,13 @@ TEST(simd, host_math_ops) { } TEST(simd, device_math_ops) { - Kokkos::parallel_for(Kokkos::RangePolicy>(0, 1), - simd_device_math_ops_functor()); +#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET + GTEST_SKIP() + << "skipping because of a non-deterministic failure reporting: " + "Failure to synchronize stream (nil): Error in " + "cuStreamSynchronize: an illegal memory access was encountered"; +#endif + Kokkos::parallel_for(1, simd_device_math_ops_functor()); } #endif diff --git a/lib/kokkos/simd/unit_tests/include/TestSIMD_Reductions.hpp b/lib/kokkos/simd/unit_tests/include/TestSIMD_Reductions.hpp new file mode 100644 index 0000000000..b3c7ac9a01 --- /dev/null +++ b/lib/kokkos/simd/unit_tests/include/TestSIMD_Reductions.hpp @@ -0,0 +1,184 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_TEST_SIMD_REDUCTIONS_HPP +#define KOKKOS_TEST_SIMD_REDUCTIONS_HPP + +#include +#include + +template +inline void host_check_reduction_one_loader(ReductionOp reduce_op, + std::size_t n, T const* args) { + Loader loader; + using simd_type = Kokkos::Experimental::simd; + using mask_type = typename Kokkos::Experimental::simd::mask_type; + constexpr std::size_t width = simd_type::size(); + + for (std::size_t i = 0; i < n; i += width) { + std::size_t const nremaining = n - i; + std::size_t const nlanes = Kokkos::min(nremaining, width); + simd_type arg; + bool const loaded_arg = loader.host_load(args + i, nlanes, arg); + if (!loaded_arg) continue; + + mask_type mask(false); + for (std::size_t j = 0; j < n; ++j) { + mask[j] = true; + } + auto value = where(mask, arg); + auto expected = reduce_op.on_host_serial(value); + auto computed = reduce_op.on_host(value); + + gtest_checker().equality(expected, computed); + } +} + +template +inline void host_check_reduction_all_loaders(ReductionOp reduce_op, + std::size_t n, T const* args) { + host_check_reduction_one_loader(reduce_op, n, + args); + host_check_reduction_one_loader(reduce_op, n, args); + host_check_reduction_one_loader(reduce_op, n, args); +} + +template +inline void host_check_all_reductions(const DataType (&args)[n]) { + host_check_reduction_all_loaders(hmin(), n, args); + host_check_reduction_all_loaders(hmax(), n, args); + host_check_reduction_all_loaders(reduce(), n, args); +} + +template +inline void host_check_reductions() { + constexpr size_t n = 11; + + if constexpr (std::is_signed_v) { + DataType const args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; + host_check_all_reductions(args); + } else { + DataType const args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; + host_check_all_reductions(args); + } +} + +template +inline void host_check_reductions_all_types( + Kokkos::Experimental::Impl::data_types) { + (host_check_reductions(), ...); +} + +template +inline void host_check_reductions_all_abis( + Kokkos::Experimental::Impl::abi_set) { + using DataTypes = Kokkos::Experimental::Impl::data_type_set; + (host_check_reductions_all_types(DataTypes()), ...); +} + +template +KOKKOS_INLINE_FUNCTION void device_check_reduction_one_loader( + ReductionOp reduce_op, std::size_t n, T const* args) { + Loader loader; + using simd_type = Kokkos::Experimental::simd; + using mask_type = typename Kokkos::Experimental::simd::mask_type; + constexpr std::size_t width = simd_type::size(); + + for (std::size_t i = 0; i < n; i += width) { + std::size_t const nremaining = n - i; + std::size_t const nlanes = Kokkos::min(nremaining, width); + simd_type arg; + bool const loaded_arg = loader.device_load(args + i, nlanes, arg); + if (!loaded_arg) continue; + + mask_type mask(false); + for (std::size_t j = 0; j < n; ++j) { + mask[j] = true; + } + auto value = where(mask, arg); + auto expected = reduce_op.on_device_serial(value); + auto computed = reduce_op.on_device(value); + + kokkos_checker().equality(expected, computed); + } +} + +template +KOKKOS_INLINE_FUNCTION void device_check_reduction_all_loaders( + ReductionOp reduce_op, std::size_t n, T const* args) { + device_check_reduction_one_loader(reduce_op, n, + args); + device_check_reduction_one_loader(reduce_op, n, args); + device_check_reduction_one_loader(reduce_op, n, args); +} + +template +KOKKOS_INLINE_FUNCTION void device_check_all_reductions( + const DataType (&args)[n]) { + device_check_reduction_all_loaders(hmin(), n, args); + device_check_reduction_all_loaders(hmax(), n, args); + device_check_reduction_all_loaders(reduce(), n, args); +} + +template +KOKKOS_INLINE_FUNCTION void device_check_reductions() { + constexpr size_t n = 11; + + if constexpr (std::is_signed_v) { + DataType const args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; + device_check_all_reductions(args); + } else { + DataType const args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; + device_check_all_reductions(args); + } +} + +template +KOKKOS_INLINE_FUNCTION void device_check_reductions_all_types( + Kokkos::Experimental::Impl::data_types) { + (device_check_reductions(), ...); +} + +template +KOKKOS_INLINE_FUNCTION void device_check_reductions_all_abis( + Kokkos::Experimental::Impl::abi_set) { + using DataTypes = Kokkos::Experimental::Impl::data_type_set; + (device_check_reductions_all_types(DataTypes()), ...); +} + +class simd_device_reduction_functor { + public: + KOKKOS_INLINE_FUNCTION void operator()(int) const { + device_check_reductions_all_abis( + Kokkos::Experimental::Impl::device_abi_set()); + } +}; + +TEST(simd, host_reductions) { + host_check_reductions_all_abis(Kokkos::Experimental::Impl::host_abi_set()); +} + +TEST(simd, device_reductions) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET + GTEST_SKIP() + << "skipping because of a non-deterministic failure reporting: " + "Failure to synchronize stream (nil): Error in " + "cuStreamSynchronize: an illegal memory access was encountered"; +#endif + Kokkos::parallel_for(1, simd_device_reduction_functor()); +} + +#endif diff --git a/lib/kokkos/simd/unit_tests/include/TestSIMD_ShiftOps.hpp b/lib/kokkos/simd/unit_tests/include/TestSIMD_ShiftOps.hpp index f6fdcb920e..ffdd2cba4a 100644 --- a/lib/kokkos/simd/unit_tests/include/TestSIMD_ShiftOps.hpp +++ b/lib/kokkos/simd/unit_tests/include/TestSIMD_ShiftOps.hpp @@ -85,10 +85,11 @@ inline void host_check_shift_op_all_loaders(ShiftOp shift_op, shift_by, n); host_check_shift_on_one_loader(shift_op, test_vals, shift_by, n); + host_check_shift_on_one_loader(shift_op, test_vals, + shift_by, n); Kokkos::Experimental::simd shift_by_lanes; - shift_by_lanes.copy_from(shift_by, - Kokkos::Experimental::element_aligned_tag()); + shift_by_lanes.copy_from(shift_by, Kokkos::Experimental::simd_flag_default); host_check_shift_by_lanes_on_one_loader( shift_op, test_vals, shift_by_lanes); @@ -96,6 +97,8 @@ inline void host_check_shift_op_all_loaders(ShiftOp shift_op, shift_by_lanes); host_check_shift_by_lanes_on_one_loader( shift_op, test_vals, shift_by_lanes); + host_check_shift_by_lanes_on_one_loader( + shift_op, test_vals, shift_by_lanes); } template @@ -104,12 +107,14 @@ inline void host_check_shift_ops() { using simd_type = Kokkos::Experimental::simd; constexpr std::size_t width = simd_type::size(); constexpr std::size_t num_cases = 8; + constexpr size_t alignment = + Kokkos::Experimental::simd::size() * sizeof(DataType); DataType max = std::numeric_limits::max(); - DataType shift_by[num_cases] = { + alignas(alignment) DataType shift_by[num_cases] = { 0, 1, 3, width / 2, width / 2 + 1, width - 1, width, width + 1}; - DataType test_vals[width]; + alignas(alignment) DataType test_vals[width]; for (std::size_t i = 0; i < width; ++i) { DataType inc = max / width; test_vals[i] = i * inc + 1; @@ -201,10 +206,11 @@ KOKKOS_INLINE_FUNCTION void device_check_shift_op_all_loaders( shift_by, n); device_check_shift_on_one_loader(shift_op, test_vals, shift_by, n); + device_check_shift_on_one_loader( + shift_op, test_vals, shift_by, n); Kokkos::Experimental::simd shift_by_lanes; - shift_by_lanes.copy_from(shift_by, - Kokkos::Experimental::element_aligned_tag()); + shift_by_lanes.copy_from(shift_by, Kokkos::Experimental::simd_flag_default); device_check_shift_by_lanes_on_one_loader( shift_op, test_vals, shift_by_lanes); @@ -212,6 +218,8 @@ KOKKOS_INLINE_FUNCTION void device_check_shift_op_all_loaders( shift_op, test_vals, shift_by_lanes); device_check_shift_by_lanes_on_one_loader( shift_op, test_vals, shift_by_lanes); + device_check_shift_by_lanes_on_one_loader( + shift_op, test_vals, shift_by_lanes); } template diff --git a/lib/kokkos/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp b/lib/kokkos/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp index 129f2b0d5c..152fd9e984 100644 --- a/lib/kokkos/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp +++ b/lib/kokkos/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp @@ -29,7 +29,7 @@ inline void host_check_where_expr_scatter_to() { std::size_t nlanes = simd_type::size(); DataType init[] = {11, 13, 17, 19, 23, 29, 31, 37}; simd_type src; - src.copy_from(init, Kokkos::Experimental::element_aligned_tag()); + src.copy_from(init, Kokkos::Experimental::simd_flag_default); for (std::size_t idx = 0; idx < nlanes; ++idx) { mask_type mask(true); @@ -46,7 +46,7 @@ inline void host_check_where_expr_scatter_to() { where(mask, src).scatter_to(dst, index); simd_type dst_simd; - dst_simd.copy_from(dst, Kokkos::Experimental::element_aligned_tag()); + dst_simd.copy_from(dst, Kokkos::Experimental::simd_flag_default); host_check_equality(expected_result, dst_simd, nlanes); } @@ -107,7 +107,7 @@ KOKKOS_INLINE_FUNCTION void device_check_where_expr_scatter_to() { std::size_t nlanes = simd_type::size(); DataType init[] = {11, 13, 17, 19, 23, 29, 31, 37}; simd_type src; - src.copy_from(init, Kokkos::Experimental::element_aligned_tag()); + src.copy_from(init, Kokkos::Experimental::simd_flag_default); for (std::size_t idx = 0; idx < nlanes; ++idx) { mask_type mask(true); @@ -124,7 +124,7 @@ KOKKOS_INLINE_FUNCTION void device_check_where_expr_scatter_to() { where(mask, src).scatter_to(dst, index); simd_type dst_simd; - dst_simd.copy_from(dst, Kokkos::Experimental::element_aligned_tag()); + dst_simd.copy_from(dst, Kokkos::Experimental::simd_flag_default); device_check_equality(expected_result, dst_simd, nlanes); } diff --git a/lib/kokkos/tpls/desul/Config.hpp.cmake.in b/lib/kokkos/tpls/desul/Config.hpp.cmake.in index a7bc738191..aed7ecfabc 100644 --- a/lib/kokkos/tpls/desul/Config.hpp.cmake.in +++ b/lib/kokkos/tpls/desul/Config.hpp.cmake.in @@ -14,6 +14,8 @@ SPDX-License-Identifier: (BSD-3-Clause) #cmakedefine DESUL_ATOMICS_ENABLE_HIP #cmakedefine DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION #cmakedefine DESUL_ATOMICS_ENABLE_SYCL +#cmakedefine DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION #cmakedefine DESUL_ATOMICS_ENABLE_OPENMP +#cmakedefine DESUL_ATOMICS_ENABLE_OPENACC #endif diff --git a/lib/kokkos/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp b/lib/kokkos/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp index 082fc132de..15c6d78d94 100644 --- a/lib/kokkos/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp +++ b/lib/kokkos/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp @@ -88,15 +88,18 @@ using sycl_atomic_ref = sycl::atomic_ref; #endif -// FIXME_SYCL Use SYCL_EXT_ONEAPI_DEVICE_GLOBAL when available instead #ifdef DESUL_SYCL_DEVICE_GLOBAL_SUPPORTED -// FIXME_SYCL The compiler forces us to use device_image_scope. Drop this when possible. +#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL +template +using sycl_device_global = sycl::ext::oneapi::experimental::device_global; +#else template using sycl_device_global = sycl::ext::oneapi::experimental::device_global< T, decltype(sycl::ext::oneapi::experimental::properties( sycl::ext::oneapi::experimental::device_image_scope))>; #endif +#endif } // namespace Impl } // namespace desul diff --git a/lib/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange.hpp b/lib/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange.hpp index e91569e1de..72639fc493 100644 --- a/lib/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange.hpp +++ b/lib/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange.hpp @@ -26,6 +26,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_OPENMP_ATOMICS #include #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include #endif diff --git a/lib/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp b/lib/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp new file mode 100644 index 0000000000..77149bd474 --- /dev/null +++ b/lib/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp @@ -0,0 +1,153 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_OPENACC_HPP_ +#define DESUL_ATOMICS_COMPARE_EXCHANGE_OPENACC_HPP_ + +#include + +#include +#include +#include + +namespace desul { +namespace Impl { + +#ifdef __NVCOMPILER + +#pragma acc routine seq +template +T device_atomic_exchange(T* dest, T value, MemoryOrder, MemoryScope /*scope*/) { + if constexpr (std::is_arithmetic_v && ((sizeof(T) == 4) || (sizeof(T) == 8))) { + T return_val; +#pragma acc atomic capture + { + return_val = *dest; + *dest = value; + } + return return_val; + } else { + // FIXME_OPENACC + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_exchange(): Not supported atomic operation in " + "the OpenACC backend\n"); + } + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + // } + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = *dest; + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + // unlock_address_openacc((void*)dest, scope); + return return_val; + } +} + +#pragma acc routine seq +template +T device_atomic_compare_exchange( + T* dest, T compare, T value, MemoryOrder, MemoryScope scope) { + // Floating point types treated separetely to work around compiler errors + // "parse invalid cast opcode for cast from 'i32' to 'float'". + // Also not just "forwarding" arguments to atomicCAS because it does not have an + // overload that takes int64_t + if constexpr (std::is_integral_v && ((sizeof(T) == 4) || (sizeof(T) == 8))) { + static_assert(sizeof(unsigned int) == 4); + static_assert(sizeof(unsigned long long int) == 8); + using cas_t = + std::conditional_t<(sizeof(T) == 4), unsigned int, unsigned long long int>; + cas_t return_val = atomicCAS(reinterpret_cast(dest), + reinterpret_cast(compare), + reinterpret_cast(value)); + return reinterpret_cast(return_val); +#ifdef DESUL_CUDA_ARCH_IS_PRE_PASCAL + } else if constexpr (std::is_same_v) { +#else + } else if constexpr (std::is_same_v || std::is_same_v) { +#endif + return atomicCAS(dest, compare, value); + } else { + // FIXME_OPENACC + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_compare_exchange(): Not supported atomic " + "operation in the OpenACC backend\n"); + } + T current_val = *dest; + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + //} + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + if (current_val == compare) { + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + } + // unlock_address_openacc((void*)dest, scope); + return current_val; + } +} + +#else // not NVHPC + +#pragma acc routine seq +template +T device_atomic_exchange(T* dest, T value, MemoryOrder, MemoryScope) { + if constexpr (std::is_arithmetic_v) { + T return_val; +#pragma acc atomic capture + { + return_val = *dest; + *dest = value; + } + return return_val; + } else { + // FIXME_OPENACC + printf( + "DESUL error in device_atomic_exchange(): Not supported atomic operation in " + "the OpenACC backend\n"); + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + // } + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = *dest; + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + // unlock_address_openacc((void*)dest, scope); + return return_val; + } +} + +#pragma acc routine seq +template +T device_atomic_compare_exchange( + T* dest, T compare, T value, MemoryOrder, MemoryScope scope) { + // FIXME_OPENACC + printf( + "DESUL error in device_atomic_compare_exchange(): Not supported atomic operation " + "in the OpenACC backend\n"); + T current_val = *dest; + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + //} + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + if (current_val == compare) { + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + } + // unlock_address_openacc((void*)dest, scope); + return current_val; +} + +#endif + +} // namespace Impl +} // namespace desul + +#endif diff --git a/lib/kokkos/tpls/desul/include/desul/atomics/Fetch_Op.hpp b/lib/kokkos/tpls/desul/include/desul/atomics/Fetch_Op.hpp index adf75c5743..1b161397c7 100644 --- a/lib/kokkos/tpls/desul/include/desul/atomics/Fetch_Op.hpp +++ b/lib/kokkos/tpls/desul/include/desul/atomics/Fetch_Op.hpp @@ -23,6 +23,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_OPENMP_ATOMICS #include #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include #endif diff --git a/lib/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp b/lib/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp new file mode 100644 index 0000000000..ab570ac578 --- /dev/null +++ b/lib/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp @@ -0,0 +1,431 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ +#ifndef DESUL_ATOMICS_FETCH_OP_OPENACC_HPP_ +#define DESUL_ATOMICS_FETCH_OP_OPENACC_HPP_ + +#include // min, max +#include +#include + +namespace desul { +namespace Impl { + +#ifdef __NVCOMPILER + +template +inline constexpr bool is_openacc_integral_type_v = + std::is_same_v || std::is_same_v || + std::is_same_v; + +template +inline constexpr bool is_openacc_arithmetic_type_v = std::is_same_v || +#ifndef DESUL_CUDA_ARCH_IS_PRE_PASCAL + std::is_same_v || +#endif + is_openacc_integral_type_v; + +#else + +template +inline constexpr bool is_openacc_integral_type_v = std::is_integral_v; + +template +inline constexpr bool is_openacc_arithmetic_type_v = std::is_arithmetic_v; + +#endif + +// +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_add( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr += val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_inc( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr += T(1); + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_sub( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr -= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_dec( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr -= T(1); + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_mul( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr *= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_div( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr /= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_lshift( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr = *ptr << val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_rshift( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr = *ptr >> val; + } + return old; +} + +#ifdef __NVCOMPILER +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_max( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; + old = atomicMax(ptr, val); + return old; +} +#endif + +#ifdef __NVCOMPILER +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_min( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + int old; + old = atomicMin(ptr, val); + return old; +} +#endif + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_and( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr &= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_or( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr |= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_xor( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr ^= val; + } + return old; +} +// + +// +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_add_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr += val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_inc_fetch( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr += T(1); + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_sub_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr -= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_dec_fetch( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr -= T(1); + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_mul_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr *= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_div_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr /= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_lshift_fetch( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr = *ptr << val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_rshift_fetch( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr = *ptr >> val; + tmp = *ptr; + } + return tmp; +} + +#ifdef __NVCOMPILER +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_max_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; + tmp = atomicMax(ptr, val); + tmp = std::max(tmp, val); + return tmp; +} +#endif + +#ifdef __NVCOMPILER +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_min_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; + tmp = atomicMin(ptr, val); + tmp = std::min(tmp, val); + return tmp; +} +#endif + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_and_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr &= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_or_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr |= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_xor_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr ^= val; + tmp = *ptr; + } + return tmp; +} +// + +// +#pragma acc routine seq +template +std::enable_if_t, void> device_atomic_store( + T* const ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { +#pragma acc atomic write + *ptr = val; +} + +#pragma acc routine seq +template +std::enable_if_t, void> device_atomic_store( + T* const ptr, const T val, MemoryOrderRelease, MemoryScopeDevice) { + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_store(MemoryOrderRelease): Not supported atomic " + "operation in the OpenACC backend\n"); + } +#pragma acc atomic write + *ptr = val; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_load( + const T* const ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T retval; +#pragma acc atomic read + retval = *ptr; + return retval; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_load( + const T* const ptr, MemoryOrderAcquire, MemoryScopeDevice) { + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_load(MemoryOrderAcquire): Not supported atomic " + "operation in the OpenACC backend\n"); + } + T retval; +#pragma acc atomic read + retval = *ptr; + return retval; +} +// + +} // namespace Impl +} // namespace desul + +#endif diff --git a/lib/kokkos/tpls/desul/include/desul/atomics/Generic.hpp b/lib/kokkos/tpls/desul/include/desul/atomics/Generic.hpp index fef10222e3..fa71477c29 100644 --- a/lib/kokkos/tpls/desul/include/desul/atomics/Generic.hpp +++ b/lib/kokkos/tpls/desul/include/desul/atomics/Generic.hpp @@ -18,11 +18,14 @@ SPDX-License-Identifier: (BSD-3-Clause) namespace desul { +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_thread_fence(MemoryOrder order, MemoryScope scope) { DESUL_IF_ON_DEVICE(return Impl::device_atomic_thread_fence(order, scope);) DESUL_IF_ON_HOST(return Impl::host_atomic_thread_fence(order, scope);) } + +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_exchange(T* dest, T val, MemoryOrder order, MemoryScope scope) { @@ -30,6 +33,7 @@ atomic_exchange(T* dest, T val, MemoryOrder order, MemoryScope scope) { DESUL_IF_ON_HOST(return Impl::host_atomic_exchange(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_compare_exchange(T* dest, T cmp, T val, MemoryOrder order, MemoryScope scope) { @@ -40,6 +44,7 @@ atomic_compare_exchange(T* dest, T cmp, T val, MemoryOrder order, MemoryScope sc } // Fetch_Oper atomics: return value before operation +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_add(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -47,6 +52,7 @@ atomic_fetch_add(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_add(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_sub(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -54,6 +60,7 @@ atomic_fetch_sub(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_sub(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_max(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -61,6 +68,7 @@ atomic_fetch_max(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_max(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_min(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -68,6 +76,7 @@ atomic_fetch_min(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_min(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_mul(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -75,6 +84,7 @@ atomic_fetch_mul(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_mul(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_div(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -82,6 +92,7 @@ atomic_fetch_div(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_div(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_mod(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -89,6 +100,7 @@ atomic_fetch_mod(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_mod(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_and(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -96,6 +108,7 @@ atomic_fetch_and(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_and(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_or(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -103,6 +116,7 @@ atomic_fetch_or(T* const dest, const T val, MemoryOrder order, MemoryScope scope DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_or(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_xor(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -110,6 +124,7 @@ atomic_fetch_xor(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_xor(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_nand(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -117,6 +132,7 @@ atomic_fetch_nand(T* const dest, const T val, MemoryOrder order, MemoryScope sco DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_nand(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_lshift(T* const dest, const unsigned int val, @@ -126,6 +142,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_lshift(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_lshift(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_rshift(T* const dest, const unsigned int val, @@ -136,6 +153,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_rshift(T* const dest, } // Oper Fetch atomics: return value after operation +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_add_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -143,6 +161,7 @@ atomic_add_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_add_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_sub_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -150,6 +169,7 @@ atomic_sub_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_sub_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_max_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -157,6 +177,7 @@ atomic_max_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_max_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_min_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -164,6 +185,7 @@ atomic_min_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_min_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_mul_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -171,6 +193,7 @@ atomic_mul_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_mul_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_div_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -178,6 +201,7 @@ atomic_div_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_div_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_mod_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -185,6 +209,7 @@ atomic_mod_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_mod_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_and_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -192,6 +217,7 @@ atomic_and_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_and_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_or_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -199,6 +225,7 @@ atomic_or_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope DESUL_IF_ON_HOST(return Impl::host_atomic_or_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_xor_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -206,6 +233,7 @@ atomic_xor_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_xor_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_nand_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -213,6 +241,7 @@ atomic_nand_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope sco DESUL_IF_ON_HOST(return Impl::host_atomic_nand_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_lshift_fetch(T* const dest, const unsigned int val, @@ -222,6 +251,7 @@ DESUL_INLINE_FUNCTION T atomic_lshift_fetch(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_lshift_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_rshift_fetch(T* const dest, const unsigned int val, @@ -233,6 +263,7 @@ DESUL_INLINE_FUNCTION T atomic_rshift_fetch(T* const dest, // Other atomics +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_load(const T* const dest, MemoryOrder order, @@ -241,6 +272,7 @@ DESUL_INLINE_FUNCTION T atomic_load(const T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_load(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_store(T* const dest, const T val, @@ -250,6 +282,7 @@ DESUL_INLINE_FUNCTION void atomic_store(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_store(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_add(T* const dest, const T val, @@ -259,6 +292,7 @@ DESUL_INLINE_FUNCTION void atomic_add(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_add(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_sub(T* const dest, const T val, @@ -268,6 +302,7 @@ DESUL_INLINE_FUNCTION void atomic_sub(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_sub(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_mul(T* const dest, const T val, @@ -277,6 +312,7 @@ DESUL_INLINE_FUNCTION void atomic_mul(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_mul(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_div(T* const dest, const T val, @@ -286,6 +322,7 @@ DESUL_INLINE_FUNCTION void atomic_div(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_div(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_min(T* const dest, const T val, @@ -295,6 +332,7 @@ DESUL_INLINE_FUNCTION void atomic_min(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_min(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_max(T* const dest, const T val, @@ -304,6 +342,7 @@ DESUL_INLINE_FUNCTION void atomic_max(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_max(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_inc_fetch(T* const dest, MemoryOrder order, @@ -312,6 +351,7 @@ DESUL_INLINE_FUNCTION T atomic_inc_fetch(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_inc_fetch(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_dec_fetch(T* const dest, MemoryOrder order, @@ -320,6 +360,7 @@ DESUL_INLINE_FUNCTION T atomic_dec_fetch(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_dec_fetch(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_inc(T* const dest, MemoryOrder order, @@ -328,6 +369,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_inc(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_inc(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_inc_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) { @@ -335,6 +377,7 @@ atomic_fetch_inc_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_inc_mod(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_dec(T* const dest, MemoryOrder order, @@ -343,6 +386,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_dec(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_dec(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_dec_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) { @@ -350,6 +394,7 @@ atomic_fetch_dec_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_dec_mod(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_inc(T* const dest, MemoryOrder order, @@ -358,6 +403,7 @@ DESUL_INLINE_FUNCTION void atomic_inc(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_inc(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_dec(T* const dest, MemoryOrder order, @@ -367,6 +413,7 @@ DESUL_INLINE_FUNCTION void atomic_dec(T* const dest, } // FIXME +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template SYCL_SPACE_ATOMIC_LOCKS_DEVICE; +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +SYCL_EXTERNAL extern +#else +static +#endif + sycl_device_global + SYCL_SPACE_ATOMIC_LOCKS_DEVICE; -SYCL_EXTERNAL extern sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_NODE; +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +SYCL_EXTERNAL extern +#else +static +#endif + sycl_device_global + SYCL_SPACE_ATOMIC_LOCKS_NODE; #define SYCL_SPACE_ATOMIC_MASK 0x1FFFF @@ -128,6 +149,34 @@ inline void unlock_address_sycl(void* ptr, MemoryScopeNode) { lock_node_ref.exchange(0); } +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +inline +#else +inline static +#endif + void + copy_sycl_lock_arrays_to_device(sycl::queue q) { + static bool once = [&q]() { +#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL + q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_DEVICE, + &SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, + sizeof(int32_t*)); + q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_NODE, + &SYCL_SPACE_ATOMIC_LOCKS_NODE_h, + sizeof(int32_t*)); +#else + auto device_ptr = SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h; + auto node_ptr = SYCL_SPACE_ATOMIC_LOCKS_NODE_h; + q.single_task([=] { + SYCL_SPACE_ATOMIC_LOCKS_DEVICE.get() = device_ptr; + SYCL_SPACE_ATOMIC_LOCKS_NODE.get() = node_ptr; + }); +#endif + return true; + }(); + (void)once; +} + #else // not supported template @@ -155,7 +204,26 @@ inline bool lock_address_sycl(void*, MemoryScopeNode) { inline void unlock_address_sycl(void*, MemoryScopeDevice) { assert(false); } inline void unlock_address_sycl(void*, MemoryScopeNode) { assert(false); } + +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +inline +#else +inline static +#endif + void + copy_sycl_lock_arrays_to_device(sycl::queue) { +} + #endif } // namespace Impl + +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +inline void ensure_sycl_lock_arrays_on_device(sycl::queue) {} +#else +static inline void ensure_sycl_lock_arrays_on_device(sycl::queue q) { + Impl::copy_sycl_lock_arrays_to_device(q); +} +#endif + } // namespace desul #endif diff --git a/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp b/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp index cb97f4a906..b6a399100b 100644 --- a/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp +++ b/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp @@ -17,6 +17,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_HIP_ATOMICS #include #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include #endif diff --git a/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp b/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp new file mode 100644 index 0000000000..d4dd74588b --- /dev/null +++ b/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp @@ -0,0 +1,81 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_LOCK_BASED_FETCH_OP_OPENACC_HPP_ +#define DESUL_ATOMICS_LOCK_BASED_FETCH_OP_OPENACC_HPP_ + +#include +#include +#include +#include + +namespace desul { +namespace Impl { + +template = 0> +inline T device_atomic_fetch_oper(const Oper& op, + T* const dest, + dont_deduce_this_parameter_t val, + MemoryOrder /*order*/, + MemoryScope scope) { + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_fetch_oper(): Not supported atomic operation in " + "the OpenACC backend\n"); + } + // Acquire a lock for the address + while (!lock_address((void*)dest, scope)) { + } + + device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = *dest; + *dest = op.apply(return_val, val); + device_atomic_thread_fence(MemoryOrderRelease(), scope); + unlock_address((void*)dest, scope); + return return_val; +} + +template = 0> +inline T device_atomic_oper_fetch(const Oper& op, + T* const dest, + dont_deduce_this_parameter_t val, + MemoryOrder /*order*/, + MemoryScope scope) { + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_oper_fetch(): Not supported atomic operation in " + "the OpenACC backend\n"); + } + // Acquire a lock for the address + while (!lock_address((void*)dest, scope)) { + } + + device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = op.apply(*dest, val); + *dest = return_val; + device_atomic_thread_fence(MemoryOrderRelease(), scope); + unlock_address((void*)dest, scope); + return return_val; +} + +} // namespace Impl +} // namespace desul + +#endif diff --git a/lib/kokkos/tpls/desul/include/desul/atomics/Macros.hpp b/lib/kokkos/tpls/desul/include/desul/atomics/Macros.hpp index 3a14b93d32..d11beb0c80 100644 --- a/lib/kokkos/tpls/desul/include/desul/atomics/Macros.hpp +++ b/lib/kokkos/tpls/desul/include/desul/atomics/Macros.hpp @@ -57,6 +57,10 @@ SPDX-License-Identifier: (BSD-3-Clause) #define DESUL_HAVE_OPENMP_ATOMICS #endif +#if defined(DESUL_ATOMICS_ENABLE_OPENACC) +#define DESUL_HAVE_OPENACC_ATOMICS +#endif + // ONLY use GNUC atomics if not explicitly say to use OpenMP atomics #if !defined(DESUL_HAVE_OPENMP_ATOMICS) && defined(__GNUC__) #define DESUL_HAVE_GCC_ATOMICS @@ -123,6 +127,30 @@ static constexpr bool desul_impl_omp_on_host() { return false; } #endif #endif +#if defined(DESUL_HAVE_OPENACC_ATOMICS) +#include +#ifdef __NVCOMPILER +// FIXME_OPENACC We cannot determine in a constant expresion whether we are on host or +// on device with NVHPC. We use the device implementation on both sides. +#define DESUL_IF_ON_DEVICE(CODE) \ + { DESUL_IMPL_STRIP_PARENS(CODE) } +#define DESUL_IF_ON_HOST(CODE) \ + {} +#else +#define DESUL_IF_ON_DEVICE(CODE) \ + if constexpr (acc_on_device(acc_device_not_host)) { \ + DESUL_IMPL_STRIP_PARENS(CODE) \ + } +#define DESUL_IF_ON_HOST(CODE) \ + if constexpr (acc_on_device(acc_device_host)) { \ + DESUL_IMPL_STRIP_PARENS(CODE) \ + } +#endif +#define DESUL_IMPL_ACC_ROUTINE_DIRECTIVE _Pragma("acc routine seq") +#else +#define DESUL_IMPL_ACC_ROUTINE_DIRECTIVE +#endif + #if !defined(DESUL_IF_ON_HOST) && !defined(DESUL_IF_ON_DEVICE) #if (defined(DESUL_ATOMICS_ENABLE_CUDA) && defined(__CUDA_ARCH__)) || \ (defined(DESUL_ATOMICS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) || \ diff --git a/lib/kokkos/tpls/desul/include/desul/atomics/Thread_Fence.hpp b/lib/kokkos/tpls/desul/include/desul/atomics/Thread_Fence.hpp index 24078aae07..6a741f6d47 100644 --- a/lib/kokkos/tpls/desul/include/desul/atomics/Thread_Fence.hpp +++ b/lib/kokkos/tpls/desul/include/desul/atomics/Thread_Fence.hpp @@ -26,6 +26,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_OPENMP_ATOMICS #include #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include #endif diff --git a/lib/kokkos/tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp b/lib/kokkos/tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp new file mode 100644 index 0000000000..a5c8aa1c8a --- /dev/null +++ b/lib/kokkos/tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp @@ -0,0 +1,25 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_THREAD_FENCE_OPENACC_HPP_ +#define DESUL_ATOMICS_THREAD_FENCE_OPENACC_HPP_ + +namespace desul { +namespace Impl { + +#pragma acc routine seq +template +void device_atomic_thread_fence(MemoryOrder, MemoryScope) { + // FIXME_OPENACC: The current OpenACC standard does not support explicit thread fence + // operations. +} + +} // namespace Impl +} // namespace desul + +#endif diff --git a/lib/kokkos/tpls/desul/src/Lock_Array_SYCL.cpp b/lib/kokkos/tpls/desul/src/Lock_Array_SYCL.cpp index 9e84c60e41..6660c76e11 100644 --- a/lib/kokkos/tpls/desul/src/Lock_Array_SYCL.cpp +++ b/lib/kokkos/tpls/desul/src/Lock_Array_SYCL.cpp @@ -14,10 +14,12 @@ SPDX-License-Identifier: (BSD-3-Clause) namespace desul::Impl { +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION SYCL_EXTERNAL sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_DEVICE; SYCL_EXTERNAL sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_NODE; +#endif int32_t* SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; int32_t* SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; @@ -31,19 +33,7 @@ void init_lock_arrays_sycl(sycl::queue q) { SYCL_SPACE_ATOMIC_LOCKS_NODE_h = sycl::malloc_host(SYCL_SPACE_ATOMIC_MASK + 1, q); - // FIXME_SYCL Once supported, the following should be replaced by - // q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_DEVICE, - // &SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, - // sizeof(int32_t*)); - // q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_NODE, - // &SYCL_SPACE_ATOMIC_LOCKS_NODE_h, - // sizeof(int32_t*)); - auto device_ptr = SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h; - auto node_ptr = SYCL_SPACE_ATOMIC_LOCKS_NODE_h; - q.single_task([=] { - SYCL_SPACE_ATOMIC_LOCKS_DEVICE.get() = device_ptr; - SYCL_SPACE_ATOMIC_LOCKS_NODE.get() = node_ptr; - }); + copy_sycl_lock_arrays_to_device(q); q.memset(SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, 0, @@ -63,7 +53,10 @@ void finalize_lock_arrays_sycl(sycl::queue q) { sycl::free(SYCL_SPACE_ATOMIC_LOCKS_NODE_h, q); SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION + copy_sycl_lock_arrays_to_device(q); +#endif } -} // namespace desul::Impl +} // namespace desul::Impl #endif diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp index ab1561bd47..25389a2fa5 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp @@ -27,165 +27,165 @@ namespace detail { // For no unique address emulation, this is the case taken when neither are empty. // For real `[[no_unique_address]]`, this case is always taken. -template struct __compressed_pair { - _MDSPAN_NO_UNIQUE_ADDRESS _T __t_val; - _MDSPAN_NO_UNIQUE_ADDRESS _U __u_val; - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { return __t_val; } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept { - return __t_val; +template struct __compressed_pair { + _MDSPAN_NO_UNIQUE_ADDRESS _T1 __t1_val{}; + _MDSPAN_NO_UNIQUE_ADDRESS _T2 __t2_val{}; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { return __t1_val; } + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept { + return __t1_val; } - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { return __u_val; } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept { - return __u_val; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { return __t2_val; } + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept { + return __t2_val; } MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair() noexcept = default; + constexpr __compressed_pair() = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair const &) noexcept = default; + constexpr __compressed_pair(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair &&) noexcept = default; + constexpr __compressed_pair(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair const &) noexcept = default; + operator=(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair &&) noexcept = default; + operator=(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - ~__compressed_pair() noexcept = default; - template - MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u) - : __t_val((_TLike &&) __t), __u_val((_ULike &&) __u) {} + ~__compressed_pair() = default; + template + MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2) + : __t1_val((_T1Like &&) __t1), __t2_val((_T2Like &&) __t2) {} }; #if !defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) // First empty. -template +template struct __compressed_pair< - _T, _U, - std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T) && !_MDSPAN_TRAIT(std::is_empty, _U)>> - : private _T { - _U __u_val; - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { - return *static_cast<_T *>(this); + _T1, _T2, + std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T1) && !_MDSPAN_TRAIT(std::is_empty, _T2)>> + : private _T1 { + _T2 __t2_val{}; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { + return *static_cast<_T1 *>(this); } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept { - return *static_cast<_T const *>(this); + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept { + return *static_cast<_T1 const *>(this); } - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { return __u_val; } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept { - return __u_val; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { return __t2_val; } + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept { + return __t2_val; } MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair() noexcept = default; + constexpr __compressed_pair() = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair const &) noexcept = default; + constexpr __compressed_pair(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair &&) noexcept = default; + constexpr __compressed_pair(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair const &) noexcept = default; + operator=(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair &&) noexcept = default; + operator=(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - ~__compressed_pair() noexcept = default; - template - MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u) - : _T((_TLike &&) __t), __u_val((_ULike &&) __u) {} + ~__compressed_pair() = default; + template + MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2) + : _T1((_T1Like &&) __t1), __t2_val((_T2Like &&) __t2) {} }; // Second empty. -template +template struct __compressed_pair< - _T, _U, - std::enable_if_t> - : private _U { - _T __t_val; - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { return __t_val; } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept { - return __t_val; + _T1, _T2, + std::enable_if_t> + : private _T2 { + _T1 __t1_val{}; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { return __t1_val; } + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept { + return __t1_val; } - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { - return *static_cast<_U *>(this); + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { + return *static_cast<_T2 *>(this); } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept { - return *static_cast<_U const *>(this); + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept { + return *static_cast<_T2 const *>(this); } MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair() noexcept = default; + constexpr __compressed_pair() = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair const &) noexcept = default; + constexpr __compressed_pair(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair &&) noexcept = default; + constexpr __compressed_pair(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair const &) noexcept = default; + operator=(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair &&) noexcept = default; + operator=(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - ~__compressed_pair() noexcept = default; + ~__compressed_pair() = default; - template - MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u) - : _U((_ULike &&) __u), __t_val((_TLike &&) __t) {} + template + MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2) + : _T2((_T2Like &&) __t2), __t1_val((_T1Like &&) __t1) {} }; // Both empty. -template +template struct __compressed_pair< - _T, _U, - std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T) && _MDSPAN_TRAIT(std::is_empty, _U)>> + _T1, _T2, + std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T1) && _MDSPAN_TRAIT(std::is_empty, _T2)>> // We need to use the __no_unique_address_emulation wrapper here to avoid // base class ambiguities. #ifdef _MDSPAN_COMPILER_MSVC // MSVC doesn't allow you to access public static member functions of a type // when you *happen* to privately inherit from that type. - : protected __no_unique_address_emulation<_T, 0>, - protected __no_unique_address_emulation<_U, 1> + : protected __no_unique_address_emulation<_T1, 0>, + protected __no_unique_address_emulation<_T2, 1> #else - : private __no_unique_address_emulation<_T, 0>, - private __no_unique_address_emulation<_U, 1> + : private __no_unique_address_emulation<_T1, 0>, + private __no_unique_address_emulation<_T2, 1> #endif { - using __first_base_t = __no_unique_address_emulation<_T, 0>; - using __second_base_t = __no_unique_address_emulation<_U, 1>; + using __first_base_t = __no_unique_address_emulation<_T1, 0>; + using __second_base_t = __no_unique_address_emulation<_T2, 1>; - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { return this->__first_base_t::__ref(); } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept { + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept { return this->__first_base_t::__ref(); } - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { return this->__second_base_t::__ref(); } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept { + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept { return this->__second_base_t::__ref(); } MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair() noexcept = default; + constexpr __compressed_pair() = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair const &) noexcept = default; + constexpr __compressed_pair(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair &&) noexcept = default; + constexpr __compressed_pair(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair const &) noexcept = default; + operator=(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair &&) noexcept = default; + operator=(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - ~__compressed_pair() noexcept = default; - template - MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u) noexcept - : __first_base_t(_T((_TLike &&) __t)), - __second_base_t(_U((_ULike &&) __u)) + ~__compressed_pair() = default; + template + MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2) noexcept + : __first_base_t(_T1((_T1Like &&) __t1)), + __second_base_t(_T2((_T2Like &&) __t2)) { } }; diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp index d35e201ceb..8e42a37ba7 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp @@ -35,10 +35,17 @@ #define MDSPAN_CXX_STD_14 201402L #define MDSPAN_CXX_STD_17 201703L #define MDSPAN_CXX_STD_20 202002L +// Note GCC has not updated this in version 13 +#ifdef __clang__ +#define MDSPAN_CXX_STD_23 202302L +#else +#define MDSPAN_CXX_STD_23 202100L +#endif #define MDSPAN_HAS_CXX_14 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14) #define MDSPAN_HAS_CXX_17 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_17) #define MDSPAN_HAS_CXX_20 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_20) +#define MDSPAN_HAS_CXX_23 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_23) static_assert(_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14, "mdspan requires C++14 or later."); @@ -224,7 +231,7 @@ static_assert(_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14, "mdspan requires C++14 or #endif #ifndef MDSPAN_CONDITIONAL_EXPLICIT -# if MDSPAN_HAS_CXX_20 && !defined(_MDSPAN_COMPILER_MSVC) +# if MDSPAN_HAS_CXX_20 # define MDSPAN_CONDITIONAL_EXPLICIT(COND) explicit(COND) # else # define MDSPAN_CONDITIONAL_EXPLICIT(COND) diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp index 0dd31c4cd0..9a28c3ed5c 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp @@ -55,6 +55,14 @@ __check_compatible_extents( return {}; } +template +MDSPAN_INLINE_FUNCTION +static constexpr bool are_valid_indices() { + return + (std::is_convertible::value && ... && true) && + (std::is_nothrow_constructible::value && ... && true); +} + // ------------------------------------------------------------------ // ------------ static_array ---------------------------------------- // ------------------------------------------------------------------ @@ -140,7 +148,8 @@ struct index_sequence_scan_impl { template struct index_sequence_scan_impl { -#if defined(__NVCC__) || defined(__NVCOMPILER) +#if defined(__NVCC__) || defined(__NVCOMPILER) || \ + defined(_MDSPAN_COMPILER_INTEL) // NVCC warns about pointless comparison with 0 for R==0 and r being const // evaluatable and also 0. MDSPAN_INLINE_FUNCTION @@ -167,7 +176,7 @@ template <> struct index_sequence_scan_impl<0> { // all static values. template struct possibly_empty_array { - T vals[N]; + T vals[N]{}; MDSPAN_INLINE_FUNCTION constexpr T &operator[](size_t r) { return vals[r]; } MDSPAN_INLINE_FUNCTION @@ -251,12 +260,17 @@ public: #ifdef __cpp_lib_span MDSPAN_TEMPLATE_REQUIRES(class T, size_t N, - /* requires */ (N == m_size_dynamic)) + /* requires */ (N == m_size_dynamic && N > 0)) MDSPAN_INLINE_FUNCTION constexpr maybe_static_array(const std::span &vals) { for (size_t r = 0; r < N; r++) m_dyn_vals[r] = static_cast(vals[r]); } + + MDSPAN_TEMPLATE_REQUIRES(class T, size_t N, + /* requires */ (N == m_size_dynamic && N == 0)) + MDSPAN_INLINE_FUNCTION + constexpr maybe_static_array(const std::span &) : m_dyn_vals{} {} #endif // constructors from all values @@ -423,9 +437,9 @@ public: class OtherIndexType, size_t N, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, OtherIndexType, index_type) && + _MDSPAN_TRAIT(std::is_convertible, const OtherIndexType&, index_type) && _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, - OtherIndexType) && + const OtherIndexType&) && (N == m_rank || N == m_rank_dynamic))) MDSPAN_INLINE_FUNCTION MDSPAN_CONDITIONAL_EXPLICIT(N != m_rank_dynamic) @@ -436,8 +450,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class OtherIndexType, size_t N, /* requires */ - (_MDSPAN_TRAIT(std::is_convertible, OtherIndexType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, OtherIndexType) && + (_MDSPAN_TRAIT(std::is_convertible, const OtherIndexType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const OtherIndexType&) && (N == m_rank || N == m_rank_dynamic))) MDSPAN_INLINE_FUNCTION MDSPAN_CONDITIONAL_EXPLICIT(N != m_rank_dynamic) @@ -454,6 +468,7 @@ private: size_t DynCount, size_t R, class OtherExtents, class... DynamicValues, /* requires */ ((R < m_rank) && (static_extent(R) == dynamic_extent))) MDSPAN_INLINE_FUNCTION + constexpr vals_t __construct_vals_from_extents(std::integral_constant, std::integral_constant, const OtherExtents &exts, @@ -468,6 +483,7 @@ private: size_t DynCount, size_t R, class OtherExtents, class... DynamicValues, /* requires */ ((R < m_rank) && (static_extent(R) != dynamic_extent))) MDSPAN_INLINE_FUNCTION + constexpr vals_t __construct_vals_from_extents(std::integral_constant, std::integral_constant, const OtherExtents &exts, @@ -481,6 +497,7 @@ private: size_t DynCount, size_t R, class OtherExtents, class... DynamicValues, /* requires */ ((R == m_rank) && (DynCount == m_rank_dynamic))) MDSPAN_INLINE_FUNCTION + constexpr vals_t __construct_vals_from_extents(std::integral_constant, std::integral_constant, const OtherExtents &, @@ -491,17 +508,20 @@ private: public: // Converting constructor from other extents specializations - MDSPAN_TEMPLATE_REQUIRES( - class OtherIndexType, size_t... OtherExtents, - /* requires */ - ( - /* multi-stage check to protect from invalid pack expansion when sizes - don't match? */ - decltype(detail::__check_compatible_extents( - std::integral_constant{}, + MDSPAN_TEMPLATE_REQUIRES( + class OtherIndexType, size_t... OtherExtents, + /* requires */ + ( + /* multi-stage check to protect from invalid pack expansion when sizes + don't match? */ + decltype(detail::__check_compatible_extents( + // using: sizeof...(Extents) == sizeof...(OtherExtents) as the second argument fails with MSVC+NVCC with some obscure expansion error + // MSVC: 19.38.33133 NVCC: 12.0 + std::integral_constant::rank() == extents::rank()>{}, std::integer_sequence{}, - std::integer_sequence{}))::value)) + std::integer_sequence{}))::value + ) + ) MDSPAN_INLINE_FUNCTION MDSPAN_CONDITIONAL_EXPLICIT((((Extents != dynamic_extent) && (OtherExtents == dynamic_extent)) || @@ -518,10 +538,14 @@ public: MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(const extents &lhs, const extents &rhs) noexcept { - bool value = true; - for (size_type r = 0; r < m_rank; r++) - value &= rhs.extent(r) == lhs.extent(r); - return value; + if constexpr (rank() != extents::rank()) { + return false; + } else { + using common_t = std::common_type_t; + for (size_type r = 0; r < m_rank; r++) + if(static_cast(rhs.extent(r)) != static_cast(lhs.extent(r))) return false; + } + return true; } #if !(MDSPAN_HAS_CXX_20) @@ -570,7 +594,7 @@ using dextents = typename detail::__make_dextents::type; template extents(IndexTypes...) -> extents; + ((void) sizeof(IndexTypes), ::MDSPAN_IMPL_STANDARD_NAMESPACE::dynamic_extent)...>; #endif // Helper type traits for identifying a class as extents. diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp index af44494a98..83ed9ef7fe 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp @@ -18,6 +18,9 @@ #include "macros.hpp" #include "trait_backports.hpp" #include "extents.hpp" +#include "../__p2642_bits/layout_padded_fwd.hpp" +#include +#include namespace MDSPAN_IMPL_STANDARD_NAMESPACE { @@ -108,6 +111,36 @@ class layout_left::mapping { */ } +#if MDSPAN_HAS_CXX_17 + /** + * Converting constructor from `layout_left_padded::mapping`. + * + * This overload participates in overload resolution only if _Mapping is a layout_left_padded mapping and + * extents_type is constructible from _Mapping::extents_type. + * + * \note There is currently a difference from p2642r2, where this function is specified as taking + * `layout_left_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::is_layout_left_padded_mapping<_Mapping>::value + && std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) + mapping(const _Mapping& __other) noexcept + : __extents(__other.extents()) + { + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: + check_padded_layout_converting_constructor_mandates(); + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: + check_padded_layout_converting_constructor_preconditions< + extents_type>(__other); + } +#endif + MDSPAN_TEMPLATE_REQUIRES( class OtherExtents, /* requires */ ( @@ -124,13 +157,14 @@ class layout_left::mapping { * other.required_span_size() is a representable value of type index_type */ #if !defined(_MDSPAN_HAS_CUDA) && !defined(_MDSPAN_HAS_HIP) && !defined(NDEBUG) - index_type stride = 1; - for(rank_type r=0; r<__extents.rank(); r++) { - if(stride != static_cast(other.stride(r))) { - // Note this throw will lead to a terminate if triggered since this function is marked noexcept - throw std::runtime_error("Assigning layout_stride to layout_left with invalid strides."); + if constexpr (extents_type::rank() > 0) { + index_type stride = 1; + using common_t = std::common_type_t; + for(rank_type r=0; r<__extents.rank(); r++) { + if(static_cast(stride) != static_cast(other.stride(r))) + std::abort(); // ("Assigning layout_stride to layout_left with invalid strides."); + stride *= __extents.extent(r); } - stride *= __extents.extent(r); } #endif } @@ -155,10 +189,7 @@ class layout_left::mapping { class... Indices, /* requires */ ( (sizeof...(Indices) == extents_type::rank()) && - _MDSPAN_FOLD_AND( - (_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices)) - ) + (detail::are_valid_indices()) ) ) _MDSPAN_HOST_DEVICE @@ -172,9 +203,9 @@ class layout_left::mapping { MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { return true; } MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_exhaustive() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; } MDSPAN_INLINE_FUNCTION constexpr index_type stride(rank_type i) const noexcept @@ -187,7 +218,10 @@ class layout_left::mapping { return value; } - template + MDSPAN_TEMPLATE_REQUIRES( + class OtherExtents, + /* requires */ ( Extents::rank() == OtherExtents::rank()) + ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(mapping const& lhs, mapping const& rhs) noexcept { return lhs.extents() == rhs.extents(); @@ -195,7 +229,10 @@ class layout_left::mapping { // In C++ 20 the not equal exists if equal is found #if !(MDSPAN_HAS_CXX_20) - template + MDSPAN_TEMPLATE_REQUIRES( + class OtherExtents, + /* requires */ ( Extents::rank() == OtherExtents::rank()) + ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator!=(mapping const& lhs, mapping const& rhs) noexcept { return lhs.extents() != rhs.extents(); @@ -215,6 +252,17 @@ class layout_left::mapping { private: _MDSPAN_NO_UNIQUE_ADDRESS extents_type __extents{}; + // [mdspan.submdspan.mapping], submdspan mapping specialization + template + MDSPAN_INLINE_FUNCTION + constexpr auto submdspan_mapping_impl( + SliceSpecifiers... slices) const; + + template + friend constexpr auto submdspan_mapping( + const mapping& src, SliceSpecifiers... slices) { + return src.submdspan_mapping_impl(slices...); + } }; diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp index a058648420..3d3927df7b 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp @@ -20,6 +20,7 @@ #include "extents.hpp" #include #include "layout_stride.hpp" +#include "../__p2642_bits/layout_padded_fwd.hpp" namespace MDSPAN_IMPL_STANDARD_NAMESPACE { @@ -113,6 +114,34 @@ class layout_right::mapping { */ } + /** + * Converting constructor from `layout_right_padded::mapping`. + * + * This overload participates in overload resolution only if _Mapping is a layout_right_padded mapping and + * extents_type is constructible from _Mapping::extents_type. + * + * \note There is currently a difference from p2642r2, where this function is specified as taking + * `layout_right_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + */ +#if MDSPAN_HAS_CXX_17 + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::is_layout_right_padded_mapping<_Mapping>::value + && std::is_constructible_v)) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) + mapping(const _Mapping &__other) noexcept + : __extents(__other.extents()) + { + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: + check_padded_layout_converting_constructor_mandates(); + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: + check_padded_layout_converting_constructor_preconditions< + extents_type>(__other); + } +#endif + MDSPAN_TEMPLATE_REQUIRES( class OtherExtents, /* requires */ ( @@ -129,13 +158,14 @@ class layout_right::mapping { * other.required_span_size() is a representable value of type index_type */ #if !defined(_MDSPAN_HAS_CUDA) && !defined(_MDSPAN_HAS_HIP) && !defined(NDEBUG) - index_type stride = 1; - for(rank_type r=__extents.rank(); r>0; r--) { - if(stride != static_cast(other.stride(r-1))) { - // Note this throw will lead to a terminate if triggered since this function is marked noexcept - throw std::runtime_error("Assigning layout_stride to layout_right with invalid strides."); + if constexpr (extents_type::rank() > 0) { + index_type stride = 1; + using common_t = std::common_type_t; + for(rank_type r=__extents.rank(); r>0; r--) { + if(static_cast(stride) != static_cast(other.stride(r-1))) + std::abort(); // ("Assigning layout_stride to layout_right with invalid strides."); + stride *= __extents.extent(r-1); } - stride *= __extents.extent(r-1); } #endif } @@ -157,13 +187,10 @@ class layout_right::mapping { //-------------------------------------------------------------------------------- MDSPAN_TEMPLATE_REQUIRES( - class... Indices, + class ... Indices, /* requires */ ( - (sizeof...(Indices) == extents_type::rank()) && - _MDSPAN_FOLD_AND( - (_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices)) - ) + (sizeof...(Indices) == extents_type::rank()) && + (detail::are_valid_indices()) ) ) _MDSPAN_HOST_DEVICE @@ -174,9 +201,9 @@ class layout_right::mapping { MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { return true; } MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { return true; } MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_exhaustive() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; } MDSPAN_INLINE_FUNCTION constexpr index_type stride(rank_type i) const noexcept @@ -189,7 +216,10 @@ class layout_right::mapping { return value; } - template + MDSPAN_TEMPLATE_REQUIRES( + class OtherExtents, + /* requires */ ( Extents::rank() == OtherExtents::rank()) + ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(mapping const& lhs, mapping const& rhs) noexcept { return lhs.extents() == rhs.extents(); @@ -197,7 +227,10 @@ class layout_right::mapping { // In C++ 20 the not equal exists if equal is found #if !(MDSPAN_HAS_CXX_20) - template + MDSPAN_TEMPLATE_REQUIRES( + class OtherExtents, + /* requires */ (Extents::rank() == OtherExtents::rank()) + ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator!=(mapping const& lhs, mapping const& rhs) noexcept { return lhs.extents() != rhs.extents(); @@ -217,6 +250,17 @@ class layout_right::mapping { private: _MDSPAN_NO_UNIQUE_ADDRESS extents_type __extents{}; + // [mdspan.submdspan.mapping], submdspan mapping specialization + template + MDSPAN_INLINE_FUNCTION + constexpr auto submdspan_mapping_impl( + SliceSpecifiers... slices) const; + + template + friend constexpr auto submdspan_mapping( + const mapping& src, SliceSpecifiers... slices) { + return src.submdspan_mapping_impl(slices...); + } }; } // end namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp index 030a494529..15ad577d14 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp @@ -88,7 +88,7 @@ struct layout_stride { : private detail::__no_unique_address_emulation< detail::__compressed_pair< Extents, - std::array + detail::possibly_empty_array > > #endif @@ -109,7 +109,7 @@ struct layout_stride { //---------------------------------------------------------------------------- - using __strides_storage_t = std::array; + using __strides_storage_t = detail::possibly_empty_array; using __member_pair_t = detail::__compressed_pair; #if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) @@ -158,14 +158,16 @@ struct layout_stride { template MDSPAN_INLINE_FUNCTION static constexpr bool _eq_impl(mapping const& self, mapping const& other) noexcept { - return _MDSPAN_FOLD_AND((self.stride(Idxs) == other.stride(Idxs)) /* && ... */) - && _MDSPAN_FOLD_AND((self.extents().extent(Idxs) == other.extents().extent(Idxs)) /* || ... */); + using common_t = std::common_type_t; + return _MDSPAN_FOLD_AND((static_cast(self.stride(Idxs)) == static_cast(other.stride(Idxs))) /* && ... */) + && _MDSPAN_FOLD_AND((static_cast(self.extents().extent(Idxs)) == static_cast(other.extents().extent(Idxs))) /* || ... */); } template MDSPAN_INLINE_FUNCTION static constexpr bool _not_eq_impl(mapping const& self, mapping const& other) noexcept { - return _MDSPAN_FOLD_OR((self.stride(Idxs) != other.stride(Idxs)) /* || ... */) - || _MDSPAN_FOLD_OR((self.extents().extent(Idxs) != other.extents().extent(Idxs)) /* || ... */); + using common_t = std::common_type_t; + return _MDSPAN_FOLD_OR((static_cast(self.stride(Idxs)) != static_cast(other.stride(Idxs))) /* || ... */) + || _MDSPAN_FOLD_OR((static_cast(self.extents().extent(Idxs)) != static_cast(other.extents().extent(Idxs))) /* || ... */); } template @@ -205,6 +207,11 @@ struct layout_stride { } #endif + MDSPAN_INLINE_FUNCTION + static constexpr std::array return_strides(const __strides_storage_t& s) { + return std::array{s[Idxs]...}; + } + template MDSPAN_INLINE_FUNCTION static constexpr size_t __return_zero() { return 0; } @@ -218,6 +225,21 @@ struct layout_stride { // Can't use defaulted parameter in the __deduction_workaround template because of a bug in MSVC warning C4348. using __impl = __deduction_workaround>; + static constexpr __strides_storage_t strides_storage(std::true_type) { + __strides_storage_t s{}; + + extents_type e; + index_type stride = 1; + for(int r = static_cast(extents_type::rank() - 1); r >= 0; r--) { + s[r] = stride; + stride *= e.extent(r); + } + + return s; + } + static constexpr __strides_storage_t strides_storage(std::false_type) { + return {}; + } //---------------------------------------------------------------------------- @@ -233,7 +255,21 @@ struct layout_stride { //-------------------------------------------------------------------------------- - MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping() noexcept = default; + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping() noexcept +#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) + : __members{ +#else + : __base_t(__base_t{__member_pair_t( +#endif + extents_type(), + __strides_storage_t(strides_storage(std::integral_constant 0)>{})) +#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) + } +#else + )}) +#endif + {} + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(mapping const&) noexcept = default; MDSPAN_TEMPLATE_REQUIRES( @@ -332,10 +368,10 @@ struct layout_stride { ) #endif MDSPAN_CONDITIONAL_EXPLICIT( - (!std::is_convertible::value) && - (detail::__is_mapping_of || - detail::__is_mapping_of || - detail::__is_mapping_of) + !(std::is_convertible::value && + (detail::__is_mapping_of || + detail::__is_mapping_of || + detail::__is_mapping_of)) ) // needs two () due to comma MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 mapping(StridedLayoutMapping const& other) noexcept // NOLINT(google-explicit-constructor) @@ -374,7 +410,7 @@ struct layout_stride { MDSPAN_INLINE_FUNCTION constexpr std::array< index_type, extents_type::rank() > strides() const noexcept { - return __strides_storage(); + return __impl::return_strides(__strides_storage()); } MDSPAN_INLINE_FUNCTION @@ -393,8 +429,7 @@ struct layout_stride { class... Indices, /* requires */ ( sizeof...(Indices) == Extents::rank() && - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) /*&& ...*/ ) && - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices) /*&& ...*/) + (detail::are_valid_indices()) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -410,17 +445,37 @@ struct layout_stride { MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; } MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 bool is_exhaustive() const noexcept { - return required_span_size() == __get_size(extents(), std::make_index_sequence()); + if constexpr (extents_type::rank() == 0) + return true; + else { + index_type span_size = required_span_size(); + if (span_size == static_cast(0)) { + if constexpr (extents_type::rank() == 1) { + return stride(0) == 1; + } else { + rank_type r_largest = 0; + for (rank_type r = 1; r < extents_type::rank(); r++) { + if (stride(r) > stride(r_largest)) { + r_largest = r; + } + } + for (rank_type r = 0; r < extents_type::rank(); r++) { + if (extents().extent(r) == 0 && r != r_largest) { + return false; + } + } + return true; + } + } else { + return required_span_size() == __get_size(extents(), std::make_index_sequence()); + } + } } MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; } MDSPAN_INLINE_FUNCTION - constexpr index_type stride(rank_type r) const noexcept -#if MDSPAN_HAS_CXX_20 - requires ( Extents::rank() > 0 ) -#endif - { + constexpr index_type stride(rank_type r) const noexcept { return __strides_storage()[r]; } @@ -444,10 +499,13 @@ struct layout_stride { MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(const mapping& x, const StridedLayoutMapping& y) noexcept { bool strides_match = true; - for(rank_type r = 0; r < extents_type::rank(); r++) - strides_match = strides_match && (x.stride(r) == y.stride(r)); + if constexpr (extents_type::rank() > 0) { + using common_t = std::common_type_t; + for(rank_type r = 0; r < extents_type::rank(); r++) + strides_match = strides_match && (static_cast(x.stride(r)) == static_cast(y.stride(r))); + } return (x.extents() == y.extents()) && - (__impl::__OFFSET(y)== static_cast(0)) && + (__impl::__OFFSET(y) == static_cast(0)) && strides_match; } @@ -489,6 +547,17 @@ struct layout_stride { } #endif + // [mdspan.submdspan.mapping], submdspan mapping specialization + template + MDSPAN_INLINE_FUNCTION + constexpr auto submdspan_mapping_impl( + SliceSpecifiers... slices) const; + + template + friend constexpr auto submdspan_mapping( + const mapping& src, SliceSpecifiers... slices) { + return src.submdspan_mapping_impl(slices...); + } }; }; diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp index 6febe30021..d6ec49e65b 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp @@ -55,6 +55,13 @@ private: ReferenceType __callop(mdspan const& __self, const std::array& indices) noexcept { return __self.__accessor_ref().access(__self.__ptr_ref(), __self.__mapping_ref()(indices[Idxs]...)); } +#ifdef __cpp_lib_span + template + MDSPAN_FORCE_INLINE_FUNCTION static constexpr + ReferenceType __callop(mdspan const& __self, const std::span& indices) noexcept { + return __self.__accessor_ref().access(__self.__ptr_ref(), __self.__mapping_ref()(indices[Idxs]...)); + } +#endif }; public: @@ -109,9 +116,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeTypes) /* && ... */) && ((sizeof...(SizeTypes) == rank()) || (sizeof...(SizeTypes) == rank_dynamic())) && + (detail::are_valid_indices()) && _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type) && _MDSPAN_TRAIT(std::is_default_constructible, accessor_type) ) @@ -125,8 +131,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class SizeType, size_t N, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) && + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) && ((N == rank()) || (N == rank_dynamic())) && _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type) && _MDSPAN_TRAIT(std::is_default_constructible, accessor_type) @@ -142,8 +148,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class SizeType, size_t N, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) && + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) && ((N == rank()) || (N == rank_dynamic())) && _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type) && _MDSPAN_TRAIT(std::is_default_constructible, accessor_type) @@ -160,7 +166,7 @@ public: (MDSPAN_INLINE_FUNCTION constexpr), mdspan, (data_handle_type p, const extents_type& exts), , /* requires */ (_MDSPAN_TRAIT(std::is_default_constructible, accessor_type) && - _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type)) + _MDSPAN_TRAIT(std::is_constructible, mapping_type, const extents_type&)) ) : __members(std::move(p), __map_acc_pair_t(mapping_type(exts), accessor_type())) { } @@ -179,10 +185,14 @@ public: MDSPAN_TEMPLATE_REQUIRES( class OtherElementType, class OtherExtents, class OtherLayoutPolicy, class OtherAccessor, /* requires */ ( - _MDSPAN_TRAIT(std::is_constructible, mapping_type, typename OtherLayoutPolicy::template mapping) && - _MDSPAN_TRAIT(std::is_constructible, accessor_type, OtherAccessor) + _MDSPAN_TRAIT(std::is_constructible, mapping_type, const typename OtherLayoutPolicy::template mapping&) && + _MDSPAN_TRAIT(std::is_constructible, accessor_type, const OtherAccessor&) ) ) + MDSPAN_CONDITIONAL_EXPLICIT( + !_MDSPAN_TRAIT(std::is_convertible, const typename OtherLayoutPolicy::template mapping&, mapping_type) || + !_MDSPAN_TRAIT(std::is_convertible, const OtherAccessor&, accessor_type) + ) MDSPAN_INLINE_FUNCTION constexpr mdspan(const mdspan& other) : __members(other.__ptr_ref(), __map_acc_pair_t(other.__mapping_ref(), other.__accessor_ref())) @@ -226,8 +236,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class SizeType, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -240,8 +250,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class SizeType, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -271,9 +281,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeTypes) /* && ... */) && - extents_type::rank() == sizeof...(SizeTypes) + extents_type::rank() == sizeof...(SizeTypes) && + (detail::are_valid_indices()) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -285,8 +294,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class SizeType, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -299,8 +308,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class SizeType, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -311,7 +320,7 @@ public: #endif // __cpp_lib_span #endif // MDSPAN_USE_PAREN_OPERATOR - MDSPAN_INLINE_FUNCTION constexpr size_t size() const noexcept { + MDSPAN_INLINE_FUNCTION constexpr size_type size() const noexcept { return __impl::__size(*this); }; @@ -346,13 +355,13 @@ public: //-------------------------------------------------------------------------------- // [mdspan.basic.obs], mdspan observers of the mapping - MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { return mapping_type::is_always_unique(); }; - MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { return mapping_type::is_always_exhaustive(); }; - MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return mapping_type::is_always_strided(); }; + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() { return mapping_type::is_always_unique(); }; + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() { return mapping_type::is_always_exhaustive(); }; + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() { return mapping_type::is_always_strided(); }; - MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept { return __mapping_ref().is_unique(); }; - MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { return __mapping_ref().is_exhaustive(); }; - MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept { return __mapping_ref().is_strided(); }; + MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const { return __mapping_ref().is_unique(); }; + MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const { return __mapping_ref().is_exhaustive(); }; + MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const { return __mapping_ref().is_strided(); }; MDSPAN_INLINE_FUNCTION constexpr index_type stride(size_t r) const { return __mapping_ref().stride(r); }; private: @@ -374,7 +383,7 @@ private: #if defined(_MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION) MDSPAN_TEMPLATE_REQUIRES( class ElementType, class... SizeTypes, - /* requires */ _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_integral, SizeTypes) /* && ... */) && + /* requires */ _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, SizeTypes, size_t) /* && ... */) && (sizeof...(SizeTypes) > 0) ) MDSPAN_DEDUCTION_GUIDE explicit mdspan(ElementType*, SizeTypes...) diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp index 3950273a83..bdc5925f71 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp @@ -103,8 +103,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) && + _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) && _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type) && (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t) || container_is_array::value) && @@ -133,61 +133,29 @@ public: ) : map_(m), ctr_(container_is_array::construct(map_)) { } - // Constructors from container - MDSPAN_TEMPLATE_REQUIRES( - class... SizeTypes, - /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) && - _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type) - ) - ) - MDSPAN_INLINE_FUNCTION - explicit constexpr mdarray(const container_type& ctr, SizeTypes... dynamic_extents) - : map_(extents_type(dynamic_extents...)), ctr_(ctr) - { assert(ctr.size() >= static_cast(map_.required_span_size())); } - - MDSPAN_FUNCTION_REQUIRES( (MDSPAN_INLINE_FUNCTION constexpr), - mdarray, (const container_type& ctr, const extents_type& exts), , + mdarray, (const extents_type& exts, const container_type& ctr), , /* requires */ (_MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)) ) : map_(exts), ctr_(ctr) { assert(ctr.size() >= static_cast(map_.required_span_size())); } - constexpr mdarray(const container_type& ctr, const mapping_type& m) + constexpr mdarray(const mapping_type& m, const container_type& ctr) : map_(m), ctr_(ctr) { assert(ctr.size() >= static_cast(map_.required_span_size())); } - - // Constructors from container - MDSPAN_TEMPLATE_REQUIRES( - class... SizeTypes, - /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) && - _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type) - ) - ) - MDSPAN_INLINE_FUNCTION - explicit constexpr mdarray(container_type&& ctr, SizeTypes... dynamic_extents) - : map_(extents_type(dynamic_extents...)), ctr_(std::move(ctr)) - { assert(ctr_.size() >= static_cast(map_.required_span_size())); } - - MDSPAN_FUNCTION_REQUIRES( (MDSPAN_INLINE_FUNCTION constexpr), - mdarray, (container_type&& ctr, const extents_type& exts), , + mdarray, (const extents_type& exts, container_type&& ctr), , /* requires */ (_MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)) ) : map_(exts), ctr_(std::move(ctr)) { assert(ctr_.size() >= static_cast(map_.required_span_size())); } - constexpr mdarray(container_type&& ctr, const mapping_type& m) + constexpr mdarray(const mapping_type& m, container_type&& ctr) : map_(m), ctr_(std::move(ctr)) { assert(ctr_.size() >= static_cast(map_.required_span_size())); } - MDSPAN_TEMPLATE_REQUIRES( class OtherElementType, class OtherExtents, class OtherLayoutPolicy, class OtherContainer, /* requires */ ( @@ -229,7 +197,7 @@ public: _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)) ) MDSPAN_INLINE_FUNCTION - constexpr mdarray(const container_type& ctr, const extents_type& exts, const Alloc& a) + constexpr mdarray(const extents_type& exts, const container_type& ctr, const Alloc& a) : map_(exts), ctr_(ctr, a) { assert(ctr_.size() >= static_cast(map_.required_span_size())); } @@ -238,7 +206,7 @@ public: /* requires */ (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t, Alloc)) ) MDSPAN_INLINE_FUNCTION - constexpr mdarray(const container_type& ctr, const mapping_type& map, const Alloc& a) + constexpr mdarray(const mapping_type& map, const container_type& ctr, const Alloc& a) : map_(map), ctr_(ctr, a) { assert(ctr_.size() >= static_cast(map_.required_span_size())); } @@ -248,7 +216,7 @@ public: _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)) ) MDSPAN_INLINE_FUNCTION - constexpr mdarray(container_type&& ctr, const extents_type& exts, const Alloc& a) + constexpr mdarray(const extents_type& exts, container_type&& ctr, const Alloc& a) : map_(exts), ctr_(std::move(ctr), a) { assert(ctr_.size() >= static_cast(map_.required_span_size())); } @@ -257,7 +225,7 @@ public: /* requires */ (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t, Alloc)) ) MDSPAN_INLINE_FUNCTION - constexpr mdarray(container_type&& ctr, const mapping_type& map, const Alloc& a) + constexpr mdarray(const mapping_type& map, container_type&& ctr, const Alloc& a) : map_(map), ctr_(std::move(ctr), a) { assert(ctr_.size() >= map_.required_span_size()); } @@ -344,8 +312,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - extents_type::rank() == sizeof...(SizeTypes) + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) && + extents_type::rank() == sizeof...(SizeTypes) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -356,8 +324,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - extents_type::rank() == sizeof...(SizeTypes) + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) && + extents_type::rank() == sizeof...(SizeTypes) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -433,8 +401,9 @@ public: class OtherElementType, class OtherExtents, class OtherLayoutType, class OtherAccessorType, /* requires */ ( - _MDSPAN_TRAIT(std::is_assignable, mdspan_type, - mdspan) + _MDSPAN_TRAIT(std::is_assignable, + mdspan, + mdspan_type) ) ) constexpr operator mdspan () { @@ -445,8 +414,9 @@ public: class OtherElementType, class OtherExtents, class OtherLayoutType, class OtherAccessorType, /* requires */ ( - _MDSPAN_TRAIT(std::is_assignable, const_mdspan_type, - mdspan) + _MDSPAN_TRAIT(std::is_assignable, + mdspan, + const_mdspan_type) ) ) constexpr operator mdspan () const { diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp index 58f38620ba..89ba8202fb 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp @@ -20,7 +20,6 @@ #include namespace MDSPAN_IMPL_STANDARD_NAMESPACE { -namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { namespace { template @@ -29,6 +28,7 @@ namespace { template struct __mdspan_is_integral_constant>: std::true_type {}; } + // Slice Specifier allowing for strides and compile time extent template struct strided_slice { @@ -36,14 +36,13 @@ struct strided_slice { using extent_type = ExtentType; using stride_type = StrideType; - OffsetType offset; - ExtentType extent; - StrideType stride; + _MDSPAN_NO_UNIQUE_ADDRESS OffsetType offset{}; + _MDSPAN_NO_UNIQUE_ADDRESS ExtentType extent{}; + _MDSPAN_NO_UNIQUE_ADDRESS StrideType stride{}; static_assert(std::is_integral_v || __mdspan_is_integral_constant::value); static_assert(std::is_integral_v || __mdspan_is_integral_constant::value); static_assert(std::is_integral_v || __mdspan_is_integral_constant::value); }; -} // MDSPAN_IMPL_PROPOSED_NAMESPACE } // MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp index b9672b7f9a..abddd0b59d 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp @@ -20,23 +20,21 @@ #include "submdspan_mapping.hpp" namespace MDSPAN_IMPL_STANDARD_NAMESPACE { -namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { template MDSPAN_INLINE_FUNCTION constexpr auto submdspan(const mdspan &src, SliceSpecifiers... slices) { - const auto sub_mapping_offset = submdspan_mapping(src.mapping(), slices...); + const auto sub_submdspan_mapping_result = submdspan_mapping(src.mapping(), slices...); // NVCC has a problem with the deduction so lets figure out the type - using sub_mapping_t = std::remove_cv_t; + using sub_mapping_t = std::remove_cv_t; using sub_extents_t = typename sub_mapping_t::extents_type; using sub_layout_t = typename sub_mapping_t::layout_type; using sub_accessor_t = typename AccessorPolicy::offset_policy; return mdspan( - src.accessor().offset(src.data_handle(), sub_mapping_offset.offset), - sub_mapping_offset.mapping, + src.accessor().offset(src.data_handle(), sub_submdspan_mapping_result.offset), + sub_submdspan_mapping_result.mapping, sub_accessor_t(src.accessor())); } -} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE } // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp index f56ce023f1..c3b2f78fb9 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp @@ -20,7 +20,6 @@ #include "strided_slice.hpp" namespace MDSPAN_IMPL_STANDARD_NAMESPACE { -namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { namespace detail { // Mapping from submapping ranks to srcmapping ranks @@ -319,5 +318,4 @@ constexpr auto submdspan_extents(const extents &src_exts, return detail::extents_constructor::next_extent( src_exts, slices...); } -} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE } // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp index 48778d57e7..ca6948c9a9 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp @@ -22,21 +22,15 @@ #include // index_sequence namespace MDSPAN_IMPL_STANDARD_NAMESPACE { -namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { //****************************************** // Return type of submdspan_mapping overloads //****************************************** -template struct mapping_offset { - Mapping mapping; +template struct submdspan_mapping_result { + _MDSPAN_NO_UNIQUE_ADDRESS LayoutMapping mapping{}; size_t offset; }; -} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE namespace detail { -using MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::first_of; -using MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::stride_of; -using MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::inv_map_rank; - // constructs sub strides template MDSPAN_INLINE_FUNCTION @@ -98,17 +92,15 @@ struct preserve_layout_left_mapping, SubRank, #pragma diag_suppress = implicit_return_from_non_void_function #endif // Actual submdspan mapping call -template +template +template MDSPAN_INLINE_FUNCTION constexpr auto -submdspan_mapping(const layout_left::mapping &src_mapping, - SliceSpecifiers... slices) { - using MDSPAN_IMPL_PROPOSED_NAMESPACE::submdspan_extents; - using MDSPAN_IMPL_PROPOSED_NAMESPACE::mapping_offset; +layout_left::mapping::submdspan_mapping_impl(SliceSpecifiers... slices) const { // compute sub extents using src_ext_t = Extents; - auto dst_ext = submdspan_extents(src_mapping.extents(), slices...); + auto dst_ext = submdspan_extents(extents(), slices...); using dst_ext_t = decltype(dst_ext); // figure out sub layout type @@ -121,18 +113,18 @@ submdspan_mapping(const layout_left::mapping &src_mapping, if constexpr (std::is_same_v) { // layout_left case - return mapping_offset{ + return submdspan_mapping_result{ dst_mapping_t(dst_ext), - static_cast(src_mapping(detail::first_of(slices)...))}; + static_cast(this->operator()(detail::first_of(slices)...))}; } else { // layout_stride case auto inv_map = detail::inv_map_rank( std::integral_constant(), std::index_sequence<>(), slices...); - return mapping_offset{ + return submdspan_mapping_result{ dst_mapping_t(dst_ext, detail::construct_sub_strides( - src_mapping, inv_map, + *this, inv_map, // HIP needs deduction guides to have markups so we need to be explicit // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120) @@ -140,7 +132,7 @@ submdspan_mapping(const layout_left::mapping &src_mapping, #else std::tuple{detail::stride_of(slices)...})), #endif - static_cast(src_mapping(detail::first_of(slices)...))}; + static_cast(this->operator()(detail::first_of(slices)...))}; } #if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__) __builtin_unreachable(); @@ -207,17 +199,15 @@ struct preserve_layout_right_mapping, SubRank, #pragma diagnostic push #pragma diag_suppress = implicit_return_from_non_void_function #endif -template +template +template MDSPAN_INLINE_FUNCTION constexpr auto -submdspan_mapping(const layout_right::mapping &src_mapping, - SliceSpecifiers... slices) { - using MDSPAN_IMPL_PROPOSED_NAMESPACE::submdspan_extents; - using MDSPAN_IMPL_PROPOSED_NAMESPACE::mapping_offset; - +layout_right::mapping::submdspan_mapping_impl( + SliceSpecifiers... slices) const { // get sub extents using src_ext_t = Extents; - auto dst_ext = submdspan_extents(src_mapping.extents(), slices...); + auto dst_ext = submdspan_extents(extents(), slices...); using dst_ext_t = decltype(dst_ext); // determine new layout type @@ -230,18 +220,18 @@ submdspan_mapping(const layout_right::mapping &src_mapping, if constexpr (std::is_same_v) { // layout_right case - return mapping_offset{ + return submdspan_mapping_result{ dst_mapping_t(dst_ext), - static_cast(src_mapping(detail::first_of(slices)...))}; + static_cast(this->operator()(detail::first_of(slices)...))}; } else { // layout_stride case auto inv_map = detail::inv_map_rank( std::integral_constant(), std::index_sequence<>(), slices...); - return mapping_offset{ + return submdspan_mapping_result{ dst_mapping_t(dst_ext, detail::construct_sub_strides( - src_mapping, inv_map, + *this, inv_map, // HIP needs deduction guides to have markups so we need to be explicit // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120) @@ -249,7 +239,7 @@ submdspan_mapping(const layout_right::mapping &src_mapping, #else std::tuple{detail::stride_of(slices)...})), #endif - static_cast(src_mapping(detail::first_of(slices)...))}; + static_cast(this->operator()(detail::first_of(slices)...))}; } #if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__) __builtin_unreachable(); @@ -270,23 +260,22 @@ submdspan_mapping(const layout_right::mapping &src_mapping, //********************************** // layout_stride submdspan_mapping //********************************* -template +template +template MDSPAN_INLINE_FUNCTION constexpr auto -submdspan_mapping(const layout_stride::mapping &src_mapping, - SliceSpecifiers... slices) { - using MDSPAN_IMPL_PROPOSED_NAMESPACE::submdspan_extents; - using MDSPAN_IMPL_PROPOSED_NAMESPACE::mapping_offset; - auto dst_ext = submdspan_extents(src_mapping.extents(), slices...); +layout_stride::mapping::submdspan_mapping_impl( + SliceSpecifiers... slices) const { + auto dst_ext = submdspan_extents(extents(), slices...); using dst_ext_t = decltype(dst_ext); auto inv_map = detail::inv_map_rank( std::integral_constant(), std::index_sequence<>(), slices...); using dst_mapping_t = typename layout_stride::template mapping; - return mapping_offset{ + return submdspan_mapping_result{ dst_mapping_t(dst_ext, detail::construct_sub_strides( - src_mapping, inv_map, + *this, inv_map, // HIP needs deduction guides to have markups so we need to be explicit // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120) @@ -294,6 +283,7 @@ submdspan_mapping(const layout_stride::mapping &src_mapping, #else std::tuple(detail::stride_of(slices)...))), #endif - static_cast(src_mapping(detail::first_of(slices)...))}; + static_cast(this->operator()(detail::first_of(slices)...))}; } + } // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp new file mode 100644 index 0000000000..a801486792 --- /dev/null +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp @@ -0,0 +1,793 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#pragma once + +#include +#include "layout_padded_fwd.hpp" +#include "../__p0009_bits/dynamic_extent.hpp" +#include "../__p0009_bits/extents.hpp" +#include "../__p0009_bits/mdspan.hpp" +#include "../__p0009_bits/layout_left.hpp" +#include "../__p0009_bits/layout_right.hpp" +#include "../__p0009_bits/layout_stride.hpp" + +namespace MDSPAN_IMPL_STANDARD_NAMESPACE { +namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { + +namespace detail { +template +MDSPAN_INLINE_FUNCTION +constexpr _T +find_next_multiple(_T alignment, _T offset) +{ + if ( alignment == 0 ) { + return _T(0); + } else { + return ( ( offset + alignment - 1 ) / alignment) * alignment; + } +} + +template +MDSPAN_INLINE_FUNCTION constexpr size_t get_actual_static_padding_value() { + constexpr auto rank = _ExtentsType::rank(); + + if constexpr (rank <= typename _ExtentsType::rank_type(1)) { + return 0; + } else if constexpr (_PaddingValue != dynamic_extent && + _ExtentsType::static_extent(_ExtentToPadIdx) != + dynamic_extent) { + static_assert( + (_PaddingValue != 0) || + (_ExtentsType::static_extent(_ExtentToPadIdx) == 0), + "padding stride can be 0 only if " + "extents_type::static_extent(extent-to-pad) is 0 or dynamic_extent"); + return find_next_multiple(_PaddingValue, + _ExtentsType::static_extent(_ExtentToPadIdx)); + } else { + return dynamic_extent; + } +} + +template +struct static_array_type_for_padded_extent +{ + static constexpr size_t padding_value = _PaddingValue; + using index_type = typename _Extents::index_type; + using extents_type = _Extents; + using type = ::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::maybe_static_array< + index_type, size_t, dynamic_extent, + detail::get_actual_static_padding_value()>; +}; + +template +struct static_array_type_for_padded_extent<_PaddingValue, _Extents, + _ExtentToPadIdx, Rank, std::enable_if_t> { + using index_type = typename _Extents::index_type; + using extents_type = _Extents; + using type = + ::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::maybe_static_array< + index_type, size_t, dynamic_extent, 0>; +}; + +template +struct padded_extent { + static constexpr size_t padding_value = _PaddingValue; + using index_type = typename _Extents::index_type; + using extents_type = _Extents; + using static_array_type = typename static_array_type_for_padded_extent< + padding_value, _Extents, _ExtentToPadIdx, _Extents::rank()>::type; + + static constexpr auto static_value() { return static_array_type::static_value(0); } + + MDSPAN_INLINE_FUNCTION + static constexpr static_array_type + init_padding(const _Extents &exts) { + if constexpr ((_Extents::rank() > 1) && (padding_value == dynamic_extent)) { + return {exts.extent(_ExtentToPadIdx)}; + } else { + return init_padding(exts, padding_value); + } + } + + MDSPAN_INLINE_FUNCTION static constexpr static_array_type + init_padding([[maybe_unused]] const _Extents &exts, + [[maybe_unused]] index_type pv) { + if constexpr (_Extents::rank() > 1) { + return {find_next_multiple(pv, + exts.extent(_ExtentToPadIdx))}; + } else { + return {}; + } + } + + template + MDSPAN_INLINE_FUNCTION static constexpr static_array_type + init_padding([[maybe_unused]] const _Mapping &other_mapping, + std::integral_constant) { + if constexpr (_Extents::rank() > 1) { + return {other_mapping.stride(_PaddingStrideIdx)}; + } else { + return {}; + } + } +}; +} // namespace detail + +template +template +class layout_left_padded::mapping { +public: + static constexpr size_t padding_value = PaddingValue; + + using extents_type = Extents; + using index_type = typename extents_type::index_type; + using size_type = typename extents_type::size_type; + using rank_type = typename extents_type::rank_type; + using layout_type = layout_left_padded; + +#ifndef MDSPAN_INTERNAL_TEST +private: +#endif // MDSPAN_INTERNAL_TEST + + static constexpr rank_type padded_stride_idx = detail::layout_padded_constants::padded_stride_idx; + static constexpr rank_type extent_to_pad_idx = detail::layout_padded_constants::extent_to_pad_idx; + + static_assert((padding_value != 0) + || (extents_type::static_extent(extent_to_pad_idx) == 0) + || (extents_type::static_extent(extent_to_pad_idx) == dynamic_extent), + "out of bounds access for rank 0"); + + using padded_stride_type = detail::padded_extent< padding_value, extents_type, extent_to_pad_idx >; + + static constexpr size_t static_padding_stride = padded_stride_type::static_value(); + + typename padded_stride_type::static_array_type padded_stride = {}; + extents_type exts = {}; + + constexpr index_type compute_offset(std::index_sequence<>) const { + return 0; + } + + template + constexpr index_type compute_offset(std::index_sequence, + IndexOffset index_offset) const { + return index_offset; + } + + template + constexpr index_type compute_offset(std::index_sequence, + IndexOffsets... index_offsets) const { + index_type indices[] = {static_cast(index_offsets)...}; + // self-recursive fold trick from + // https://github.com/llvm/llvm-project/blob/96e1914aa2e6d8966acbfbe2f4d184201f1aa318/libcxx/include/mdspan/layout_left.h#L144 + index_type res = 0; + ((res = indices[extents_type::rank() - 1 - Ranks] + + ((extents_type::rank() - 1 - Ranks) == extent_to_pad_idx + ? padded_stride.value(0) + : exts.extent(extents_type::rank() - 1 - Ranks)) * + res), + ...); + return res; + } + +public: +#if !MDSPAN_HAS_CXX_20 + MDSPAN_INLINE_FUNCTION_DEFAULTED + constexpr mapping() + : mapping(extents_type{}) + {} +#else + MDSPAN_INLINE_FUNCTION_DEFAULTED + constexpr mapping() + requires(static_padding_stride != dynamic_extent) = default; + + MDSPAN_INLINE_FUNCTION + constexpr mapping() + requires(static_padding_stride == dynamic_extent) + : mapping(extents_type{}) + {} +#endif + + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(const mapping&) noexcept = default; + MDSPAN_INLINE_FUNCTION_DEFAULTED mapping& operator=(const mapping&) noexcept = default; + + /** + * Initializes the mapping with the given extents. + * + * \param ext the given extents + */ + MDSPAN_INLINE_FUNCTION + constexpr mapping(const extents_type& ext) + : padded_stride(padded_stride_type::init_padding(ext)), exts(ext) + {} + + /** + * Initializes the mapping with the given extents and the specified padding value. + * + * This overload participates in overload resolution only if `is_convertible_v` + * is `true` and `is_nothrow_constructible_v` is `true` + * + * \param ext the given extents + * \param padding_value the padding value + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Size, + /* requires */ ( + std::is_convertible_v<_Size, index_type> + && std::is_nothrow_constructible_v + ) + ) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const extents_type &ext, _Size dynamic_padding_value) + : padded_stride(padded_stride_type::init_padding(ext, dynamic_padding_value)), exts(ext) + { + assert((padding_value == dynamic_extent) || (static_cast(padding_value) == static_cast(dynamic_padding_value))); + } + + /** + * Converting constructor from `layout_left::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true. + * If `OtherExtents::rank() > 1` then one of `padding_value`, `static_extent(0)`, or `OtherExtents::static_extent(0)` must be `dynamic_extent`; + * otherwise, `OtherExtents::static_extent(0)` must be equal to the least multiple of `padding_value` greater than or equal to `extents_type::static_extent(0)` + */ + MDSPAN_TEMPLATE_REQUIRES( + class _OtherExtents, + /* requires */ ( + std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<_OtherExtents, extents_type>)) + constexpr mapping(const layout_left::mapping<_OtherExtents> &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + { + static_assert((_OtherExtents::rank() > 1) || (static_padding_stride != dynamic_extent) || (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) + || (static_padding_stride == _OtherExtents::static_extent(extent_to_pad_idx))); + } + + /** + * Converting constructor from `layout_stride::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true + */ + MDSPAN_TEMPLATE_REQUIRES( + class _OtherExtents, + /* requires */ ( + std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0)) + constexpr mapping(const layout_stride::mapping<_OtherExtents> &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + { + } + + /** + * Converting constructor from `layout_left_padded::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true. + * Either `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or `padding_value == OtherPaddingStride`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_left_padded_mapping<_Mapping>::value + && std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && (padding_value == dynamic_extent || _Mapping::padding_value == dynamic_extent))) + constexpr + mapping(const _Mapping &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + { + static_assert(padding_value == dynamic_extent || + _Mapping::padding_value == dynamic_extent || + padding_value == _Mapping::padding_value); + } + + /** + * Converting constructor from `layout_right_padded::mapping`. + * + * This overload participates in overload resolution only if `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_right_padded_mapping<_Mapping>::value + && extents_type::rank() <= 1 + && std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) + constexpr + mapping(const _Mapping &other_mapping) noexcept + : padded_stride(padded_stride_type::init_padding(other_mapping.extents(), other_mapping.extents().extent(extent_to_pad_idx))), + exts(other_mapping.extents()) + {} + + constexpr const extents_type &extents() const noexcept + { + return exts; + } + + constexpr std::array + strides() const noexcept + { + if constexpr ( extents_type::rank() == 0 ) { + return {}; + } else if constexpr ( extents_type::rank() == 1 ) { + return {1}; + } else { + index_type value = 1; + std::array s{}; + s[extent_to_pad_idx] = value; + value *= padded_stride.value(0); + for (rank_type r = extent_to_pad_idx + 1; r < extents_type::rank() - 1; ++r) + { + s[r] = value; + value *= exts.extent(r); + } + s[extents_type::rank() - 1] = value; + return s; + } + } + + constexpr index_type + required_span_size() const noexcept + { + if constexpr ( extents_type::rank() == 0 ) { + return 1; + } else if constexpr ( extents_type::rank() == 1 ) { + return exts.extent(0); + } else { + index_type value = padded_stride.value(0); + for (rank_type r = 1; r < extents_type::rank(); ++r) { + value *= exts.extent(r); + } + return value; + } + } + + /** + * Return the mapping given the provided indices per rank. + * + * This overload participates in overload resolution only if: + * - `sizeof...(Indices) == extents_type::rank()`, + * - `(is_convertible_v && ...) is true`, and + * - (is_nothrow_constructible_v && ...) is true. + */ + MDSPAN_TEMPLATE_REQUIRES( + class... _Indices, + /* requires */ ( + sizeof...(_Indices) == extents_type::rank() && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) + ) + ) + constexpr size_t operator()(_Indices... idxs) const noexcept + { + return compute_offset(std::index_sequence_for<_Indices...>{}, idxs...); + } + + static constexpr bool is_always_unique() noexcept { return true; } + static constexpr bool is_always_exhaustive() noexcept + { + return (extents_type::rank() <= rank_type(1)) + || (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent + && extents_type::static_extent(extent_to_pad_idx) == padded_stride_type::static_value()); + } + static constexpr bool is_always_strided() noexcept { return true; } + + static constexpr bool is_unique() noexcept { return true; } + constexpr bool is_exhaustive() const noexcept + { + return (extents_type::rank() < 2) + || (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + } + static constexpr bool is_strided() noexcept { return true; } + + constexpr index_type stride(rank_type r) const noexcept + { + assert(r < extents_type::rank()); + if(r == 0) return index_type(1); + + index_type value = padded_stride.value(0); + for (rank_type k = 1; k < r; k++) value *= exts.extent(k); + + return value; + } + + /** + * Equality operator between `layout_left_padded`s + * + * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * + * \note There is currently a difference from p2642r2, where this function is specified as taking + * `layout_left_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_left_padded_mapping<_Mapping>::value + && (_Mapping::extents_type::rank() == extents_type::rank()) + ) + ) + friend constexpr bool operator==(const mapping &left, const _Mapping &right) noexcept + { + // Workaround for some compilers not short-circuiting properly with compile-time checks + // i.e. we can't access stride(_padding_stride_idx) of a rank 0 mapping + bool strides_equal = true; + if constexpr (extents_type::rank() > rank_type(1)) + { + strides_equal = left.stride(padded_stride_idx) == right.stride(padded_stride_idx); + } + return (left.extents() == right.extents()) && strides_equal; + } + +#if !MDSPAN_HAS_CXX_20 + /** + * Inequality operator between `layout_left_padded`s + * + * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_left_padded_mapping<_Mapping>::value + && (_Mapping::extents_type::rank() == extents_type::rank()) + ) + ) + friend constexpr bool operator!=(const mapping &left, const _Mapping &right) noexcept + { + return !(left == right); + } +#endif +}; + +template +template +class layout_right_padded::mapping { +public: + static constexpr size_t padding_value = PaddingValue; + + using extents_type = Extents; + using index_type = typename extents_type::index_type; + using size_type = typename extents_type::size_type; + using rank_type = typename extents_type::rank_type; + using layout_type = layout_right_padded; + +#ifndef MDSPAN_INTERNAL_TEST + private: +#endif // MDSPAN_INTERNAL_TEST + + static constexpr rank_type padded_stride_idx = detail::layout_padded_constants::padded_stride_idx; + static constexpr rank_type extent_to_pad_idx = detail::layout_padded_constants::extent_to_pad_idx; + + static_assert((padding_value != 0) + || (extents_type::static_extent(extent_to_pad_idx) == 0) + || (extents_type::static_extent(extent_to_pad_idx) == dynamic_extent), + "if padding stride is 0, static_extent(extent-to-pad-rank) must also be 0 or dynamic_extent"); + + using padded_stride_type = detail::padded_extent< padding_value, extents_type, extent_to_pad_idx >; + static constexpr size_t static_padding_stride = padded_stride_type::static_value(); + + typename padded_stride_type::static_array_type padded_stride = {}; + extents_type exts = {}; + + constexpr index_type compute_offset(std::index_sequence<>) const { + return 0; + } + + template + constexpr index_type compute_offset(std::index_sequence, + IndexOffset index_offset) const { + return index_offset; + } + + template + constexpr index_type compute_offset(std::index_sequence, + IndexOffsets... index_offsets) const { + // self-recursive fold trick from + // https://github.com/llvm/llvm-project/blob/4d9771741d40cc9cfcccb6b033f43689d36b705a/libcxx/include/mdspan/layout_right.h#L141 + index_type res = 0; + ((res = static_cast(index_offsets) + + (Ranks == extent_to_pad_idx ? padded_stride.value(0) + : exts.extent(Ranks)) * + res), + ...); + return res; + } + +public: +#if !MDSPAN_HAS_CXX_20 + MDSPAN_INLINE_FUNCTION_DEFAULTED + constexpr mapping() + : mapping(extents_type{}) + {} +#else + MDSPAN_INLINE_FUNCTION_DEFAULTED + constexpr mapping() + requires(static_padding_stride != dynamic_extent) = default; + + MDSPAN_INLINE_FUNCTION + constexpr mapping() + requires(static_padding_stride == dynamic_extent) + : mapping(extents_type{}) + {} +#endif + + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(const mapping&) noexcept = default; + MDSPAN_INLINE_FUNCTION_DEFAULTED mapping& operator=(const mapping&) noexcept = default; + + /** + * Initializes the mapping with the given extents. + * + * \param ext the given extents + */ + MDSPAN_INLINE_FUNCTION + constexpr mapping(const extents_type &ext) + : padded_stride(padded_stride_type::init_padding(ext)), exts(ext) {} + + /** + * Initializes the mapping with the given extents and the specified padding value. + * + * This overload participates in overload resolution only if `is_convertible_v` + * is `true` and `is_nothrow_constructible_v` is `true` + * + * \param ext the given extents + * \param padding_value the padding value + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Size, + /* requires */ ( + std::is_convertible_v<_Size, index_type> + && std::is_nothrow_constructible_v + ) + ) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const extents_type &ext, _Size dynamic_padding_value) + : padded_stride(padded_stride_type::init_padding(ext, static_cast(dynamic_padding_value))), + exts(ext) { + assert((padding_value == dynamic_extent) || + (static_cast(padding_value) == static_cast(dynamic_padding_value))); + } + + /** + * Converting constructor from `layout_right::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true. + * If `OtherExtents::rank() > 1` then one of `padding_value`, `static_extent(0)`, or `OtherExtents::static_extent(0)` must be `dynamic_extent`; + * otherwise, `OtherExtents::static_extent(0)` must be equal to the least multiple of `padding_value` greater than or equal to `extents_type::static_extent(0)` + */ + MDSPAN_TEMPLATE_REQUIRES( + class _OtherExtents, + /* requires */ ( + std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<_OtherExtents, extents_type>)) + constexpr mapping(const layout_right::mapping<_OtherExtents> &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + { + static_assert((_OtherExtents::rank() > 1) || (padded_stride_type::static_value() != dynamic_extent) || (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) + || (padded_stride_type::static_value() == _OtherExtents::static_extent(extent_to_pad_idx))); + } + + /** + * Converting constructor from `layout_stride::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true + */ + MDSPAN_TEMPLATE_REQUIRES( + class _OtherExtents, + /* requires */ ( + std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0)) + constexpr mapping(const layout_stride::mapping<_OtherExtents> &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + {} + + /** + * Converting constructor from `layout_right_padded::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true. + * Either `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or `padding_value == OtherPaddingStride`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_right_padded_mapping<_Mapping>::value + && std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && + (padding_value == dynamic_extent || + _Mapping::padding_value == dynamic_extent))) + constexpr mapping(const _Mapping &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + { + static_assert(padding_value == dynamic_extent || + _Mapping::padding_value == dynamic_extent || + padding_value == _Mapping::padding_value); + } + + /** + * Converting constructor from `layout_left_padded::mapping`. + * + * This overload participates in overload resolution only if `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_left_padded_mapping<_Mapping>::value + && extents_type::rank() <= 1 + && std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) + constexpr mapping(const _Mapping &other_mapping) noexcept + : padded_stride(padded_stride_type::init_padding(other_mapping.extents(), other_mapping.extents().extent(extent_to_pad_idx))), + exts(other_mapping.extents()) + {} + + constexpr const extents_type &extents() const noexcept + { + return exts; + } + + constexpr std::array + strides() const noexcept + { + if constexpr ( extents_type::rank() == 0 ) { + return {}; + } else if constexpr ( extents_type::rank() == 1 ) { + return {1}; + } else { + index_type value = 1; + std::array s{}; + s[extent_to_pad_idx] = value; + value *= padded_stride.value(0); + for (rank_type r = extent_to_pad_idx - 1; r > 0; --r) + { + s[r] = value; + value *= exts.extent(r); + } + s[0] = value; + return s; + } + } + + constexpr index_type + required_span_size() const noexcept + { + if constexpr ( extents_type::rank() == 0 ) { + return 1; + } else if constexpr ( extents_type::rank() == 1 ) { + return exts.extent(0); + } else { + index_type value = 1; + for (rank_type r = 0; r < extent_to_pad_idx; ++r) + { + value *= exts.extent(r); + } + return value * padded_stride.value(0); + } + } + + /** + * Return the mapping given the provided indices per rank. + * + * This overload participates in overload resolution only if: + * - `sizeof...(Indices) == extents_type::rank()`, + * - `(is_convertible_v && ...) is true`, and + * - (is_nothrow_constructible_v && ...) is true. + */ + MDSPAN_TEMPLATE_REQUIRES( + class... _Indices, + /* requires */ ( + sizeof...(_Indices) == extents_type::rank() && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) + ) + ) + constexpr size_t operator()(_Indices... idxs) const noexcept + { + return compute_offset(std::index_sequence_for<_Indices...>{}, idxs...); + } + + static constexpr bool is_always_unique() noexcept { return true; } + static constexpr bool is_always_exhaustive() noexcept + { + return (extents_type::rank() <= rank_type(1)) + || (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent + && extents_type::static_extent(extent_to_pad_idx) == padded_stride_type::static_value()); + } + static constexpr bool is_always_strided() noexcept { return true; } + + static constexpr bool is_unique() noexcept { return true; } + constexpr bool is_exhaustive() const noexcept + { + return (extents_type::rank() < 2) + || (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + } + static constexpr bool is_strided() noexcept { return true; } + + constexpr index_type stride(rank_type r) const noexcept + { + assert(r < extents_type::rank()); + if(r == extents_type::rank() - 1) return index_type(1); + + index_type value = padded_stride.value(0); + for (rank_type k = extents_type::rank() - 2; k > r; k--) value *= exts.extent(k); + + return value; + } + + /** + * Equality operator between `layout_right_padded`s + * + * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * + * \note There is currently a difference from p2642r2, where this function is specified as taking + * `layout_right_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_right_padded_mapping<_Mapping>::value + && (_Mapping::extents_type::rank() == extents_type::rank()) + ) + ) + friend constexpr bool operator==(const mapping &left, const _Mapping &right) noexcept + { + // Workaround for some compilers not short-circuiting properly with compile-time checks + // i.e. we can't access stride(_padding_stride_idx) of a rank 0 mapping + bool strides_equal = true; + if constexpr (extents_type::rank() > rank_type(1)) + { + strides_equal = left.stride(padded_stride_idx) == right.stride(padded_stride_idx); + } + return (left.extents() == right.extents()) && strides_equal; + } + +#if !MDSPAN_HAS_CXX_20 + /** + * Inequality operator between `layout_right_padded`s + * + * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_right_padded_mapping<_Mapping>::value + && (_Mapping::extents_type::rank() == extents_type::rank()) + ) + ) + friend constexpr bool operator!=(const mapping &left, const _Mapping &right) noexcept + { + return !(left == right); + } +#endif +}; +} +} diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp new file mode 100644 index 0000000000..945f091a2d --- /dev/null +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp @@ -0,0 +1,117 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#pragma once + +#include +#include "../__p0009_bits/dynamic_extent.hpp" + +namespace MDSPAN_IMPL_STANDARD_NAMESPACE { +namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { + +template +struct layout_left_padded { + template + class mapping; +}; + +template +struct layout_right_padded { + template + class mapping; +}; + +namespace detail { +// The layout_padded_constants structs are only useful if rank > 1, otherwise they may wrap +template +struct layout_padded_constants; + +template +struct layout_padded_constants, _ExtentsType> +{ + using rank_type = typename _ExtentsType::rank_type; + static constexpr rank_type padded_stride_idx = 1; + static constexpr rank_type extent_to_pad_idx = 0; +}; + +template +struct layout_padded_constants, _ExtentsType> +{ + using rank_type = typename _ExtentsType::rank_type; + static constexpr rank_type padded_stride_idx = _ExtentsType::rank() - 2; + static constexpr rank_type extent_to_pad_idx = _ExtentsType::rank() - 1; +}; + +template +struct is_layout_left_padded : std::false_type {}; + +template +struct is_layout_left_padded> : std::true_type {}; + +template +struct is_layout_left_padded_mapping : std::false_type {}; + +template +struct is_layout_left_padded_mapping<_Mapping, + std::enable_if_t::template mapping>::value>> + : std::true_type {}; + +template +struct is_layout_right_padded : std::false_type {}; + +template +struct is_layout_right_padded> : std::true_type {}; + +template +struct is_layout_right_padded_mapping : std::false_type {}; + +template +struct is_layout_right_padded_mapping<_Mapping, + std::enable_if_t::template mapping>::value>> + : std::true_type {}; + +template +constexpr void check_padded_layout_converting_constructor_mandates() +{ + if constexpr (_LayoutExtentsType::rank() > 1) { + using extents_type = typename _PaddedLayoutMappingType::extents_type; + constexpr auto padding_value = _PaddedLayoutMappingType::padding_value; + constexpr auto idx = layout_padded_constants::extent_to_pad_idx; + if constexpr ((_LayoutExtentsType::static_extent(idx) != dynamic_extent) && + (extents_type::static_extent(idx) != dynamic_extent) && + (padding_value != dynamic_extent)) { + if constexpr (padding_value == 0) { + static_assert(_LayoutExtentsType::static_extent(idx) == 0); + } else { + static_assert( + _LayoutExtentsType::static_extent(idx) % padding_value == 0); + } + } + } +} + +template +constexpr void check_padded_layout_converting_constructor_preconditions([[maybe_unused]] const _OtherMapping &other_mapping) { + if constexpr (_ExtentsType::rank() > 1) { + constexpr auto padded_stride_idx = + layout_padded_constants::padded_stride_idx; + constexpr auto extent_to_pad_idx = layout_padded_constants::extent_to_pad_idx; + assert(other_mapping.stride(padded_stride_idx) == other_mapping.extents().extent(extent_to_pad_idx)); + } +} +} +} +} diff --git a/lib/kokkos/tpls/mdspan/include/mdspan/mdspan.hpp b/lib/kokkos/tpls/mdspan/include/mdspan/mdspan.hpp index b440873526..ac72a1a4e6 100644 --- a/lib/kokkos/tpls/mdspan/include/mdspan/mdspan.hpp +++ b/lib/kokkos/tpls/mdspan/include/mdspan/mdspan.hpp @@ -35,6 +35,7 @@ #include "../experimental/__p0009_bits/layout_right.hpp" #include "../experimental/__p0009_bits/macros.hpp" #if MDSPAN_HAS_CXX_17 +#include "../experimental/__p2642_bits/layout_padded.hpp" #include "../experimental/__p2630_bits/submdspan.hpp" #endif diff --git a/src/.gitignore b/src/.gitignore index 6e9fd14b3f..b145f81159 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -1361,6 +1361,8 @@ /pair_oxrna2_*.cpp /pair_oxrna2_*.h /mf_oxdna.h +/pair_pedone.cpp +/pair_pedone.h /pair_peri.cpp /pair_peri.h /pair_peri_eps.cpp diff --git a/src/ASPHERE/pair_ylz.cpp b/src/ASPHERE/pair_ylz.cpp index a678712619..833dc8e3e4 100644 --- a/src/ASPHERE/pair_ylz.cpp +++ b/src/ASPHERE/pair_ylz.cpp @@ -300,6 +300,7 @@ double PairYLZ::init_one(int i, int j) zeta[j][i] = zeta[i][j]; mu[j][i] = mu[i][j]; beta[j][i] = beta[i][j]; + cut[j][i] = cut[i][j]; return cut[i][j]; } @@ -409,7 +410,7 @@ void PairYLZ::write_data_all(FILE *fp) { for (int i = 1; i <= atom->ntypes; i++) for (int j = i; j <= atom->ntypes; j++) - fprintf(fp, "%d %d %g %g %g %g %g %g\n", i, j, epsilon[i][i], sigma[i][i], cut[i][j], + fprintf(fp, "%d %d %g %g %g %g %g %g\n", i, j, epsilon[i][j], sigma[i][j], cut[i][j], zeta[i][j], mu[i][j], beta[i][j]); } diff --git a/src/BODY/body_rounded_polyhedron.cpp b/src/BODY/body_rounded_polyhedron.cpp index 991f52cac5..6aa15b740b 100644 --- a/src/BODY/body_rounded_polyhedron.cpp +++ b/src/BODY/body_rounded_polyhedron.cpp @@ -99,10 +99,9 @@ int BodyRoundedPolyhedron::nedges(AtomVecBody::Bonus *bonus) { int nvertices = bonus->ivalue[0]; int nedges = bonus->ivalue[1]; - //int nfaces = bonus->ivalue[2]; if (nvertices == 1) return 0; else if (nvertices == 2) return 1; - return nedges; //(nvertices+nfaces-2); // Euler formula: V-E+F=2 + return nedges; } /* ---------------------------------------------------------------------- */ @@ -116,6 +115,9 @@ double *BodyRoundedPolyhedron::edges(AtomVecBody::Bonus *bonus) int BodyRoundedPolyhedron::nfaces(AtomVecBody::Bonus *bonus) { + int nvertices = bonus->ivalue[0]; + if (nvertices < 3) return 0; + return bonus->ivalue[2]; } diff --git a/src/BODY/pair_body_rounded_polygon.cpp b/src/BODY/pair_body_rounded_polygon.cpp index 432f1d5c9c..2293f56a98 100644 --- a/src/BODY/pair_body_rounded_polygon.cpp +++ b/src/BODY/pair_body_rounded_polygon.cpp @@ -415,17 +415,14 @@ void PairBodyRoundedPolygon::init_style() if (!avec) error->all(FLERR,"Pair body/rounded/polygon requires atom style body"); if (strcmp(avec->bptr->style,"rounded/polygon") != 0) - error->all(FLERR,"Pair body/rounded/polygon requires " - "body style rounded/polygon"); + error->all(FLERR,"Pair body/rounded/polygon requires body style rounded/polygon"); bptr = dynamic_cast(avec->bptr); if (force->newton_pair == 0) - error->all(FLERR,"Pair style body/rounded/polygon requires " - "newton pair on"); + error->all(FLERR,"Pair style body/rounded/polygon requires newton pair on"); if (comm->ghost_velocity == 0) - error->all(FLERR,"Pair body/rounded/polygon requires " - "ghost atoms store velocity"); + error->all(FLERR,"Pair body/rounded/polygon requires ghost atoms store velocity"); neighbor->add_request(this); @@ -463,27 +460,24 @@ void PairBodyRoundedPolygon::init_style() for (i = 1; i <= ntypes; i++) maxerad[i] = merad[i] = 0; - int ipour; - for (ipour = 0; ipour < modify->nfix; ipour++) - if (strcmp(modify->fix[ipour]->style,"pour") == 0) break; - if (ipour == modify->nfix) ipour = -1; + Fix *fixpour = nullptr; + auto pours = modify->get_fix_by_style("^pour"); + if (pours.size() > 0) fixpour = pours[0]; + + Fix *fixdep = nullptr; + auto deps = modify->get_fix_by_style("^deposit"); + if (deps.size() > 0) fixdep = deps[0]; - int idep; - for (idep = 0; idep < modify->nfix; idep++) - if (strcmp(modify->fix[idep]->style,"deposit") == 0) break; - if (idep == modify->nfix) idep = -1; for (i = 1; i <= ntypes; i++) { merad[i] = 0.0; - if (ipour >= 0) { + if (fixpour) { itype = i; - merad[i] = - *((double *) modify->fix[ipour]->extract("radius",itype)); + merad[i] = *((double *) fixpour->extract("radius",itype)); } - if (idep >= 0) { + if (fixdep) { itype = i; - merad[i] = - *((double *) modify->fix[idep]->extract("radius",itype)); + merad[i] = *((double *) fixdep->extract("radius",itype)); } } @@ -570,8 +564,7 @@ void PairBodyRoundedPolygon::body2space(int i) } if ((body_num_edges > 0) && (edge_ends == nullptr)) - error->one(FLERR,"Inconsistent edge data for body of atom {}", - atom->tag[i]); + error->one(FLERR,"Inconsistent edge data for body of atom {}", atom->tag[i]); for (int m = 0; m < body_num_edges; m++) { edge[nedge][0] = static_cast(edge_ends[2*m+0]); diff --git a/src/BODY/pair_body_rounded_polyhedron.cpp b/src/BODY/pair_body_rounded_polyhedron.cpp index 82660df1e0..ed83dc49e2 100644 --- a/src/BODY/pair_body_rounded_polyhedron.cpp +++ b/src/BODY/pair_body_rounded_polyhedron.cpp @@ -222,8 +222,7 @@ void PairBodyRoundedPolyhedron::compute(int eflag, int vflag) // sphere-sphere interaction if (npi == 1 && npj == 1) { - sphere_against_sphere(i, j, itype, jtype, delx, dely, delz, - rsq, v, f, evflag); + sphere_against_sphere(i, j, itype, jtype, delx, dely, delz, rsq, v, f, evflag); continue; } @@ -391,20 +390,16 @@ void PairBodyRoundedPolyhedron::coeff(int narg, char **arg) void PairBodyRoundedPolyhedron::init_style() { avec = dynamic_cast(atom->style_match("body")); - if (!avec) error->all(FLERR,"Pair body/rounded/polyhedron requires " - "atom style body"); + if (!avec) error->all(FLERR,"Pair body/rounded/polyhedron requires atom style body"); if (strcmp(avec->bptr->style,"rounded/polyhedron") != 0) - error->all(FLERR,"Pair body/rounded/polyhedron requires " - "body style rounded/polyhedron"); + error->all(FLERR,"Pair body/rounded/polyhedron requires body style rounded/polyhedron"); bptr = dynamic_cast(avec->bptr); if (force->newton_pair == 0) - error->all(FLERR,"Pair style body/rounded/polyhedron requires " - "newton pair on"); + error->all(FLERR,"Pair style body/rounded/polyhedron requires newton pair on"); if (comm->ghost_velocity == 0) - error->all(FLERR,"Pair body/rounded/polyhedron requires " - "ghost atoms store velocity"); + error->all(FLERR,"Pair body/rounded/polyhedron requires ghost atoms store velocity"); neighbor->add_request(this); @@ -446,27 +441,23 @@ void PairBodyRoundedPolyhedron::init_style() for (i = 1; i <= ntypes; i++) maxerad[i] = merad[i] = 0; - int ipour; - for (ipour = 0; ipour < modify->nfix; ipour++) - if (strcmp(modify->fix[ipour]->style,"pour") == 0) break; - if (ipour == modify->nfix) ipour = -1; + Fix *fixpour = nullptr; + auto pours = modify->get_fix_by_style("^pour"); + if (pours.size() > 0) fixpour = pours[0]; - int idep; - for (idep = 0; idep < modify->nfix; idep++) - if (strcmp(modify->fix[idep]->style,"deposit") == 0) break; - if (idep == modify->nfix) idep = -1; + Fix *fixdep = nullptr; + auto deps = modify->get_fix_by_style("^deposit"); + if (deps.size() > 0) fixdep = deps[0]; for (i = 1; i <= ntypes; i++) { merad[i] = 0.0; - if (ipour >= 0) { + if (fixpour) { itype = i; - merad[i] = - *((double *) modify->fix[ipour]->extract("radius",itype)); + merad[i] = *((double *) fixpour->extract("radius",itype)); } - if (idep >= 0) { + if (fixdep) { itype = i; - merad[i] = - *((double *) modify->fix[idep]->extract("radius",itype)); + merad[i] = *((double *) fixdep->extract("radius",itype)); } } @@ -558,8 +549,7 @@ void PairBodyRoundedPolyhedron::body2space(int i) } if ((body_num_edges > 0) && (edge_ends == nullptr)) - error->one(FLERR,"Inconsistent edge data for body of atom {}", - atom->tag[i]); + error->one(FLERR,"Inconsistent edge data for body of atom {}", atom->tag[i]); for (int m = 0; m < body_num_edges; m++) { edge[nedge][0] = static_cast(edge_ends[2*m+0]); @@ -585,8 +575,7 @@ void PairBodyRoundedPolyhedron::body2space(int i) } if ((body_num_faces > 0) && (face_pts == nullptr)) - error->one(FLERR,"Inconsistent face data for body of atom {}", - atom->tag[i]); + error->one(FLERR,"Inconsistent face data for body of atom {}", atom->tag[i]); for (int m = 0; m < body_num_faces; m++) { for (int k = 0; k < MAX_FACE_SIZE; k++) diff --git a/src/EXTRA-DUMP/dump_xtc.cpp b/src/EXTRA-DUMP/dump_xtc.cpp index 798bdcb391..3119a470dc 100644 --- a/src/EXTRA-DUMP/dump_xtc.cpp +++ b/src/EXTRA-DUMP/dump_xtc.cpp @@ -14,7 +14,7 @@ /* ---------------------------------------------------------------------- Contributing authors: Naveen Michaud-Agrawal (Johns Hopkins U) - open-source XDR routines from + Open Source XDR based I/O routines from Frans van Hoesel (https://www.rug.nl/staff/f.h.j.van.hoesel/) are included in this file Axel Kohlmeyer (Temple U) @@ -35,27 +35,29 @@ #include "output.h" #include "update.h" +#include "xdr_compat.h" + #include #include #include using namespace LAMMPS_NS; -#define EPS 1e-5 -#define XTC_MAGIC 1995 +static constexpr double EPS = 1.0e-5; +static constexpr int XTC_MAGIC = 1995; #define MYMIN(a,b) ((a) < (b) ? (a) : (b)) #define MYMAX(a,b) ((a) > (b) ? (a) : (b)) -int xdropen(XDR *, const char *, const char *); -int xdrclose(XDR *); -void xdrfreebuf(); -int xdr3dfcoord(XDR *, float *, int *, float *); +static int xdropen(XDR *, const char *, const char *); +static int xdrclose(XDR *); +static void xdrfreebuf(); +static int xdr3dfcoord(XDR *, float *, int *, float *); /* ---------------------------------------------------------------------- */ -DumpXTC::DumpXTC(LAMMPS *lmp, int narg, char **arg) : Dump(lmp, narg, arg), - coords(nullptr) +DumpXTC::DumpXTC(LAMMPS *lmp, int narg, char **arg) + : Dump(lmp, narg, arg), coords(nullptr), xd(nullptr) { if (narg != 5) error->all(FLERR,"Illegal dump xtc command"); if (binary || compressed || multifile || multiproc) @@ -68,6 +70,7 @@ DumpXTC::DumpXTC(LAMMPS *lmp, int narg, char **arg) : Dump(lmp, narg, arg), flush_flag = 0; unwrap_flag = 0; precision = 1000.0; + xd = new XDR; // allocate global array for atom coords @@ -105,9 +108,10 @@ DumpXTC::~DumpXTC() memory->destroy(coords); if (me == 0) { - xdrclose(&xd); + xdrclose(xd); xdrfreebuf(); } + delete xd; } /* ---------------------------------------------------------------------- */ @@ -150,7 +154,8 @@ void DumpXTC::openfile() fp = nullptr; if (me == 0) - if (xdropen(&xd,filename,"w") == 0) error->one(FLERR,"Cannot open dump file"); + if (xdropen(xd,filename,"w") == 0) + error->one(FLERR,"Cannot open XTC format dump file {}: {}", filename, utils::getsyserror()); } /* ---------------------------------------------------------------------- */ @@ -176,11 +181,11 @@ void DumpXTC::write_header(bigint nbig) if (me != 0) return; int tmp = XTC_MAGIC; - xdr_int(&xd,&tmp); - xdr_int(&xd,&n); - xdr_int(&xd,&ntimestep); + xdr_int(xd,&tmp); + xdr_int(xd,&n); + xdr_int(xd,&ntimestep); float time_value = ntimestep * tfactor * update->dt; - xdr_float(&xd,&time_value); + xdr_float(xd,&time_value); // cell basis vectors if (domain->triclinic) { @@ -192,18 +197,18 @@ void DumpXTC::write_header(bigint nbig) float xz = sfactor * domain->xz; float yz = sfactor * domain->yz; - xdr_float(&xd,&xdim); xdr_float(&xd,&zero); xdr_float(&xd,&zero); - xdr_float(&xd,&xy ); xdr_float(&xd,&ydim); xdr_float(&xd,&zero); - xdr_float(&xd,&xz ); xdr_float(&xd,&yz ); xdr_float(&xd,&zdim); + xdr_float(xd,&xdim); xdr_float(xd,&zero); xdr_float(xd,&zero); + xdr_float(xd,&xy ); xdr_float(xd,&ydim); xdr_float(xd,&zero); + xdr_float(xd,&xz ); xdr_float(xd,&yz ); xdr_float(xd,&zdim); } else { float zero = 0.0; float xdim = sfactor * (domain->boxhi[0] - domain->boxlo[0]); float ydim = sfactor * (domain->boxhi[1] - domain->boxlo[1]); float zdim = sfactor * (domain->boxhi[2] - domain->boxlo[2]); - xdr_float(&xd,&xdim); xdr_float(&xd,&zero); xdr_float(&xd,&zero); - xdr_float(&xd,&zero); xdr_float(&xd,&ydim); xdr_float(&xd,&zero); - xdr_float(&xd,&zero); xdr_float(&xd,&zero); xdr_float(&xd,&zdim); + xdr_float(xd,&xdim); xdr_float(xd,&zero); xdr_float(xd,&zero); + xdr_float(xd,&zero); xdr_float(xd,&ydim); xdr_float(xd,&zero); + xdr_float(xd,&zero); xdr_float(xd,&zero); xdr_float(xd,&zdim); } } @@ -328,7 +333,7 @@ double DumpXTC::memory_usage() void DumpXTC::write_frame() { - xdr3dfcoord(&xd,coords,&natoms,&precision); + xdr3dfcoord(xd,coords,&natoms,&precision); } // ---------------------------------------------------------------------- @@ -406,7 +411,7 @@ static int magicints[] = { | | xdropen - open xdr file | - | This versions differs from xdrstdio_create, because I need to know + | This version differs from xdrstdio_create, because I need to know | the state of the file (read or write) so I can use xdr3dfcoord | in eigther read or write mode, and the file descriptor | so I can close the file (something xdr_destroy doesn't do). @@ -1048,7 +1053,7 @@ int xdr3dfcoord(XDR *xdrs, float *fp, int *size, float *precision) } if (buf[1] != 0) buf[0]++; xdr_int(xdrs, &(buf[0])); /* buf[0] holds the length in bytes */ - return errval * (xdr_opaque(xdrs, (caddr_t)&(buf[3]), (u_int)buf[0])); + return errval * (xdr_opaque(xdrs, (char *)&(buf[3]), (unsigned int)buf[0])); } else { /* xdrs is open for reading */ @@ -1129,7 +1134,7 @@ int xdr3dfcoord(XDR *xdrs, float *fp, int *size, float *precision) if (xdr_int(xdrs, &(buf[0])) == 0) return 0; - if (xdr_opaque(xdrs, (caddr_t)&(buf[3]), (u_int)buf[0]) == 0) + if (xdr_opaque(xdrs, (char *)&(buf[3]), (unsigned int)buf[0]) == 0) return 0; buf[0] = buf[1] = buf[2] = 0; diff --git a/src/EXTRA-DUMP/dump_xtc.h b/src/EXTRA-DUMP/dump_xtc.h index 74147be06b..0a27cc67dd 100644 --- a/src/EXTRA-DUMP/dump_xtc.h +++ b/src/EXTRA-DUMP/dump_xtc.h @@ -21,8 +21,8 @@ DumpStyle(xtc,DumpXTC); #define LMP_DUMP_XTC_H #include "dump.h" -#include "xdr_compat.h" +struct XDR; namespace LAMMPS_NS { class DumpXTC : public Dump { @@ -37,7 +37,7 @@ class DumpXTC : public Dump { float precision; // user-adjustable precision setting float *coords; double sfactor, tfactor; // scaling factors for positions and time unit - XDR xd; + XDR *xd; void init_style() override; int modify_param(int, char **) override; diff --git a/src/EXTRA-DUMP/xdr_compat.cpp b/src/EXTRA-DUMP/xdr_compat.cpp index e29bbe8334..3caa38663c 100644 --- a/src/EXTRA-DUMP/xdr_compat.cpp +++ b/src/EXTRA-DUMP/xdr_compat.cpp @@ -1,54 +1,63 @@ -// clang-format off #include "xdr_compat.h" + #include #include -/* This file is needed for systems, that do not provide XDR support - * in their system libraries. It was written for windows, but will - * most probably work on other platforms too. better make sure you - * test that the xtc files produced are ok before using it. +/* + * This file contains an implementation of the Sun External Data Representation (XDR) + * routines. They have been adapted specifically for the use with the LAMMPS xtc dump + * style to produce compressed trajectory files in the Gromacs XTC format. * - * It is also needed on BG/L and Cray XT3/XT4 as we don't have - * XDR support in the lightweight kernel runtimes either. + * The XDR sources are avaiable under the BSD 3-clause license for example in + * the MIT Kerberos 5 distribution with the following copyright notice and license. * - * This file contains the definitions for Sun External Data - * Representation (XDR) headers and routines. + * @(#)xdr.h 2.2 88/07/29 4.0 RPCSRC * - * Although the rest of LAMPPS is GPL, you can copy and use the XDR - * routines in any way you want as long as you obey Sun's license: + * Copyright (c) 2010, Oracle America, Inc. * - * Sun RPC is a product of Sun Microsystems, Inc. and is provided for - * unrestricted use provided that this legend is included on all tape - * media and as a part of the software program in whole or part. Users - * may copy or modify Sun RPC without charge, but are not authorized - * to license or distribute it to anyone else except as part of a product or - * program developed by the user. + * All rights reserved. * - * SUN RPC IS PROVIDED AS IS WITH NO WARRANTIES OF ANY KIND INCLUDING THE - * WARRANTIES OF DESIGN, MERCHANTIBILITY AND FITNESS FOR A PARTICULAR - * PURPOSE, OR ARISING FROM A COURSE OF DEALING, USAGE OR TRADE PRACTICE. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: * - * Sun RPC is provided with no support and without any obligation on the - * part of Sun Microsystems, Inc. to assist in its use, correction, - * modification or enhancement. + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. * - * SUN MICROSYSTEMS, INC. SHALL HAVE NO LIABILITY WITH RESPECT TO THE - * INFRINGEMENT OF COPYRIGHTS, TRADE SECRETS OR ANY PATENTS BY SUN RPC - * OR ANY PART THEREOF. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. * - * In no event will Sun Microsystems, Inc. be liable for any lost revenue - * or profits or other special, indirect and consequential damages, even if - * Sun has been advised of the possibility of such damages. + * * Neither the name of the "Oracle America, Inc." nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. * - * Sun Microsystems, Inc. - * 2550 Garcia Avenue - * Mountain View, California 94043 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef __cplusplus extern "C" { #endif +#ifndef FALSE +#define FALSE (0) +#endif +#ifndef TRUE +#define TRUE (1) +#endif + +#define BYTES_PER_XDR_UNIT (4) + /* * for unit alignment */ @@ -58,19 +67,18 @@ static xdr_uint32_t xdr_swapbytes(xdr_uint32_t x) { xdr_uint32_t y; int i; - char *px=(char *)&x; - char *py=(char *)&y; + char *px = (char *) &x; + char *py = (char *) &y; - for (i=0;i<4;i++) - py[i]=px[3-i]; + for (i = 0; i < 4; i++) py[i] = px[3 - i]; return y; } static xdr_uint32_t xdr_htonl(xdr_uint32_t x) { - short s=0x0F00; - if (*((char *)&s)==(char)0x0F) { + short s = 0x0F00; + if (*((char *) &s) == (char) 0x0F) { /* bigendian, do nothing */ return x; } else { @@ -81,8 +89,8 @@ static xdr_uint32_t xdr_htonl(xdr_uint32_t x) static xdr_uint32_t xdr_ntohl(xdr_uint32_t x) { - short s=0x0F00; - if (*((char *)&s)==(char)0x0F) { + short s = 0x0F00; + if (*((char *) &s) == (char) 0x0F) { /* bigendian, do nothing */ return x; } else { @@ -91,221 +99,39 @@ static xdr_uint32_t xdr_ntohl(xdr_uint32_t x) } } - -/* - * Free a data structure using XDR - * Not a filter, but a convenient utility nonetheless - */ -void -xdr_free (xdrproc_t proc, char *objp) -{ - XDR x; - - x.x_op = XDR_FREE; - (*proc) (&x, objp); -} - -/* - * XDR nothing - */ -bool_t -xdr_void (void) -{ - return TRUE; -} - /* * XDR integers */ -bool_t -xdr_int (XDR *xdrs, int *ip) +bool_t xdr_int(XDR *xdrs, int *ip) { xdr_int32_t l; - switch (xdrs->x_op) - { + switch (xdrs->x_op) { + case XDR_ENCODE: l = (xdr_int32_t) (*ip); - return xdr_putint32 (xdrs, &l); + return xdr_putint32(xdrs, &l); + break; case XDR_DECODE: - if (!xdr_getint32 (xdrs, &l)) - { - return FALSE; - } + if (!xdr_getint32(xdrs, &l)) return FALSE; *ip = (int) l; + return TRUE; + break; case XDR_FREE: return TRUE; + break; } return FALSE; } - -/* - * XDR unsigned integers - */ -bool_t -xdr_u_int (XDR *xdrs, unsigned int *up) -{ - xdr_uint32_t l; - - switch (xdrs->x_op) - { - case XDR_ENCODE: - l = (xdr_uint32_t) (*up); - return xdr_putuint32 (xdrs, &l); - - case XDR_DECODE: - if (!xdr_getuint32 (xdrs, &l)) - { - return FALSE; - } - *up = (unsigned int) l; - - case XDR_FREE: - return TRUE; - } - return FALSE; -} - - - - -/* - * XDR short integers - */ -bool_t -xdr_short (XDR *xdrs, short *sp) -{ - xdr_int32_t l; - - switch (xdrs->x_op) - { - case XDR_ENCODE: - l = (xdr_int32_t) *sp; - return xdr_putint32 (xdrs, &l); - - case XDR_DECODE: - if (!xdr_getint32 (xdrs, &l)) - { - return FALSE; - } - *sp = (short) l; - return TRUE; - - case XDR_FREE: - return TRUE; - } - return FALSE; -} - - -/* - * XDR unsigned short integers - */ -bool_t -xdr_u_short (XDR *xdrs, unsigned short *usp) -{ - xdr_uint32_t l; - - switch (xdrs->x_op) - { - case XDR_ENCODE: - l = (xdr_uint32_t) *usp; - return xdr_putuint32 (xdrs, &l); - - case XDR_DECODE: - if (!xdr_getuint32 (xdrs, &l)) - { - return FALSE; - } - *usp = (unsigned short) l; - return TRUE; - - case XDR_FREE: - return TRUE; - } - return FALSE; -} - - -/* - * XDR a char - */ -bool_t -xdr_char (XDR *xdrs, char *cp) -{ - int i; - - i = (*cp); - if (!xdr_int (xdrs, &i)) - { - return FALSE; - } - *cp = i; - return TRUE; -} - -/* - * XDR an unsigned char - */ -bool_t -xdr_u_char (XDR *xdrs, unsigned char *cp) -{ - unsigned int u; - - u = (*cp); - if (!xdr_u_int (xdrs, &u)) - { - return FALSE; - } - *cp = u; - return TRUE; -} - -/* - * XDR booleans - */ -bool_t -xdr_bool (XDR *xdrs, int *bp) -{ -#define XDR_FALSE ((xdr_int32_t) 0) -#define XDR_TRUE ((xdr_int32_t) 1) - - xdr_int32_t lb; - - switch (xdrs->x_op) - { - case XDR_ENCODE: - lb = *bp ? XDR_TRUE : XDR_FALSE; - return xdr_putint32 (xdrs, &lb); - - case XDR_DECODE: - if (!xdr_getint32 (xdrs, &lb)) - { - return FALSE; - } - *bp = (lb == XDR_FALSE) ? FALSE : TRUE; - return TRUE; - - case XDR_FREE: - return TRUE; - } - return FALSE; -#undef XDR_FALSE -#undef XDR_TRUE -} - - - /* * XDR opaque data * Allows the specification of a fixed size sequence of opaque bytes. * cp points to the opaque object and cnt gives the byte length. */ -bool_t -xdr_opaque (XDR *xdrs, char *cp, unsigned int cnt) +bool_t xdr_opaque(XDR *xdrs, char *cp, unsigned int cnt) { unsigned int rndup; static char crud[BYTES_PER_XDR_UNIT]; @@ -313,219 +139,61 @@ xdr_opaque (XDR *xdrs, char *cp, unsigned int cnt) /* * if no data we are done */ - if (cnt == 0) - return TRUE; + if (cnt == 0) return TRUE; /* * round byte count to full xdr units */ rndup = cnt % BYTES_PER_XDR_UNIT; - if (rndup > 0) - rndup = BYTES_PER_XDR_UNIT - rndup; - - switch (xdrs->x_op) - { - case XDR_DECODE: - if (!xdr_getbytes (xdrs, cp, cnt)) - { - return FALSE; - } - if (rndup == 0) - return TRUE; - return xdr_getbytes (xdrs, (char *)crud, rndup); - - case XDR_ENCODE: - if (!xdr_putbytes (xdrs, cp, cnt)) - { - return FALSE; - } - if (rndup == 0) - return TRUE; - return xdr_putbytes (xdrs, xdr_zero, rndup); - - case XDR_FREE: - return TRUE; - } - return FALSE; -} - - -/* - * XDR null terminated ASCII strings - * xdr_string deals with "C strings" - arrays of bytes that are - * terminated by a nullptr character. The parameter cpp references a - * pointer to storage; If the pointer is null, then the necessary - * storage is allocated. The last parameter is the max allowed length - * of the string as specified by a protocol. - */ -bool_t -xdr_string (XDR *xdrs, char **cpp, unsigned int maxsize) -{ - char *sp = *cpp; /* sp is the actual string pointer */ - unsigned int size = 0; - unsigned int nodesize = 0; - - /* - * first deal with the length since xdr strings are counted-strings - */ - switch (xdrs->x_op) - { - case XDR_FREE: - if (sp == nullptr) - { - return TRUE; /* already free */ - } - /* fall through... */ - case XDR_ENCODE: - if (sp == nullptr) - return FALSE; - size = strlen (sp); - break; - case XDR_DECODE: - break; - } - - if (!xdr_u_int (xdrs, &size)) - { - return FALSE; - } - if (size > maxsize) - { - return FALSE; - } - nodesize = size + 1; - - /* - * now deal with the actual bytes - */ - switch (xdrs->x_op) - { - case XDR_DECODE: - if (nodesize == 0) - { - return TRUE; - } - if (sp == nullptr) - *cpp = sp = (char *) malloc (nodesize); - if (sp == nullptr) - { - (void) fputs ("xdr_string: out of memory\n", stderr); - return FALSE; - } - sp[size] = 0; - /* fall into ... */ - - case XDR_ENCODE: - return xdr_opaque (xdrs, sp, size); - - case XDR_FREE: - free (sp); - *cpp = nullptr; - return TRUE; - } - return FALSE; -} - - - -/* Floating-point stuff */ - -bool_t -xdr_float(XDR *xdrs, float *fp) -{ - xdr_int32_t tmp; - - switch (xdrs->x_op) { - - case XDR_ENCODE: - tmp = *(xdr_int32_t *)fp; - return (xdr_putint32(xdrs, &tmp)); - - break; - - case XDR_DECODE: - if (xdr_getint32(xdrs, &tmp)) { - *(xdr_int32_t *)fp = tmp; - return (TRUE); - } - - break; - - case XDR_FREE: - return (TRUE); - } - return (FALSE); -} - - -bool_t -xdr_double(XDR *xdrs, double *dp) -{ - - /* Windows and some other systems dont define double-precision - * word order in the header files, so unfortunately we have - * to calculate it! - */ - static int LSW=-1; /* Least significant fp word */ - int *ip; - xdr_int32_t tmp[2]; - - if (LSW<0) { - double x=0.987654321; /* Just a number */ - - /* Possible representations in IEEE double precision: - * (S=small endian, B=big endian) - * - * Byte order, Word order, Hex - * S S b8 56 0e 3c dd 9a ef 3f - * B S 3c 0e 56 b8 3f ef 9a dd - * S B dd 9a ef 3f b8 56 0e 3c - * B B 3f ef 9a dd 3c 0e 56 b8 - */ - - unsigned char ix = *((char *)&x); - - if (ix==0xdd || ix==0x3f) - LSW=1; /* Big endian word order */ - else if (ix==0xb8 || ix==0x3c) - LSW=0; /* Small endian word order */ - else { /* Catch strange errors */ - printf("Error when detecting floating-point word order.\n" - "Do you have a non-IEEE system?\n" - "If possible, use the XDR libraries provided with your system,\n" - "instead of the Gromacs fallback XDR source.\n"); - exit(0); - } - } + if (rndup > 0) rndup = BYTES_PER_XDR_UNIT - rndup; switch (xdrs->x_op) { - case XDR_ENCODE: - ip = (int *)dp; - tmp[0] = ip[!LSW]; - tmp[1] = ip[LSW]; - return (xdr_putint32(xdrs, tmp) && - xdr_putint32(xdrs, tmp+1)); + case XDR_DECODE: + if (!xdr_getbytes(xdrs, cp, cnt)) { return FALSE; } + if (rndup == 0) return TRUE; + return xdr_getbytes(xdrs, (char *) crud, rndup); + break; - break; + case XDR_ENCODE: + if (!xdr_putbytes(xdrs, cp, cnt)) { return FALSE; } + if (rndup == 0) return TRUE; + return xdr_putbytes(xdrs, xdr_zero, rndup); + break; - case XDR_DECODE: - ip = (int *)dp; - if (xdr_getint32(xdrs, tmp+!LSW) && - xdr_getint32(xdrs, tmp+LSW)) { - ip[0] = tmp[0]; - ip[1] = tmp[1]; - return (TRUE); - } - - break; - - case XDR_FREE: - return (TRUE); + case XDR_FREE: + return TRUE; + break; } - return (FALSE); + return FALSE; } +/* Floating-point stuff */ + +bool_t xdr_float(XDR *xdrs, float *fp) +{ + xdr_int32_t tmp; + + switch (xdrs->x_op) { + + case XDR_ENCODE: + tmp = *(xdr_int32_t *) fp; + return (xdr_putint32(xdrs, &tmp)); + break; + + case XDR_DECODE: + if (xdr_getint32(xdrs, &tmp)) { + *(xdr_int32_t *) fp = tmp; + return TRUE; + } + break; + + case XDR_FREE: + return TRUE; + break; + } + return FALSE; +} /* Array routines */ @@ -539,55 +207,37 @@ xdr_double(XDR *xdrs, double *dp) * > elemsize: size of each element * > xdr_elem: routine to XDR each element */ -bool_t -xdr_vector (XDR *xdrs, char *basep, unsigned int nelem, - unsigned int elemsize, xdrproc_t xdr_elem) +bool_t xdr_vector(XDR *xdrs, char *basep, unsigned int nelem, unsigned int elemsize, + xdrproc_t xdr_elem) { -#define LASTUNSIGNED ((unsigned int)0-1) +#define LASTUNSIGNED ((unsigned int) 0 - 1) unsigned int i; char *elptr; elptr = basep; - for (i = 0; i < nelem; i++) - { - if (!(*xdr_elem) (xdrs, elptr, LASTUNSIGNED)) - { - return FALSE; - } - elptr += elemsize; - } + for (i = 0; i < nelem; i++) { + if (!(*xdr_elem)(xdrs, elptr, LASTUNSIGNED)) { return FALSE; } + elptr += elemsize; + } return TRUE; #undef LASTUNSIGNED } - - -static bool_t xdrstdio_getbytes (XDR *, char *, unsigned int); -static bool_t xdrstdio_putbytes (XDR *, char *, unsigned int); -static unsigned int xdrstdio_getpos (XDR *); -static bool_t xdrstdio_setpos (XDR *, unsigned int); -static xdr_int32_t *xdrstdio_inline (XDR *, int); -static void xdrstdio_destroy (XDR *); -static bool_t xdrstdio_getint32 (XDR *, xdr_int32_t *); -static bool_t xdrstdio_putint32 (XDR *, xdr_int32_t *); -static bool_t xdrstdio_getuint32 (XDR *, xdr_uint32_t *); -static bool_t xdrstdio_putuint32 (XDR *, xdr_uint32_t *); +static bool_t xdrstdio_getbytes(XDR *, char *, unsigned int); +static bool_t xdrstdio_putbytes(XDR *, char *, unsigned int); +static void xdrstdio_destroy(XDR *); +static bool_t xdrstdio_getint32(XDR *, xdr_int32_t *); +static bool_t xdrstdio_putint32(XDR *, xdr_int32_t *); /* * Ops vector for stdio type XDR */ -static const struct xdr_ops xdrstdio_ops = -{ - xdrstdio_getbytes, /* deserialize counted bytes */ - xdrstdio_putbytes, /* serialize counted bytes */ - xdrstdio_getpos, /* get offset in the stream */ - xdrstdio_setpos, /* set offset in the stream */ - xdrstdio_inline, /* prime stream for inline macros */ - xdrstdio_destroy, /* destroy stream */ - xdrstdio_getint32, /* deserialize a int */ - xdrstdio_putint32, /* serialize a int */ - xdrstdio_getuint32, /* deserialize a int */ - xdrstdio_putuint32 /* serialize a int */ +static const struct xdr_ops xdrstdio_ops = { + xdrstdio_getbytes, /* deserialize counted bytes */ + xdrstdio_putbytes, /* serialize counted bytes */ + xdrstdio_destroy, /* destroy stream */ + xdrstdio_getint32, /* deserialize a int */ + xdrstdio_putint32, /* serialize a int */ }; /* @@ -595,8 +245,7 @@ static const struct xdr_ops xdrstdio_ops = * Sets the xdr stream handle xdrs for use on the stream file. * Operation flag is set to op. */ -void -xdrstdio_create (XDR *xdrs, FILE *file, enum xdr_op op) +void xdrstdio_create(XDR *xdrs, FILE *file, enum xdr_op op) { xdrs->x_op = op; /* We have to add the const since the `struct xdr_ops' in `struct XDR' @@ -611,104 +260,42 @@ xdrstdio_create (XDR *xdrs, FILE *file, enum xdr_op op) * Destroy a stdio xdr stream. * Cleans up the xdr stream handle xdrs previously set up by xdrstdio_create. */ -static void -xdrstdio_destroy (XDR *xdrs) +static void xdrstdio_destroy(XDR *xdrs) { - (void) fflush ((FILE *) xdrs->x_private); + (void) fflush((FILE *) xdrs->x_private); /* xx should we close the file ?? */ } - -static bool_t -xdrstdio_getbytes (XDR *xdrs, char *addr, unsigned int len) +static bool_t xdrstdio_getbytes(XDR *xdrs, char *addr, unsigned int len) { - if ((len != 0) && (fread (addr, (int) len, 1, - (FILE *) xdrs->x_private) != 1)) - return FALSE; + if ((len != 0) && (fread(addr, (int) len, 1, (FILE *) xdrs->x_private) != 1)) return FALSE; return TRUE; } -static bool_t -xdrstdio_putbytes (XDR *xdrs, char *addr, unsigned int len) +static bool_t xdrstdio_putbytes(XDR *xdrs, char *addr, unsigned int len) { - if ((len != 0) && (fwrite (addr, (int) len, 1, - (FILE *) xdrs->x_private) != 1)) - return FALSE; + if ((len != 0) && (fwrite(addr, (int) len, 1, (FILE *) xdrs->x_private) != 1)) return FALSE; return TRUE; } -static unsigned int -xdrstdio_getpos (XDR *xdrs) -{ - return (unsigned int) ftell ((FILE *) xdrs->x_private); -} - -static bool_t -xdrstdio_setpos (XDR *xdrs, unsigned int pos) -{ - return fseek ((FILE *) xdrs->x_private, (xdr_int32_t) pos, 0) < 0 ? FALSE : TRUE; -} - -static xdr_int32_t * -xdrstdio_inline (XDR * /*xdrs*/, int /*len*/) -{ - /* - * Must do some work to implement this: must ensure - * enough data in the underlying stdio buffer, - * that the buffer is aligned so that we can indirect through a - * long *, and stuff this pointer in xdrs->x_buf. Doing - * a fread or fwrite to a scratch buffer would defeat - * most of the gains to be had here and require storage - * management on this buffer, so we don't do this. - */ - return nullptr; -} - -static bool_t -xdrstdio_getint32 (XDR *xdrs, xdr_int32_t *ip) +static bool_t xdrstdio_getint32(XDR *xdrs, xdr_int32_t *ip) { xdr_int32_t mycopy; - if (fread ((char *) &mycopy, 4, 1, (FILE *) xdrs->x_private) != 1) - return FALSE; - *ip = xdr_ntohl (mycopy); + if (fread((char *) &mycopy, 4, 1, (FILE *) xdrs->x_private) != 1) return FALSE; + *ip = xdr_ntohl(mycopy); return TRUE; } -static bool_t -xdrstdio_putint32 (XDR *xdrs, xdr_int32_t *ip) +static bool_t xdrstdio_putint32(XDR *xdrs, xdr_int32_t *ip) { - xdr_int32_t mycopy = xdr_htonl (*ip); + xdr_int32_t mycopy = xdr_htonl(*ip); ip = &mycopy; - if (fwrite ((char *) ip, 4, 1, (FILE *) xdrs->x_private) != 1) - return FALSE; + if (fwrite((char *) ip, 4, 1, (FILE *) xdrs->x_private) != 1) return FALSE; return TRUE; } -static bool_t -xdrstdio_getuint32 (XDR *xdrs, xdr_uint32_t *ip) -{ - xdr_uint32_t mycopy; - - if (fread ((char *) &mycopy, 4, 1, (FILE *) xdrs->x_private) != 1) - return FALSE; - *ip = xdr_ntohl (mycopy); - return TRUE; -} - -static bool_t -xdrstdio_putuint32 (XDR *xdrs, xdr_uint32_t *ip) -{ - xdr_uint32_t mycopy = xdr_htonl (*ip); - - ip = &mycopy; - if (fwrite ((char *) ip, 4, 1, (FILE *) xdrs->x_private) != 1) - return FALSE; - return TRUE; -} - #ifdef __cplusplus } #endif - diff --git a/src/EXTRA-DUMP/xdr_compat.h b/src/EXTRA-DUMP/xdr_compat.h index 55dc22e9f9..d600a4a404 100644 --- a/src/EXTRA-DUMP/xdr_compat.h +++ b/src/EXTRA-DUMP/xdr_compat.h @@ -1,7 +1,7 @@ #ifndef LMP_XDR_COMPAT_H #define LMP_XDR_COMPAT_H -#include +#include #include #ifdef __cplusplus @@ -9,47 +9,53 @@ extern "C" { #endif /* - * This file is needed for systems, that do not provide XDR support - * in their system libraries. It was written for windows, but will - * most probably work on other platforms too. better make sure you - * test that the xtc files produced are ok before using it. + * This file contains the definitions for Sun External Data Representation (XDR). + * They have been adapted specifically for the use with the LAMMPS xtc dump style + * to produce compressed trajectory files in the Gromacs XTC format. * - * It is also needed on BG/L, BG/P and Cray XT3/XT4/XT5 as we don't - * have XDR support in the lightweight kernel runtimes either. + * The XDR sources are avaiable under the BSD 3-clause license for example in + * the MIT Kerberos 5 distribution with the following copyright notice and license. * - * This file contains the definitions for Sun External Data - * Representation (XDR) headers and routines. + * @(#)xdr.h 2.2 88/07/29 4.0 RPCSRC * - * Although the rest of LAMPPS is GPL, you can copy and use the XDR - * routines in any way you want as long as you obey Sun's license: - * Sun RPC is a product of Sun Microsystems, Inc. and is provided for - * unrestricted use provided that this legend is included on all tape - * media and as a part of the software program in whole or part. Users - * may copy or modify Sun RPC without charge, but are not authorized - * to license or distribute it to anyone else except as part of a product or - * program developed by the user. + * Copyright (c) 2010, Oracle America, Inc. * - * SUN RPC IS PROVIDED AS IS WITH NO WARRANTIES OF ANY KIND INCLUDING THE - * WARRANTIES OF DESIGN, MERCHANTIBILITY AND FITNESS FOR A PARTICULAR - * PURPOSE, OR ARISING FROM A COURSE OF DEALING, USAGE OR TRADE PRACTICE. + * All rights reserved. * - * Sun RPC is provided with no support and without any obligation on the - * part of Sun Microsystems, Inc. to assist in its use, correction, - * modification or enhancement. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: * - * SUN MICROSYSTEMS, INC. SHALL HAVE NO LIABILITY WITH RESPECT TO THE - * INFRINGEMENT OF COPYRIGHTS, TRADE SECRETS OR ANY PATENTS BY SUN RPC - * OR ANY PART THEREOF. + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. * - * In no event will Sun Microsystems, Inc. be liable for any lost revenue - * or profits or other special, indirect and consequential damages, even if - * Sun has been advised of the possibility of such damages. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. * - * Sun Microsystems, Inc. - * 2550 Garcia Avenue - * Mountain View, California 94043 + * * Neither the name of the "Oracle America, Inc." nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/* compatibility typedefs */ +typedef int bool_t; + +typedef int32_t xdr_int32_t; +typedef uint32_t xdr_uint32_t; + /* * Xdr operations. XDR_ENCODE causes the type to be encoded into the * stream. XDR_DECODE causes the type to be extracted from the stream. @@ -57,44 +63,8 @@ extern "C" { * XDR_DECODE request. */ -typedef int bool_t; - -#if defined(_WIN32) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__DragonFly__) || \ - defined(__OpenBSD__) || defined(__NetBSD__) || (defined(__linux__) && !defined(__GLIBC_MINOR__)) -typedef char *caddr_t; -typedef unsigned int u_int; -#endif - -/* - * Aninteger type that is 32 bits wide. Check if int, - * long or short is 32 bits and die if none of them is :-) - */ -#if (INT_MAX == 2147483647) -typedef int xdr_int32_t; -typedef unsigned int xdr_uint32_t; -#elif (LONG_MAX == 2147483647L) -typedef long xdr_int32_t; -typedef unsigned long xdr_uint32_t; -#elif (SHRT_MAX == 2147483647) -typedef short xdr_int32_t; -typedef unsigned short xdr_uint32_t; -#else -#error ERROR: No 32 bit wide integer type found! -#endif - enum xdr_op { XDR_ENCODE = 0, XDR_DECODE = 1, XDR_FREE = 2 }; -#ifndef FALSE -#define FALSE (0) -#endif -#ifndef TRUE -#define TRUE (1) -#endif - -#define BYTES_PER_XDR_UNIT (4) -/* Macro to round up to units of 4. */ -#define XDR_RNDUP(x) (((x) + BYTES_PER_XDR_UNIT - 1) & ~(BYTES_PER_XDR_UNIT - 1)) - /* * The XDR handle. * Contains operation which is being applied to the stream, @@ -113,26 +83,16 @@ struct XDR { }; struct xdr_ops { + /* get some bytes from XDR stream */ bool_t (*x_getbytes)(XDR *__xdrs, char *__addr, unsigned int __len); - /* get some bytes from " */ + /* put some bytes to XDR stream */ bool_t (*x_putbytes)(XDR *__xdrs, char *__addr, unsigned int __len); - /* put some bytes to " */ - unsigned int (*x_getpostn)(XDR *__xdrs); - /* returns bytes off from beginning */ - bool_t (*x_setpostn)(XDR *__xdrs, unsigned int __pos); - /* lets you reposition the stream */ - xdr_int32_t *(*x_inline)(XDR *__xdrs, int __len); - /* buf quick ptr to buffered data */ - void (*x_destroy)(XDR *__xdrs); /* free privates of this xdr_stream */ + void (*x_destroy)(XDR *__xdrs); + /* get a int from XDR stream */ bool_t (*x_getint32)(XDR *__xdrs, xdr_int32_t *__ip); - /* get a int from underlying stream */ + /* put a int to XDR stream */ bool_t (*x_putint32)(XDR *__xdrs, xdr_int32_t *__ip); - /* put a int to " */ - bool_t (*x_getuint32)(XDR *__xdrs, xdr_uint32_t *__ip); - /* get a unsigned int from underlying stream */ - bool_t (*x_putuint32)(XDR *__xdrs, xdr_uint32_t *__ip); - /* put a int to " */ }; /* @@ -151,53 +111,25 @@ typedef bool_t (*xdrproc_t)(XDR *, void *, ...); * * XDR *xdrs; * xdr_int32_t *int32p; - * long *longp; - * char *addr; * unsigned int len; - * unsigned int pos; */ #define xdr_getint32(xdrs, int32p) (*(xdrs)->x_ops->x_getint32)(xdrs, int32p) - #define xdr_putint32(xdrs, int32p) (*(xdrs)->x_ops->x_putint32)(xdrs, int32p) - -#define xdr_getuint32(xdrs, uint32p) (*(xdrs)->x_ops->x_getuint32)(xdrs, uint32p) - -#define xdr_putuint32(xdrs, uint32p) (*(xdrs)->x_ops->x_putuint32)(xdrs, uint32p) - #define xdr_getbytes(xdrs, addr, len) (*(xdrs)->x_ops->x_getbytes)(xdrs, addr, len) - #define xdr_putbytes(xdrs, addr, len) (*(xdrs)->x_ops->x_putbytes)(xdrs, addr, len) - -#define xdr_getpos(xdrs) (*(xdrs)->x_ops->x_getpostn)(xdrs) - -#define xdr_setpos(xdrs, pos) (*(xdrs)->x_ops->x_setpostn)(xdrs, pos) - -#define xdr_inline(xdrs, len) (*(xdrs)->x_ops->x_inline)(xdrs, len) - #define xdr_destroy(xdrs) \ do { \ if ((xdrs)->x_ops->x_destroy) (*(xdrs)->x_ops->x_destroy)(xdrs); \ } while (0) extern bool_t xdr_int(XDR *__xdrs, int *__ip); -extern bool_t xdr_u_int(XDR *__xdrs, unsigned int *__ip); -extern bool_t xdr_short(XDR *__xdrs, short *__ip); -extern bool_t xdr_u_short(XDR *__xdrs, unsigned short *__ip); -extern bool_t xdr_bool(XDR *__xdrs, int *__bp); extern bool_t xdr_opaque(XDR *__xdrs, char *__cp, unsigned int __cnt); -extern bool_t xdr_string(XDR *__xdrs, char **__cpp, unsigned int __maxsize); -extern bool_t xdr_char(XDR *__xdrs, char *__cp); -extern bool_t xdr_u_char(XDR *__xdrs, unsigned char *__cp); extern bool_t xdr_vector(XDR *__xdrs, char *__basep, unsigned int __nelem, unsigned int __elemsize, xdrproc_t __xdr_elem); extern bool_t xdr_float(XDR *__xdrs, float *__fp); -extern bool_t xdr_double(XDR *__xdrs, double *__dp); extern void xdrstdio_create(XDR *__xdrs, FILE *__file, enum xdr_op __xop); -/* free memory buffers for xdr */ -extern void xdr_free(xdrproc_t __proc, char *__objp); - #ifdef __cplusplus } #endif diff --git a/src/EXTRA-FIX/fix_ave_correlate_long.cpp b/src/EXTRA-FIX/fix_ave_correlate_long.cpp index fc1760b353..7c80365c57 100644 --- a/src/EXTRA-FIX/fix_ave_correlate_long.cpp +++ b/src/EXTRA-FIX/fix_ave_correlate_long.cpp @@ -454,6 +454,8 @@ void FixAveCorrelateLong::end_of_step() scalar = val.val.f->compute_vector(val.argindex-1); // evaluate equal-style or vector-style variable + // if index exceeds vector length, use a zero value + // this can be useful if vector length is not known a priori } else if (val.which == ArgInfo::VARIABLE) { if (val.argindex == 0) @@ -462,7 +464,7 @@ void FixAveCorrelateLong::end_of_step() double *varvec; int nvec = input->variable->compute_vector(val.val.v,&varvec); int index = val.argindex; - if (nvec < index) scalar = 0.0; + if (index > nvec) scalar = 0.0; else scalar = varvec[index-1]; } } diff --git a/src/EXTRA-PAIR/pair_pedone.cpp b/src/EXTRA-PAIR/pair_pedone.cpp new file mode 100644 index 0000000000..c5f7f49117 --- /dev/null +++ b/src/EXTRA-PAIR/pair_pedone.cpp @@ -0,0 +1,389 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Axel Kohlmeyer (Temple U) +------------------------------------------------------------------------- */ + +#include "pair_pedone.h" + +#include "atom.h" +#include "comm.h" +#include "error.h" +#include "force.h" +#include "memory.h" +#include "neigh_list.h" + +#include +#include + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +PairPedone::PairPedone(LAMMPS *lmp) : + Pair(lmp), d0(nullptr), alpha(nullptr), r0(nullptr), c0(nullptr), pedone1(nullptr), + pedone2(nullptr) +{ + writedata = 1; +} + +/* ---------------------------------------------------------------------- */ + +PairPedone::~PairPedone() +{ + if (copymode) return; + + if (allocated) { + memory->destroy(setflag); + memory->destroy(cutsq); + + memory->destroy(cut); + memory->destroy(c0); + memory->destroy(d0); + memory->destroy(alpha); + memory->destroy(r0); + memory->destroy(pedone1); + memory->destroy(pedone2); + memory->destroy(offset); + } +} + +/* ---------------------------------------------------------------------- */ + +void PairPedone::compute(int eflag, int vflag) +{ + int i, j, ii, jj, inum, jnum, itype, jtype; + double xtmp, ytmp, ztmp, delx, dely, delz, evdwl, fpair; + double rsq, r, r2inv, r6inv, dr, dexp, factor_lj; + int *ilist, *jlist, *numneigh, **firstneigh; + + evdwl = 0.0; + ev_init(eflag, vflag); + + double **x = atom->x; + double **f = atom->f; + int *type = atom->type; + int nlocal = atom->nlocal; + double *special_lj = force->special_lj; + int newton_pair = force->newton_pair; + + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + + // loop over neighbors of my atoms + + for (ii = 0; ii < inum; ii++) { + i = ilist[ii]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + factor_lj = special_lj[sbmask(j)]; + j &= NEIGHMASK; + + delx = xtmp - x[j][0]; + dely = ytmp - x[j][1]; + delz = ztmp - x[j][2]; + rsq = delx * delx + dely * dely + delz * delz; + jtype = type[j]; + + if (rsq < cutsq[itype][jtype]) { + r2inv = 1.0 / rsq; + r6inv = r2inv * r2inv * r2inv; + r = sqrt(rsq); + dr = r - r0[itype][jtype]; + dexp = exp(-alpha[itype][jtype] * dr); + fpair = pedone1[itype][jtype] * (dexp * dexp - dexp) / r + + pedone2[itype][jtype] * r6inv * r6inv * r2inv; + fpair *= factor_lj; + + f[i][0] += delx * fpair; + f[i][1] += dely * fpair; + f[i][2] += delz * fpair; + if (newton_pair || j < nlocal) { + f[j][0] -= delx * fpair; + f[j][1] -= dely * fpair; + f[j][2] -= delz * fpair; + } + + if (eflag) { + evdwl = d0[itype][jtype] * (dexp * dexp - 2.0 * dexp) + c0[itype][jtype] * r6inv * r6inv - + offset[itype][jtype]; + evdwl *= factor_lj; + } + + if (evflag) ev_tally(i, j, nlocal, newton_pair, evdwl, 0.0, fpair, delx, dely, delz); + } + } + } + + if (vflag_fdotr) virial_fdotr_compute(); +} + +/* ---------------------------------------------------------------------- + allocate all arrays +------------------------------------------------------------------------- */ + +void PairPedone::allocate() +{ + allocated = 1; + int np1 = atom->ntypes + 1; + + memory->create(setflag, np1, np1, "pair:setflag"); + for (int i = 1; i < np1; i++) + for (int j = i; j < np1; j++) setflag[i][j] = 0; + + memory->create(cutsq, np1, np1, "pair:cutsq"); + + memory->create(cut, np1, np1, "pair:cut"); + memory->create(c0, np1, np1, "pair:c0"); + memory->create(d0, np1, np1, "pair:d0"); + memory->create(alpha, np1, np1, "pair:alpha"); + memory->create(r0, np1, np1, "pair:r0"); + memory->create(pedone1, np1, np1, "pair:pedone1"); + memory->create(pedone2, np1, np1, "pair:pedone2"); + memory->create(offset, np1, np1, "pair:offset"); +} + +/* ---------------------------------------------------------------------- + global settings +------------------------------------------------------------------------- */ + +void PairPedone::settings(int narg, char **arg) +{ + if (narg != 1) error->all(FLERR, "Illegal pair_style command"); + + cut_global = utils::numeric(FLERR, arg[0], false, lmp); + + // reset cutoffs that have been explicitly set + + if (allocated) { + int i, j; + for (i = 1; i <= atom->ntypes; i++) + for (j = i; j <= atom->ntypes; j++) + if (setflag[i][j]) cut[i][j] = cut_global; + } +} + +/* ---------------------------------------------------------------------- + set coeffs for one or more type pairs +------------------------------------------------------------------------- */ + +void PairPedone::coeff(int narg, char **arg) +{ + if (narg < 6 || narg > 7) error->all(FLERR, "Incorrect args for pair coefficients"); + if (!allocated) allocate(); + + int ilo, ihi, jlo, jhi; + utils::bounds(FLERR, arg[0], 1, atom->ntypes, ilo, ihi, error); + utils::bounds(FLERR, arg[1], 1, atom->ntypes, jlo, jhi, error); + + double d0_one = utils::numeric(FLERR, arg[2], false, lmp); + double alpha_one = utils::numeric(FLERR, arg[3], false, lmp); + double r0_one = utils::numeric(FLERR, arg[4], false, lmp); + double c0_one = utils::numeric(FLERR, arg[5], false, lmp); + + double cut_one = cut_global; + if (narg == 7) cut_one = utils::numeric(FLERR, arg[6], false, lmp); + + int count = 0; + for (int i = ilo; i <= ihi; i++) { + for (int j = MAX(jlo, i); j <= jhi; j++) { + c0[i][j] = c0_one; + d0[i][j] = d0_one; + alpha[i][j] = alpha_one; + r0[i][j] = r0_one; + cut[i][j] = cut_one; + setflag[i][j] = 1; + count++; + } + } + + if (count == 0) error->all(FLERR, "Incorrect args for pair coefficients"); +} + +/* ---------------------------------------------------------------------- + init for one type pair i,j and corresponding j,i +------------------------------------------------------------------------- */ + +double PairPedone::init_one(int i, int j) +{ + if (setflag[i][j] == 0) error->all(FLERR, "All pair coeffs are not set"); + + pedone1[i][j] = 2.0 * d0[i][j] * alpha[i][j]; + pedone2[i][j] = 12.0 * c0[i][j]; + + if (offset_flag) { + double alpha_dr = -alpha[i][j] * (cut[i][j] - r0[i][j]); + offset[i][j] = + d0[i][j] * (exp(2.0 * alpha_dr) - 2.0 * exp(alpha_dr)) - c0[i][j] / pow(cut[i][j], 12.0); + } else + offset[i][j] = 0.0; + + c0[j][i] = c0[i][j]; + d0[j][i] = d0[i][j]; + alpha[j][i] = alpha[i][j]; + r0[j][i] = r0[i][j]; + pedone1[j][i] = pedone1[i][j]; + pedone2[j][i] = pedone2[i][j]; + offset[j][i] = offset[i][j]; + + return cut[i][j]; +} + +/* ---------------------------------------------------------------------- + proc 0 writes to restart file +------------------------------------------------------------------------- */ + +void PairPedone::write_restart(FILE *fp) +{ + write_restart_settings(fp); + + int i, j; + for (i = 1; i <= atom->ntypes; i++) { + for (j = i; j <= atom->ntypes; j++) { + fwrite(&setflag[i][j], sizeof(int), 1, fp); + if (setflag[i][j]) { + fwrite(&c0[i][j], sizeof(double), 1, fp); + fwrite(&d0[i][j], sizeof(double), 1, fp); + fwrite(&alpha[i][j], sizeof(double), 1, fp); + fwrite(&r0[i][j], sizeof(double), 1, fp); + fwrite(&cut[i][j], sizeof(double), 1, fp); + } + } + } +} + +/* ---------------------------------------------------------------------- + proc 0 reads from restart file, bcasts +------------------------------------------------------------------------- */ + +void PairPedone::read_restart(FILE *fp) +{ + read_restart_settings(fp); + + allocate(); + + int i, j; + int me = comm->me; + for (i = 1; i <= atom->ntypes; i++) { + for (j = i; j <= atom->ntypes; j++) { + if (me == 0) utils::sfread(FLERR, &setflag[i][j], sizeof(int), 1, fp, nullptr, error); + MPI_Bcast(&setflag[i][j], 1, MPI_INT, 0, world); + if (setflag[i][j]) { + if (me == 0) { + utils::sfread(FLERR, &c0[i][j], sizeof(double), 1, fp, nullptr, error); + utils::sfread(FLERR, &d0[i][j], sizeof(double), 1, fp, nullptr, error); + utils::sfread(FLERR, &alpha[i][j], sizeof(double), 1, fp, nullptr, error); + utils::sfread(FLERR, &r0[i][j], sizeof(double), 1, fp, nullptr, error); + utils::sfread(FLERR, &cut[i][j], sizeof(double), 1, fp, nullptr, error); + } + MPI_Bcast(&c0[i][j], 1, MPI_DOUBLE, 0, world); + MPI_Bcast(&d0[i][j], 1, MPI_DOUBLE, 0, world); + MPI_Bcast(&alpha[i][j], 1, MPI_DOUBLE, 0, world); + MPI_Bcast(&r0[i][j], 1, MPI_DOUBLE, 0, world); + MPI_Bcast(&cut[i][j], 1, MPI_DOUBLE, 0, world); + } + } + } +} + +/* ---------------------------------------------------------------------- + proc 0 writes to restart file +------------------------------------------------------------------------- */ + +void PairPedone::write_restart_settings(FILE *fp) +{ + fwrite(&cut_global, sizeof(double), 1, fp); + fwrite(&offset_flag, sizeof(int), 1, fp); + fwrite(&mix_flag, sizeof(int), 1, fp); +} + +/* ---------------------------------------------------------------------- + proc 0 reads from restart file, bcasts +------------------------------------------------------------------------- */ + +void PairPedone::read_restart_settings(FILE *fp) +{ + if (comm->me == 0) { + utils::sfread(FLERR, &cut_global, sizeof(double), 1, fp, nullptr, error); + utils::sfread(FLERR, &offset_flag, sizeof(int), 1, fp, nullptr, error); + utils::sfread(FLERR, &mix_flag, sizeof(int), 1, fp, nullptr, error); + } + MPI_Bcast(&cut_global, 1, MPI_DOUBLE, 0, world); + MPI_Bcast(&offset_flag, 1, MPI_INT, 0, world); + MPI_Bcast(&mix_flag, 1, MPI_INT, 0, world); +} + +/* ---------------------------------------------------------------------- + proc 0 writes to data file +------------------------------------------------------------------------- */ + +void PairPedone::write_data(FILE *fp) +{ + for (int i = 1; i <= atom->ntypes; i++) + fprintf(fp, "%d %g %g %g %g\n", i, d0[i][i], alpha[i][i], r0[i][i], c0[i][i]); +} + +/* ---------------------------------------------------------------------- + proc 0 writes all pairs to data file +------------------------------------------------------------------------- */ + +void PairPedone::write_data_all(FILE *fp) +{ + for (int i = 1; i <= atom->ntypes; i++) + for (int j = i; j <= atom->ntypes; j++) + fprintf(fp, "%d %d %g %g %g %g %g\n", i, j, d0[i][j], alpha[i][j], r0[i][j], c0[i][j], + cut[i][j]); +} + +/* ---------------------------------------------------------------------- */ + +double PairPedone::single(int /*i*/, int /*j*/, int itype, int jtype, double rsq, + double /*factor_coul*/, double factor_lj, double &fforce) +{ + double r, dr, dexp, phi, r2inv, r6inv; + + r = sqrt(rsq); + dr = r - r0[itype][jtype]; + dexp = exp(-alpha[itype][jtype] * dr); + r2inv = 1.0 / rsq; + r6inv = r2inv * r2inv * r2inv; + fforce = pedone1[itype][jtype] * (dexp * dexp - dexp) / r + + pedone2[itype][jtype] * r6inv * r6inv * r2inv; + fforce *= factor_lj; + + phi = d0[itype][jtype] * (dexp * dexp - 2.0 * dexp) + c0[itype][jtype] * r6inv * r6inv - + offset[itype][jtype]; + return factor_lj * phi; +} + +/* ---------------------------------------------------------------------- */ + +void *PairPedone::extract(const char *str, int &dim) +{ + dim = 2; + if (strcmp(str, "c0") == 0) return (void *) c0; + if (strcmp(str, "d0") == 0) return (void *) d0; + if (strcmp(str, "r0") == 0) return (void *) r0; + if (strcmp(str, "alpha") == 0) return (void *) alpha; + return nullptr; +} diff --git a/src/EXTRA-PAIR/pair_pedone.h b/src/EXTRA-PAIR/pair_pedone.h new file mode 100644 index 0000000000..a54f50a6b3 --- /dev/null +++ b/src/EXTRA-PAIR/pair_pedone.h @@ -0,0 +1,58 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(pedone,PairPedone); +// clang-format on +#else + +#ifndef LMP_PAIR_PEDONE_H +#define LMP_PAIR_PEDONE_H + +#include "pair.h" + +namespace LAMMPS_NS { + +class PairPedone : public Pair { + public: + PairPedone(class LAMMPS *); + ~PairPedone() override; + void compute(int, int) override; + + void settings(int, char **) override; + void coeff(int, char **) override; + double init_one(int, int) override; + void write_restart(FILE *) override; + void read_restart(FILE *) override; + void write_restart_settings(FILE *) override; + void read_restart_settings(FILE *) override; + void write_data(FILE *) override; + void write_data_all(FILE *) override; + double single(int, int, int, int, double, double, double, double &) override; + void *extract(const char *, int &) override; + + protected: + double cut_global; + double **cut; + double **d0, **alpha, **r0, **c0; + double **pedone1, **pedone2; + double **offset; + + virtual void allocate(); +}; + +} // namespace LAMMPS_NS + +#endif +#endif diff --git a/src/KOKKOS/atom_kokkos.cpp b/src/KOKKOS/atom_kokkos.cpp index b1a066f165..6707deddd5 100644 --- a/src/KOKKOS/atom_kokkos.cpp +++ b/src/KOKKOS/atom_kokkos.cpp @@ -31,9 +31,7 @@ using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- */ -AtomKokkos::AtomKokkos(LAMMPS *lmp) : Atom(lmp), -mapBinner(1, 0.0, 1.0), // no default constructor, these values are not used -mapSorter(d_tag_sorted, 0, 1, mapBinner, true) +AtomKokkos::AtomKokkos(LAMMPS *lmp) : Atom(lmp) { avecKK = nullptr; diff --git a/src/KOKKOS/atom_kokkos.h b/src/KOKKOS/atom_kokkos.h index 652e6c2191..669c8b510a 100644 --- a/src/KOKKOS/atom_kokkos.h +++ b/src/KOKKOS/atom_kokkos.h @@ -88,8 +88,25 @@ class AtomKokkos : public Atom { DAT::tdual_int_1d k_map_array; dual_hash_type k_map_hash; - DAT::t_tagint_1d d_tag_sorted; - DAT::t_int_1d d_i_sorted; + struct KeyValue { + int i; + bigint tag; + }; + + typedef Kokkos::View t_keyvalue_1d; + t_keyvalue_1d d_sorted; + + struct MyComp { + KOKKOS_FUNCTION + bool operator()(const KeyValue& a, const KeyValue& b) const + { + if (a.tag < b.tag) + return true; + if (b.tag < a.tag) + return false; + return a.i < b.i; + } + }; typedef Kokkos::DualView tdual_tagint_2; typedef tdual_tagint_2::t_dev t_tagint_2; @@ -101,11 +118,6 @@ class AtomKokkos : public Atom { DAT::t_tagint_scalar d_tag_min,d_tag_max; HAT::t_tagint_scalar h_tag_min,h_tag_max; - using MapKeyViewType = decltype(d_tag_sorted); - using BinOpMap = Kokkos::BinOp1D; - BinOpMap mapBinner; - Kokkos::BinSort mapSorter; - class AtomVecKokkos* avecKK; // map lookup function inlined for efficiency diff --git a/src/KOKKOS/atom_map_kokkos.cpp b/src/KOKKOS/atom_map_kokkos.cpp index 8203e1e6a2..2c4c58ee19 100644 --- a/src/KOKKOS/atom_map_kokkos.cpp +++ b/src/KOKKOS/atom_map_kokkos.cpp @@ -190,128 +190,64 @@ void AtomKokkos::map_set_device() atomKK->sync(Device, TAG_MASK); + int map_style_array = (map_style == MAP_ARRAY); + auto d_tag = atomKK->k_tag.d_view; auto d_sametag = k_sametag.d_view; - // sort by tag - int nmax = atom->nmax; - int realloc_flag = 0; - if (!d_tag_sorted.data() || (int)d_tag_sorted.extent(0) < nmax) { - MemKK::realloc_kokkos(d_tag_sorted,"atom:tag_sorted",nmax); - MemKK::realloc_kokkos(d_i_sorted,"atom:i_sorted",nmax); - realloc_flag = 1; - } + // sort by tag then local id - h_tag_min() = MAXTAGINT; - h_tag_max() = 0; + if (!d_sorted.data() || (int)d_sorted.extent(0) < nmax) + MemKK::realloc_kokkos(d_sorted,"atom:sorted",nmax); - Kokkos::deep_copy(d_tag_min_max,h_tag_min_max); - - auto l_tag_sorted = d_tag_sorted; - auto l_i_sorted = d_i_sorted; - auto l_tag_min = d_tag_min; - auto l_tag_max = d_tag_max; - int map_style_array = (map_style == MAP_ARRAY); + auto l_sorted = Kokkos::subview(d_sorted,std::make_pair(0,nall)); Kokkos::parallel_for(nall, LAMMPS_LAMBDA(int i) { - l_i_sorted(i) = i; - tagint tag_i = d_tag(i); - l_tag_sorted(i) = tag_i; - Kokkos::atomic_min(&l_tag_min(),tag_i); - Kokkos::atomic_max(&l_tag_max(),tag_i); + l_sorted(i).i = i; + l_sorted(i).tag = d_tag(i); }); - Kokkos::deep_copy(h_tag_min_max,d_tag_min_max); - - tagint min = h_tag_min(); - tagint max = h_tag_max(); - - using MapKeyViewType = decltype(d_tag_sorted); - using BinOpMap = Kokkos::BinOp1D; - - mapBinner = BinOpMap(nall, min, max); - - if (realloc_flag) { - mapSorter = Kokkos::BinSort(d_tag_sorted, 0, nall, mapBinner, true); - MemKK::realloc_kokkos(mapSorter.bin_count_atomic,"Kokkos::SortImpl::BinSortFunctor::bin_count",nmax+1); - Kokkos::deep_copy(mapSorter.bin_count_atomic,0); - mapSorter.bin_count_const = mapSorter.bin_count_atomic; - MemKK::realloc_kokkos(mapSorter.bin_offsets,"Kokkos::SortImpl::BinSortFunctor::bin_offsets",nmax+1); - MemKK::realloc_kokkos(mapSorter.sort_order,"Kokkos::SortImpl::BinSortFunctor::sort_order",nmax); - } else { - Kokkos::deep_copy(mapSorter.bin_count_atomic,0); - mapSorter.bin_op = mapBinner; - mapSorter.range_begin = 0; - mapSorter.range_end = nall; - } - - mapSorter.create_permute_vector(LMPDeviceType()); - mapSorter.sort(LMPDeviceType(), d_tag_sorted, 0, nall); - mapSorter.sort(LMPDeviceType(), d_i_sorted, 0, nall); + Kokkos::sort(LMPDeviceType(),l_sorted,MyComp{}); auto d_map_array = k_map_array.d_view; auto d_map_hash = k_map_hash.d_view; - d_map_hash.clear(); + if (!map_style_array) + d_map_hash.clear(); auto d_error_flag = k_error_flag.d_view; Kokkos::deep_copy(d_error_flag,0); - // for each tag find: - // neighboring atoms with closest local id for sametag // atom with smallest local id for atom map Kokkos::parallel_for(nall, LAMMPS_LAMBDA(int ii) { - const int i = l_i_sorted(ii); - const tagint tag_i = l_tag_sorted(ii); - int i_min = i; - int i_closest = MAXSMALLINT; + const int i = l_sorted(ii).i; + const tagint tag_i = l_sorted(ii).tag; - // search atoms with same tag in the forward direction + // sametag + tagint tag_j = -1; int jj = ii+1; - int closest_flag = 0; + if (jj < nall) tag_j = l_sorted(jj).tag; - while (jj < nall) { - const tagint tag_j = l_tag_sorted(jj); - if (tag_j != tag_i) break; - const int j = l_i_sorted(jj); - i_min = MIN(i_min,j); - if (j > i) { - i_closest = MIN(i_closest,j); - closest_flag = 1; - } - jj++; - } + if (tag_j == tag_i) + d_sametag(i) = l_sorted(jj).i; + else + d_sametag(i) = -1; - // search atoms with same tag in the reverse direction + // atom map + tag_j = -1; jj = ii-1; + if (jj >= 0) tag_j = l_sorted(jj).tag; - while (jj >= 0) { - const tagint tag_j = l_tag_sorted(jj); - if (tag_j != tag_i) break; - const int j = l_i_sorted(jj); - i_min = MIN(i_min,j); - if (j > i) { - i_closest = MIN(i_closest,j); - closest_flag = 1; - } - jj--; - } - - if (!closest_flag) - i_closest = -1; - - d_sametag(i) = i_closest; - - if (i == i_min) { + if (tag_j != tag_i) { if (map_style_array) - d_map_array(tag_i) = i_min; + d_map_array(tag_i) = i; else { - auto insert_result = d_map_hash.insert(tag_i, i_min); + auto insert_result = d_map_hash.insert(tag_i, i); if (insert_result.failed()) d_error_flag() = 1; } } diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index 2425857d61..8f821c3036 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -880,15 +880,8 @@ void CommKokkos::exchange_device() // sort exchange_sendlist - auto d_exchange_sendlist = k_exchange_sendlist.view(); - using KeyViewType = decltype(d_exchange_sendlist); - using BinOp = Kokkos::BinOp1D; - - BinOp binner(count, 0, nlocal); - Kokkos::BinSort Sorter(d_exchange_sendlist, 0, count, binner, true); - Sorter.create_permute_vector(DeviceType()); - Sorter.sort(DeviceType(), d_exchange_sendlist, 0, count); - + auto d_exchange_sendlist = Kokkos::subview(k_exchange_sendlist.view(),std::make_pair(0,count)); + Kokkos::sort(DeviceType(), d_exchange_sendlist); k_exchange_sendlist.sync(); // when atom is deleted, fill it in with last atom diff --git a/src/KOKKOS/domain_kokkos.cpp b/src/KOKKOS/domain_kokkos.cpp index aecc12cd12..d0865c6afb 100644 --- a/src/KOKKOS/domain_kokkos.cpp +++ b/src/KOKKOS/domain_kokkos.cpp @@ -22,7 +22,7 @@ using namespace LAMMPS_NS; -static constexpr double BIG = 1.0e20; +static constexpr double BIG = 1.0e20; /* ---------------------------------------------------------------------- */ @@ -80,6 +80,11 @@ public: void DomainKokkos::reset_box() { // perform shrink-wrapping + + // nothing to do for empty systems + + if (atom->natoms == 0) return; + // compute extent of atoms on this proc // for triclinic, this is done in lamda space diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index 5caed42f43..202d46e788 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -39,8 +39,11 @@ FFT3dKokkos::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int Pointers(lmp) { int nthreads = lmp->kokkos->nthreads; + +#if defined(LMP_KOKKOS_GPU) int ngpus = lmp->kokkos->ngpus; ExecutionSpace execution_space = ExecutionSpaceFromDevice::space; +#endif #if defined(FFT_KOKKOS_MKL) if (ngpus > 0 && execution_space == Device) diff --git a/src/KOKKOS/pair_kokkos.h b/src/KOKKOS/pair_kokkos.h index 15417d7620..c4bd603041 100644 --- a/src/KOKKOS/pair_kokkos.h +++ b/src/KOKKOS/pair_kokkos.h @@ -950,10 +950,10 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&P static int vectorsize = 0; static int atoms_per_team = 0; - static int teamsize_max_for = 0; - static int teamsize_max_reduce = 0; #if defined(LMP_KOKKOS_GPU) + static int teamsize_max_for = 0; + static int teamsize_max_reduce = 0; static int lastcall = -1; if (!vectorsize || lastcall < fpair->lmp->neighbor->lastcall) { lastcall = fpair->lmp->update->ntimestep; diff --git a/src/MDI/mdi_engine.cpp b/src/MDI/mdi_engine.cpp index 6d7c604fa6..4f69e814f5 100644 --- a/src/MDI/mdi_engine.cpp +++ b/src/MDI/mdi_engine.cpp @@ -695,7 +695,7 @@ void MDIEngine::mdi_md() if (strcmp(mdicmd, "EXIT") == 0) return; // run one step at a time forever - // driver triggers exit with @ command other than @COORDS,@FORCES,@ENDSTEP + // driver triggers exit with @ command other than @COORDS,@FORCES,@ENDSTEP,@ update->integrate->setup(1); @@ -711,7 +711,7 @@ void MDIEngine::mdi_md() update->integrate->run(1); if (strcmp(mdicmd, "@COORDS") != 0 && strcmp(mdicmd, "@FORCES") != 0 && - strcmp(mdicmd, "@ENDSTEP") != 0) + strcmp(mdicmd, "@ENDSTEP") != 0 && strcmp(mdicmd, "@") != 0) break; } diff --git a/src/MOLECULE/fix_cmap.cpp b/src/MOLECULE/fix_cmap.cpp index cb4cb8cadc..bc220da30e 100644 --- a/src/MOLECULE/fix_cmap.cpp +++ b/src/MOLECULE/fix_cmap.cpp @@ -970,7 +970,8 @@ void FixCMAP::read_data_section(char * /*keyword*/, int /*n*/, char *buf, atom5 = values.next_tagint(); if (values.has_next()) throw TokenizerException("too many items",line); } catch (std::exception &e) { - error->all(FLERR,"Incorrect format of CMAP section: {}", e.what()); + error->all(FLERR,"Incorrect format of CMAP section in data file: {}{}", + e.what(), utils::errorurl(2)); } atom1 += id_offset; diff --git a/src/OPENMP/angle_cosine_squared_omp.cpp b/src/OPENMP/angle_cosine_squared_omp.cpp index 9b849c62a3..f70e167d36 100644 --- a/src/OPENMP/angle_cosine_squared_omp.cpp +++ b/src/OPENMP/angle_cosine_squared_omp.cpp @@ -16,14 +16,16 @@ Contributing author: Axel Kohlmeyer (Temple U) ------------------------------------------------------------------------- */ -#include "omp_compat.h" #include "angle_cosine_squared_omp.h" -#include + #include "atom.h" #include "comm.h" #include "force.h" #include "neighbor.h" +#include + +#include "omp_compat.h" #include "suffix.h" using namespace LAMMPS_NS; diff --git a/src/OPENMP/pair_pedone_omp.cpp b/src/OPENMP/pair_pedone_omp.cpp new file mode 100644 index 0000000000..140816bfe4 --- /dev/null +++ b/src/OPENMP/pair_pedone_omp.cpp @@ -0,0 +1,169 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + This software is distributed under the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Axel Kohlmeyer (Temple U) +------------------------------------------------------------------------- */ + +#include "pair_pedone_omp.h" + +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "neigh_list.h" +#include "suffix.h" + +#include + +#include "omp_compat.h" +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +PairPedoneOMP::PairPedoneOMP(LAMMPS *lmp) : PairPedone(lmp), ThrOMP(lmp, THR_PAIR) +{ + suffix_flag |= Suffix::OMP; + respa_enable = 0; +} + +/* ---------------------------------------------------------------------- */ + +void PairPedoneOMP::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + const int nall = atom->nlocal + atom->nghost; + const int nthreads = comm->nthreads; + const int inum = list->inum; + +#if defined(_OPENMP) +#pragma omp parallel LMP_DEFAULT_NONE LMP_SHARED(eflag, vflag) +#endif + { + int ifrom, ito, tid; + + loop_setup_thr(ifrom, ito, tid, inum, nthreads); + ThrData *thr = fix->get_thr(tid); + thr->timer(Timer::START); + ev_setup_thr(eflag, vflag, nall, eatom, vatom, nullptr, thr); + + if (evflag) { + if (eflag) { + if (force->newton_pair) + eval<1, 1, 1>(ifrom, ito, thr); + else + eval<1, 1, 0>(ifrom, ito, thr); + } else { + if (force->newton_pair) + eval<1, 0, 1>(ifrom, ito, thr); + else + eval<1, 0, 0>(ifrom, ito, thr); + } + } else { + if (force->newton_pair) + eval<0, 0, 1>(ifrom, ito, thr); + else + eval<0, 0, 0>(ifrom, ito, thr); + } + + thr->timer(Timer::PAIR); + reduce_thr(this, eflag, vflag, thr); + } // end of omp parallel region +} + +template +void PairPedoneOMP::eval(int iifrom, int iito, ThrData *const thr) +{ + int i, j, ii, jj, jnum, itype, jtype; + double xtmp, ytmp, ztmp, delx, dely, delz, evdwl, fpair; + double rsq, r, r2inv, r6inv, dr, dexp, factor_lj; + int *ilist, *jlist, *numneigh, **firstneigh; + + evdwl = 0.0; + + const auto *_noalias const x = (dbl3_t *) atom->x[0]; + auto *_noalias const f = (dbl3_t *) thr->get_f()[0]; + const int *_noalias const type = atom->type; + const int nlocal = atom->nlocal; + const double *_noalias const special_lj = force->special_lj; + double fxtmp, fytmp, fztmp; + + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + + // loop over neighbors of my atoms + + for (ii = iifrom; ii < iito; ++ii) { + + i = ilist[ii]; + xtmp = x[i].x; + ytmp = x[i].y; + ztmp = x[i].z; + itype = type[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + fxtmp = fytmp = fztmp = 0.0; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + factor_lj = special_lj[sbmask(j)]; + j &= NEIGHMASK; + + delx = xtmp - x[j].x; + dely = ytmp - x[j].y; + delz = ztmp - x[j].z; + rsq = delx * delx + dely * dely + delz * delz; + jtype = type[j]; + + if (rsq < cutsq[itype][jtype]) { + r2inv = 1.0 / rsq; + r6inv = r2inv * r2inv * r2inv; + r = sqrt(rsq); + dr = r - r0[itype][jtype]; + dexp = exp(-alpha[itype][jtype] * dr); + fpair = pedone1[itype][jtype] * (dexp * dexp - dexp) / r + + pedone2[itype][jtype] * r6inv * r6inv * r2inv; + fpair *= factor_lj; + + fxtmp += delx * fpair; + fytmp += dely * fpair; + fztmp += delz * fpair; + if (NEWTON_PAIR || j < nlocal) { + f[j].x -= delx * fpair; + f[j].y -= dely * fpair; + f[j].z -= delz * fpair; + } + + if (EFLAG) { + evdwl = d0[itype][jtype] * (dexp * dexp - 2.0 * dexp) + c0[itype][jtype] * r6inv * r6inv - + offset[itype][jtype]; + evdwl *= factor_lj; + } + + if (EVFLAG) + ev_tally_thr(this, i, j, nlocal, NEWTON_PAIR, evdwl, 0.0, fpair, delx, dely, delz, thr); + } + } + f[i].x += fxtmp; + f[i].y += fytmp; + f[i].z += fztmp; + } +} + +/* ---------------------------------------------------------------------- */ + +double PairPedoneOMP::memory_usage() +{ + double bytes = memory_usage_thr(); + bytes += PairPedone::memory_usage(); + + return bytes; +} diff --git a/src/OPENMP/pair_pedone_omp.h b/src/OPENMP/pair_pedone_omp.h new file mode 100644 index 0000000000..8c23e86fcd --- /dev/null +++ b/src/OPENMP/pair_pedone_omp.h @@ -0,0 +1,48 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Axel Kohlmeyer (Temple U) +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(pedone/omp,PairPedoneOMP); +// clang-format on +#else + +#ifndef LMP_PAIR_PEDONE_OMP_H +#define LMP_PAIR_PEDONE_OMP_H + +#include "pair_pedone.h" +#include "thr_omp.h" + +namespace LAMMPS_NS { + +class PairPedoneOMP : public PairPedone, public ThrOMP { + + public: + PairPedoneOMP(class LAMMPS *); + + void compute(int, int) override; + double memory_usage() override; + + private: + template + void eval(int ifrom, int ito, ThrData *const thr); +}; + +} // namespace LAMMPS_NS + +#endif +#endif diff --git a/src/RIGID/fix_rigid.cpp b/src/RIGID/fix_rigid.cpp index 7a63c52220..f4b82e1fd2 100644 --- a/src/RIGID/fix_rigid.cpp +++ b/src/RIGID/fix_rigid.cpp @@ -1234,11 +1234,6 @@ void FixRigid::enforce2d() angmom[ibody][1] = 0.0; omega[ibody][0] = 0.0; omega[ibody][1] = 0.0; - if (langflag && langextra) { - langextra[ibody][2] = 0.0; - langextra[ibody][3] = 0.0; - langextra[ibody][4] = 0.0; - } } } @@ -1958,6 +1953,8 @@ void FixRigid::setup_bodies_static() // diagonalize inertia tensor for each body via Jacobi rotations // inertia = 3 eigenvalues = principal moments of inertia + // request that jacobi3() return them in ascending order, + /// so that in 2d last evector is z-axis // evectors and exzy_space = 3 evectors = principal axes of rigid body int ierror; @@ -1972,7 +1969,7 @@ void FixRigid::setup_bodies_static() tensor[0][2] = tensor[2][0] = all[ibody][4]; tensor[0][1] = tensor[1][0] = all[ibody][5]; - ierror = MathEigen::jacobi3(tensor,inertia[ibody],evectors); + ierror = MathEigen::jacobi3(tensor,inertia[ibody],evectors,1); if (ierror) error->all(FLERR, "Insufficient Jacobi rotations for rigid body"); @@ -1986,6 +1983,22 @@ void FixRigid::setup_bodies_static() ez_space[ibody][1] = evectors[1][2]; ez_space[ibody][2] = evectors[2][2]; + // for 2d, ensure that evector along z axis is last + // necessary so that quaternion is a simple rotation around +z axis + // or a 180 degree rotation for a -z axis + // otherwise richardson() method for a body with a tiny evalue (near-linear) + // may not preserve the correct z-aligned quat and associated evectors + // over time due to round-off accumulation + + if (domain->dimension == 2) { + if (fabs(ez_space[ibody][0]) > EPSILON || fabs(ez_space[ibody][1]) > EPSILON) { + std::swap(inertia[ibody][1],inertia[ibody][2]); + std::swap(ey_space[ibody][0],ez_space[ibody][0]); + std::swap(ey_space[ibody][1],ez_space[ibody][1]); + std::swap(ey_space[ibody][2],ez_space[ibody][2]); + } + } + // if any principal moment < scaled EPSILON, set to 0.0 double max; diff --git a/src/RIGID/fix_rigid_small.cpp b/src/RIGID/fix_rigid_small.cpp index 9e185a4de2..0bfd8032c5 100644 --- a/src/RIGID/fix_rigid_small.cpp +++ b/src/RIGID/fix_rigid_small.cpp @@ -1110,11 +1110,6 @@ void FixRigidSmall::enforce2d() b->angmom[1] = 0.0; b->omega[0] = 0.0; b->omega[1] = 0.0; - if (langflag && langextra) { - langextra[ibody][2] = 0.0; - langextra[ibody][3] = 0.0; - langextra[ibody][4] = 0.0; - } } } @@ -2102,6 +2097,8 @@ void FixRigidSmall::setup_bodies_static() // diagonalize inertia tensor for each body via Jacobi rotations // inertia = 3 eigenvalues = principal moments of inertia + // request that jacobi3() returns them in ascending order, + // so that in 2d last evector is z-axis // evectors and exzy_space = 3 evectors = principal axes of rigid body int ierror; @@ -2118,7 +2115,7 @@ void FixRigidSmall::setup_bodies_static() tensor[0][1] = tensor[1][0] = itensor[ibody][5]; inertia = body[ibody].inertia; - ierror = MathEigen::jacobi3(tensor,inertia,evectors); + ierror = MathEigen::jacobi3(tensor,inertia,evectors,1); if (ierror) error->all(FLERR, "Insufficient Jacobi rotations for rigid body"); ex = body[ibody].ex_space; @@ -2134,6 +2131,22 @@ void FixRigidSmall::setup_bodies_static() ez[1] = evectors[1][2]; ez[2] = evectors[2][2]; + // for 2d, ensure that evector along z axis is last + // necessary so that quaternion is a simple rotation around +z axis + // or a 180 degree rotation for a -z axis + // otherwise richardson() method for a body with a tiny evalue (near-linear) + // may not preserve the correct z-aligned quat and associated evectors + // over time due to round-off accumulation + + if (domain->dimension == 2) { + if (fabs(ez[0]) > EPSILON || fabs(ez[1]) > EPSILON) { + std::swap(inertia[1],inertia[2]); + std::swap(ey[0],ez[0]); + std::swap(ey[1],ez[1]); + std::swap(ey[2],ez[2]); + } + } + // if any principal moment < scaled EPSILON, set to 0.0 double max; @@ -2156,11 +2169,12 @@ void FixRigidSmall::setup_bodies_static() // convert geometric center position to principal axis coordinates // xcm is wrapped, but xgc is not initially + xcm = body[ibody].xcm; xgc = body[ibody].xgc; double delta[3]; MathExtra::sub3(xgc,xcm,delta); - domain->minimum_image(delta); + domain->minimum_image_big(delta); MathExtra::transpose_matvec(ex,ey,ez,delta,body[ibody].xgc_body); MathExtra::add3(xcm,delta,xgc); } diff --git a/src/SPIN/compute_spin.cpp b/src/SPIN/compute_spin.cpp index 1c92d284f0..fc5e223e75 100644 --- a/src/SPIN/compute_spin.cpp +++ b/src/SPIN/compute_spin.cpp @@ -215,9 +215,8 @@ void ComputeSpin::compute_vector() tempnum += tx*tx+ty*ty+tz*tz; tempdenom += sp[i][0]*fm[i][0]+fm[i][1]*sp[i][1]+sp[i][2]*fm[i][2]; countsp++; - } + } else error->all(FLERR,"Compute compute/spin requires atom/spin style"); } - else error->all(FLERR,"Compute compute/spin requires atom/spin style"); } MPI_Allreduce(mag,magtot,4,MPI_DOUBLE,MPI_SUM,world); diff --git a/src/atom.cpp b/src/atom.cpp index 085ca88b4e..7edf4760d2 100644 --- a/src/atom.cpp +++ b/src/atom.cpp @@ -1068,7 +1068,8 @@ void Atom::data_atoms(int n, char *buf, tagint id_offset, tagint mol_offset, } if ((nwords != avec->size_data_atom) && (nwords != avec->size_data_atom + 3)) - error->all(FLERR,"Incorrect format in {}: {}", location, utils::trim(buf)); + error->all(FLERR,"Incorrect format in {}: {}{}", location, + utils::trim(buf), utils::errorurl(2)); *next = '\n'; // set bounds for my proc @@ -1149,7 +1150,8 @@ void Atom::data_atoms(int n, char *buf, tagint id_offset, tagint mol_offset, // skip over empty or comment lines } else if ((nvalues < nwords) || ((nvalues > nwords) && (!utils::strmatch(values[nwords],"^#")))) { - error->all(FLERR, "Incorrect format in {}: {}", location, utils::trim(buf)); + error->all(FLERR, "Incorrect format in {}: {}{}", location, + utils::trim(buf), utils::errorurl(2)); } else { int imx = 0, imy = 0, imz = 0; if (imageflag) { @@ -1243,7 +1245,8 @@ void Atom::data_vels(int n, char *buf, tagint id_offset) if (values.size() == 0) { // skip over empty or comment lines } else if ((int)values.size() != avec->size_data_vel) { - error->all(FLERR, "Incorrect velocity format in data file: {}", utils::trim(buf)); + error->all(FLERR, "Incorrect format in Velocities section of data file: {}{}", + utils::trim(buf), utils::errorurl(2)); } else { tagint tagdata = utils::tnumeric(FLERR,values[0],false,lmp) + id_offset; if (tagdata <= 0 || tagdata > map_tag_max) @@ -1287,7 +1290,9 @@ void Atom::data_bonds(int n, char *buf, int *count, tagint id_offset, // skip over empty or comment lines // Bonds line is: number(ignored), bond type, atomID 1, atomID 2 if (nwords > 0) { - if (nwords != 4) error->all(FLERR, "Incorrect format in {}: {}", location, utils::trim(buf)); + if (nwords != 4) + error->all(FLERR, "Incorrect format in {}: {}{}", location, + utils::trim(buf), utils::errorurl(2)); typestr = utils::utf8_subst(values[1]); atom1 = utils::tnumeric(FLERR, values[2], false, lmp); atom2 = utils::tnumeric(FLERR, values[3], false, lmp); @@ -1378,7 +1383,9 @@ void Atom::data_angles(int n, char *buf, int *count, tagint id_offset, // skip over empty or comment lines // Angles line is: number(ignored), angle type, atomID 1, atomID 2, atomID 3 if (nwords > 0) { - if (nwords != 5) error->all(FLERR, "Incorrect format in {}: {}", location, utils::trim(buf)); + if (nwords != 5) + error->all(FLERR, "Incorrect format in {}: {}{}", location, + utils::trim(buf), utils::errorurl(2)); typestr = utils::utf8_subst(values[1]); atom1 = utils::tnumeric(FLERR, values[2], false, lmp); atom2 = utils::tnumeric(FLERR, values[3], false, lmp); @@ -1485,7 +1492,9 @@ void Atom::data_dihedrals(int n, char *buf, int *count, tagint id_offset, // skip over empty or comment lines // Dihedrals line is: number(ignored), bond type, atomID 1, atomID 2, atomID 3, atomID 4 if (nwords > 0) { - if (nwords != 6) error->all(FLERR, "Incorrect format in {}: {}", location, utils::trim(buf)); + if (nwords != 6) + error->all(FLERR, "Incorrect format in {}: {}{}", location, + utils::trim(buf), utils::errorurl(2)); typestr = utils::utf8_subst(values[1]); atom1 = utils::tnumeric(FLERR, values[2], false, lmp); atom2 = utils::tnumeric(FLERR, values[3], false, lmp); @@ -1611,7 +1620,9 @@ void Atom::data_impropers(int n, char *buf, int *count, tagint id_offset, // skip over empty or comment lines // Impropers line is: number(ignored), bond type, atomID 1, atomID 2, atomID 3, atomID 4 if (nwords > 0) { - if (nwords != 6) error->all(FLERR, "Incorrect format in {}: {}", location, utils::trim(buf)); + if (nwords != 6) + error->all(FLERR, "Incorrect format in {}: {}{}", location, + utils::trim(buf), utils::errorurl(2)); typestr = utils::utf8_subst(values[1]); atom1 = utils::tnumeric(FLERR, values[2], false, lmp); atom2 = utils::tnumeric(FLERR, values[3], false, lmp); @@ -1727,7 +1738,8 @@ void Atom::data_bonus(int n, char *buf, AtomVec *avec_bonus, tagint id_offset) if (values.size() == 0) { // skip over empty or comment lines } else if ((int)values.size() != avec_bonus->size_data_bonus) { - error->all(FLERR, "Incorrect bonus data format in data file: {}", utils::trim(buf)); + error->all(FLERR, "Incorrect format in Bonus section of data file: {}{}", + utils::trim(buf), utils::errorurl(2)); } else { tagint tagdata = utils::tnumeric(FLERR,values[0],false,lmp) + id_offset; if (tagdata <= 0 || tagdata > map_tag_max) diff --git a/src/domain.cpp b/src/domain.cpp index 7513d384e2..c825016b27 100644 --- a/src/domain.cpp +++ b/src/domain.cpp @@ -973,6 +973,9 @@ void Domain::subbox_too_small_check(double thresh) changed "if" to "while" to enable distance to far-away ghost atom returned by atom->map() to be wrapped back into box could be problem for looking up atom IDs when cutoff > boxsize + should be used for most cases where the difference in the image count + is small (usually 0 or 1) + use minimum_image_big() when a large difference between image counts is expected ------------------------------------------------------------------------- */ static constexpr double MAXIMGCOUNT = 16; @@ -1045,6 +1048,72 @@ void Domain::minimum_image(double &dx, double &dy, double &dz) const } } +/* ---------------------------------------------------------------------- + minimum image convention in periodic dimensions + use 1/2 of box size as test + for triclinic, also add/subtract tilt factors in other dims as needed + allow multiple box lengths to enable distance to + far-away ghost atom returned by atom->map() to be wrapped back into box + could be problem for looking up atom IDs when cutoff > boxsize + this should be used when there is a large image count difference possible + this applies for example to fix rigid/small +------------------------------------------------------------------------- */ + +void Domain::minimum_image_big(double &dx, double &dy, double &dz) const +{ + if (triclinic == 0) { + if (xperiodic) { + double dfactor = dx/xprd + 0.5; + if (dx < 0) dfactor -= 1.0; + if (dfactor > MAXSMALLINT) + error->one(FLERR, "Atoms have moved too far apart ({}) for minimum image\n", dx); + dx -= xprd * static_cast(dfactor); + } + if (yperiodic) { + double dfactor = dy/yprd + 0.5; + if (dy < 0) dfactor -= 1.0; + if (dfactor > MAXSMALLINT) + error->one(FLERR, "Atoms have moved too far apart ({}) for minimum image\n", dy); + dy -= yprd * static_cast(dfactor); + } + if (zperiodic) { + double dfactor = dz/zprd + 0.5; + if (dz < 0) dfactor -= 1.0; + if (dfactor > MAXSMALLINT) + error->one(FLERR, "Atoms have moved too far apart ({}) for minimum image\n", dz); + dz -= zprd * static_cast(dfactor); + } + + } else { + if (zperiodic) { + double dfactor = dz/zprd + 0.5; + if (dz < 0) dfactor -= 1.0; + if (dfactor > MAXSMALLINT) + error->one(FLERR, "Atoms have moved too far apart ({}) for minimum image\n", dz); + int factor = static_cast(dfactor); + dz -= zprd * factor; + dy -= yz * factor; + dx -= xz * factor; + } + if (yperiodic) { + double dfactor = dy/yprd + 0.5; + if (dy < 0) dfactor -= 1.0; + if (dfactor > MAXSMALLINT) + error->one(FLERR, "Atoms have moved too far apart ({}) for minimum image\n", dy); + int factor = static_cast(dfactor); + dy -= yprd * factor; + dx -= xy * factor; + } + if (xperiodic) { + double dfactor = dx/xprd + 0.5; + if (dx < 0) dfactor -= 1.0; + if (dfactor > MAXSMALLINT) + error->one(FLERR, "Atoms have moved too far apart ({}) for minimum image\n", dx); + dx -= xprd * static_cast(dfactor); + } + } +} + /* ---------------------------------------------------------------------- return local index of atom J or any of its images that is closest to atom I if J is not a valid index like -1, just return it diff --git a/src/domain.h b/src/domain.h index ab054f1b50..ac4dd12e61 100644 --- a/src/domain.h +++ b/src/domain.h @@ -119,6 +119,8 @@ class Domain : protected Pointers { void subbox_too_small_check(double); void minimum_image(double &, double &, double &) const; void minimum_image(double *delta) const { minimum_image(delta[0], delta[1], delta[2]); } + void minimum_image_big(double &, double &, double &) const; + void minimum_image_big(double *delta) const { minimum_image_big(delta[0], delta[1], delta[2]); } int closest_image(int, int); int closest_image(const double *const, int); void closest_image(const double *const, const double *const, double *const); diff --git a/src/fix_ave_correlate.cpp b/src/fix_ave_correlate.cpp index d506e17761..08cd673122 100644 --- a/src/fix_ave_correlate.cpp +++ b/src/fix_ave_correlate.cpp @@ -97,7 +97,7 @@ FixAveCorrelate::FixAveCorrelate(LAMMPS *lmp, int narg, char **arg) : while (iarg < nargnew) { if (strcmp(arg[iarg],"type") == 0) { - if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ave/correlate type", error); + if (iarg+2 > nargnew) utils::missing_cmd_args(FLERR, "fix ave/correlate type", error); if (strcmp(arg[iarg+1],"auto") == 0) type = AUTO; else if (strcmp(arg[iarg+1],"upper") == 0) type = UPPER; else if (strcmp(arg[iarg+1],"lower") == 0) type = LOWER; @@ -107,21 +107,21 @@ FixAveCorrelate::FixAveCorrelate(LAMMPS *lmp, int narg, char **arg) : else error->all(FLERR,"Unknown fix ave/correlate type: {}"); iarg += 2; } else if (strcmp(arg[iarg],"ave") == 0) { - if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ave/correlate ave", error); + if (iarg+2 > nargnew) utils::missing_cmd_args(FLERR, "fix ave/correlate ave", error); if (strcmp(arg[iarg+1],"one") == 0) ave = ONE; else if (strcmp(arg[iarg+1],"running") == 0) ave = RUNNING; else error->all(FLERR,"Unknown fix ave/correlate ave mode: {}", arg[iarg+1]); iarg += 2; } else if (strcmp(arg[iarg],"start") == 0) { - if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ave/correlate start", error); + if (iarg+2 > nargnew) utils::missing_cmd_args(FLERR, "fix ave/correlate start", error); startstep = utils::inumeric(FLERR,arg[iarg+1],false,lmp); iarg += 2; } else if (strcmp(arg[iarg],"prefactor") == 0) { - if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ave/correlate prefactor", error); + if (iarg+2 > nargnew) utils::missing_cmd_args(FLERR, "fix ave/correlate prefactor", error); prefactor = utils::numeric(FLERR,arg[iarg+1],false,lmp); iarg += 2; } else if (strcmp(arg[iarg],"file") == 0) { - if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ave/correlate file", error); + if (iarg+2 > nargnew) utils::missing_cmd_args(FLERR, "fix ave/correlate file", error); if (comm->me == 0) { fp = fopen(arg[iarg+1],"w"); if (fp == nullptr) @@ -133,17 +133,17 @@ FixAveCorrelate::FixAveCorrelate(LAMMPS *lmp, int narg, char **arg) : overwrite = 1; iarg += 1; } else if (strcmp(arg[iarg],"title1") == 0) { - if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ave/correlate title1", error); + if (iarg+2 > nargnew) utils::missing_cmd_args(FLERR, "fix ave/correlate title1", error); delete[] title1; title1 = utils::strdup(arg[iarg+1]); iarg += 2; } else if (strcmp(arg[iarg],"title2") == 0) { - if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ave/correlate title2", error); + if (iarg+2 > nargnew) utils::missing_cmd_args(FLERR, "fix ave/correlate title2", error); delete[] title2; title2 = utils::strdup(arg[iarg+1]); iarg += 2; } else if (strcmp(arg[iarg],"title3") == 0) { - if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ave/correlate title3", error); + if (iarg+2 > nargnew) utils::missing_cmd_args(FLERR, "fix ave/correlate title3", error); delete[] title3; title3 = utils::strdup(arg[iarg+1]); iarg += 2; @@ -414,6 +414,8 @@ void FixAveCorrelate::end_of_step() scalar = val.val.f->compute_vector(val.argindex-1); // evaluate equal-style or vector-style variable + // if index exceeds vector length, use a zero value + // this can be useful if vector length is not known a priori } else if (val.which == ArgInfo::VARIABLE) { if (val.argindex == 0) @@ -422,7 +424,7 @@ void FixAveCorrelate::end_of_step() double *varvec; int nvec = input->variable->compute_vector(val.val.v,&varvec); int index = val.argindex; - if (nvec < index) scalar = 0.0; + if (index > nvec) scalar = 0.0; else scalar = varvec[index-1]; } } diff --git a/src/fix_ave_histo.cpp b/src/fix_ave_histo.cpp index b3ca9e1106..d676bca0b7 100644 --- a/src/fix_ave_histo.cpp +++ b/src/fix_ave_histo.cpp @@ -600,6 +600,8 @@ void FixAveHisto::end_of_step() } // evaluate equal-style or vector-style or atom-style variable + // if index exceeds vector length, use a zero value + // this can be useful if vector length is not known a priori } else if (val.which == ArgInfo::VARIABLE) { if (kind == GLOBAL && mode == SCALAR) { @@ -607,7 +609,7 @@ void FixAveHisto::end_of_step() else { double *varvec; int nvec = input->variable->compute_vector(val.val.v,&varvec); - if (nvec < j) bin_one(0.0); + if (j > nvec) bin_one(0.0); else bin_one(varvec[j-1]); } diff --git a/src/fix_ave_histo_weight.cpp b/src/fix_ave_histo_weight.cpp index 7a5458bd3d..cccfc86d59 100644 --- a/src/fix_ave_histo_weight.cpp +++ b/src/fix_ave_histo_weight.cpp @@ -113,256 +113,256 @@ void FixAveHistoWeight::end_of_step() double weight = 0.0; double *weights = nullptr; int stride = 0; - auto &val = values[1]; - int j = val.argindex; + auto &val1 = values[1]; + int j = val1.argindex; // atom attributes - if (val.which == ArgInfo::X) { + if (val1.which == ArgInfo::X) { weights = &atom->x[0][j]; stride = 3; - } else if (val.which == ArgInfo::V) { + } else if (val1.which == ArgInfo::V) { weights = &atom->v[0][j]; stride = 3; bin_atoms(&atom->v[0][j],3); - } else if (val.which == ArgInfo::F) { + } else if (val1.which == ArgInfo::F) { weights = &atom->f[0][j]; stride = 3; } // invoke compute if not previously invoked - if (val.which == ArgInfo::COMPUTE) { + if (val1.which == ArgInfo::COMPUTE) { if (kind == GLOBAL && mode == SCALAR) { if (j == 0) { - if (!(val.val.c->invoked_flag & Compute::INVOKED_SCALAR)) { - val.val.c->compute_scalar(); - val.val.c->invoked_flag |= Compute::INVOKED_SCALAR; + if (!(val1.val.c->invoked_flag & Compute::INVOKED_SCALAR)) { + val1.val.c->compute_scalar(); + val1.val.c->invoked_flag |= Compute::INVOKED_SCALAR; } - weight = val.val.c->scalar; + weight = val1.val.c->scalar; } else { - if (!(val.val.c->invoked_flag & Compute::INVOKED_VECTOR)) { - val.val.c->compute_vector(); - val.val.c->invoked_flag |= Compute::INVOKED_VECTOR; + if (!(val1.val.c->invoked_flag & Compute::INVOKED_VECTOR)) { + val1.val.c->compute_vector(); + val1.val.c->invoked_flag |= Compute::INVOKED_VECTOR; } - weight = val.val.c->vector[j-1]; + weight = val1.val.c->vector[j-1]; } } else if (kind == GLOBAL && mode == VECTOR) { if (j == 0) { - if (!(val.val.c->invoked_flag & Compute::INVOKED_VECTOR)) { - val.val.c->compute_vector(); - val.val.c->invoked_flag |= Compute::INVOKED_VECTOR; + if (!(val1.val.c->invoked_flag & Compute::INVOKED_VECTOR)) { + val1.val.c->compute_vector(); + val1.val.c->invoked_flag |= Compute::INVOKED_VECTOR; } - weights = val.val.c->vector; + weights = val1.val.c->vector; stride = 1; } else { - if (!(val.val.c->invoked_flag & Compute::INVOKED_ARRAY)) { - val.val.c->compute_array(); - val.val.c->invoked_flag |= Compute::INVOKED_ARRAY; + if (!(val1.val.c->invoked_flag & Compute::INVOKED_ARRAY)) { + val1.val.c->compute_array(); + val1.val.c->invoked_flag |= Compute::INVOKED_ARRAY; } - if (val.val.c->array) weights = &val.val.c->array[0][j-1]; - stride = val.val.c->size_array_cols; + if (val1.val.c->array) weights = &val1.val.c->array[0][j-1]; + stride = val1.val.c->size_array_cols; } } else if (kind == PERATOM) { - if (!(val.val.c->invoked_flag & Compute::INVOKED_PERATOM)) { - val.val.c->compute_peratom(); - val.val.c->invoked_flag |= Compute::INVOKED_PERATOM; + if (!(val1.val.c->invoked_flag & Compute::INVOKED_PERATOM)) { + val1.val.c->compute_peratom(); + val1.val.c->invoked_flag |= Compute::INVOKED_PERATOM; } if (j == 0) { - weights = val.val.c->vector_atom; + weights = val1.val.c->vector_atom; stride = 1; - } else if (val.val.c->array_atom) { - weights = &val.val.c->array_atom[0][j-1]; - stride = val.val.c->size_peratom_cols; + } else if (val1.val.c->array_atom) { + weights = &val1.val.c->array_atom[0][j-1]; + stride = val1.val.c->size_peratom_cols; } } else if (kind == LOCAL) { - if (!(val.val.c->invoked_flag & Compute::INVOKED_LOCAL)) { - val.val.c->compute_local(); - val.val.c->invoked_flag |= Compute::INVOKED_LOCAL; + if (!(val1.val.c->invoked_flag & Compute::INVOKED_LOCAL)) { + val1.val.c->compute_local(); + val1.val.c->invoked_flag |= Compute::INVOKED_LOCAL; } if (j == 0) { - weights = val.val.c->vector_local; + weights = val1.val.c->vector_local; stride = 1; - } else if (val.val.c->array_local) { - weights = &val.val.c->array_local[0][j-1]; - stride = val.val.c->size_local_cols; + } else if (val1.val.c->array_local) { + weights = &val1.val.c->array_local[0][j-1]; + stride = val1.val.c->size_local_cols; } } // access fix fields, guaranteed to be ready - } else if (val.which == ArgInfo::FIX) { + } else if (val1.which == ArgInfo::FIX) { if (kind == GLOBAL && mode == SCALAR) { - if (j == 0) weight = val.val.f->compute_scalar(); - else weight = val.val.f->compute_vector(j-1); + if (j == 0) weight = val1.val.f->compute_scalar(); + else weight = val1.val.f->compute_vector(j-1); } else if (kind == GLOBAL && mode == VECTOR) { error->all(FLERR,"Fix ave/histo/weight option not yet supported"); // NOTE: need to allocate local storage if (j == 0) { - int n = val.val.f->size_vector; - for (int i = 0; i < n; i++) weights[n] = val.val.f->compute_vector(i); + int n = val1.val.f->size_vector; + for (int i = 0; i < n; i++) weights[n] = val1.val.f->compute_vector(i); } else { - int n = val.val.f->size_vector; - for (int i = 0; i < n; i++) weights[n] = val.val.f->compute_array(i,j-1); + int n = val1.val.f->size_vector; + for (int i = 0; i < n; i++) weights[n] = val1.val.f->compute_array(i,j-1); } } else if (kind == PERATOM) { if (j == 0) { - weights = val.val.f->vector_atom; + weights = val1.val.f->vector_atom; stride = 1; - } else if (val.val.f->array_atom) { - weights = &val.val.f->array_atom[0][j-1]; - stride = val.val.f->size_peratom_cols; + } else if (val1.val.f->array_atom) { + weights = &val1.val.f->array_atom[0][j-1]; + stride = val1.val.f->size_peratom_cols; } } else if (kind == LOCAL) { if (j == 0) { - weights = val.val.f->vector_local; + weights = val1.val.f->vector_local; stride = 1; - } else if (val.val.f->array_local) { - weights = &val.val.f->array_local[0][j-1]; - stride = val.val.f->size_local_cols; + } else if (val1.val.f->array_local) { + weights = &val1.val.f->array_local[0][j-1]; + stride = val1.val.f->size_local_cols; } } // evaluate equal-style variable - } else if (val.which == ArgInfo::VARIABLE && kind == GLOBAL) { - weight = input->variable->compute_equal(val.val.v); + } else if (val1.which == ArgInfo::VARIABLE && kind == GLOBAL) { + weight = input->variable->compute_equal(val1.val.v); - } else if (val.which == ArgInfo::VARIABLE && kind == PERATOM) { + } else if (val1.which == ArgInfo::VARIABLE && kind == PERATOM) { if (atom->nmax > maxatom) { memory->destroy(vector); maxatom = atom->nmax; memory->create(vector,maxatom,"ave/histo/weight:vector"); } - input->variable->compute_atom(val.val.v,igroup,vector,1,0); + input->variable->compute_atom(val1.val.v,igroup,vector,1,0); weights = vector; stride = 1; } // bin values using weights, values are 1st value (i = 0) - val = values[0]; - j = val.argindex; + auto &val0 = values[0]; + j = val0.argindex; // atom attributes - if (val.which == ArgInfo::X && weights != nullptr) + if (val0.which == ArgInfo::X && weights != nullptr) bin_atoms_weights(&atom->x[0][j],3,weights,stride); - else if (val.which == ArgInfo::V && weights != nullptr) + else if (val0.which == ArgInfo::V && weights != nullptr) bin_atoms_weights(&atom->v[0][j],3,weights,stride); - else if (val.which == ArgInfo::F && weights != nullptr) + else if (val0.which == ArgInfo::F && weights != nullptr) bin_atoms_weights(&atom->f[0][j],3,weights,stride); // invoke compute if not previously invoked - if (val.which == ArgInfo::COMPUTE) { + if (val0.which == ArgInfo::COMPUTE) { if (kind == GLOBAL && mode == SCALAR) { if (j == 0) { - if (!(val.val.c->invoked_flag & Compute::INVOKED_SCALAR)) { - val.val.c->compute_scalar(); - val.val.c->invoked_flag |= Compute::INVOKED_SCALAR; + if (!(val0.val.c->invoked_flag & Compute::INVOKED_SCALAR)) { + val0.val.c->compute_scalar(); + val0.val.c->invoked_flag |= Compute::INVOKED_SCALAR; } - bin_one_weights(val.val.c->scalar,weight); + bin_one_weights(val0.val.c->scalar,weight); } else { - if (!(val.val.c->invoked_flag & Compute::INVOKED_VECTOR)) { - val.val.c->compute_vector(); - val.val.c->invoked_flag |= Compute::INVOKED_VECTOR; + if (!(val0.val.c->invoked_flag & Compute::INVOKED_VECTOR)) { + val0.val.c->compute_vector(); + val0.val.c->invoked_flag |= Compute::INVOKED_VECTOR; } - bin_one_weights(val.val.c->vector[j-1],weight); + bin_one_weights(val0.val.c->vector[j-1],weight); } } else if (kind == GLOBAL && mode == VECTOR) { if (j == 0) { - if (!(val.val.c->invoked_flag & Compute::INVOKED_VECTOR)) { - val.val.c->compute_vector(); - val.val.c->invoked_flag |= Compute::INVOKED_VECTOR; + if (!(val0.val.c->invoked_flag & Compute::INVOKED_VECTOR)) { + val0.val.c->compute_vector(); + val0.val.c->invoked_flag |= Compute::INVOKED_VECTOR; } - bin_vector_weights(val.val.c->size_vector,val.val.c->vector,1, + bin_vector_weights(val0.val.c->size_vector,val0.val.c->vector,1, weights,stride); } else { - if (!(val.val.c->invoked_flag & Compute::INVOKED_ARRAY)) { - val.val.c->compute_array(); - val.val.c->invoked_flag |= Compute::INVOKED_ARRAY; + if (!(val0.val.c->invoked_flag & Compute::INVOKED_ARRAY)) { + val0.val.c->compute_array(); + val0.val.c->invoked_flag |= Compute::INVOKED_ARRAY; } - if (val.val.c->array) - bin_vector_weights(val.val.c->size_array_rows,&val.val.c->array[0][j-1], - val.val.c->size_array_cols,weights,stride); + if (val0.val.c->array) + bin_vector_weights(val0.val.c->size_array_rows,&val0.val.c->array[0][j-1], + val0.val.c->size_array_cols,weights,stride); } } else if (kind == PERATOM) { - if (!(val.val.c->invoked_flag & Compute::INVOKED_PERATOM)) { - val.val.c->compute_peratom(); - val.val.c->invoked_flag |= Compute::INVOKED_PERATOM; + if (!(val0.val.c->invoked_flag & Compute::INVOKED_PERATOM)) { + val0.val.c->compute_peratom(); + val0.val.c->invoked_flag |= Compute::INVOKED_PERATOM; } if (j == 0) - bin_atoms_weights(val.val.c->vector_atom,1,weights, stride); - else if (val.val.c->array_atom) - bin_atoms_weights(&val.val.c->array_atom[0][j-1], - val.val.c->size_peratom_cols,weights,stride); + bin_atoms_weights(val0.val.c->vector_atom,1,weights, stride); + else if (val0.val.c->array_atom) + bin_atoms_weights(&val0.val.c->array_atom[0][j-1], + val0.val.c->size_peratom_cols,weights,stride); } else if (kind == LOCAL) { - if (!(val.val.c->invoked_flag & Compute::INVOKED_LOCAL)) { - val.val.c->compute_local(); - val.val.c->invoked_flag |= Compute::INVOKED_LOCAL; + if (!(val0.val.c->invoked_flag & Compute::INVOKED_LOCAL)) { + val0.val.c->compute_local(); + val0.val.c->invoked_flag |= Compute::INVOKED_LOCAL; } if (j == 0) - bin_vector_weights(val.val.c->size_local_rows, - val.val.c->vector_local,1,weights,stride); - else if (val.val.c->array_local) - bin_vector_weights(val.val.c->size_local_rows, - &val.val.c->array_local[0][j-1], - val.val.c->size_local_cols,weights,stride); + bin_vector_weights(val0.val.c->size_local_rows, + val0.val.c->vector_local,1,weights,stride); + else if (val0.val.c->array_local) + bin_vector_weights(val0.val.c->size_local_rows, + &val0.val.c->array_local[0][j-1], + val0.val.c->size_local_cols,weights,stride); } // access fix fields, guaranteed to be ready - } else if (val.which == ArgInfo::FIX) { + } else if (val0.which == ArgInfo::FIX) { if (kind == GLOBAL && mode == SCALAR) { - if (j == 0) bin_one_weights(val.val.f->compute_scalar(),weight); - else bin_one_weights(val.val.f->compute_vector(j-1),weight); + if (j == 0) bin_one_weights(val0.val.f->compute_scalar(),weight); + else bin_one_weights(val0.val.f->compute_vector(j-1),weight); } else if (kind == GLOBAL && mode == VECTOR) { if (j == 0) { - int n = val.val.f->size_vector; + int n = val0.val.f->size_vector; for (int i = 0; i < n; i++) - bin_one_weights(val.val.f->compute_vector(i),weights[i*stride]); + bin_one_weights(val0.val.f->compute_vector(i),weights[i*stride]); } else { - int n = val.val.f->size_vector; + int n = val0.val.f->size_vector; for (int i = 0; i < n; i++) - bin_one_weights(val.val.f->compute_array(i,j-1),weights[i*stride]); + bin_one_weights(val0.val.f->compute_array(i,j-1),weights[i*stride]); } } else if (kind == PERATOM) { if (j == 0) - bin_atoms_weights(val.val.f->vector_atom,1,weights,stride); - else if (val.val.f->array_atom) - bin_atoms_weights(&val.val.f->array_atom[0][j-1],val.val.f->size_peratom_cols, + bin_atoms_weights(val0.val.f->vector_atom,1,weights,stride); + else if (val0.val.f->array_atom) + bin_atoms_weights(&val0.val.f->array_atom[0][j-1],val0.val.f->size_peratom_cols, weights,stride); } else if (kind == LOCAL) { - if (j == 0) bin_vector_weights(val.val.f->size_local_rows,val.val.f->vector_local,1, + if (j == 0) bin_vector_weights(val0.val.f->size_local_rows,val0.val.f->vector_local,1, weights,stride); - else if (val.val.f->array_local) - bin_vector_weights(val.val.f->size_local_rows,&val.val.f->array_local[0][j-1], - val.val.f->size_local_cols,weights,stride); + else if (val0.val.f->array_local) + bin_vector_weights(val0.val.f->size_local_rows,&val0.val.f->array_local[0][j-1], + val0.val.f->size_local_cols,weights,stride); } // evaluate equal-style variable - } else if (val.which == ArgInfo::VARIABLE && kind == GLOBAL) { - bin_one_weights(input->variable->compute_equal(val.val.v),weight); + } else if (val0.which == ArgInfo::VARIABLE && kind == GLOBAL) { + bin_one_weights(input->variable->compute_equal(val0.val.v),weight); - } else if (val.which == ArgInfo::VARIABLE && kind == PERATOM) { + } else if (val0.which == ArgInfo::VARIABLE && kind == PERATOM) { if (atom->nmax > maxatom) { memory->destroy(vector); maxatom = atom->nmax; memory->create(vector,maxatom,"ave/histo/weight:vector"); } - input->variable->compute_atom(val.val.v,igroup,vector,1,0); + input->variable->compute_atom(val0.val.v,igroup,vector,1,0); bin_atoms_weights(vector,1,weights,stride); } diff --git a/src/fix_ave_time.cpp b/src/fix_ave_time.cpp index 5219a4de3d..417e0fd97a 100644 --- a/src/fix_ave_time.cpp +++ b/src/fix_ave_time.cpp @@ -178,7 +178,7 @@ FixAveTime::FixAveTime(LAMMPS *lmp, int narg, char **arg) : if (val.argindex && (val.val.f->array_flag == 0)) error->all(FLERR,"Fix ave/time fix {} does not calculate an array", val.id); if (val.argindex && (val.val.f->size_array_rows_variable)) - error->all(FLERR,"Fix ave/time fix {} array cannot be variable length", val.id); + error->all(FLERR,"Fix ave/time fix {} array cannot have variable row length", val.id); if (val.argindex && (val.argindex > val.val.f->size_array_cols)) error->all(FLERR,"Fix ave/time fix {} array is accessed out-of-range", val.id); if (nevery % val.val.f->global_freq) @@ -562,7 +562,8 @@ void FixAveTime::invoke_scalar(bigint ntimestep) scalar = val.val.f->compute_vector(val.argindex-1); // evaluate equal-style or vector-style variable - // ensure no out-of-range access to vector-style variable + // if index exceeds vector length, use a zero value + // this can be useful if vector length is not known a priori } else if (val.which == ArgInfo::VARIABLE) { if (val.argindex == 0) @@ -570,7 +571,7 @@ void FixAveTime::invoke_scalar(bigint ntimestep) else { double *varvec; int nvec = input->variable->compute_vector(val.val.v,&varvec); - if (nvec < val.argindex) scalar = 0.0; + if (val.argindex > nvec) scalar = 0.0; else scalar = varvec[val.argindex-1]; } } diff --git a/src/fix_property_atom.cpp b/src/fix_property_atom.cpp index de96b5c39d..2cbdb44ab0 100644 --- a/src/fix_property_atom.cpp +++ b/src/fix_property_atom.cpp @@ -328,8 +328,9 @@ void FixPropertyAtom::read_data_section(char *keyword, int n, char *buf, tagint try { ValueTokenizer values(buf); if ((int) values.count() != values_peratom + 1) - error->all(FLERR, "Incorrect format in {} section of data file: {} expected {} and got {}", - keyword, buf, values_peratom + 1, values.count()); + error->all(FLERR, "Incorrect format in {} section of data file: {}\n" + "expected {} parameters and got {}{}", keyword, utils::trim(buf), + values_peratom + 1, values.count(), utils::errorurl(2)); itag = values.next_tagint() + id_offset; if (itag <= 0 || itag > map_tag_max) diff --git a/src/math_eigen.cpp b/src/math_eigen.cpp index 65c3fa806a..c79a61f84a 100644 --- a/src/math_eigen.cpp +++ b/src/math_eigen.cpp @@ -31,7 +31,7 @@ using namespace MathEigen; typedef Jacobi Jacobi_v1; typedef Jacobi Jacobi_v2; -int MathEigen::jacobi3(double const mat[3][3], double *eval, double evec[3][3]) +int MathEigen::jacobi3(double const mat[3][3], double *eval, double evec[3][3], int sort) { // make copy of const matrix @@ -44,7 +44,15 @@ int MathEigen::jacobi3(double const mat[3][3], double *eval, double evec[3][3]) // create instance of generic Jacobi class and get eigenvalues and -vectors Jacobi_v1 ecalc3(3, M, midx); - int ierror = ecalc3.Diagonalize(mat, eval, evec, Jacobi_v1::SORT_DECREASING_EVALS); + int ierror = 1; + if (sort == -1) + ierror = ecalc3.Diagonalize(mat, eval, evec, Jacobi_v1::SORT_DECREASING_EVALS); + else if (sort == 0) + ierror = ecalc3.Diagonalize(mat, eval, evec, Jacobi_v1::DO_NOT_SORT); + else if (sort == 1) + ierror = ecalc3.Diagonalize(mat, eval, evec, Jacobi_v1::SORT_INCREASING_EVALS); + + if (ierror) return ierror; // transpose the evec matrix @@ -54,7 +62,7 @@ int MathEigen::jacobi3(double const mat[3][3], double *eval, double evec[3][3]) return ierror; } -int MathEigen::jacobi3(double const *const *mat, double *eval, double **evec) +int MathEigen::jacobi3(double const *const *mat, double *eval, double **evec, int sort) { // make copy of const matrix @@ -67,7 +75,15 @@ int MathEigen::jacobi3(double const *const *mat, double *eval, double **evec) // create instance of generic Jacobi class and get eigenvalues and -vectors Jacobi_v2 ecalc3(3, M, midx); - int ierror = ecalc3.Diagonalize(mat, eval, evec, Jacobi_v2::SORT_DECREASING_EVALS); + int ierror = 1; + if (sort == -1) + ierror = ecalc3.Diagonalize(mat, eval, evec, Jacobi_v2::SORT_DECREASING_EVALS); + else if (sort == 0) + ierror = ecalc3.Diagonalize(mat, eval, evec, Jacobi_v2::DO_NOT_SORT); + else if (sort == 1) + ierror = ecalc3.Diagonalize(mat, eval, evec, Jacobi_v2::SORT_INCREASING_EVALS); + + if (ierror) return ierror; // transpose the evec matrix diff --git a/src/math_eigen.h b/src/math_eigen.h index 9f2431e14a..c7a3853767 100644 --- a/src/math_eigen.h +++ b/src/math_eigen.h @@ -22,13 +22,14 @@ namespace MathEigen { * \param mat the 3x3 matrix you wish to diagonalize * \param eval store the eigenvalues here * \param evec store the eigenvectors here... + * \param sort order eigenvalues and -vectors (-1 decreasing (default), 1 increasing, 0 unsorted) * \return 0 if eigenvalue calculation converged, 1 if it failed */ -int jacobi3(double const *const *mat, double *eval, double **evec); +int jacobi3(double const *const *mat, double *eval, double **evec, int sort = -1); /** \overload */ -int jacobi3(double const mat[3][3], double *eval, double evec[3][3]); +int jacobi3(double const mat[3][3], double *eval, double evec[3][3], int sort = -1); } // namespace MathEigen diff --git a/src/platform.cpp b/src/platform.cpp index b324bd0b5c..7e9fa820fa 100644 --- a/src/platform.cpp +++ b/src/platform.cpp @@ -249,12 +249,7 @@ std::string platform::os_info() } else if (build == "22631") { buf = "Windows 11 23H2"; } else { - const char *entry = "ProductName"; - RegGetValue(HKEY_LOCAL_MACHINE, subkey, entry, RRF_RT_REG_SZ, nullptr, &value, - (LPDWORD) &value_length); - // enforce zero termination - value[1023] = '\0'; - buf = value; + buf = "Windows Build " + build; } DWORD fullversion, majorv, minorv, buildv = 0; fullversion = GetVersion(); diff --git a/src/read_data.cpp b/src/read_data.cpp index 657369d5d0..36bbce605f 100644 --- a/src/read_data.cpp +++ b/src/read_data.cpp @@ -1360,7 +1360,8 @@ void ReadData::header(int firstpass) // check that exiting string is a valid section keyword parse_keyword(1); - if (!is_data_section(keyword)) error->all(FLERR, "Unknown identifier in data file: {}", keyword); + if (!is_data_section(keyword)) + error->all(FLERR, "Unknown identifier in data file: {}{}", keyword, utils::errorurl(1)); // error checks on header values // must be consistent with atom style and other header values diff --git a/src/thermo.cpp b/src/thermo.cpp index efc5f984fc..5ef5eb59b8 100644 --- a/src/thermo.cpp +++ b/src/thermo.cpp @@ -1490,9 +1490,8 @@ void Thermo::compute_compute() if (normflag && compute->extscalar) dvalue /= natoms; } else if (compute_which[m] == VECTOR) { if (compute->size_vector_variable && argindex1[ifield] > compute->size_vector) - dvalue = 0.0; - else - dvalue = compute->vector[argindex1[ifield] - 1]; + error->all(FLERR, "Thermo compute vector is accessed out-of-range"); + dvalue = compute->vector[argindex1[ifield] - 1]; if (normflag) { if (compute->extvector == 0) return; @@ -1503,9 +1502,8 @@ void Thermo::compute_compute() } } else { if (compute->size_array_rows_variable && argindex1[ifield] > compute->size_array_rows) - dvalue = 0.0; - else - dvalue = compute->array[argindex1[ifield] - 1][argindex2[ifield] - 1]; + error->all(FLERR, "Thermo compute array is accessed out-of-range"); + dvalue = compute->array[argindex1[ifield] - 1][argindex2[ifield] - 1]; if (normflag && compute->extarray) dvalue /= natoms; } } @@ -1517,10 +1515,14 @@ void Thermo::compute_fix() int m = field2index[ifield]; Fix *fix = fixes[m]; + // check for out-of-range access if vector/array is variable length + if (argindex1[ifield] == 0) { dvalue = fix->compute_scalar(); if (normflag && fix->extscalar) dvalue /= natoms; } else if (argindex2[ifield] == 0) { + if (fix->size_vector_variable && argindex1[ifield] > fix->size_vector) + error->all(FLERR, "Thermo fix vector is accessed out-of-range"); dvalue = fix->compute_vector(argindex1[ifield] - 1); if (normflag) { if (fix->extvector == 0) @@ -1531,6 +1533,8 @@ void Thermo::compute_fix() dvalue /= natoms; } } else { + if (fix->size_array_rows_variable && argindex1[ifield] > fix->size_array_rows) + error->all(FLERR, "Thermo fix array is accessed out-of-range"); dvalue = fix->compute_array(argindex1[ifield] - 1, argindex2[ifield] - 1); if (normflag && fix->extarray) dvalue /= natoms; } @@ -1542,15 +1546,17 @@ void Thermo::compute_variable() { int iarg = argindex1[ifield]; + // evaluate equal-style or vector-style variable + // if index exceeds vector length, use a zero value + // this can be useful if vector length is not known a priori + if (iarg == 0) dvalue = input->variable->compute_equal(variables[field2index[ifield]]); else { double *varvec; int nvec = input->variable->compute_vector(variables[field2index[ifield]], &varvec); - if (nvec < iarg) - dvalue = 0.0; - else - dvalue = varvec[iarg - 1]; + if (iarg > nvec) dvalue = 0.0; + else dvalue = varvec[iarg - 1]; } } diff --git a/src/variable.cpp b/src/variable.cpp index c1cf978028..89d15d38f5 100644 --- a/src/variable.cpp +++ b/src/variable.cpp @@ -369,7 +369,8 @@ void Variable::set(int narg, char **arg) data[nvar][0] = new char[MAXLINE]; reader[nvar] = new VarReader(lmp,arg[0],arg[2],SCALARFILE); int flag = reader[nvar]->read_scalar(data[nvar][0]); - if (flag) error->all(FLERR,"File variable could not read value"); + if (flag) + error->all(FLERR,"File variable {} could not read value from {}", arg[0], arg[2]); // ATOMFILE for numbers // which = 1st value @@ -387,7 +388,8 @@ void Variable::set(int narg, char **arg) data[nvar][0] = nullptr; reader[nvar] = new VarReader(lmp,arg[0],arg[2],ATOMFILE); int flag = reader[nvar]->read_peratom(); - if (flag) error->all(FLERR,"Atomfile variable could not read values"); + if (flag) + error->all(FLERR,"Atomfile variable {} could not read values from {}", arg[0], arg[2]); // FORMAT // num = 3, which = 1st value @@ -1225,7 +1227,6 @@ int Variable::compute_vector(int ivar, double **result) int nlen = size_tree_vector(tree); if (nlen == 0) print_var_error(FLERR,"Vector-style variable has zero length",ivar); - if (nlen < 0) print_var_error(FLERR,"Inconsistent lengths in vector-style variable",ivar); @@ -1535,9 +1536,6 @@ double Variable::evaluate(char *str, Tree **tree, int ivar) if (!compute->vector_flag) print_var_error(FLERR,"Mismatched compute in variable formula",ivar); - if (index1 > compute->size_vector && - compute->size_vector_variable == 0) - print_var_error(FLERR,"Variable formula compute vector is accessed out-of-range",ivar,0); if (!compute->is_initialized()) print_var_error(FLERR,"Variable formula compute cannot be invoked before " "initialization by a run",ivar); @@ -1546,10 +1544,14 @@ double Variable::evaluate(char *str, Tree **tree, int ivar) compute->invoked_flag |= Compute::INVOKED_VECTOR; } - if (compute->size_vector_variable && - index1 > compute->size_vector) value1 = 0.0; - else value1 = compute->vector[index1-1]; - argstack[nargstack++] = value1; + // wait to check index1 until after compute invocation + // to allow for computes with size_vector_variable == 1 + + if (index1 > compute->size_vector) + print_var_error(FLERR,"Variable formula compute vector is accessed out-of-range",ivar,0); + + value1 = compute->vector[index1-1]; + argstack[nargstack++] = value1; // c_ID[i][j] = scalar from global array @@ -1557,9 +1559,6 @@ double Variable::evaluate(char *str, Tree **tree, int ivar) if (!compute->array_flag) print_var_error(FLERR,"Mismatched compute in variable formula",ivar); - if (index1 > compute->size_array_rows && - compute->size_array_rows_variable == 0) - print_var_error(FLERR,"Variable formula compute array is accessed out-of-range",ivar,0); if (index2 > compute->size_array_cols) print_var_error(FLERR,"Variable formula compute array is accessed out-of-range",ivar,0); if (!compute->is_initialized()) @@ -1570,9 +1569,13 @@ double Variable::evaluate(char *str, Tree **tree, int ivar) compute->invoked_flag |= Compute::INVOKED_ARRAY; } - if (compute->size_array_rows_variable && - index1 > compute->size_array_rows) value1 = 0.0; - else value1 = compute->array[index1-1][index2-1]; + // wait to check index1 until after compute invocation + // to allow for computes with size_array_rows_variable == 1 + + if (index1 > compute->size_array_rows) + print_var_error(FLERR,"Variable formula compute array is accessed out-of-range",ivar,0); + + value1 = compute->array[index1-1][index2-1]; argstack[nargstack++] = value1; // C_ID[i] = scalar element of per-atom vector, note uppercase "C" @@ -1634,8 +1637,6 @@ double Variable::evaluate(char *str, Tree **tree, int ivar) if (!compute->vector_flag) print_var_error(FLERR,"Mismatched compute in variable formula",ivar); - if (compute->size_vector == 0) - print_var_error(FLERR,"Variable formula compute vector is zero length",ivar); if (!compute->is_initialized()) print_var_error(FLERR,"Variable formula compute cannot be invoked before " "initialization by a run",ivar); @@ -1644,6 +1645,12 @@ double Variable::evaluate(char *str, Tree **tree, int ivar) compute->invoked_flag |= Compute::INVOKED_VECTOR; } + // wait to check vector size until after compute invocation + // to allow for computes with size_vector_variable == 1 + + if (compute->size_vector == 0) + print_var_error(FLERR,"Variable formula compute vector is zero length",ivar); + auto newtree = new Tree(); newtree->type = VECTORARRAY; newtree->array = compute->vector; @@ -1666,10 +1673,12 @@ double Variable::evaluate(char *str, Tree **tree, int ivar) compute->compute_array(); compute->invoked_flag |= Compute::INVOKED_ARRAY; } - // wait until after compute invocation to check size_array_rows - // b/c may be zero until after initial invocation + + // wait to check row count until after compute invocation + // to allow for computes with size_array_rows_variable == 1 + if (compute->size_array_rows == 0) - print_var_error(FLERR,"Variable formula compute array is zero length",ivar); + print_var_error(FLERR,"Variable formula compute array has zero rows",ivar); auto newtree = new Tree(); newtree->type = VECTORARRAY; @@ -1800,8 +1809,7 @@ double Variable::evaluate(char *str, Tree **tree, int ivar) if (!fix->vector_flag) print_var_error(FLERR,"Mismatched fix in variable formula",ivar); - if (index1 > fix->size_vector && - fix->size_vector_variable == 0) + if (index1 > fix->size_vector) print_var_error(FLERR,"Variable formula fix vector is accessed out-of-range",ivar,0); if (update->whichflag > 0 && update->ntimestep % fix->global_freq) print_var_error(FLERR,"Fix in variable not computed at a compatible time",ivar); @@ -1815,8 +1823,7 @@ double Variable::evaluate(char *str, Tree **tree, int ivar) if (!fix->array_flag) print_var_error(FLERR,"Mismatched fix in variable formula",ivar); - if (index1 > fix->size_array_rows && - fix->size_array_rows_variable == 0) + if (index1 > fix->size_array_rows) print_var_error(FLERR,"Variable formula fix array is accessed out-of-range",ivar,0); if (index2 > fix->size_array_cols) print_var_error(FLERR,"Variable formula fix array is accessed out-of-range",ivar,0); @@ -5400,7 +5407,8 @@ VarReader::VarReader(LAMMPS *lmp, char *name, char *file, int flag) : if (me == 0) { fp = fopen(file,"r"); if (fp == nullptr) - error->one(FLERR,"Cannot open file variable file {}: {}", file, utils::getsyserror()); + error->one(FLERR,"Cannot open {} variable {} file {}: {}", (style == Variable::ATOMFILE) + ? "atomfile" : "file", name, file, utils::getsyserror()); } // if atomfile-style variable, must store per-atom values read from file @@ -5458,14 +5466,12 @@ int VarReader::read_scalar(char *str) while (true) { ptr = fgets(str,MAXLINE,fp); if (!ptr) { n=0; break; } // end of file - ptr[strcspn(ptr,"#")] = '\0'; // strip comment - ptr += strspn(ptr," \t\n\r\f"); // strip leading whitespace - ptr[strcspn(ptr," \t\n\r\f")] = '\0'; // strip trailing whitespace - n = strlen(ptr) + 1; + auto line = utils::trim(utils::trim_comment(str)); + n = line.size() + 1; if (n == 1) continue; // skip if blank line + memcpy(str, line.c_str(), n); break; } - if (n > 0) memmove(str,ptr,n); // move trimmed string back } MPI_Bcast(&n,1,MPI_INT,0,world); if (n == 0) return 1; @@ -5481,9 +5487,9 @@ int VarReader::read_scalar(char *str) int VarReader::read_peratom() { - int i,m,n,nchunk,eof; + int i,m,nchunk,eof; tagint tag; - char *ptr,*next; + char *ptr; double value; // set all per-atom values to 0.0 @@ -5497,24 +5503,22 @@ int VarReader::read_peratom() // read one string from file, convert to Nlines char str[MAXLINE]; + bigint nlines = 0; if (me == 0) { while (true) { ptr = fgets(str,MAXLINE,fp); - if (!ptr) { n=0; break; } // end of file - ptr[strcspn(ptr,"#")] = '\0'; // strip comment - ptr += strspn(ptr," \t\n\r\f"); // strip leading whitespace - ptr[strcspn(ptr," \t\n\r\f")] = '\0'; // strip trailing whitespace - n = strlen(ptr) + 1; - if (n == 1) continue; // skip if blank line + if (!ptr) { nlines = 0; break; } // end of file + Tokenizer words(utils::trim(utils::trim_comment(str))); + if (words.count() == 0) continue; // skip if blank or comment line + if (words.count() != 1) + error->one(FLERR, "Expected 1 token but found {} when parsing {}", words.count(), str); + nlines = utils::bnumeric(FLERR,words.next(),true,lmp); break; } - memmove(str,ptr,n); // move trimmed string back } + MPI_Bcast(&nlines,1,MPI_LMP_BIGINT,0,world); + if (nlines == 0) return 1; - MPI_Bcast(&n,1,MPI_INT,0,world); - if (n == 0) return 1; - MPI_Bcast(str,n,MPI_CHAR,0,world); - bigint nlines = utils::bnumeric(FLERR,str,false,lmp); tagint map_tag_max = atom->map_tag_max; bigint nread = 0; @@ -5523,24 +5527,22 @@ int VarReader::read_peratom() eof = utils::read_lines_from_file(fp,nchunk,MAXLINE,buffer,me,world); if (eof) return 1; - char *buf = buffer; - for (i = 0; i < nchunk; i++) { - next = strchr(buf,'\n'); - *next = '\0'; + for (const auto &line : utils::split_lines(buffer)) { try { - ValueTokenizer words(buf); + ValueTokenizer words(utils::trim_comment(utils::trim(line))); + if (words.count() == 0) continue; // skip comment or empty lines + if (words.count() != 2) + throw TokenizerException(fmt::format("expected 2 tokens but found {}", words.count()), ""); tag = words.next_bigint(); value = words.next_double(); + ++nread; } catch (TokenizerException &e) { - error->all(FLERR,"Invalid atomfile line '{}': {}",buf,e.what()); + error->all(FLERR,"Invalid atomfile line '{}': {}", line, e.what()); } if ((tag <= 0) || (tag > map_tag_max)) error->all(FLERR,"Invalid atom ID {} in variable file", tag); if ((m = atom->map(tag)) >= 0) vstore[m] = value; - buf = next + 1; } - - nread += nchunk; } return 0; diff --git a/unittest/commands/test_variables.cpp b/unittest/commands/test_variables.cpp index c631b69528..6748867b4e 100644 --- a/unittest/commands/test_variables.cpp +++ b/unittest/commands/test_variables.cpp @@ -216,7 +216,7 @@ TEST_F(VariableTest, CreateDelete) command("variable one internal 2");); TEST_FAILURE(".*ERROR: Cannot use atomfile-style variable unless an atom map exists.*", command("variable eleven atomfile test_variable.atomfile");); - TEST_FAILURE(".*ERROR on proc 0: Cannot open file variable file test_variable.xxx.*", + TEST_FAILURE(".*ERROR on proc 0: Cannot open file variable nine1 file test_variable.xxx.*", command("variable nine1 file test_variable.xxx");); TEST_FAILURE(".*ERROR: World variable count doesn't match # of partitions.*", command("variable ten10 world xxx xxx");); @@ -293,7 +293,7 @@ TEST_F(VariableTest, AtomicSystem) command("variable one atom x");); TEST_FAILURE(".*ERROR: Cannot redefine variable as a different style.*", command("variable id vector f_press");); - TEST_FAILURE(".*ERROR on proc 0: Cannot open file variable file test_variable.xxx.*", + TEST_FAILURE(".*ERROR on proc 0: Cannot open atomfile variable ten1 file test_variable.xxx.*", command("variable ten1 atomfile test_variable.xxx");); TEST_FAILURE(".*ERROR: Variable loop: has a circular dependency.*", variable->compute_equal("v_loop");); diff --git a/unittest/force-styles/check_tests.py b/unittest/force-styles/check_tests.py index df71640789..4dba8f9b9e 100755 --- a/unittest/force-styles/check_tests.py +++ b/unittest/force-styles/check_tests.py @@ -54,7 +54,7 @@ improper = {} kspace = {} pair = {} -style_pattern = re.compile("(.+)Style\((.+),(.+)\)") +style_pattern = re.compile("(.+)Style\\((.+),(.+)\\)") upper = re.compile("[A-Z]+") gpu = re.compile("(.+)/gpu$") intel = re.compile("(.+)/intel$") @@ -176,19 +176,19 @@ def check_tests(name,styles,yaml,search,skip=()): counter = 0 counter += check_tests('pair',pair,'*-pair-*.yaml', - '.*pair_style:\s*((\S+).*)?',skip=('meam','lj/sf')) + '.*pair_style:\\s*((\\S+).*)?',skip=('meam','lj/sf')) counter += check_tests('bond',bond,'bond-*.yaml', - '.*bond_style:\s*((\S+).*)?') + '.*bond_style:\\s*((\\S+).*)?') counter += check_tests('angle',angle,'angle-*.yaml', - '.*angle_style:\s*((\S+).*)?') + '.*angle_style:\\s*((\\S+).*)?') counter += check_tests('dihedral',dihedral,'dihedral-*.yaml', - '.*dihedral_style:\s*((\S+).*)?') + '.*dihedral_style:\\s*((\\S+).*)?') counter += check_tests('improper',improper,'improper-*.yaml', - '.*improper_style:\s*((\S+).*)?') + '.*improper_style:\\s*((\\S+).*)?') counter += check_tests('kspace',kspace,'kspace-*.yaml', - '.*kspace_style\s*((\S+).*)?') + '.*kspace_style\\s*((\\S+).*)?') counter += check_tests('fix',fix,'fix-*.yaml', - ' fix\s+((\S+)\s*)?') + ' fix\\s+((\\S+)\\s*)?') total = len(pair)+len(bond)+len(angle)+len(dihedral)+len(improper)+len(kspace)+len(fix) print(f"\nTotal tests missing: {counter} of {total}") diff --git a/unittest/force-styles/tests/atomic-pair-pedone.yaml b/unittest/force-styles/tests/atomic-pair-pedone.yaml new file mode 100644 index 0000000000..ea97d9ee8c --- /dev/null +++ b/unittest/force-styles/tests/atomic-pair-pedone.yaml @@ -0,0 +1,112 @@ +--- +lammps_version: 7 Feb 2024 +tags: +date_generated: Tue Apr 9 07:44:34 2024 +epsilon: 7.5e-13 +skip_tests: +prerequisites: ! | + pair pedone + pair coul/dsf +pre_commands: ! | + echo screen + atom_modify map array + units metal + atom_style charge + lattice fcc 4.01 + region box block 0 2 0 2 0 2 + create_box 2 box + create_atoms 1 box + displace_atoms all random 0.1 0.1 0.1 623426 + mass 1 40.0 + mass 2 16.0 + set type 1 type/fraction 2 0.5 998877 + set type 1 charge 1.2 + set type 2 charge -1.2 + velocity all create 100 4534624 loop geom +post_commands: ! "" +input_file: in.empty +pair_style: hybrid/overlay pedone 8.0 coul/dsf 0.05 8.0 +pair_coeff: ! | + * * coul/dsf + 1 2 pedone 0.030211 2.241334 2.923245 5.0 +extract: ! | + c0 2 + d0 2 + r0 2 + alpha 2 +natoms: 32 +init_vdwl: -0.05846735245123568 +init_coul: -127.6163776098739 +init_stress: ! |- + -2.3766889264059056e+01 -4.4271670366067475e+01 -2.5180846524036202e+01 5.0038886969182466e+00 1.8669335074554186e-01 -5.7263597257471289e+01 +init_forces: ! |2 + 1 1.3068329866550368e+00 -3.2426393314368136e+00 -1.7090059822217059e+00 + 2 -8.3708962469663994e-01 8.0332716327131255e+00 3.3880696306892157e+00 + 3 1.0817171269316166e+00 6.9121377930223904e-01 6.1164739509900095e+00 + 4 -6.2705188270402727e+00 -8.8174699380428727e+00 -6.6683743935745587e+00 + 5 2.8006317273000447e-01 -1.4785785338160322e+00 2.1291363801130675e-01 + 6 -9.9407772093842439e-02 3.0597043453576771e+00 2.6317878547215199e+00 + 7 -3.0340486322792599e+00 1.3184658303824097e+00 3.6223571622028894e+00 + 8 5.0115915641263573e+00 -4.9677051620674799e+00 -6.1712277135681015e+00 + 9 -6.1985573132343514e+00 -2.9794836727762029e+00 -1.3747449204783135e+00 + 10 6.8202263133821162e-02 -8.8740995444094821e+00 -3.7669178307548044e+00 + 11 1.8229752044042762e+00 1.2148573816886858e+00 -4.4347407582895784e+00 + 12 -3.9294213238803009e+00 8.8686068238310583e+00 -1.5696200877040254e+00 + 13 4.6671449089488757e+00 -4.6189653076982848e+00 -1.3124536997564062e+00 + 14 4.5651421484368113e-02 -2.3521973378049634e+00 -1.6447916834323362e+00 + 15 2.0718235766251047e+00 -1.0016787355222116e+00 2.6102514913196893e+00 + 16 4.2906354719273310e+00 5.5091935314875675e+00 2.3081189742020186e-01 + 17 2.4088963659786491e+00 2.6695675589930521e+00 3.5122235738936918e+00 + 18 3.4512362576614999e+00 -4.1430347952161721e+00 4.2175118122911428e+00 + 19 -1.1172781222643726e+00 5.0374260939675146e+00 6.1287692774066489e+00 + 20 4.7466340882928550e+00 4.2198180415705018e+00 -8.2329099903756351e+00 + 21 4.9922952432844170e-01 -3.3643476589192347e+00 -3.4586600244054257e+00 + 22 1.9519912199733915e+00 2.3651493661604901e+00 1.9718930394593539e+00 + 23 3.2436581993388014e+00 9.2909318945485386e-01 4.6199095913337018e+00 + 24 -4.6577163797485408e+00 1.0944440231458034e-01 -3.9975755528276791e+00 + 25 -8.9409463869190198e+00 2.4726478248668329e+00 3.7066236847004816e+00 + 26 1.1745295461487557e+00 -6.0775548138182502e+00 3.9173262687577626e-01 + 27 2.1387234883139223e+00 -9.1458060453528878e-01 3.4582641764483730e+00 + 28 -5.4502482566484289e+00 3.4786802718788201e+00 2.2450528597292503e+00 + 29 -7.5667651846088226e+00 -1.4577149705984696e+00 -3.9568568740165220e+00 + 30 3.1230549128026425e-01 1.3068114717904746e+00 1.5378868153458369e+00 + 31 3.8386185003687259e+00 2.2553073046534435e+00 -1.3273631790345837e+00 + 32 3.6895374447617666e+00 7.5079155624842708e-01 -9.7729039239942395e-01 +run_vdwl: -0.05936305172833948 +run_coul: -127.68271387147016 +run_stress: ! |- + -2.3775085785175861e+01 -4.4287200358424037e+01 -2.5219014434426338e+01 5.0215354612619398e+00 1.6449805968891407e-01 -5.7274887432938485e+01 +run_forces: ! |2 + 1 1.2991722692082786e+00 -3.2346896741423627e+00 -1.7041329770094276e+00 + 2 -8.3577389730915552e-01 8.0381530163215924e+00 3.3923109270711151e+00 + 3 1.0835047785054042e+00 6.9319911007073187e-01 6.1150295417259430e+00 + 4 -6.2592674457409263e+00 -8.8205393549813724e+00 -6.6664974877688721e+00 + 5 2.8319324224275277e-01 -1.4769949542028844e+00 2.1366042708413130e-01 + 6 -1.0153594962155643e-01 3.0590371871661279e+00 2.6305719846785136e+00 + 7 -3.0312781169981404e+00 1.3253014477860192e+00 3.6137212324460495e+00 + 8 5.0065920194766118e+00 -4.9738538898220481e+00 -6.1680653045217282e+00 + 9 -6.1953589072633033e+00 -2.9840507476418630e+00 -1.3858506137213893e+00 + 10 7.2110931001166223e-02 -8.8715664120515498e+00 -3.7622114666042830e+00 + 11 1.8261227725248501e+00 1.2127884088947380e+00 -4.4352512345078896e+00 + 12 -3.9286782746032927e+00 8.8761060543305845e+00 -1.5679267227812392e+00 + 13 4.6639719242626425e+00 -4.6221575223138132e+00 -1.3189131763776332e+00 + 14 4.5202926162814835e-02 -2.3548742883712031e+00 -1.6491976796732051e+00 + 15 2.0720535642022995e+00 -1.0069819281480463e+00 2.6083600468918844e+00 + 16 4.2902983639126537e+00 5.5131718679546795e+00 2.3830711385553280e-01 + 17 2.4096838033622459e+00 2.6710791048618789e+00 3.5085731540934209e+00 + 18 3.4514062725618015e+00 -4.1373868590762886e+00 4.2169294048595312e+00 + 19 -1.1264411792212099e+00 5.0227271243385347e+00 6.1237583999889607e+00 + 20 4.7393545880453329e+00 4.2216825100295452e+00 -8.2318896574215081e+00 + 21 5.0316022185802045e-01 -3.3658905024666987e+00 -3.4562503451177298e+00 + 22 1.9368078287723918e+00 2.3534799016713426e+00 1.9729696376413666e+00 + 23 3.2400632484576661e+00 9.3259676426325444e-01 4.6157700185053541e+00 + 24 -4.6586992607556210e+00 1.1241261874328348e-01 -3.9896118917888690e+00 + 25 -8.9349626587186393e+00 2.4768648221903695e+00 3.7112080381606569e+00 + 26 1.1679346190142463e+00 -6.0828712249034158e+00 3.8801942263557465e-01 + 27 2.1417538088342263e+00 -9.0746081465677486e-01 3.4626716637488029e+00 + 28 -5.4328133673274284e+00 3.4793352823397115e+00 2.2408588660642268e+00 + 29 -7.5599171237192415e+00 -1.4574882072609379e+00 -3.9594670101361755e+00 + 30 3.1095531392911402e-01 1.2978745340983431e+00 1.5451191860370526e+00 + 31 3.8408681082369638e+00 2.2573010571848813e+00 -1.3280944539185844e+00 + 32 3.6805155767070339e+00 7.5369556779361924e-01 -9.7447904413958231e-01 +... diff --git a/unittest/force-styles/tests/fix-timestep-rigid_group.yaml b/unittest/force-styles/tests/fix-timestep-rigid_group.yaml index e28e0abb08..e434ace375 100644 --- a/unittest/force-styles/tests/fix-timestep-rigid_group.yaml +++ b/unittest/force-styles/tests/fix-timestep-rigid_group.yaml @@ -1,6 +1,7 @@ --- -lammps_version: 17 Feb 2022 -date_generated: Fri Mar 18 22:18:00 2022 +lammps_version: 7 Feb 2024 +tags: +date_generated: Thu Apr 4 21:27:10 2024 epsilon: 5e-13 skip_tests: prerequisites: ! | @@ -14,65 +15,65 @@ post_commands: ! | input_file: in.fourmol natoms: 29 run_stress: ! |- - -1.4245356937318884e+03 -1.4496493315649691e+03 -3.6144360984224995e+03 8.4840626828644076e+02 2.0318336761611761e+02 -6.0622397707970140e+02 -global_scalar: 15.711521423178082 + -1.4245356937318909e+03 -1.4496493315649632e+03 -3.6144360984225123e+03 8.4840626828644361e+02 2.0318336761612773e+02 -6.0622397707969583e+02 +global_scalar: 15.711521423178084 run_pos: ! |2 - 1 -2.7899546863891400e-01 2.4731857340328229e+00 -1.7290667740242271e-01 - 2 3.0296221610264262e-01 2.9517129916957545e+00 -8.5798904387773245e-01 - 3 -6.9368802364134741e-01 1.2445115421754194e+00 -6.2281111198650418e-01 - 4 -1.5764879647103154e+00 1.4919714415841279e+00 -1.2492069414674623e+00 - 5 -8.9434512967429969e-01 9.3651699743511030e-01 4.0191726558261276e-01 - 6 2.9454439634451712e-01 2.2724545792544038e-01 -1.2845195053960268e+00 - 7 3.4049112903270051e-01 -9.4655678322458359e-03 -2.4634480020857055e+00 - 8 1.1644354555804874e+00 -4.8367776650961336e-01 -6.7663643940735863e-01 - 9 1.3781717822696469e+00 -2.5332509530010694e-01 2.6864954436590061e-01 - 10 2.0186368606041896e+00 -1.4285861423625796e+00 -9.6712491252780131e-01 - 11 1.7929137227577452e+00 -1.9875455388407426e+00 -1.8836565352266534e+00 - 12 3.0032775230399604e+00 -4.8983022415174027e-01 -1.6190248017343642e+00 - 13 4.0448964162125947e+00 -9.0213155122391020e-01 -1.6385398399479558e+00 - 14 2.6035151245015822e+00 -4.0874995493219213e-01 -2.6555999074786607e+00 - 15 2.9761196776172318e+00 5.6287237454108674e-01 -1.2442626196083388e+00 - 16 2.6517373021566168e+00 -2.3957035508393707e+00 3.3389262100692263e-02 - 17 2.2311114924744970e+00 -2.1018393228798513e+00 1.1496088522377543e+00 - 18 2.1390642573201784e+00 3.0164773560693781e+00 -3.5143984803853878e+00 - 19 1.5353246655146278e+00 2.6305911186316133e+00 -4.2455871034737074e+00 - 20 2.7649421538938390e+00 3.6818603528430849e+00 -3.9364115785985550e+00 - 21 4.9043112657298877e+00 -4.0774268210397882e+00 -3.6200836396129836e+00 - 22 4.3665322424283310e+00 -4.2075138112953594e+00 -4.4636587264885881e+00 - 23 5.7355405581985188e+00 -3.5789558641908918e+00 -3.8805763324089981e+00 - 24 2.0692780332810115e+00 3.1504920436416004e+00 3.1571131300668789e+00 - 25 1.3007297593169076e+00 3.2745259354179481e+00 2.5110163874103675e+00 - 26 2.5819416446099739e+00 4.0104903120756576e+00 3.2150249624526035e+00 + 1 -2.7899546863891755e-01 2.4731857340328198e+00 -1.7290667740241461e-01 + 2 3.0296221610263996e-01 2.9517129916957550e+00 -8.5798904387772190e-01 + 3 -6.9368802364134963e-01 1.2445115421754176e+00 -6.2281111198650141e-01 + 4 -1.5764879647103172e+00 1.4919714415841279e+00 -1.2492069414674598e+00 + 5 -8.9434512967430235e-01 9.3651699743510453e-01 4.0191726558261442e-01 + 6 2.9454439634451712e-01 2.2724545792544146e-01 -1.2845195053960266e+00 + 7 3.4049112903270240e-01 -9.4655678322404235e-03 -2.4634480020857055e+00 + 8 1.1644354555804877e+00 -4.8367776650961403e-01 -6.7663643940735962e-01 + 9 1.3781717822696455e+00 -2.5332509530011083e-01 2.6864954436590072e-01 + 10 2.0186368606041905e+00 -1.4285861423625785e+00 -9.6712491252780486e-01 + 11 1.7929137227577487e+00 -1.9875455388407386e+00 -1.8836565352266592e+00 + 12 3.0032775230399622e+00 -4.8983022415173583e-01 -1.6190248017343625e+00 + 13 4.0448964162125964e+00 -9.0213155122390454e-01 -1.6385398399479547e+00 + 14 2.6035151245015857e+00 -4.0874995493218413e-01 -2.6555999074786598e+00 + 15 2.9761196776172323e+00 5.6287237454109007e-01 -1.2442626196083335e+00 + 16 2.6517373021566182e+00 -2.3957035508393734e+00 3.3389262100686046e-02 + 17 2.2311114924744961e+00 -2.1018393228798584e+00 1.1496088522377494e+00 + 18 2.1390642573201792e+00 3.0164773560693803e+00 -3.5143984803853883e+00 + 19 1.5353246655146293e+00 2.6305911186316160e+00 -4.2455871034737100e+00 + 20 2.7649421538938399e+00 3.6818603528430875e+00 -3.9364115785985545e+00 + 21 4.9043112657298966e+00 -4.0774268210397882e+00 -3.6200836396129850e+00 + 22 4.3665322424283417e+00 -4.2075138112953612e+00 -4.4636587264885925e+00 + 23 5.7355405581985277e+00 -3.5789558641908901e+00 -3.8805763324089995e+00 + 24 2.0692780332810012e+00 3.1504920436415969e+00 3.1571131300668829e+00 + 25 1.3007297593168976e+00 3.2745259354179459e+00 2.5110163874103693e+00 + 26 2.5819416446099637e+00 4.0104903120756576e+00 3.2150249624526102e+00 27 -1.9613581876744359e+00 -4.3556300596085160e+00 2.1101467673534788e+00 28 -2.7406520384725965e+00 -4.0207251278130975e+00 1.5828689861678511e+00 29 -1.3108232656499081e+00 -3.5992986322410760e+00 2.2680459788743503e+00 run_vel: ! |2 - 1 4.7093289825842508e-04 2.6351122778447809e-04 -4.4905093064114883e-04 - 2 4.9594625316470506e-04 9.4561370489630299e-05 -5.4581359894047775e-04 - 3 3.3306085115756103e-04 2.3224943880673259e-04 -2.3659455671746018e-04 - 4 3.3692327392261152e-04 2.1926810694051179e-04 -2.4716631558862516e-04 - 5 3.3642542694186002e-04 4.1797578013265738e-04 -1.8011341766657675e-04 - 6 2.0926869754934769e-04 2.6449308951578185e-05 -1.0508938983871811e-04 - 7 1.4629043007907940e-04 -1.6873376665350122e-04 -6.8354048774350921e-05 - 8 1.5844101624224881e-04 3.7728761273999780e-05 -1.9162715667090996e-05 - 9 2.1299362072601952e-04 1.6917140529157474e-04 -6.3528165037845483e-05 - 10 5.4261629412254495e-05 -9.4655528376811482e-05 1.0511362869146690e-04 - 11 -3.2194160796502724e-05 -2.2025095264758716e-04 2.0300202946212429e-04 - 12 1.2640586304750378e-04 -2.9851080445665107e-04 -7.9476371818245798e-05 - 13 8.4523575162142608e-05 -4.0583135407330561e-04 -4.7551111331700511e-05 - 14 9.9954050381270972e-05 -4.2610816481298294e-04 -7.9255633594379530e-05 - 15 2.4417481119789862e-04 -2.3521002264677992e-04 -2.4875318161048917e-04 - 16 -9.0958138549664992e-06 3.7774817121222391e-06 2.4035199548835096e-04 - 17 5.7507224523612230e-05 2.2629217444843764e-04 2.0686920072684822e-04 - 18 2.9220264989359833e-04 -6.2478376436796265e-04 8.4222594596602366e-04 - 19 2.0572616567799188e-04 -5.0334424271726639e-04 8.4953929443210648e-04 - 20 4.1224811789513022e-04 -7.4115205416011554e-04 8.3678612337507920e-04 - 21 -1.0671858777656393e-03 -1.1531171045499515e-03 7.3720674900162159e-04 - 22 -1.1066511338291710e-03 -1.0433933757600460e-03 7.4544544325708573e-04 - 23 -9.7629260480941525e-04 -1.3100872491594103e-03 7.2687284219704804e-04 - 24 4.3308126651259312e-04 -6.6527658087322801e-04 8.4451298670663606e-04 - 25 4.4565811905442889e-04 -5.1298436273584285e-04 8.5878867884521559e-04 - 26 5.9865972692022765e-04 -7.6385263287080381e-04 8.4259943226842166e-04 + 1 4.7093289825842437e-04 2.6351122778447999e-04 -4.4905093064114823e-04 + 2 4.9594625316470473e-04 9.4561370489632928e-05 -5.4581359894047732e-04 + 3 3.3306085115756054e-04 2.3224943880673362e-04 -2.3659455671746045e-04 + 4 3.3692327392261130e-04 2.1926810694051292e-04 -2.4716631558862576e-04 + 5 3.3642542694185899e-04 4.1797578013265770e-04 -1.8011341766657654e-04 + 6 2.0926869754934769e-04 2.6449308951579106e-05 -1.0508938983871929e-04 + 7 1.4629043007908003e-04 -1.6873376665349995e-04 -6.8354048774352968e-05 + 8 1.5844101624224859e-04 3.7728761274000288e-05 -1.9162715667092141e-05 + 9 2.1299362072601887e-04 1.6917140529157517e-04 -6.3528165037845917e-05 + 10 5.4261629412254576e-05 -9.4655528376811157e-05 1.0511362869146505e-04 + 11 -3.2194160796502236e-05 -2.2025095264758700e-04 2.0300202946212152e-04 + 12 1.2640586304750429e-04 -2.9851080445664956e-04 -7.9476371818247574e-05 + 13 8.4523575162143312e-05 -4.0583135407330399e-04 -4.7551111331702557e-05 + 14 9.9954050381271961e-05 -4.2610816481298121e-04 -7.9255633594381943e-05 + 15 2.4417481119789894e-04 -2.3521002264677784e-04 -2.4875318161049020e-04 + 16 -9.0958138549668516e-06 3.7774817121217089e-06 2.4035199548834928e-04 + 17 5.7507224523611227e-05 2.2629217444843685e-04 2.0686920072684740e-04 + 18 2.9220264989359860e-04 -6.2478376436796244e-04 8.4222594596602409e-04 + 19 2.0572616567799204e-04 -5.0334424271726607e-04 8.4953929443210702e-04 + 20 4.1224811789513060e-04 -7.4115205416011543e-04 8.3678612337507964e-04 + 21 -1.0671858777656406e-03 -1.1531171045499533e-03 7.3720674900162007e-04 + 22 -1.1066511338291734e-03 -1.0433933757600477e-03 7.4544544325708432e-04 + 23 -9.7629260480941644e-04 -1.3100872491594124e-03 7.2687284219704641e-04 + 24 4.3308126651259382e-04 -6.6527658087322671e-04 8.4451298670663671e-04 + 25 4.4565811905442982e-04 -5.1298436273584133e-04 8.5878867884521635e-04 + 26 5.9865972692022961e-04 -7.6385263287080262e-04 8.4259943226842242e-04 27 -2.6510179146429716e-04 3.6306203629019116e-04 -5.6235585400647747e-04 28 -2.3068708109787484e-04 -8.5663070212203200e-04 2.1302563179109169e-03 29 -2.5054744388303732e-03 -1.6773997805290820e-04 2.8436699761004796e-03 diff --git a/unittest/force-styles/tests/fix-timestep-rigid_molecule.yaml b/unittest/force-styles/tests/fix-timestep-rigid_molecule.yaml index d667942e49..e871a351f6 100644 --- a/unittest/force-styles/tests/fix-timestep-rigid_molecule.yaml +++ b/unittest/force-styles/tests/fix-timestep-rigid_molecule.yaml @@ -1,6 +1,7 @@ --- -lammps_version: 17 Feb 2022 -date_generated: Fri Mar 18 22:18:00 2022 +lammps_version: 7 Feb 2024 +tags: +date_generated: Thu Apr 4 21:27:10 2024 epsilon: 5e-13 skip_tests: prerequisites: ! | @@ -14,8 +15,8 @@ post_commands: ! | input_file: in.fourmol natoms: 29 run_stress: ! |- - -4.9200116134789873e+01 -2.6907707565987707e+01 -6.0080860422278581e+00 -2.5620423972101300e+01 -1.3450224059983967e+01 -1.4947288487003760e+00 -global_scalar: 18.3405601674144 + -4.9200116134788615e+01 -2.6907707565987401e+01 -6.0080860422276308e+00 -2.5620423972100241e+01 -1.3450224059984270e+01 -1.4947288487006070e+00 +global_scalar: 18.340560167414402 run_pos: ! |2 1 -2.7993683669226832e-01 2.4726588069312840e+00 -1.7200860244148433e-01 2 3.0197083955402204e-01 2.9515239068888608e+00 -8.5689735572907566e-01 @@ -34,15 +35,15 @@ run_pos: ! |2 15 2.9756315249791303e+00 5.6334269722969288e-01 -1.2437650754599008e+00 16 2.6517554244980306e+00 -2.3957110424978438e+00 3.2908335999178327e-02 17 2.2309964792710639e+00 -2.1022918943319384e+00 1.1491948328949437e+00 - 18 2.1392027588271301e+00 3.0171068018412779e+00 -3.5144628518856349e+00 - 19 1.5366124997074571e+00 2.6286809834111748e+00 -4.2452547844370221e+00 - 20 2.7628161763455852e+00 3.6842251687634775e+00 -3.9370881219352554e+00 - 21 4.9036621347791245e+00 -4.0757648442838548e+00 -3.6192617654515904e+00 - 22 4.3655322291888483e+00 -4.2084949965552561e+00 -4.4622011117402334e+00 - 23 5.7380414793463101e+00 -3.5841969195032672e+00 -3.8827839830470219e+00 + 18 2.1392027588271301e+00 3.0171068018412783e+00 -3.5144628518856353e+00 + 19 1.5366124997074575e+00 2.6286809834111740e+00 -4.2452547844370221e+00 + 20 2.7628161763455852e+00 3.6842251687634775e+00 -3.9370881219352558e+00 + 21 4.9036621347791236e+00 -4.0757648442838548e+00 -3.6192617654515908e+00 + 22 4.3655322291888474e+00 -4.2084949965552561e+00 -4.4622011117402343e+00 + 23 5.7380414793463110e+00 -3.5841969195032672e+00 -3.8827839830470219e+00 24 2.0701314765323930e+00 3.1499370533342330e+00 3.1565324852522938e+00 - 25 1.3030170721374779e+00 3.2711173927682249e+00 2.5081940917429768e+00 - 26 2.5776230782480045e+00 4.0127347068243875e+00 3.2182355138709275e+00 + 25 1.3030170721374787e+00 3.2711173927682244e+00 2.5081940917429759e+00 + 26 2.5776230782480041e+00 4.0127347068243884e+00 3.2182355138709284e+00 27 -1.9613581876744359e+00 -4.3556300596085160e+00 2.1101467673534788e+00 28 -2.7406520384725965e+00 -4.0207251278130975e+00 1.5828689861678511e+00 29 -1.3108232656499081e+00 -3.5992986322410760e+00 2.2680459788743503e+00 @@ -64,15 +65,15 @@ run_vel: ! |2 15 -4.3301707382721859e-03 -3.1802661664634938e-03 3.2037919043360571e-03 16 -9.6715751018414326e-05 -5.0016572678960377e-04 1.4945658875149626e-03 17 6.5692180538157174e-04 3.6635779995305095e-04 8.3495414466050911e-04 - 18 3.6149625095704914e-04 -3.1032459262908286e-04 8.1043030117346052e-04 - 19 8.5103884665345452e-04 -1.4572280596788108e-03 1.0163621287634116e-03 - 20 -6.5204659278590683e-04 4.3989037444289853e-04 4.9909839028507901e-04 - 21 -1.3888125881903923e-03 -3.1978049143082385e-04 1.1455681499836646e-03 - 22 -1.6084223477729510e-03 -1.5355394240821117e-03 1.4772010826232375e-03 - 23 2.6392672378805124e-04 -3.9375414431174821e-03 -3.6991583139728095e-04 - 24 8.6062827067890247e-04 -9.4179873474469237e-04 5.5396395550012453e-04 - 25 1.5933645477487538e-03 -2.2139156625681695e-03 -5.5078029695647401e-04 - 26 -1.5679561743998840e-03 3.5146224354726100e-04 2.4446924193334478e-03 + 18 3.6149625095704849e-04 -3.1032459262908286e-04 8.1043030117346052e-04 + 19 8.5103884665345473e-04 -1.4572280596788095e-03 1.0163621287634121e-03 + 20 -6.5204659278590661e-04 4.3989037444289755e-04 4.9909839028507901e-04 + 21 -1.3888125881903906e-03 -3.1978049143082342e-04 1.1455681499836646e-03 + 22 -1.6084223477729526e-03 -1.5355394240821163e-03 1.4772010826232394e-03 + 23 2.6392672378804821e-04 -3.9375414431174795e-03 -3.6991583139728377e-04 + 24 8.6062827067890269e-04 -9.4179873474469291e-04 5.5396395550012388e-04 + 25 1.5933645477487551e-03 -2.2139156625681673e-03 -5.5078029695647250e-04 + 26 -1.5679561743998888e-03 3.5146224354726068e-04 2.4446924193334487e-03 27 -2.6510179146429716e-04 3.6306203629019116e-04 -5.6235585400647747e-04 28 -2.3068708109787484e-04 -8.5663070212203200e-04 2.1302563179109169e-03 29 -2.5054744388303732e-03 -1.6773997805290820e-04 2.8436699761004796e-03 diff --git a/unittest/force-styles/tests/fix-timestep-rigid_molecule_tri.yaml b/unittest/force-styles/tests/fix-timestep-rigid_molecule_tri.yaml index 304db9fd60..672bf8946c 100644 --- a/unittest/force-styles/tests/fix-timestep-rigid_molecule_tri.yaml +++ b/unittest/force-styles/tests/fix-timestep-rigid_molecule_tri.yaml @@ -1,6 +1,7 @@ --- -lammps_version: 17 Feb 2022 -date_generated: Fri Mar 18 22:18:00 2022 +lammps_version: 7 Feb 2024 +tags: +date_generated: Thu Apr 4 21:27:11 2024 epsilon: 5e-12 skip_tests: prerequisites: ! | @@ -15,8 +16,8 @@ post_commands: ! | input_file: in.fourmol natoms: 29 run_stress: ! |- - -4.9200116134788658e+01 -2.6907707565985344e+01 -6.0080860422268874e+00 -2.5620423972099733e+01 -1.3450224059983656e+01 -1.4947288487000705e+00 -global_scalar: 18.340560167414306 + -4.9200116134789653e+01 -2.6907707565986087e+01 -6.0080860422267843e+00 -2.5620423972100063e+01 -1.3450224059983270e+01 -1.4947288486998982e+00 +global_scalar: 18.340560167414335 run_pos: ! |2 1 -2.7993683669226854e-01 2.4726588069312836e+00 -1.7200860244148508e-01 2 3.0197083955402171e-01 2.9515239068888608e+00 -8.5689735572907555e-01 @@ -36,14 +37,14 @@ run_pos: ! |2 16 2.6517554244980301e+00 -2.3957110424978438e+00 3.2908335999177751e-02 17 2.2309964792710639e+00 -2.1022918943319384e+00 1.1491948328949437e+00 18 2.1392027588271310e+00 3.0171068018412779e+00 -3.5144628518856349e+00 - 19 1.5366124997074566e+00 2.6286809834111740e+00 -4.2452547844370239e+00 + 19 1.5366124997074584e+00 2.6286809834111722e+00 -4.2452547844370230e+00 20 2.7628161763455852e+00 3.6842251687634775e+00 -3.9370881219352558e+00 - 21 4.9036621347791245e+00 -4.0757648442838557e+00 -3.6192617654515900e+00 - 22 4.3655322291888465e+00 -4.2084949965552569e+00 -4.4622011117402334e+00 + 21 4.9036621347791245e+00 -4.0757648442838548e+00 -3.6192617654515900e+00 + 22 4.3655322291888465e+00 -4.2084949965552578e+00 -4.4622011117402343e+00 23 5.7380414793463101e+00 -3.5841969195032686e+00 -3.8827839830470232e+00 24 2.0701314765323913e+00 3.1499370533342308e+00 3.1565324852522920e+00 - 25 1.3030170721374770e+00 3.2711173927682236e+00 2.5081940917429755e+00 - 26 2.5776230782480054e+00 4.0127347068243875e+00 3.2182355138709262e+00 + 25 1.3030170721374779e+00 3.2711173927682236e+00 2.5081940917429755e+00 + 26 2.5776230782480036e+00 4.0127347068243875e+00 3.2182355138709280e+00 27 -1.9613581876744357e+00 -4.3556300596085160e+00 2.1101467673534788e+00 28 -2.7406520384725965e+00 -4.0207251278130975e+00 1.5828689861678509e+00 29 -1.3108232656499084e+00 -3.5992986322410765e+00 2.2680459788743512e+00 @@ -65,15 +66,15 @@ run_vel: ! |2 15 -4.3301707382721859e-03 -3.1802661664634938e-03 3.2037919043360571e-03 16 -9.6715751018414326e-05 -5.0016572678960377e-04 1.4945658875149626e-03 17 6.5692180538157174e-04 3.6635779995305095e-04 8.3495414466050911e-04 - 18 3.6149625095704681e-04 -3.1032459262907857e-04 8.1043030117346074e-04 - 19 8.5103884665345820e-04 -1.4572280596788108e-03 1.0163621287634073e-03 - 20 -6.5204659278590271e-04 4.3989037444289630e-04 4.9909839028508215e-04 - 21 -1.3888125881903852e-03 -3.1978049143082049e-04 1.1455681499836594e-03 - 22 -1.6084223477729513e-03 -1.5355394240820970e-03 1.4772010826232351e-03 - 23 2.6392672378803975e-04 -3.9375414431174569e-03 -3.6991583139727910e-04 - 24 8.6062827067889835e-04 -9.4179873474469346e-04 5.5396395550012518e-04 - 25 1.5933645477487516e-03 -2.2139156625681669e-03 -5.5078029695647542e-04 - 26 -1.5679561743998831e-03 3.5146224354726187e-04 2.4446924193334495e-03 + 18 3.6149625095704659e-04 -3.1032459262907825e-04 8.1043030117346085e-04 + 19 8.5103884665346059e-04 -1.4572280596788099e-03 1.0163621287634082e-03 + 20 -6.5204659278590227e-04 4.3989037444289446e-04 4.9909839028508150e-04 + 21 -1.3888125881903869e-03 -3.1978049143081757e-04 1.1455681499836596e-03 + 22 -1.6084223477729556e-03 -1.5355394240821013e-03 1.4772010826232407e-03 + 23 2.6392672378803953e-04 -3.9375414431174656e-03 -3.6991583139727423e-04 + 24 8.6062827067889998e-04 -9.4179873474469411e-04 5.5396395550012377e-04 + 25 1.5933645477487516e-03 -2.2139156625681634e-03 -5.5078029695647109e-04 + 26 -1.5679561743998922e-03 3.5146224354726068e-04 2.4446924193334543e-03 27 -2.6510179146429716e-04 3.6306203629019116e-04 -5.6235585400647747e-04 28 -2.3068708109787484e-04 -8.5663070212203200e-04 2.1302563179109169e-03 29 -2.5054744388303732e-03 -1.6773997805290820e-04 2.8436699761004796e-03 diff --git a/unittest/force-styles/tests/fix-timestep-rigid_nph.yaml b/unittest/force-styles/tests/fix-timestep-rigid_nph.yaml index c80a70b428..16ba7c079d 100644 --- a/unittest/force-styles/tests/fix-timestep-rigid_nph.yaml +++ b/unittest/force-styles/tests/fix-timestep-rigid_nph.yaml @@ -1,6 +1,7 @@ --- -lammps_version: 17 Feb 2022 -date_generated: Fri Mar 18 22:18:00 2022 +lammps_version: 7 Feb 2024 +tags: +date_generated: Thu Apr 4 21:27:11 2024 epsilon: 5e-13 skip_tests: prerequisites: ! | @@ -14,38 +15,38 @@ post_commands: ! | input_file: in.fourmol natoms: 29 run_stress: ! |2- - 4.3578059172167876e+01 1.7275105166465000e+01 6.7372361276631054e+01 5.1985075049901745e+01 -2.0990677389800993e+01 -7.5321398101844359e+00 -global_scalar: 29.023636440847998 + 4.3578059175768836e+01 1.7275105168781163e+01 6.7372361277896715e+01 5.1985075050485008e+01 -2.0990677388216337e+01 -7.5321398110797180e+00 +global_scalar: 29.023636439584656 run_pos: ! |2 - 1 -6.3472039825517168e-01 3.0113983126282058e+00 -8.8148450172235826e-02 - 2 6.4798884173500326e-02 3.5870486860057795e+00 -9.1146271255434463e-01 - 3 -1.1328967478840362e+00 1.5344674077762583e+00 -6.2949567786977667e-01 - 4 -2.1941320441841130e+00 1.8319737599530370e+00 -1.3824693495474225e+00 - 5 -1.3741175247360697e+00 1.1637763350569887e+00 6.0220861483086097e-01 - 6 5.5368589242158706e-02 3.1209253712244411e-01 -1.4252606627467266e+00 - 7 1.1075313780270069e-01 2.8008314824797154e-02 -2.8425552056438050e+00 - 8 1.1011987966104080e+00 -5.4254536577068713e-01 -6.9472264392660854e-01 - 9 1.3580030945401020e+00 -2.6595138115345840e-01 4.4172536708297194e-01 - 10 2.1282964643831388e+00 -1.6781145595676907e+00 -1.0442216631471304e+00 - 11 1.8571593172391605e+00 -2.3497452731071471e+00 -2.1462323657665392e+00 - 12 3.3117732698469986e+00 -5.4913311816190635e-01 -1.8274356036322548e+00 - 13 4.5640183918453143e+00 -1.0445083545907554e+00 -1.8509716390298214e+00 - 14 2.8312769330518019e+00 -4.5135848464344086e-01 -3.0735173792331993e+00 - 15 3.2788434490964296e+00 7.1618295543695254e-01 -1.3765217601452289e+00 - 16 2.8895075000232158e+00 -2.8409365554010479e+00 1.5818504152554702e-01 - 17 2.3837073405559277e+00 -2.4882133308169232e+00 1.5000885103549333e+00 - 18 2.2738793194357232e+00 3.6743407122553755e+00 -4.1408965121163197e+00 - 19 1.6572750518209336e+00 3.2770314238152451e+00 -4.8886441786593569e+00 - 20 2.9120476452800226e+00 4.3568412675031851e+00 -4.5732834167769187e+00 - 21 5.6058485050774536e+00 -4.8495065176300871e+00 -4.2655497599953458e+00 - 22 5.0552709232982114e+00 -4.9851876752032496e+00 -5.1280564953560424e+00 - 23 6.4593933585948218e+00 -4.3461765105422652e+00 -4.5350231456236889e+00 - 24 2.1823354619125279e+00 3.8552931130470363e+00 3.8953804330431208e+00 - 25 1.3973696115403698e+00 3.9794119228484153e+00 3.2321313266194949e+00 - 26 2.7018361227965517e+00 4.7379517631305443e+00 3.9583193478092706e+00 - 27 -2.6559803075358257e+00 -5.1969823689078796e+00 2.6552621488555683e+00 - 28 -3.5927802460207046e+00 -4.7943885088602283e+00 2.0214142204095413e+00 - 29 -1.8739632618339108e+00 -4.2877858778713946e+00 2.8450749793919066e+00 + 1 -6.3472039825540794e-01 3.0113983126285611e+00 -8.8148450172186088e-02 + 2 6.4798884173342230e-02 3.5870486860061987e+00 -9.1146271255438371e-01 + 3 -1.1328967478843275e+00 1.5344674077764573e+00 -6.2949567786978555e-01 + 4 -2.1941320441845233e+00 1.8319737599532644e+00 -1.3824693495475202e+00 + 5 -1.3741175247363868e+00 1.1637763350571468e+00 6.0220861483099597e-01 + 6 5.5368589242003274e-02 3.1209253712249918e-01 -1.4252606627468261e+00 + 7 1.1075313780254881e-01 2.8008314824818470e-02 -2.8425552056440617e+00 + 8 1.1011987966103707e+00 -5.4254536577072621e-01 -6.9472264392662098e-01 + 9 1.3580030945400878e+00 -2.6595138115347083e-01 4.4172536708308918e-01 + 10 2.1282964643832170e+00 -1.6781145595678604e+00 -1.0442216631471855e+00 + 11 1.8571593172392049e+00 -2.3497452731073896e+00 -2.1462323657667168e+00 + 12 3.3117732698472082e+00 -5.4913311816195076e-01 -1.8274356036323969e+00 + 13 4.5640183918456607e+00 -1.0445083545908531e+00 -1.8509716390299671e+00 + 14 2.8312769330519618e+00 -4.5135848464346928e-01 -3.0735173792334827e+00 + 15 3.2788434490966321e+00 7.1618295543705379e-01 -1.3765217601453177e+00 + 16 2.8895075000233756e+00 -2.8409365554013446e+00 1.5818504152563229e-01 + 17 2.3837073405560343e+00 -2.4882133308171808e+00 1.5000885103551624e+00 + 18 2.2738793194332434e+00 3.6743407122541889e+00 -4.1408965121171795e+00 + 19 1.6572750518219337e+00 3.2770314238270633e+00 -4.8886441786700008e+00 + 20 2.9120476452894675e+00 4.3568412674987194e+00 -4.5732834167653644e+00 + 21 5.6058485051319096e+00 -4.8495065176594299e+00 -4.2655497599906971e+00 + 22 5.0552709232924169e+00 -4.9851876754509741e+00 -5.1280564952785888e+00 + 23 6.4593933583860359e+00 -4.3461765101804879e+00 -4.5350231457223327e+00 + 24 2.1823354618683570e+00 3.8552931130563355e+00 3.8953804330779889e+00 + 25 1.3973696115700545e+00 3.9794119229082359e+00 3.2321313265764022e+00 + 26 2.7018361229436465e+00 4.7379517630364116e+00 3.9583193477161114e+00 + 27 -2.6559803075362858e+00 -5.1969823689084436e+00 2.6552621488559236e+00 + 28 -3.5927802460212725e+00 -4.7943885088607452e+00 2.0214142204098309e+00 + 29 -1.8739632618342856e+00 -4.2877858778718556e+00 2.8450749793922920e+00 run_vel: ! |2 1 7.7867804888392077e-04 5.8970331623292821e-04 -2.2179517633030531e-04 2 2.7129529964126462e-03 4.6286427111164284e-03 3.5805549693846352e-03 @@ -64,15 +65,15 @@ run_vel: ! |2 15 -4.3301707382721859e-03 -3.1802661664634938e-03 3.2037919043360571e-03 16 -9.6715751018414326e-05 -5.0016572678960377e-04 1.4945658875149626e-03 17 6.5692180538157174e-04 3.6635779995305095e-04 8.3495414466050911e-04 - 18 3.0094600491564739e-04 -2.4312792027781263e-04 6.5542049134062323e-04 - 19 7.4731683462770076e-04 -1.2894119671278408e-03 8.4327024053533397e-04 - 20 -6.2333686369976551e-04 4.4115361641690044e-04 3.7135656431834220e-04 - 21 -1.1457423793218525e-03 -1.7337748161437940e-04 9.4510018429417686e-04 - 22 -1.3457150581639313e-03 -1.2816797357047471e-03 1.2470992250388096e-03 - 23 3.6277645415306518e-04 -3.4719859048227848e-03 -4.3796817853449118e-04 - 24 7.2410992462873655e-04 -7.6012809744767037e-04 4.3327155128124943e-04 - 25 1.3921349892629666e-03 -1.9207002802664867e-03 -5.7453335109528090e-04 - 26 -1.4901465947638008e-03 4.2012923457099966e-04 2.1578545404178418e-03 + 18 3.0094600492089644e-04 -2.4312792028464785e-04 6.5542049134054972e-04 + 19 7.4731683460624917e-04 -1.2894119671240515e-03 8.4327024053305281e-04 + 20 -6.2333686369944134e-04 4.4115361644063580e-04 3.7135656432041769e-04 + 21 -1.1457423794330429e-03 -1.7337748206069275e-04 9.4510018428907005e-04 + 22 -1.3457150585185161e-03 -1.2816797348700177e-03 1.2470992253076274e-03 + 23 3.6277645495226573e-04 -3.4719859038751704e-03 -4.3796817878355291e-04 + 24 7.2410992459670032e-04 -7.6012809759399148e-04 4.3327155120505761e-04 + 25 1.3921349891892136e-03 -1.9207002802470530e-03 -5.7453335098663809e-04 + 26 -1.4901465945625111e-03 4.2012923513626559e-04 2.1578545406129137e-03 27 -2.6510179146429716e-04 3.6306203629019116e-04 -5.6235585400647747e-04 28 -2.3068708109787484e-04 -8.5663070212203200e-04 2.1302563179109169e-03 29 -2.5054744388303732e-03 -1.6773997805290820e-04 2.8436699761004796e-03 diff --git a/unittest/force-styles/tests/fix-timestep-rigid_nph_small.yaml b/unittest/force-styles/tests/fix-timestep-rigid_nph_small.yaml index 3894815950..80cc4865c0 100644 --- a/unittest/force-styles/tests/fix-timestep-rigid_nph_small.yaml +++ b/unittest/force-styles/tests/fix-timestep-rigid_nph_small.yaml @@ -1,6 +1,7 @@ --- -lammps_version: 17 Feb 2022 -date_generated: Fri Mar 18 22:18:00 2022 +lammps_version: 7 Feb 2024 +tags: +date_generated: Thu Apr 4 21:27:11 2024 epsilon: 6.5e-13 skip_tests: prerequisites: ! | @@ -14,38 +15,38 @@ post_commands: ! | input_file: in.fourmol natoms: 29 run_stress: ! |2- - 2.7340318973870396e+01 4.7963870091858283e+00 6.8884396847592512e+01 2.9853310007358935e+01 -1.0857139901347722e+01 -5.1889756561454785e+00 -global_scalar: 9.77678786310451 + 2.7340318979717416e+01 4.7963870104375275e+00 6.8884396847589585e+01 2.9853310005453281e+01 -1.0857139898599751e+01 -5.1889756547311965e+00 +global_scalar: 9.776787862991728 run_pos: ! |2 - 1 -5.1121862036604515e-01 2.8134872171079977e+00 -4.8993015395518924e-02 - 2 1.4735952488047133e-01 3.3535825972277546e+00 -9.3694001270735150e-01 - 3 -9.8023793775378820e-01 1.4277788160410712e+00 -6.3283768722999234e-01 - 4 -1.9793617512974304e+00 1.7069097152779946e+00 -1.4449221382955635e+00 - 5 -1.2073406578712120e+00 1.0799834439081337e+00 6.9555923026692668e-01 - 6 1.3848116183742931e-01 2.8090381873852976e-01 -1.4910727029127884e+00 - 7 1.9062418946016990e-01 1.4366032742456625e-02 -3.0196292835199614e+00 - 8 1.1231015082845541e+00 -5.2094745136401599e-01 -7.0318517336042774e-01 - 9 1.3648756844511976e+00 -2.6143726919534771e-01 5.2247754752734465e-01 - 10 2.0900856844466578e+00 -1.5863783165912952e+00 -1.0801209545800976e+00 - 11 1.8348175253566659e+00 -2.2165258198419622e+00 -2.2686429310672072e+00 - 12 3.2042965133156098e+00 -5.2712831182449804e-01 -1.9248196297790088e+00 - 13 4.3832508188729271e+00 -9.9190674157019298e-01 -1.9502033172902991e+00 - 14 2.7519224412447691e+00 -4.3539271970391624e-01 -3.2687227073821310e+00 - 15 3.1732939937025400e+00 6.6003562890618639e-01 -1.4385076445934288e+00 - 16 2.8067449168447887e+00 -2.6773787170015133e+00 2.1667842294144180e-01 - 17 2.3305479923928516e+00 -2.3464414104884277e+00 1.6639254952584981e+00 - 18 2.2269920241232128e+00 3.4328783208254681e+00 -4.4342132514635013e+00 - 19 1.6145347679280793e+00 3.0386658278179439e+00 -5.1868156516245785e+00 - 20 2.8608613711028656e+00 4.1100452338287408e+00 -4.8694049549907970e+00 - 21 5.3613621396958795e+00 -4.5653056926475841e+00 -4.5681019697305372e+00 - 22 4.8144754754921184e+00 -4.6999404674483083e+00 -5.4362066556130868e+00 - 23 6.2091840278795729e+00 -4.0659479262420684e+00 -4.8393130641864568e+00 - 24 2.1433208912603074e+00 3.5960988832146015e+00 4.2399236066404100e+00 - 25 1.3636453973491918e+00 3.7192408266342980e+00 3.5723762826473990e+00 - 26 2.6593036729945752e+00 4.4718649490241678e+00 4.3032623333405660e+00 - 27 -2.4141791756398536e+00 -4.8879035738852403e+00 2.9097838637418292e+00 - 28 -3.2961505257539048e+00 -4.5101758871984199e+00 2.2261768979308005e+00 - 29 -1.6779316575994301e+00 -4.0348635219024889e+00 3.1144975929056571e+00 + 1 -5.1121862036689958e-01 2.8134872171089729e+00 -4.8993015395755179e-02 + 2 1.4735952487989756e-01 3.3535825972289093e+00 -9.3694001270719340e-01 + 3 -9.8023793775484513e-01 1.4277788160415970e+00 -6.3283768722997102e-01 + 4 -1.9793617512989155e+00 1.7069097152786199e+00 -1.4449221382951762e+00 + 5 -1.2073406578723613e+00 1.0799834439085494e+00 6.9555923026634758e-01 + 6 1.3848116183685821e-01 2.8090381873868608e-01 -1.4910727029123834e+00 + 7 1.9062418945961834e-01 1.4366032742524126e-02 -3.0196292835188681e+00 + 8 1.1231015082843996e+00 -5.2094745136412257e-01 -7.0318517336038155e-01 + 9 1.3648756844511478e+00 -2.6143726919537080e-01 5.2247754752684727e-01 + 10 2.0900856844469189e+00 -1.5863783165917535e+00 -1.0801209545798738e+00 + 11 1.8348175253568222e+00 -2.2165258198426265e+00 -2.2686429310664504e+00 + 12 3.2042965133163452e+00 -5.2712831182460818e-01 -1.9248196297784048e+00 + 13 4.3832508188741741e+00 -9.9190674157045855e-01 -1.9502033172896844e+00 + 14 2.7519224412453145e+00 -4.3539271970399618e-01 -3.2687227073809266e+00 + 15 3.1732939937032665e+00 6.6003562890646350e-01 -1.4385076445930487e+00 + 16 2.8067449168453553e+00 -2.6773787170023233e+00 2.1667842294107942e-01 + 17 2.3305479923932175e+00 -2.3464414104891320e+00 1.6639254952574838e+00 + 18 2.2269920241209178e+00 3.4328783208250382e+00 -4.4342132514621486e+00 + 19 1.6145347679293440e+00 3.0386658278306271e+00 -5.1868156516331227e+00 + 20 2.8608613711127191e+00 4.1100452338250122e+00 -4.8694049549767646e+00 + 21 5.3613621397513214e+00 -4.5653056926761684e+00 -4.5681019697231218e+00 + 22 4.8144754754873968e+00 -4.6999404677006380e+00 -5.4362066555318300e+00 + 23 6.2091840276731247e+00 -4.0659479258840996e+00 -4.8393130642860642e+00 + 24 2.1433208912158790e+00 3.5960988832250020e+00 4.2399236066734023e+00 + 25 1.3636453973794058e+00 3.7192408266942927e+00 3.5723762826011995e+00 + 26 2.6593036731433042e+00 4.4718649489304223e+00 4.3032623332423157e+00 + 27 -2.4141791756415234e+00 -4.8879035738867795e+00 2.9097838637402536e+00 + 28 -3.2961505257559520e+00 -4.5101758871998348e+00 2.2261768979295358e+00 + 29 -1.6779316576007828e+00 -4.0348635219037465e+00 3.1144975929039944e+00 run_vel: ! |2 1 7.7867804888392077e-04 5.8970331623292821e-04 -2.2179517633030531e-04 2 2.7129529964126462e-03 4.6286427111164284e-03 3.5805549693846352e-03 @@ -64,15 +65,15 @@ run_vel: ! |2 15 -4.3301707382721859e-03 -3.1802661664634938e-03 3.2037919043360571e-03 16 -9.6715751018414326e-05 -5.0016572678960377e-04 1.4945658875149626e-03 17 6.5692180538157174e-04 3.6635779995305095e-04 8.3495414466050911e-04 - 18 3.1638284997073272e-04 -2.6313163919070405e-04 6.1054395248656961e-04 - 19 7.6494647252307673e-04 -1.3190724749214326e-03 7.9947132612985723e-04 - 20 -6.1620104632513929e-04 4.2577138774295274e-04 3.2526261653548693e-04 - 21 -1.2063428871524097e-03 -2.2879409878999576e-04 8.9132836538734445e-04 - 22 -1.4151473871894464e-03 -1.3502255393198256e-03 1.1972773109437849e-03 - 23 3.1280366109607172e-04 -3.5563936893394407e-03 -4.9548546532774958e-04 - 24 7.5594375541558026e-04 -8.1321043994394464e-04 3.9340911295780739e-04 - 25 1.4373446731689036e-03 -1.9778020567486213e-03 -6.1842201918304478e-04 - 26 -1.4806168650325999e-03 3.7766934274110835e-04 2.1280924225288342e-03 + 18 3.1638284997600319e-04 -2.6313163919763335e-04 6.1054395248685519e-04 + 19 7.6494647250110288e-04 -1.3190724749175438e-03 7.9947132612783736e-04 + 20 -6.1620104632483571e-04 4.2577138776739548e-04 3.2526261653790590e-04 + 21 -1.2063428872614197e-03 -2.2879409923923591e-04 8.9132836537741717e-04 + 22 -1.4151473875545966e-03 -1.3502255384792933e-03 1.1972773112250280e-03 + 23 3.1280366189902534e-04 -3.5563936883846667e-03 -4.9548546556753227e-04 + 24 7.5594375538112746e-04 -8.1321044009394260e-04 3.9340911288157350e-04 + 25 1.4373446730968913e-03 -1.9778020567293151e-03 -6.1842201907436371e-04 + 26 -1.4806168648243687e-03 3.7766934332225264e-04 2.1280924227258073e-03 27 -2.6510179146429716e-04 3.6306203629019116e-04 -5.6235585400647747e-04 28 -2.3068708109787484e-04 -8.5663070212203200e-04 2.1302563179109169e-03 29 -2.5054744388303732e-03 -1.6773997805290820e-04 2.8436699761004796e-03 diff --git a/unittest/force-styles/tests/fix-timestep-rigid_npt.yaml b/unittest/force-styles/tests/fix-timestep-rigid_npt.yaml index f5965e53ff..6757db5703 100644 --- a/unittest/force-styles/tests/fix-timestep-rigid_npt.yaml +++ b/unittest/force-styles/tests/fix-timestep-rigid_npt.yaml @@ -1,6 +1,7 @@ --- -lammps_version: 17 Feb 2022 -date_generated: Fri Mar 18 22:18:00 2022 +lammps_version: 7 Feb 2024 +tags: +date_generated: Thu Apr 4 21:27:12 2024 epsilon: 5e-13 skip_tests: prerequisites: ! | @@ -13,65 +14,65 @@ post_commands: ! | input_file: in.fourmol natoms: 29 run_stress: ! |- - -1.6326314448663306e+03 -1.4727331978532295e+03 -3.8557370515932275e+03 5.5052891601644615e+02 4.7346742977310657e+02 -6.2035591882122355e+02 -global_scalar: 106.86683072474125 + -1.6326314448662429e+03 -1.4727331978532245e+03 -3.8557370515929042e+03 5.5052891601619581e+02 4.7346742977256520e+02 -6.2035591882208064e+02 +global_scalar: 106.86683072474244 run_pos: ! |2 - 1 -2.6314711410922875e-01 2.4664715027241684e+00 -1.7093568570875561e-01 - 2 3.1632911015968190e-01 2.9434731493852482e+00 -8.5432214735778889e-01 - 3 -6.7623447816593885e-01 1.2410822625695044e+00 -6.1935152269903870e-01 - 4 -1.5552134736906362e+00 1.4878541800991378e+00 -1.2440909745466859e+00 - 5 -8.7601967096402067e-01 9.3417436540572218e-01 4.0272031680440712e-01 - 6 3.0755837780638462e-01 2.2629147986241449e-01 -1.2791162680673960e+00 - 7 3.5322094628053069e-01 -1.0043890952307954e-02 -2.4548503163676365e+00 - 8 1.1736205127907979e+00 -4.8269091330540537e-01 -6.7273784266507608e-01 - 9 1.3865071239751696e+00 -2.5278331076620741e-01 2.6996653369766221e-01 - 10 2.0239883243193546e+00 -1.4252201368162511e+00 -9.6228264545891751e-01 - 11 1.7991233925769246e+00 -1.9828365722517098e+00 -1.8762366544355809e+00 - 12 3.0044710092992837e+00 -4.8928363303895761e-01 -1.6126944183951402e+00 - 13 4.0415308387392486e+00 -9.0061411581930262e-01 -1.6321139880363660e+00 - 14 2.6064005411338655e+00 -4.0859653026870735e-01 -2.6465043951812621e+00 - 15 2.9775904824773907e+00 5.6065407887877150e-01 -1.2391617757503752e+00 - 16 2.6542663248057963e+00 -2.3895844048756363e+00 3.5746598094128501e-02 - 17 2.2355490747046538e+00 -2.0962135127180099e+00 1.1489434027780590e+00 - 18 2.0921160979727347e+00 2.9872159674143273e+00 -3.4902339097026891e+00 - 19 1.4908686219092431e+00 2.6025398330908249e+00 -4.2194623779121834e+00 - 20 2.7154518806645740e+00 3.6506388357595867e+00 -3.9111287168645399e+00 - 21 4.8435638296030810e+00 -4.0881941921728835e+00 -3.5957796498833634e+00 - 22 4.3080557005367073e+00 -4.2177797604324549e+00 -4.4370935526124242e+00 - 23 5.6713237924930837e+00 -3.5912865024293716e+00 -3.8555915013182531e+00 - 24 2.0228224543345528e+00 3.1208125399081723e+00 3.1634860992076259e+00 - 25 1.2576132296055036e+00 3.2447174749294536e+00 2.5191319958251963e+00 - 26 2.5334951322488237e+00 3.9783477827941720e+00 3.2212409164234312e+00 - 27 -1.8488304998563332e+00 -4.2601261704683342e+00 2.0568476369354265e+00 - 28 -2.6026086128772454e+00 -3.9329047688996304e+00 1.5399898445636406e+00 - 29 -1.2195954744860957e+00 -3.5211468177700818e+00 2.2116264666073615e+00 + 1 -2.6314711410917102e-01 2.4664715027243860e+00 -1.7093568570953632e-01 + 2 3.1632911015962950e-01 2.9434731493852171e+00 -8.5432214735883338e-01 + 3 -6.7623447816593352e-01 1.2410822625695497e+00 -6.1935152269929450e-01 + 4 -1.5552134736907304e+00 1.4878541800989344e+00 -1.2440909745469027e+00 + 5 -8.7601967096385724e-01 9.3417436540614585e-01 4.0272031680429610e-01 + 6 3.0755837780630380e-01 2.2629147986222176e-01 -1.2791162680674191e+00 + 7 3.5322094628027934e-01 -1.0043890952942114e-02 -2.4548503163675806e+00 + 8 1.1736205127908210e+00 -4.8269091330536096e-01 -6.7273784266496328e-01 + 9 1.3865071239753313e+00 -2.5278331076580596e-01 2.6996653369765600e-01 + 10 2.0239883243193466e+00 -1.4252201368163044e+00 -9.6228264545858089e-01 + 11 1.7991233925767878e+00 -1.9828365722521095e+00 -1.8762366544350000e+00 + 12 3.0044710092991682e+00 -4.8928363303924272e-01 -1.6126944183953009e+00 + 13 4.0415308387391402e+00 -9.0061411581958151e-01 -1.6321139880365303e+00 + 14 2.6064005411335902e+00 -4.0859653026938592e-01 -2.6465043951813936e+00 + 15 2.9775904824773161e+00 5.6065407887862850e-01 -1.2391617757509259e+00 + 16 2.6542663248059526e+00 -2.3895844048753085e+00 3.5746598094734239e-02 + 17 2.2355490747049700e+00 -2.0962135127172692e+00 1.1489434027786212e+00 + 18 2.0921160979710356e+00 2.9872159674136229e+00 -3.4902339097027140e+00 + 19 1.4908686219074729e+00 2.6025398330897387e+00 -4.2194623779119471e+00 + 20 2.7154518806624317e+00 3.6506388357591026e+00 -3.9111287168648765e+00 + 21 4.8435638296045518e+00 -4.0881941921723524e+00 -3.5957796498832693e+00 + 22 4.3080557005379525e+00 -4.2177797604322951e+00 -4.4370935526121276e+00 + 23 5.6713237924942437e+00 -3.5912865024285043e+00 -3.8555915013185178e+00 + 24 2.0228224543350635e+00 3.1208125399084361e+00 3.1634860992076055e+00 + 25 1.2576132296057372e+00 3.2447174749292715e+00 2.5191319958254175e+00 + 26 2.5334951322489658e+00 3.9783477827946756e+00 3.2212409164231035e+00 + 27 -1.8488304998563332e+00 -4.2601261704683413e+00 2.0568476369354238e+00 + 28 -2.6026086128772454e+00 -3.9329047688996370e+00 1.5399898445636415e+00 + 29 -1.2195954744860957e+00 -3.5211468177700862e+00 2.2116264666073588e+00 run_vel: ! |2 - 1 1.2393084479630034e-03 7.0215195817155049e-04 -1.1910956210640397e-03 - 2 1.3060936199988536e-03 2.5041119719347224e-04 -1.4496302699051125e-03 - 3 8.7069732478159932e-04 6.1866591813748923e-04 -6.2317312592554579e-04 - 4 8.8100215742025064e-04 5.8380213791516000e-04 -6.5145037264846529e-04 - 5 8.7979303397991678e-04 1.1152950208762130e-03 -4.7231382224758212e-04 - 6 5.3965146863311727e-04 6.8643008418757634e-05 -2.7149223435848658e-04 - 7 3.7117679682181569e-04 -4.5322194777211656e-04 -1.7317402888851005e-04 - 8 4.0378854177636284e-04 9.9015358993666757e-05 -4.1783685861269460e-05 - 9 5.4970639315540500e-04 4.5048022318729304e-04 -1.6045108899919851e-04 - 10 1.2521448037945991e-04 -2.5472783650533836e-04 2.9052485920877619e-04 - 11 -1.0599027352488127e-04 -5.9051612835384309e-04 5.5226010155799178e-04 - 12 3.1798607399623040e-04 -7.9980833669012115e-04 -2.0274707260294341e-04 - 13 2.0597404142686670e-04 -1.0865778699535151e-03 -1.1731137935658918e-04 - 14 2.4719215573349161e-04 -1.1410575874168858e-03 -2.0209037936298231e-04 - 15 6.3286464043726845e-04 -6.3068988069288313e-04 -6.5527927471360488e-04 - 16 -4.4100406048953834e-05 8.6869240444187047e-06 6.5198761255923199e-04 - 17 1.3407421346950653e-04 6.0357565278263911e-04 5.6233596575975121e-04 - 18 7.9277804690569076e-04 -1.5618239874425175e-03 2.1367192719678593e-03 - 19 5.6167660797942776e-04 -1.2371794194922848e-03 2.1562222137424714e-03 - 20 1.1137406410123489e-03 -1.8729421751430327e-03 2.1222207985340819e-03 - 21 -2.8426953558137740e-03 -2.9730185469781381e-03 1.8564402246257748e-03 - 22 -2.9480844379790165e-03 -2.6797216173769360e-03 1.8784164631754769e-03 - 23 -2.5997293519674958e-03 -3.3926375081633348e-03 1.8288830284141459e-03 - 24 1.1689404599043950e-03 -1.6701257754515662e-03 2.1428138286394673e-03 - 25 1.2027302640333160e-03 -1.2630861421196525e-03 2.1808987508670514e-03 - 26 1.6116362268906780e-03 -1.9337182438138849e-03 2.1377249582867843e-03 + 1 1.2393084479632162e-03 7.0215195817134601e-04 -1.1910956210642444e-03 + 2 1.3060936199989690e-03 2.5041119719309234e-04 -1.4496302699052684e-03 + 3 8.7069732478170037e-04 6.1866591813752230e-04 -6.2317312592555772e-04 + 4 8.8100215742026918e-04 5.8380213791525335e-04 -6.5145037264832683e-04 + 5 8.7979303398017070e-04 1.1152950208763543e-03 -4.7231382224773813e-04 + 6 5.3965146863306555e-04 6.8643008418797912e-05 -2.7149223435837187e-04 + 7 3.7117679682156736e-04 -4.5322194777208414e-04 -1.7317402888817444e-04 + 8 4.0378854177637320e-04 9.9015358993721983e-05 -4.1783685861266425e-05 + 9 5.4970639315557207e-04 4.5048022318731326e-04 -1.6045108899939207e-04 + 10 1.2521448037938158e-04 -2.5472783650525840e-04 2.9052485920884211e-04 + 11 -1.0599027352512348e-04 -5.9051612835367331e-04 5.5226010155827335e-04 + 12 3.1798607399607243e-04 -7.9980833669034384e-04 -2.0274707260289267e-04 + 13 2.0597404142668038e-04 -1.0865778699538143e-03 -1.1731137935657286e-04 + 14 2.4719215573317579e-04 -1.1410575874171004e-03 -2.0209037936272953e-04 + 15 6.3286464043720871e-04 -6.3068988069325653e-04 -6.5527927471369335e-04 + 16 -4.4100406048914694e-05 8.6869240445997393e-06 6.5198761255915100e-04 + 17 1.3407421346973834e-04 6.0357565278286712e-04 5.6233596575947994e-04 + 18 7.9277804690533363e-04 -1.5618239874416928e-03 2.1367192719678658e-03 + 19 5.6167660797890148e-04 -1.2371794194914493e-03 2.1562222137424727e-03 + 20 1.1137406410120911e-03 -1.8729421751419769e-03 2.1222207985341088e-03 + 21 -2.8426953558134235e-03 -2.9730185469789214e-03 1.8564402246258563e-03 + 22 -2.9480844379788334e-03 -2.6797216173776307e-03 1.8784164631755556e-03 + 23 -2.5997293519669897e-03 -3.3926375081639489e-03 1.8288830284142509e-03 + 24 1.1689404599044329e-03 -1.6701257754517325e-03 2.1428138286393884e-03 + 25 1.2027302640331447e-03 -1.2630861421197028e-03 2.1808987508669616e-03 + 26 1.6116362268908176e-03 -1.9337182438138503e-03 2.1377249582867175e-03 27 -2.6510179146429716e-04 3.6306203629019116e-04 -5.6235585400647747e-04 28 -2.3068708109787484e-04 -8.5663070212203200e-04 2.1302563179109169e-03 29 -2.5054744388303732e-03 -1.6773997805290820e-04 2.8436699761004796e-03 diff --git a/unittest/force-styles/tests/fix-timestep-rigid_npt_small.yaml b/unittest/force-styles/tests/fix-timestep-rigid_npt_small.yaml index 3b13658e19..b7b074debd 100644 --- a/unittest/force-styles/tests/fix-timestep-rigid_npt_small.yaml +++ b/unittest/force-styles/tests/fix-timestep-rigid_npt_small.yaml @@ -1,6 +1,7 @@ --- -lammps_version: 17 Feb 2022 -date_generated: Fri Mar 18 22:18:00 2022 +lammps_version: 7 Feb 2024 +tags: +date_generated: Thu Apr 4 21:27:12 2024 epsilon: 5e-13 skip_tests: prerequisites: ! | @@ -13,38 +14,38 @@ post_commands: ! | input_file: in.fourmol natoms: 29 run_stress: ! |- - -8.7531774769722489e+01 5.5811525017966304e+00 -5.5468297744356747e+01 -1.5316306343483370e+02 1.4641268097314367e+02 1.7263710089631324e+01 -global_scalar: 77.78983430293286 + -8.7531774640811420e+01 5.5811525750281774e+00 -5.5468297761715007e+01 -1.5316306336726905e+02 1.4641268095299071e+02 1.7263710083146290e+01 +global_scalar: 77.78983430422252 run_pos: ! |2 - 1 -4.6333219629007161e-01 2.7511450055070625e+00 -1.2865946102806269e-01 - 2 1.7937148390204793e-01 3.2800405238539234e+00 -8.8510337855738808e-01 - 3 -9.2104620265671233e-01 1.3941717929286011e+00 -6.2603796687145774e-01 - 4 -1.8960869879711328e+00 1.6675144043869761e+00 -1.3178544214440926e+00 - 5 -1.1426748052069362e+00 1.0535885915279550e+00 5.0562616550054784e-01 - 6 1.7070712623541162e-01 2.7107933832755826e-01 -1.3571701846607374e+00 - 7 2.2159329060539701e-01 1.0068698962042433e-02 -2.6593507556860114e+00 - 8 1.1315940381701060e+00 -5.1414408469809381e-01 -6.8596713849763802e-01 - 9 1.3675404538221994e+00 -2.6001531899016506e-01 3.5817751536664133e-01 - 10 2.0752698846777218e+00 -1.5574812996955254e+00 -1.0070795245589492e+00 - 11 1.8261547470632067e+00 -2.1745615463231482e+00 -2.0195839000288469e+00 - 12 3.1626236108721066e+00 -5.2019677375525752e-01 -1.7266801053747978e+00 - 13 4.3131602274134853e+00 -9.7533717592326674e-01 -1.7483045222380902e+00 - 14 2.7211536303664605e+00 -4.3036348628163701e-01 -2.8715539682060491e+00 - 15 3.1323683805788374e+00 6.4234915962457073e-01 -1.3123899007466848e+00 - 16 2.7746546569032322e+00 -2.6258578189755974e+00 9.7666596945726880e-02 - 17 2.3099360535750506e+00 -2.3017831004883886e+00 1.3305794265747686e+00 - 18 2.2091748314094701e+00 3.3564440703097080e+00 -3.8370878208998480e+00 - 19 1.5986312961639815e+00 2.9614993054417287e+00 -4.5778944294436021e+00 - 20 2.8405364052167421e+00 4.0335971973474170e+00 -4.2659151034329339e+00 - 21 5.2651527410670678e+00 -4.4761614286515128e+00 -3.9518304737634447e+00 - 22 4.7192922284117014e+00 -4.6119045765637390e+00 -4.8062296930647124e+00 - 23 6.1127575782518644e+00 -3.9811721108739997e+00 -4.2204729624242692e+00 - 24 2.1290800761933255e+00 3.5132841007593623e+00 3.5392070209389175e+00 - 25 1.3519459804490630e+00 3.6349473854278020e+00 2.8807586653452137e+00 - 26 2.6413474233716503e+00 4.3893648735951771e+00 3.6035699967293215e+00 - 27 -2.3204235087828389e+00 -4.7905434153250859e+00 2.3919287951691697e+00 - 28 -3.1811356909797261e+00 -4.4206486004501846e+00 1.8095625809312565e+00 - 29 -1.6019226098503827e+00 -3.9551927030786480e+00 2.5663248522869146e+00 + 1 -4.6333219629057343e-01 2.7511450055078264e+00 -1.2865946102794723e-01 + 2 1.7937148390171043e-01 3.2800405238548382e+00 -8.8510337855745913e-01 + 3 -9.2104620265733672e-01 1.3941717929290096e+00 -6.2603796687147195e-01 + 4 -1.8960869879720148e+00 1.6675144043874610e+00 -1.3178544214442827e+00 + 5 -1.1426748052076219e+00 1.0535885915282748e+00 5.0562616550083384e-01 + 6 1.7070712623507234e-01 2.7107933832768616e-01 -1.3571701846609390e+00 + 7 2.2159329060507194e-01 1.0068698962099276e-02 -2.6593507556865532e+00 + 8 1.1315940381700180e+00 -5.1414408469817374e-01 -6.8596713849766644e-01 + 9 1.3675404538221745e+00 -2.6001531899018637e-01 3.5817751536688647e-01 + 10 2.0752698846778816e+00 -1.5574812996958780e+00 -1.0070795245590576e+00 + 11 1.8261547470632973e+00 -2.1745615463236652e+00 -2.0195839000292208e+00 + 12 3.1626236108725436e+00 -5.2019677375534190e-01 -1.7266801053750953e+00 + 13 4.3131602274142278e+00 -9.7533717592347013e-01 -1.7483045222383922e+00 + 14 2.7211536303667962e+00 -4.3036348628169740e-01 -2.8715539682066451e+00 + 15 3.1323683805792637e+00 6.4234915962478567e-01 -1.3123899007468758e+00 + 16 2.7746546569035768e+00 -2.6258578189762343e+00 9.7666596945902739e-02 + 17 2.3099360535752709e+00 -2.3017831004889393e+00 1.3305794265752642e+00 + 18 2.2091748313982826e+00 3.3564440703034535e+00 -3.8370878209026742e+00 + 19 1.5986312961681257e+00 2.9614993054929961e+00 -4.5778944294898185e+00 + 20 2.8405364052584243e+00 4.0335971973267473e+00 -4.2659151033808254e+00 + 21 5.2651527413064194e+00 -4.4761614287784965e+00 -3.9518304737405883e+00 + 22 4.7192922283808425e+00 -4.6119045776644754e+00 -4.8062296927145907e+00 + 23 6.1127575773332410e+00 -3.9811721092729444e+00 -4.2204729628712050e+00 + 24 2.1290800759971340e+00 3.5132841007987228e+00 3.5392070210919400e+00 + 25 1.3519459805793055e+00 3.6349473856926782e+00 2.8807586651545414e+00 + 26 2.6413474240255201e+00 4.3893648731792023e+00 3.6035699963154144e+00 + 27 -2.3204235087838274e+00 -4.7905434153262867e+00 2.3919287951699459e+00 + 28 -3.1811356909809412e+00 -4.4206486004512886e+00 1.8095625809318783e+00 + 29 -1.6019226098511883e+00 -3.9551927030796277e+00 2.5663248522877335e+00 run_vel: ! |2 1 7.7867804888392077e-04 5.8970331623292821e-04 -2.2179517633030531e-04 2 2.7129529964126462e-03 4.6286427111164284e-03 3.5805549693846352e-03 @@ -63,15 +64,15 @@ run_vel: ! |2 15 -4.3301707382721859e-03 -3.1802661664634938e-03 3.2037919043360571e-03 16 -9.6715751018414326e-05 -5.0016572678960377e-04 1.4945658875149626e-03 17 6.5692180538157174e-04 3.6635779995305095e-04 8.3495414466050911e-04 - 18 7.2384391131466940e-04 -6.0015829212802722e-04 1.5957533238990559e-03 - 19 1.7583138222551384e-03 -3.0158245948490804e-03 2.0310435058142470e-03 - 20 -1.4153552732353322e-03 9.7835305930749246e-04 9.3881222516217474e-04 - 21 -2.7591188772323472e-03 -5.1180650802276303e-04 2.2758295071994400e-03 - 22 -3.2319732401280494e-03 -3.0809796427949646e-03 2.9861065768383484e-03 - 23 6.9767443123301817e-04 -8.1543313142268207e-03 -8.9929522742256325e-04 - 24 1.7345816999787505e-03 -1.8508160062822962e-03 1.0723416147087287e-03 - 25 3.2855417755407162e-03 -4.5284294762327620e-03 -1.2529299007822618e-03 - 26 -3.4004728795728936e-03 8.5952140737749613e-04 5.0505027847540665e-03 + 18 7.2384391137821785e-04 -6.0015829219183913e-04 1.5957533239005792e-03 + 19 1.7583138220942001e-03 -3.0158245949231362e-03 2.0310435058145879e-03 + 20 -1.4153552733289841e-03 9.7835305963750062e-04 9.3881222515317965e-04 + 21 -2.7591188784018856e-03 -5.1180651254767841e-04 2.2758295071625967e-03 + 22 -3.2319732438308327e-03 -3.0809796341686479e-03 2.9861065796802132e-03 + 23 6.9767443960831559e-04 -8.1543313047864312e-03 -8.9929523012053270e-04 + 24 1.7345816996818938e-03 -1.8508160077951139e-03 1.0723416139084840e-03 + 25 3.2855417748809557e-03 -4.5284294761711655e-03 -1.2529298997977286e-03 + 26 -3.4004728777299181e-03 8.5952141335802687e-04 5.0505027869618231e-03 27 -2.6510179146429716e-04 3.6306203629019116e-04 -5.6235585400647747e-04 28 -2.3068708109787484e-04 -8.5663070212203200e-04 2.1302563179109169e-03 29 -2.5054744388303732e-03 -1.6773997805290820e-04 2.8436699761004796e-03 diff --git a/unittest/force-styles/tests/fix-timestep-rigid_nve_group.yaml b/unittest/force-styles/tests/fix-timestep-rigid_nve_group.yaml index b20d639fd4..0e8f18da7b 100644 --- a/unittest/force-styles/tests/fix-timestep-rigid_nve_group.yaml +++ b/unittest/force-styles/tests/fix-timestep-rigid_nve_group.yaml @@ -1,6 +1,7 @@ --- -lammps_version: 17 Feb 2022 -date_generated: Fri Mar 18 22:18:00 2022 +lammps_version: 7 Feb 2024 +tags: +date_generated: Thu Apr 4 21:27:13 2024 epsilon: 5e-13 skip_tests: prerequisites: ! | @@ -14,65 +15,65 @@ post_commands: ! | input_file: in.fourmol natoms: 29 run_stress: ! |- - -1.4245356938011610e+03 -1.4496493316650424e+03 -3.6144360982532016e+03 8.4840626794792252e+02 2.0318336802442886e+02 -6.0622397695978805e+02 -global_scalar: 15.711521423178128 + -1.4245356938011316e+03 -1.4496493316650758e+03 -3.6144360982530934e+03 8.4840626794790273e+02 2.0318336802433893e+02 -6.0622397695991208e+02 +global_scalar: 15.711521423178162 run_pos: ! |2 - 1 -2.7899546863905123e-01 2.4731857340327181e+00 -1.7290667740231969e-01 - 2 3.0296221610252227e-01 2.9517129916957194e+00 -8.5798904387756503e-01 - 3 -6.9368802364141247e-01 1.2445115421753310e+00 -6.2281111198650718e-01 - 4 -1.5764879647103560e+00 1.4919714415840475e+00 -1.2492069414674947e+00 - 5 -8.9434512967440649e-01 9.3651699743494377e-01 4.0191726558257690e-01 - 6 2.9454439634452678e-01 2.2724545792543693e-01 -1.2845195053960459e+00 - 7 3.4049112903278234e-01 -9.4655678321664549e-03 -2.4634480020857370e+00 - 8 1.1644354555804921e+00 -4.8367776650962330e-01 -6.7663643940738027e-01 - 9 1.3781717822695918e+00 -2.5332509530017322e-01 2.6864954436590494e-01 - 10 2.0186368606042460e+00 -1.4285861423625348e+00 -9.6712491252784183e-01 - 11 1.7929137227578726e+00 -1.9875455388406436e+00 -1.8836565352267429e+00 - 12 3.0032775230400142e+00 -4.8983022415161337e-01 -1.6190248017342870e+00 - 13 4.0448964162126639e+00 -9.0213155122374034e-01 -1.6385398399478515e+00 - 14 2.6035151245016883e+00 -4.0874995493201027e-01 -2.6555999074785985e+00 - 15 2.9761196776172243e+00 5.6287237454118566e-01 -1.2442626196081918e+00 - 16 2.6517373021566577e+00 -2.3957035508393689e+00 3.3389262100618433e-02 - 17 2.2311114924744668e+00 -2.1018393228799419e+00 1.1496088522376777e+00 - 18 2.1390642573199212e+00 3.0164773560692755e+00 -3.5143984803853900e+00 - 19 1.5353246655143720e+00 2.6305911186314508e+00 -4.2455871034736816e+00 - 20 2.7649421538935122e+00 3.6818603528430254e+00 -3.9364115785985936e+00 - 21 4.9043112657301942e+00 -4.0774268210396798e+00 -3.6200836396129796e+00 - 22 4.3665322424286144e+00 -4.2075138112953070e+00 -4.4636587264885614e+00 - 23 5.7355405581987764e+00 -3.5789558641907195e+00 -3.8805763324090350e+00 - 24 2.0692780332810026e+00 3.1504920436416008e+00 3.1571131300668833e+00 - 25 1.3007297593168636e+00 3.2745259354178766e+00 2.5110163874103986e+00 - 26 2.5819416446099002e+00 4.0104903120757012e+00 3.2150249624525742e+00 + 1 -2.7899546863904412e-01 2.4731857340327541e+00 -1.7290667740243348e-01 + 2 3.0296221610251317e-01 2.9517129916957181e+00 -8.5798904387771990e-01 + 3 -6.9368802364141358e-01 1.2445115421753392e+00 -6.2281111198654315e-01 + 4 -1.5764879647103740e+00 1.4919714415840188e+00 -1.2492069414675249e+00 + 5 -8.9434512967438362e-01 9.3651699743500849e-01 4.0191726558256402e-01 + 6 2.9454439634451368e-01 2.2724545792540876e-01 -1.2845195053960490e+00 + 7 3.4049112903274215e-01 -9.4655678322607961e-03 -2.4634480020857299e+00 + 8 1.1644354555804954e+00 -4.8367776650961680e-01 -6.7663643940736340e-01 + 9 1.3781717822696169e+00 -2.5332509530011327e-01 2.6864954436590560e-01 + 10 2.0186368606042455e+00 -1.4285861423625437e+00 -9.6712491252779242e-01 + 11 1.7929137227578522e+00 -1.9875455388407057e+00 -1.8836565352266585e+00 + 12 3.0032775230399977e+00 -4.8983022415165589e-01 -1.6190248017343138e+00 + 13 4.0448964162126479e+00 -9.0213155122378219e-01 -1.6385398399478794e+00 + 14 2.6035151245016470e+00 -4.0874995493211108e-01 -2.6555999074786221e+00 + 15 2.9761196776172136e+00 5.6287237454116579e-01 -1.2442626196082760e+00 + 16 2.6517373021566839e+00 -2.3957035508393223e+00 3.3389262100708361e-02 + 17 2.2311114924745179e+00 -2.1018393228798340e+00 1.1496088522377621e+00 + 18 2.1390642573196605e+00 3.0164773560691671e+00 -3.5143984803853927e+00 + 19 1.5353246655140995e+00 2.6305911186312847e+00 -4.2455871034736425e+00 + 20 2.7649421538931831e+00 3.6818603528429503e+00 -3.9364115785986438e+00 + 21 4.9043112657304171e+00 -4.0774268210395990e+00 -3.6200836396129659e+00 + 22 4.3665322424288018e+00 -4.2075138112952830e+00 -4.4636587264885161e+00 + 23 5.7355405581989505e+00 -3.5789558641905872e+00 -3.8805763324090754e+00 + 24 2.0692780332810834e+00 3.1504920436416377e+00 3.1571131300668784e+00 + 25 1.3007297593169014e+00 3.2745259354178451e+00 2.5110163874104305e+00 + 26 2.5819416446099250e+00 4.0104903120757749e+00 3.2150249624525231e+00 27 -1.9613581876744359e+00 -4.3556300596085160e+00 2.1101467673534788e+00 28 -2.7406520384725965e+00 -4.0207251278130975e+00 1.5828689861678511e+00 29 -1.3108232656499081e+00 -3.5992986322410760e+00 2.2680459788743503e+00 run_vel: ! |2 - 1 4.7093289825841293e-04 2.6351122778450888e-04 -4.4905093064113717e-04 - 2 4.9594625316469964e-04 9.4561370489668111e-05 -5.4581359894048111e-04 - 3 3.3306085115754910e-04 2.3224943880673595e-04 -2.3659455671744723e-04 - 4 3.3692327392259764e-04 2.1926810694050300e-04 -2.4716631558861373e-04 - 5 3.3642542694184180e-04 4.1797578013266372e-04 -1.8011341766654800e-04 - 6 2.0926869754934492e-04 2.6449308951570887e-05 -1.0508938983871866e-04 - 7 1.4629043007908284e-04 -1.6873376665352296e-04 -6.8354048774366290e-05 - 8 1.5844101624224813e-04 3.7728761273997381e-05 -1.9162715667088780e-05 - 9 2.1299362072601532e-04 1.6917140529158732e-04 -6.3528165037833598e-05 - 10 5.4261629412260376e-05 -9.4655528376821362e-05 1.0511362869146115e-04 - 11 -3.2194160796493454e-05 -2.2025095264761673e-04 2.0300202946211041e-04 - 12 1.2640586304751833e-04 -2.9851080445664229e-04 -7.9476371818270762e-05 - 13 8.4523575162163329e-05 -4.0583135407329152e-04 -4.7551111331733064e-05 - 14 9.9954050381288400e-05 -4.2610816481298728e-04 -7.9255633594414740e-05 - 15 2.4417481119791087e-04 -2.3521002264675206e-04 -2.4875318161051227e-04 - 16 -9.0958138549618100e-06 3.7774817121146141e-06 2.4035199548835590e-04 - 17 5.7507224523608950e-05 2.2629217444844056e-04 2.0686920072686990e-04 - 18 2.9220264989358538e-04 -6.2478376436791018e-04 8.4222594596602778e-04 - 19 2.0572616567796829e-04 -5.0334424271721273e-04 8.4953929443210897e-04 - 20 4.1224811789512659e-04 -7.4115205416005016e-04 8.3678612337508636e-04 - 21 -1.0671858777656236e-03 -1.1531171045500116e-03 7.3720674900161585e-04 - 22 -1.1066511338291651e-03 -1.0433933757601002e-03 7.4544544325707912e-04 - 23 -9.7629260480938717e-04 -1.3100872491594619e-03 7.2687284219704522e-04 - 24 4.3308126651259090e-04 -6.6527658087322823e-04 8.4451298670663681e-04 - 25 4.4565811905441464e-04 -5.1298436273583472e-04 8.5878867884521526e-04 - 26 5.9865972692023459e-04 -7.6385263287079232e-04 8.4259943226842524e-04 + 1 4.7093289825842481e-04 2.6351122778449815e-04 -4.4905093064115029e-04 + 2 4.9594625316470614e-04 9.4561370489646928e-05 -5.4581359894049163e-04 + 3 3.3306085115755453e-04 2.3224943880673822e-04 -2.3659455671744877e-04 + 4 3.3692327392259862e-04 2.1926810694050856e-04 -2.4716631558860722e-04 + 5 3.3642542694185568e-04 4.1797578013267277e-04 -1.8011341766655748e-04 + 6 2.0926869754934175e-04 2.6449308951572771e-05 -1.0508938983871239e-04 + 7 1.4629043007906883e-04 -1.6873376665352220e-04 -6.8354048774347479e-05 + 8 1.5844101624224818e-04 3.7728761274000153e-05 -1.9162715667088122e-05 + 9 2.1299362072602399e-04 1.6917140529158875e-04 -6.3528165037844006e-05 + 10 5.4261629412255362e-05 -9.4655528376817648e-05 1.0511362869146607e-04 + 11 -3.2194160796507657e-05 -2.2025095264760857e-04 2.0300202946212778e-04 + 12 1.2640586304750909e-04 -2.9851080445665606e-04 -7.9476371818267184e-05 + 13 8.4523575162152420e-05 -4.0583135407330979e-04 -4.7551111331730963e-05 + 14 9.9954050381270538e-05 -4.2610816481300132e-04 -7.9255633594400035e-05 + 15 2.4417481119790729e-04 -2.3521002264677391e-04 -2.4875318161051720e-04 + 16 -9.0958138549606716e-06 3.7774817121242263e-06 2.4035199548835337e-04 + 17 5.7507224523620660e-05 2.2629217444845357e-04 2.0686920072685659e-04 + 18 2.9220264989356375e-04 -6.2478376436786377e-04 8.4222594596602756e-04 + 19 2.0572616567793704e-04 -5.0334424271716611e-04 8.4953929443210886e-04 + 20 4.1224811789511017e-04 -7.4115205415999053e-04 8.3678612337508690e-04 + 21 -1.0671858777656028e-03 -1.1531171045500558e-03 7.3720674900162051e-04 + 22 -1.1066511338291541e-03 -1.0433933757601397e-03 7.4544544325708389e-04 + 23 -9.7629260480935768e-04 -1.3100872491594961e-03 7.2687284219705075e-04 + 24 4.3308126651259366e-04 -6.6527658087323755e-04 8.4451298670663172e-04 + 25 4.4565811905440515e-04 -5.1298436273583775e-04 8.5878867884520984e-04 + 26 5.9865972692024294e-04 -7.6385263287079004e-04 8.4259943226842036e-04 27 -2.6510179146429716e-04 3.6306203629019116e-04 -5.6235585400647747e-04 28 -2.3068708109787484e-04 -8.5663070212203200e-04 2.1302563179109169e-03 29 -2.5054744388303732e-03 -1.6773997805290820e-04 2.8436699761004796e-03 diff --git a/unittest/force-styles/tests/fix-timestep-rigid_nve_molecule.yaml b/unittest/force-styles/tests/fix-timestep-rigid_nve_molecule.yaml index 839ac060a5..b0e52d3fea 100644 --- a/unittest/force-styles/tests/fix-timestep-rigid_nve_molecule.yaml +++ b/unittest/force-styles/tests/fix-timestep-rigid_nve_molecule.yaml @@ -1,6 +1,7 @@ --- -lammps_version: 17 Feb 2022 -date_generated: Fri Mar 18 22:18:00 2022 +lammps_version: 7 Feb 2024 +tags: +date_generated: Thu Apr 4 21:27:13 2024 epsilon: 5e-13 skip_tests: prerequisites: ! | @@ -14,8 +15,8 @@ post_commands: ! | input_file: in.fourmol natoms: 29 run_stress: ! |- - -4.9200114774918006e+01 -2.6907707694141354e+01 -6.0080872444875970e+00 -2.5620425756344780e+01 -1.3450222538011893e+01 -1.4947348732785031e+00 -global_scalar: 18.340560167364448 + -4.9200114760030708e+01 -2.6907707699312748e+01 -6.0080872440179061e+00 -2.5620425767600064e+01 -1.3450222535184853e+01 -1.4947348700253382e+00 +global_scalar: 18.340560165889197 run_pos: ! |2 1 -2.7993683669226832e-01 2.4726588069312840e+00 -1.7200860244148433e-01 2 3.0197083955402204e-01 2.9515239068888608e+00 -8.5689735572907566e-01 @@ -34,15 +35,15 @@ run_pos: ! |2 15 2.9756315249791303e+00 5.6334269722969288e-01 -1.2437650754599008e+00 16 2.6517554244980306e+00 -2.3957110424978438e+00 3.2908335999178327e-02 17 2.2309964792710639e+00 -2.1022918943319384e+00 1.1491948328949437e+00 - 18 2.1392027588270928e+00 3.0171068018423082e+00 -3.5144628518853867e+00 - 19 1.5366124996934336e+00 2.6286809834236959e+00 -4.2452547844313493e+00 - 20 2.7628161763597592e+00 3.6842251687468450e+00 -3.9370881219419189e+00 - 21 4.9036621348471368e+00 -4.0757648444604762e+00 -3.6192617654906609e+00 - 22 4.3655322292129357e+00 -4.2084949964269480e+00 -4.4622011117992786e+00 - 23 5.7380414790507261e+00 -3.5841969189265162e+00 -3.8827839828320116e+00 - 24 2.0701314764933532e+00 3.1499370533556008e+00 3.1565324853054118e+00 - 25 1.3030170721038390e+00 3.2711173927738786e+00 2.5081940917867680e+00 - 26 2.5776230784374867e+00 4.0127347067334345e+00 3.2182355136150917e+00 + 18 2.1392027588241729e+00 3.0171068018404634e+00 -3.5144628518858858e+00 + 19 1.5366124996944652e+00 2.6286809834366300e+00 -4.2452547844429631e+00 + 20 2.7628161763703827e+00 3.6842251687412753e+00 -3.9370881219283147e+00 + 21 4.9036621349084646e+00 -4.0757648444931904e+00 -3.6192617654848509e+00 + 22 4.3655322292057255e+00 -4.2084949967079632e+00 -4.4622011117106153e+00 + 23 5.7380414788131207e+00 -3.5841969185149058e+00 -3.8827839829438688e+00 + 24 2.0701314764430685e+00 3.1499370533656190e+00 3.1565324853444698e+00 + 25 1.3030170721374645e+00 3.2711173928413317e+00 2.5081940917372791e+00 + 26 2.5776230786045939e+00 4.0127347066259897e+00 3.2182355135086644e+00 27 -1.9613581876744359e+00 -4.3556300596085160e+00 2.1101467673534788e+00 28 -2.7406520384725965e+00 -4.0207251278130975e+00 1.5828689861678511e+00 29 -1.3108232656499081e+00 -3.5992986322410760e+00 2.2680459788743503e+00 @@ -64,15 +65,15 @@ run_vel: ! |2 15 -4.3301707382721859e-03 -3.1802661664634938e-03 3.2037919043360571e-03 16 -9.6715751018414326e-05 -5.0016572678960377e-04 1.4945658875149626e-03 17 6.5692180538157174e-04 3.6635779995305095e-04 8.3495414466050911e-04 - 18 3.6149625094898067e-04 -3.1032459262177040e-04 8.1043030117471950e-04 - 19 8.5103884664914254e-04 -1.4572280597118469e-03 1.0163621287571445e-03 - 20 -6.5204659274939057e-04 4.3989037444674739e-04 4.9909839028631532e-04 - 21 -1.3888125888095134e-03 -3.1978049191290817e-04 1.1455681505629727e-03 - 22 -1.6084223473476296e-03 -1.5355394235202363e-03 1.4772010819351844e-03 - 23 2.6392672583440717e-04 -3.9375414417551127e-03 -3.6991583302200246e-04 - 24 8.6062827046548790e-04 -9.4179873487668705e-04 5.5396395555797203e-04 - 25 1.5933645478462865e-03 -2.2139156628290975e-03 -5.5078029723780941e-04 - 26 -1.5679561736454237e-03 3.5146224433513641e-04 2.4446924193838983e-03 + 18 3.6149625095571725e-04 -3.1032459263052550e-04 8.1043030117473349e-04 + 19 8.5103884662188244e-04 -1.4572280597071525e-03 1.0163621287543638e-03 + 20 -6.5204659274901945e-04 4.3989037447700791e-04 4.9909839028904252e-04 + 21 -1.3888125889514069e-03 -3.1978049248194420e-04 1.1455681505565557e-03 + 22 -1.6084223477996385e-03 -1.5355394224557757e-03 1.4772010822781041e-03 + 23 2.6392672685288674e-04 -3.9375414405480738e-03 -3.6991583333937880e-04 + 24 8.6062827042478370e-04 -9.4179873506334828e-04 5.5396395546095014e-04 + 25 1.5933645477524167e-03 -2.2139156628045932e-03 -5.5078029709943691e-04 + 26 -1.5679561733890424e-03 3.5146224505578228e-04 2.4446924196328459e-03 27 -2.6510179146429716e-04 3.6306203629019116e-04 -5.6235585400647747e-04 28 -2.3068708109787484e-04 -8.5663070212203200e-04 2.1302563179109169e-03 29 -2.5054744388303732e-03 -1.6773997805290820e-04 2.8436699761004796e-03 diff --git a/unittest/force-styles/tests/fix-timestep-rigid_nve_single.yaml b/unittest/force-styles/tests/fix-timestep-rigid_nve_single.yaml index 854e8b4d45..0277d7991a 100644 --- a/unittest/force-styles/tests/fix-timestep-rigid_nve_single.yaml +++ b/unittest/force-styles/tests/fix-timestep-rigid_nve_single.yaml @@ -1,6 +1,7 @@ --- -lammps_version: 17 Feb 2022 -date_generated: Fri Mar 18 22:18:00 2022 +lammps_version: 7 Feb 2024 +tags: +date_generated: Thu Apr 4 21:27:13 2024 epsilon: 5e-13 skip_tests: prerequisites: ! | @@ -14,26 +15,26 @@ post_commands: ! | input_file: in.fourmol natoms: 29 run_stress: ! |- - -1.3754817467882767e+03 -1.4228425246441275e+03 -3.6087196200592489e+03 8.7407043142559303e+02 2.1665316510426268e+02 -6.0480791467747542e+02 -global_scalar: 4.531423038570333 + -1.3754817467882813e+03 -1.4228425246442055e+03 -3.6087196200592184e+03 8.7407043142559792e+02 2.1665316510417179e+02 -6.0480791467761571e+02 +global_scalar: 4.531423038570381 run_pos: ! |2 - 1 -2.7899546859706881e-01 2.4731857340427750e+00 -1.7290667720866193e-01 - 2 3.0296221616781649e-01 2.9517129917211218e+00 -8.5798904365338713e-01 - 3 -6.9368802362172777e-01 1.2445115422148878e+00 -6.2281111185285920e-01 - 4 -1.5764879646739900e+00 1.4919714416721197e+00 -1.2492069413381908e+00 - 5 -8.9434512967965252e-01 9.3651699743522254e-01 4.0191726569953845e-01 - 6 2.9454439635066831e-01 2.2724545796942719e-01 -1.2845195052894431e+00 - 7 3.4049112905319934e-01 -9.4655677384814507e-03 -2.4634480019885556e+00 - 8 1.1644354555589707e+00 -4.8367776651303718e-01 -6.7663643931662931e-01 - 9 1.3781717822376129e+00 -2.5332509534954067e-01 2.6864954447021949e-01 - 10 2.0186368605646337e+00 -1.4285861423742481e+00 -9.6712491246329535e-01 - 11 1.7929137227202196e+00 -1.9875455388073511e+00 -1.8836565351901273e+00 - 12 3.0032775230343667e+00 -4.8983022415922672e-01 -1.6190248016125368e+00 - 13 4.0448964161972993e+00 -9.0213155125590028e-01 -1.6385398398261621e+00 - 14 2.6035151245156412e+00 -4.0874995488520105e-01 -2.6555999073601511e+00 - 15 2.9761196776308623e+00 5.6287237451808192e-01 -1.2442626194415292e+00 - 16 2.6517373020764632e+00 -2.3957035509096389e+00 3.3389262134244646e-02 - 17 2.2311114923824555e+00 -2.1018393229880719e+00 1.1496088522768189e+00 + 1 -2.7899546859705771e-01 2.4731857340428069e+00 -1.7290667720877784e-01 + 2 3.0296221616781072e-01 2.9517129917211151e+00 -8.5798904365354312e-01 + 3 -6.9368802362172532e-01 1.2445115422148945e+00 -6.2281111185289584e-01 + 4 -1.5764879646740031e+00 1.4919714416720897e+00 -1.2492069413382207e+00 + 5 -8.9434512967962521e-01 9.3651699743528616e-01 4.0191726569952280e-01 + 6 2.9454439635065666e-01 2.2724545796939852e-01 -1.2845195052894454e+00 + 7 3.4049112905316026e-01 -9.4655677385761805e-03 -2.4634480019885459e+00 + 8 1.1644354555589742e+00 -4.8367776651303018e-01 -6.7663643931661244e-01 + 9 1.3781717822376380e+00 -2.5332509534948033e-01 2.6864954447021760e-01 + 10 2.0186368605646310e+00 -1.4285861423742554e+00 -9.6712491246324517e-01 + 11 1.7929137227201968e+00 -1.9875455388074099e+00 -1.8836565351900401e+00 + 12 3.0032775230343471e+00 -4.8983022415926980e-01 -1.6190248016125621e+00 + 13 4.0448964161972807e+00 -9.0213155125594269e-01 -1.6385398398261892e+00 + 14 2.6035151245155976e+00 -4.0874995488530264e-01 -2.6555999073601715e+00 + 15 2.9761196776308503e+00 5.6287237451805949e-01 -1.2442626194416131e+00 + 16 2.6517373020764849e+00 -2.3957035509095892e+00 3.3389262134333686e-02 + 17 2.2311114923825035e+00 -2.1018393229879604e+00 1.1496088522769004e+00 18 2.1384791188033843e+00 3.0177261773770208e+00 -3.5160827596876225e+00 19 1.5349125211132961e+00 2.6315969880333707e+00 -4.2472859440220647e+00 20 2.7641167828863153e+00 3.6833419064000221e+00 -3.9380850623312638e+00 @@ -47,23 +48,23 @@ run_pos: ! |2 28 -2.7406520384725965e+00 -4.0207251278130975e+00 1.5828689861678511e+00 29 -1.3108232656499081e+00 -3.5992986322410760e+00 2.2680459788743503e+00 run_vel: ! |2 - 1 4.7093296226164550e-04 2.6351124312060223e-04 -4.4905063547613568e-04 - 2 4.9594635271876775e-04 9.4561409237174846e-05 -5.4581325723053790e-04 - 3 3.3306088119081919e-04 2.3224949911015709e-04 -2.3659435306899653e-04 - 4 3.3692332940285361e-04 2.1926824120528752e-04 -2.4716611858555457e-04 - 5 3.3642541894622217e-04 4.1797578053944250e-04 -1.8011323945926332e-04 - 6 2.0926870695908031e-04 2.6449376032433555e-05 -1.0508922741401509e-04 - 7 1.4629046128363305e-04 -1.6873362379725323e-04 -6.8353900724087087e-05 - 8 1.5844098346817862e-04 3.7728756087615553e-05 -1.9162577392847385e-05 - 9 2.1299357198252531e-04 1.6917133003967874e-04 -6.3528006071188683e-05 - 10 5.4261569071251603e-05 -9.4655546204709643e-05 1.0511372702289179e-04 - 11 -3.2194218121513917e-05 -2.2025090185605342e-04 2.0300208519291412e-04 - 12 1.2640585449265036e-04 -2.9851081600945991e-04 -7.9476186245599681e-05 - 13 8.4523551795123310e-05 -4.0583140303606936e-04 -4.7550925831962545e-05 - 14 9.9954071734181717e-05 -4.2610809338914382e-04 -7.9255453072696249e-05 - 15 2.4417483202631243e-04 -2.3521005781666407e-04 -2.4875292755154228e-04 - 16 -9.0959360838797421e-06 3.7773746063106756e-06 2.4035204669042973e-04 - 17 5.7507084250803101e-05 2.2629200960629499e-04 2.0686926033796699e-04 + 1 4.7093296226165618e-04 2.6351124312058857e-04 -4.4905063547614403e-04 + 2 4.9594635271877252e-04 9.4561409237151983e-05 -5.4581325723054321e-04 + 3 3.3306088119082413e-04 2.3224949911015692e-04 -2.3659435306899455e-04 + 4 3.3692332940285378e-04 2.1926824120529077e-04 -2.4716611858554389e-04 + 5 3.3642541894623611e-04 4.1797578053944765e-04 -1.8011323945926958e-04 + 6 2.0926870695907706e-04 2.6449376032434591e-05 -1.0508922741400673e-04 + 7 1.4629046128361865e-04 -1.6873362379725188e-04 -6.8353900724066446e-05 + 8 1.5844098346817927e-04 3.7728756087617390e-05 -1.9162577392845779e-05 + 9 2.1299357198253474e-04 1.6917133003967807e-04 -6.3528006071197993e-05 + 10 5.4261569071247645e-05 -9.4655546204705848e-05 1.0511372702289633e-04 + 11 -3.2194218121526927e-05 -2.2025090185604412e-04 2.0300208519293052e-04 + 12 1.2640585449264128e-04 -2.9851081600947238e-04 -7.9476186245595616e-05 + 13 8.4523551795112752e-05 -4.0583140303608579e-04 -4.7550925831960783e-05 + 14 9.9954071734163598e-05 -4.2610809338915548e-04 -7.9255453072680826e-05 + 15 2.4417483202630842e-04 -2.3521005781668527e-04 -2.4875292755154548e-04 + 16 -9.0959360838764895e-06 3.7773746063197473e-06 2.4035204669042547e-04 + 17 5.7507084250817169e-05 2.2629200960630572e-04 2.0686926033795233e-04 18 -6.0936815808025862e-04 -9.3774557532468582e-04 -3.3558072507805731e-04 19 -6.9919768291957119e-04 -3.6060777270430031e-03 4.2833405289822791e-03 20 4.7777805013736515e-03 5.1003745845520452e-03 1.8002873923729241e-03 diff --git a/unittest/force-styles/tests/fix-timestep-rigid_nve_small.yaml b/unittest/force-styles/tests/fix-timestep-rigid_nve_small.yaml index 664921b147..1a017f0a6c 100644 --- a/unittest/force-styles/tests/fix-timestep-rigid_nve_small.yaml +++ b/unittest/force-styles/tests/fix-timestep-rigid_nve_small.yaml @@ -1,6 +1,7 @@ --- -lammps_version: 17 Feb 2022 -date_generated: Fri Mar 18 22:18:00 2022 +lammps_version: 7 Feb 2024 +tags: +date_generated: Thu Apr 4 21:27:14 2024 epsilon: 5e-13 skip_tests: prerequisites: ! | @@ -14,8 +15,8 @@ post_commands: ! | input_file: in.fourmol natoms: 29 run_stress: ! |- - -4.9200114774918006e+01 -2.6907707694141354e+01 -6.0080872444875970e+00 -2.5620425756344780e+01 -1.3450222538011893e+01 -1.4947348732785031e+00 -global_scalar: 0.5007318719663203 + -4.9200114760030708e+01 -2.6907707699312748e+01 -6.0080872440179061e+00 -2.5620425767600064e+01 -1.3450222535184853e+01 -1.4947348700253382e+00 +global_scalar: 0.5007318719399354 run_pos: ! |2 1 -2.7993683669226832e-01 2.4726588069312840e+00 -1.7200860244148433e-01 2 3.0197083955402204e-01 2.9515239068888608e+00 -8.5689735572907566e-01 @@ -34,15 +35,15 @@ run_pos: ! |2 15 2.9756315249791303e+00 5.6334269722969288e-01 -1.2437650754599008e+00 16 2.6517554244980306e+00 -2.3957110424978438e+00 3.2908335999178327e-02 17 2.2309964792710639e+00 -2.1022918943319384e+00 1.1491948328949437e+00 - 18 2.1392027588270928e+00 3.0171068018423082e+00 -3.5144628518853867e+00 - 19 1.5366124996934336e+00 2.6286809834236959e+00 -4.2452547844313493e+00 - 20 2.7628161763597592e+00 3.6842251687468450e+00 -3.9370881219419189e+00 - 21 4.9036621348471368e+00 -4.0757648444604762e+00 -3.6192617654906609e+00 - 22 4.3655322292129357e+00 -4.2084949964269480e+00 -4.4622011117992786e+00 - 23 5.7380414790507261e+00 -3.5841969189265162e+00 -3.8827839828320116e+00 - 24 2.0701314764933532e+00 3.1499370533556008e+00 3.1565324853054118e+00 - 25 1.3030170721038390e+00 3.2711173927738786e+00 2.5081940917867680e+00 - 26 2.5776230784374867e+00 4.0127347067334345e+00 3.2182355136150917e+00 + 18 2.1392027588241729e+00 3.0171068018404634e+00 -3.5144628518858858e+00 + 19 1.5366124996944652e+00 2.6286809834366300e+00 -4.2452547844429631e+00 + 20 2.7628161763703827e+00 3.6842251687412753e+00 -3.9370881219283147e+00 + 21 4.9036621349084646e+00 -4.0757648444931904e+00 -3.6192617654848509e+00 + 22 4.3655322292057255e+00 -4.2084949967079632e+00 -4.4622011117106153e+00 + 23 5.7380414788131207e+00 -3.5841969185149058e+00 -3.8827839829438688e+00 + 24 2.0701314764430685e+00 3.1499370533656190e+00 3.1565324853444698e+00 + 25 1.3030170721374645e+00 3.2711173928413317e+00 2.5081940917372791e+00 + 26 2.5776230786045939e+00 4.0127347066259897e+00 3.2182355135086644e+00 27 -1.9613581876744359e+00 -4.3556300596085160e+00 2.1101467673534788e+00 28 -2.7406520384725965e+00 -4.0207251278130975e+00 1.5828689861678511e+00 29 -1.3108232656499081e+00 -3.5992986322410760e+00 2.2680459788743503e+00 @@ -64,15 +65,15 @@ run_vel: ! |2 15 -4.3301707382721859e-03 -3.1802661664634938e-03 3.2037919043360571e-03 16 -9.6715751018414326e-05 -5.0016572678960377e-04 1.4945658875149626e-03 17 6.5692180538157174e-04 3.6635779995305095e-04 8.3495414466050911e-04 - 18 3.6149625094898067e-04 -3.1032459262177040e-04 8.1043030117471950e-04 - 19 8.5103884664914254e-04 -1.4572280597118469e-03 1.0163621287571445e-03 - 20 -6.5204659274939057e-04 4.3989037444674739e-04 4.9909839028631532e-04 - 21 -1.3888125888095134e-03 -3.1978049191290817e-04 1.1455681505629727e-03 - 22 -1.6084223473476296e-03 -1.5355394235202363e-03 1.4772010819351844e-03 - 23 2.6392672583440717e-04 -3.9375414417551127e-03 -3.6991583302200246e-04 - 24 8.6062827046548790e-04 -9.4179873487668705e-04 5.5396395555797203e-04 - 25 1.5933645478462865e-03 -2.2139156628290975e-03 -5.5078029723780941e-04 - 26 -1.5679561736454237e-03 3.5146224433513641e-04 2.4446924193838983e-03 + 18 3.6149625095571725e-04 -3.1032459263052550e-04 8.1043030117473349e-04 + 19 8.5103884662188244e-04 -1.4572280597071525e-03 1.0163621287543638e-03 + 20 -6.5204659274901945e-04 4.3989037447700791e-04 4.9909839028904252e-04 + 21 -1.3888125889514069e-03 -3.1978049248194420e-04 1.1455681505565557e-03 + 22 -1.6084223477996385e-03 -1.5355394224557757e-03 1.4772010822781041e-03 + 23 2.6392672685288674e-04 -3.9375414405480738e-03 -3.6991583333937880e-04 + 24 8.6062827042478370e-04 -9.4179873506334828e-04 5.5396395546095014e-04 + 25 1.5933645477524167e-03 -2.2139156628045932e-03 -5.5078029709943691e-04 + 26 -1.5679561733890424e-03 3.5146224505578228e-04 2.4446924196328459e-03 27 -2.6510179146429716e-04 3.6306203629019116e-04 -5.6235585400647747e-04 28 -2.3068708109787484e-04 -8.5663070212203200e-04 2.1302563179109169e-03 29 -2.5054744388303732e-03 -1.6773997805290820e-04 2.8436699761004796e-03 diff --git a/unittest/force-styles/tests/fix-timestep-rigid_nvt.yaml b/unittest/force-styles/tests/fix-timestep-rigid_nvt.yaml index a49508ca15..f15e2a3c7f 100644 --- a/unittest/force-styles/tests/fix-timestep-rigid_nvt.yaml +++ b/unittest/force-styles/tests/fix-timestep-rigid_nvt.yaml @@ -1,7 +1,8 @@ --- -lammps_version: 17 Feb 2022 -date_generated: Fri Mar 18 22:18:00 2022 -epsilon: 5e-13 +lammps_version: 7 Feb 2024 +tags: +date_generated: Thu Apr 4 21:27:14 2024 +epsilon: 1e-12 skip_tests: prerequisites: ! | atom full @@ -13,26 +14,26 @@ post_commands: ! | input_file: in.fourmol natoms: 29 run_stress: ! |- - -1.3123962047757550e+03 -1.3675423591710460e+03 -3.5468492999583855e+03 7.8271738572396373e+02 2.6480486115495069e+02 -7.6950536863736306e+02 -global_scalar: 68.08659647423171 + -1.3123962047758187e+03 -1.3675423591720764e+03 -3.5468492999580299e+03 7.8271738572394452e+02 2.6480486115379637e+02 -7.6950536863892899e+02 +global_scalar: 68.08659647424867 run_pos: ! |2 - 1 -2.7802951913990959e-01 2.4737132264311215e+00 -1.7381271738602289e-01 - 2 3.0397800832473609e-01 2.9519031941431444e+00 -8.5908822750267100e-01 - 3 -6.9299720296404743e-01 1.2449766685866726e+00 -6.2329294828335358e-01 - 4 -1.5757894675975461e+00 1.4924105480974301e+00 -1.2497098747240374e+00 - 5 -8.9364750934418624e-01 9.3735293261000852e-01 4.0154813851989335e-01 - 6 2.9498813449175199e-01 2.2729986882976547e-01 -1.2847387164260673e+00 - 7 3.4080910885027837e-01 -9.8008218359699473e-03 -2.4635938021179546e+00 - 8 1.1647778042705452e+00 -4.8360070140706557e-01 -6.7668409924218165e-01 - 9 1.3786230528159027e+00 -2.5298559880150862e-01 2.6851325883861188e-01 - 10 2.0187712935465942e+00 -1.4287732348422091e+00 -9.6692440387148870e-01 - 11 1.7928755785831587e+00 -1.9879833661313322e+00 -1.8832605388690278e+00 - 12 3.0035558347419657e+00 -4.9042429038271507e-01 -1.6191927838346238e+00 - 13 4.0450911337530959e+00 -9.0293975523160919e-01 -1.6386440514135796e+00 - 14 2.6037405819194577e+00 -4.0959881564101863e-01 -2.6557674031621108e+00 - 15 2.9766330093335447e+00 5.6240461100771322e-01 -1.2447686007433758e+00 - 16 2.6517453810147344e+00 -2.3956939898026426e+00 3.3859750042781744e-02 - 17 2.2312525656149020e+00 -2.1013855689264771e+00 1.1500124166835219e+00 + 1 -2.7802951913978302e-01 2.4737132264315886e+00 -1.7381271738770820e-01 + 2 3.0397800832462774e-01 2.9519031941430738e+00 -8.5908822750493219e-01 + 3 -6.9299720296403144e-01 1.2449766685867643e+00 -6.2329294828390935e-01 + 4 -1.5757894675977402e+00 1.4924105480969891e+00 -1.2497098747245081e+00 + 5 -8.9364750934382919e-01 9.3735293261092456e-01 4.0154813851965188e-01 + 6 2.9498813449158368e-01 2.2729986882934847e-01 -1.2847387164261186e+00 + 7 3.4080910884973536e-01 -9.8008218373410172e-03 -2.4635938021178290e+00 + 8 1.1647778042705941e+00 -4.8360070140696521e-01 -6.7668409924193851e-01 + 9 1.3786230528162504e+00 -2.5298559880063631e-01 2.6851325883859889e-01 + 10 2.0187712935465760e+00 -1.4287732348423197e+00 -9.6692440387075651e-01 + 11 1.7928755785828601e+00 -1.9879833661321924e+00 -1.8832605388677695e+00 + 12 3.0035558347417104e+00 -4.9042429038332558e-01 -1.6191927838349707e+00 + 13 4.0450911337528455e+00 -9.0293975523220671e-01 -1.6386440514139291e+00 + 14 2.6037405819188639e+00 -4.0959881564248080e-01 -2.6557674031623937e+00 + 15 2.9766330093333795e+00 5.6240461100740513e-01 -1.2447686007445669e+00 + 16 2.6517453810150675e+00 -2.3956939898019254e+00 3.3859750044092363e-02 + 17 2.2312525656155877e+00 -2.1013855689248668e+00 1.1500124166847305e+00 18 2.1384791188033843e+00 3.0177261773770208e+00 -3.5160827596876225e+00 19 1.5349125211132961e+00 2.6315969880333707e+00 -4.2472859440220647e+00 20 2.7641167828863153e+00 3.6833419064000221e+00 -3.9380850623312638e+00 @@ -46,23 +47,23 @@ run_pos: ! |2 28 -2.7406520384725965e+00 -4.0207251278130975e+00 1.5828689861678511e+00 29 -1.3108232656499081e+00 -3.5992986322410760e+00 2.2680459788743503e+00 run_vel: ! |2 - 1 1.8443993556501188e-03 1.0121779580014884e-03 -1.7361326034900006e-03 - 2 1.9401022924558343e-03 3.6428754787592733e-04 -2.1069540627804634e-03 - 3 1.3158623602983108e-03 8.9265747656461540e-04 -9.2144725682164657e-04 - 4 1.3306255280096089e-03 8.4281508655990054e-04 -9.6194026572564146e-04 - 5 1.3289682243410692e-03 1.6048237018834067e-03 -7.0511232123071470e-04 - 6 8.4113718611833661e-04 1.0389683283144290e-04 -4.1697370456873913e-04 - 7 5.9950574545626287e-04 -6.4437674895215604e-04 -2.7586696717582678e-04 - 8 6.4634547270651834e-04 1.4734228826522431e-04 -8.7540766366730972e-05 - 9 8.5561404484505246e-04 6.5123532540338036e-04 -2.5782947158524498e-04 - 10 2.4688038968480818e-04 -3.5995975344065565e-04 3.8912416843275122e-04 - 11 -8.4672359473208624e-05 -8.4134349031640394e-04 7.6463157764214873e-04 - 12 5.2321633256319569e-04 -1.1418047427480882e-03 -3.1842516233562688e-04 - 13 3.6258187754908603e-04 -1.5531581259494627e-03 -1.9590476904013767e-04 - 14 4.2166181631324117e-04 -1.6310415916630540e-03 -3.1740232809360453e-04 - 15 9.7471807923383321e-04 -8.9939841790992827e-04 -9.6757308853409824e-04 - 16 4.1534888649229478e-06 1.7705740202856454e-05 9.0753010117813394e-04 - 17 2.5969943716026096e-04 8.7075266710270492e-04 7.7887058799645239e-04 + 1 1.8443993556507550e-03 1.0121779580008364e-03 -1.7361326034906117e-03 + 2 1.9401022924561704e-03 3.6428754787474148e-04 -2.1069540627809144e-03 + 3 1.3158623602986111e-03 8.9265747656469769e-04 -9.2144725682167822e-04 + 4 1.3306255280096579e-03 8.4281508656016844e-04 -9.6194026572521364e-04 + 5 1.3289682243418409e-03 1.6048237018838117e-03 -7.0511232123120064e-04 + 6 8.4113718611817723e-04 1.0389683283156166e-04 -4.1697370456838980e-04 + 7 5.9950574545550414e-04 -6.4437674895204296e-04 -2.7586696717479603e-04 + 8 6.4634547270655271e-04 1.4734228826538870e-04 -8.7540766366732192e-05 + 9 8.5561404484556399e-04 6.5123532540342654e-04 -2.5782947158584915e-04 + 10 2.4688038968457540e-04 -3.5995975344040265e-04 3.8912416843293944e-04 + 11 -8.4672359473939376e-05 -8.4134349031586390e-04 7.6463157764299549e-04 + 12 5.2321633256271539e-04 -1.1418047427487572e-03 -3.1842516233546950e-04 + 13 3.6258187754852045e-04 -1.5531581259503574e-03 -1.9590476904008422e-04 + 14 4.2166181631227780e-04 -1.6310415916636891e-03 -3.1740232809282303e-04 + 15 9.7471807923364706e-04 -8.9939841791107037e-04 -9.6757308853435780e-04 + 16 4.1534888650543531e-06 1.7705740203412426e-05 9.0753010117785768e-04 + 17 2.5969943716097897e-04 8.7075266710338634e-04 7.7887058799558893e-04 18 -6.0936815808025862e-04 -9.3774557532468582e-04 -3.3558072507805731e-04 19 -6.9919768291957119e-04 -3.6060777270430031e-03 4.2833405289822791e-03 20 4.7777805013736515e-03 5.1003745845520452e-03 1.8002873923729241e-03 diff --git a/unittest/force-styles/tests/fix-timestep-rigid_nvt_small.yaml b/unittest/force-styles/tests/fix-timestep-rigid_nvt_small.yaml index 6068993094..dab3d52a04 100644 --- a/unittest/force-styles/tests/fix-timestep-rigid_nvt_small.yaml +++ b/unittest/force-styles/tests/fix-timestep-rigid_nvt_small.yaml @@ -1,6 +1,7 @@ --- -lammps_version: 17 Feb 2022 -date_generated: Fri Mar 18 22:18:00 2022 +lammps_version: 7 Feb 2024 +tags: +date_generated: Thu Apr 4 21:27:15 2024 epsilon: 5e-13 skip_tests: prerequisites: ! | @@ -13,8 +14,8 @@ post_commands: ! | input_file: in.fourmol natoms: 29 run_stress: ! |- - -1.4827261116680472e+02 -1.8411194349753309e+01 -1.0752762859308649e+02 -2.1814511477016276e+02 1.7027764307147623e+02 2.1058942244057214e+01 -global_scalar: 0.9532609554739606 + -1.4827261099624999e+02 -1.8411194282828326e+01 -1.0752762861573947e+02 -2.1814511471949461e+02 1.7027764305079162e+02 2.1058942246396320e+01 +global_scalar: 0.9532609552151339 run_pos: ! |2 1 -2.7993683669226832e-01 2.4726588069312840e+00 -1.7200860244148433e-01 2 3.0197083955402204e-01 2.9515239068888608e+00 -8.5689735572907566e-01 @@ -33,15 +34,15 @@ run_pos: ! |2 15 2.9756315249791303e+00 5.6334269722969288e-01 -1.2437650754599008e+00 16 2.6517554244980306e+00 -2.3957110424978438e+00 3.2908335999178327e-02 17 2.2309964792710639e+00 -2.1022918943319384e+00 1.1491948328949437e+00 - 18 2.1395635672981443e+00 3.0168023048492310e+00 -3.5136606977867388e+00 - 19 1.5374727853253387e+00 2.6272080572819609e+00 -4.2442423140467360e+00 - 20 2.7621434607990372e+00 3.6846842324743214e+00 -3.9366036441030396e+00 - 21 4.9022821625125470e+00 -4.0760572704380627e+00 -3.6181235130909242e+00 - 22 4.3639257458824501e+00 -4.2100277325126187e+00 -4.4607219430080747e+00 - 23 5.7383384133351401e+00 -3.5881799317362106e+00 -3.8831848688588710e+00 - 24 2.0709922902331592e+00 3.1490053461169678e+00 3.1570777020268803e+00 - 25 1.3046262534530633e+00 3.2688902575528282e+00 2.5076144141701078e+00 - 26 2.5760050685080813e+00 4.0131166912605272e+00 3.2207051913215210e+00 + 18 2.1395635672857165e+00 3.0168023048413781e+00 -3.5136606977888540e+00 + 19 1.5374727853296883e+00 2.6272080573369379e+00 -4.2442423140961818e+00 + 20 2.7621434608442974e+00 3.6846842324506923e+00 -3.9366036440451500e+00 + 21 4.9022821627727593e+00 -4.0760572705753884e+00 -3.6181235130648650e+00 + 22 4.3639257458473608e+00 -4.2100277337137149e+00 -4.4607219426242057e+00 + 23 5.7383384123314833e+00 -3.5881799299869201e+00 -3.8831848693467652e+00 + 24 2.0709922900187268e+00 3.1490053461587983e+00 3.1570777021928031e+00 + 25 1.3046262535950772e+00 3.2688902578410239e+00 2.5076144139609013e+00 + 26 2.5760050692220648e+00 4.0131166908053464e+00 3.2207051908683750e+00 27 -1.9613581876744359e+00 -4.3556300596085160e+00 2.1101467673534788e+00 28 -2.7406520384725965e+00 -4.0207251278130975e+00 1.5828689861678511e+00 29 -1.3108232656499081e+00 -3.5992986322410760e+00 2.2680459788743503e+00 @@ -63,15 +64,15 @@ run_vel: ! |2 15 -4.3301707382721859e-03 -3.1802661664634938e-03 3.2037919043360571e-03 16 -9.6715751018414326e-05 -5.0016572678960377e-04 1.4945658875149626e-03 17 6.5692180538157174e-04 3.6635779995305095e-04 8.3495414466050911e-04 - 18 7.8522439440007537e-04 -6.6826115062653757e-04 1.7528441282153480e-03 - 19 1.8628941719211862e-03 -3.1840047051916367e-03 2.2062694140207390e-03 - 20 -1.4430972531298200e-03 9.7564145841628493e-04 1.0686492192569898e-03 - 21 -3.0047717246574385e-03 -6.6139343888744974e-04 2.4784169377340712e-03 - 22 -3.4980341571643784e-03 -3.3380963325931002e-03 3.2191613979274040e-03 - 23 5.9333930569297746e-04 -8.6231086219834968e-03 -8.2692040355627789e-04 - 24 1.8727912311097641e-03 -2.0349136820274911e-03 1.1951471753018509e-03 - 25 3.4887365958745920e-03 -4.8232966889391266e-03 -1.2263764490291313e-03 - 26 -3.4770258010749858e-03 7.8662050223200905e-04 5.3381090661352298e-03 + 18 7.8522439447394326e-04 -6.6826115070004464e-04 1.7528441282176825e-03 + 19 1.8628941717384645e-03 -3.1840047052822556e-03 2.2062694140226819e-03 + 20 -1.4430972532419690e-03 9.7564145880034414e-04 1.0686492192457362e-03 + 21 -3.0047717260078453e-03 -6.6139344410253201e-04 2.4784169376928207e-03 + 22 -3.4980341614361554e-03 -3.3380963226435131e-03 3.2191614012082861e-03 + 23 5.9333931535562748e-04 -8.6231086111150188e-03 -8.2692040667271425e-04 + 24 1.8727912307694413e-03 -2.0349136837727452e-03 1.1951471743788897e-03 + 25 3.4887365951187166e-03 -4.8232966888750181e-03 -1.2263764478999317e-03 + 26 -3.4770257989604671e-03 7.8662050913487318e-04 5.3381090686904524e-03 27 -2.6510179146429716e-04 3.6306203629019116e-04 -5.6235585400647747e-04 28 -2.3068708109787484e-04 -8.5663070212203200e-04 2.1302563179109169e-03 29 -2.5054744388303732e-03 -1.6773997805290820e-04 2.8436699761004796e-03 diff --git a/unittest/force-styles/tests/fix-timestep-rigid_single.yaml b/unittest/force-styles/tests/fix-timestep-rigid_single.yaml index 02acb437d9..5af45ebc1b 100644 --- a/unittest/force-styles/tests/fix-timestep-rigid_single.yaml +++ b/unittest/force-styles/tests/fix-timestep-rigid_single.yaml @@ -1,6 +1,7 @@ --- -lammps_version: 17 Feb 2022 -date_generated: Fri Mar 18 22:18:00 2022 +lammps_version: 7 Feb 2024 +tags: +date_generated: Thu Apr 4 21:27:15 2024 epsilon: 7.5e-13 skip_tests: prerequisites: ! | @@ -14,26 +15,26 @@ post_commands: ! | input_file: in.fourmol natoms: 29 run_stress: ! |- - -1.3754817466835989e+03 -1.4228425246166137e+03 -3.6087196201914348e+03 8.7407043149698916e+02 2.1665316519769885e+02 -6.0480791462033073e+02 -global_scalar: 4.531423038570297 + -1.3754817466835991e+03 -1.4228425246166084e+03 -3.6087196201914162e+03 8.7407043149699155e+02 2.1665316519769920e+02 -6.0480791462032153e+02 +global_scalar: 4.531423038570312 run_pos: ! |2 - 1 -2.7899546859693136e-01 2.4731857340428784e+00 -1.7290667720876285e-01 - 2 3.0296221616793728e-01 2.9517129917211546e+00 -8.5798904365355155e-01 - 3 -6.9368802362166204e-01 1.2445115422149751e+00 -6.2281111185285498e-01 - 4 -1.5764879646739487e+00 1.4919714416722010e+00 -1.2492069413381564e+00 - 5 -8.9434512967954416e-01 9.3651699743538730e-01 4.0191726569957442e-01 - 6 2.9454439635065910e-01 2.2724545796943096e-01 -1.2845195052894232e+00 - 7 3.4049112905311751e-01 -9.4655677385591108e-03 -2.4634480019885228e+00 - 8 1.1644354555589662e+00 -4.8367776651302724e-01 -6.7663643931660777e-01 - 9 1.3781717822376680e+00 -2.5332509534947545e-01 2.6864954447021416e-01 - 10 2.0186368605645764e+00 -1.4285861423742918e+00 -9.6712491246325605e-01 - 11 1.7929137227200918e+00 -1.9875455388074483e+00 -1.8836565351900385e+00 - 12 3.0032775230343125e+00 -4.8983022415935312e-01 -1.6190248016126132e+00 - 13 4.0448964161972283e+00 -9.0213155125606947e-01 -1.6385398398262669e+00 - 14 2.6035151245155355e+00 -4.0874995488538129e-01 -2.6555999073602123e+00 - 15 2.9761196776308694e+00 5.6287237451798222e-01 -1.2442626194416753e+00 - 16 2.6517373020764219e+00 -2.3957035509096407e+00 3.3389262134315700e-02 - 17 2.2311114923824857e+00 -2.1018393229879817e+00 1.1496088522768926e+00 + 1 -2.7899546859693225e-01 2.4731857340428771e+00 -1.7290667720876018e-01 + 2 3.0296221616793617e-01 2.9517129917211538e+00 -8.5798904365354822e-01 + 3 -6.9368802362166271e-01 1.2445115422149740e+00 -6.2281111185285387e-01 + 4 -1.5764879646739496e+00 1.4919714416722003e+00 -1.2492069413381550e+00 + 5 -8.9434512967954460e-01 9.3651699743538508e-01 4.0191726569957453e-01 + 6 2.9454439635065910e-01 2.2724545796943096e-01 -1.2845195052894227e+00 + 7 3.4049112905311785e-01 -9.4655677385578896e-03 -2.4634480019885232e+00 + 8 1.1644354555589664e+00 -4.8367776651302741e-01 -6.7663643931660822e-01 + 9 1.3781717822376678e+00 -2.5332509534947639e-01 2.6864954447021405e-01 + 10 2.0186368605645768e+00 -1.4285861423742912e+00 -9.6712491246325705e-01 + 11 1.7929137227200933e+00 -1.9875455388074468e+00 -1.8836565351900401e+00 + 12 3.0032775230343129e+00 -4.8983022415935129e-01 -1.6190248016126136e+00 + 13 4.0448964161972292e+00 -9.0213155125606781e-01 -1.6385398398262672e+00 + 14 2.6035151245155359e+00 -4.0874995488537874e-01 -2.6555999073602123e+00 + 15 2.9761196776308694e+00 5.6287237451798344e-01 -1.2442626194416739e+00 + 16 2.6517373020764223e+00 -2.3957035509096416e+00 3.3389262134313369e-02 + 17 2.2311114923824853e+00 -2.1018393229879830e+00 1.1496088522768906e+00 18 2.1384791188033843e+00 3.0177261773770208e+00 -3.5160827596876225e+00 19 1.5349125211132961e+00 2.6315969880333707e+00 -4.2472859440220647e+00 20 2.7641167828863153e+00 3.6833419064000221e+00 -3.9380850623312638e+00 @@ -47,23 +48,23 @@ run_pos: ! |2 28 -2.7406520384725965e+00 -4.0207251278130975e+00 1.5828689861678511e+00 29 -1.3108232656499081e+00 -3.5992986322410760e+00 2.2680459788743503e+00 run_vel: ! |2 - 1 4.7093296226165759e-04 2.6351124312057290e-04 -4.4905063547614669e-04 - 2 4.9594635271877263e-04 9.4561409237138512e-05 -5.4581325723053421e-04 - 3 3.3306088119083101e-04 2.3224949911015481e-04 -2.3659435306900835e-04 - 4 3.3692332940286717e-04 2.1926824120529722e-04 -2.4716611858556465e-04 - 5 3.3642541894624077e-04 4.1797578053943724e-04 -1.8011323945929064e-04 - 6 2.0926870695908283e-04 2.6449376032441855e-05 -1.0508922741401397e-04 - 7 1.4629046128362884e-04 -1.6873362379723068e-04 -6.8353900724071325e-05 - 8 1.5844098346817943e-04 3.7728756087619084e-05 -1.9162577392849316e-05 - 9 2.1299357198253002e-04 1.6917133003966749e-04 -6.3528006071200284e-05 - 10 5.4261569071245856e-05 -9.4655546204698666e-05 1.0511372702289738e-04 - 11 -3.2194218121523431e-05 -2.2025090185602293e-04 2.0300208519292805e-04 - 12 1.2640585449263546e-04 -2.9851081600946745e-04 -7.9476186245575585e-05 - 13 8.4523551795102263e-05 -4.0583140303608199e-04 -4.7550925831931374e-05 - 14 9.9954071734163435e-05 -4.2610809338913835e-04 -7.9255453072661880e-05 - 15 2.4417483202629980e-04 -2.3521005781669047e-04 -2.4875292755152005e-04 - 16 -9.0959360838836724e-06 3.7773746063194780e-06 2.4035204669042463e-04 - 17 5.7507084250807628e-05 2.2629200960629336e-04 2.0686926033794547e-04 + 1 4.7093296226165726e-04 2.6351124312057366e-04 -4.4905063547614750e-04 + 2 4.9594635271877263e-04 9.4561409237139244e-05 -5.4581325723053519e-04 + 3 3.3306088119083079e-04 2.3224949911015511e-04 -2.3659435306900900e-04 + 4 3.3692332940286722e-04 2.1926824120529738e-04 -2.4716611858556574e-04 + 5 3.3642541894624012e-04 4.1797578053943767e-04 -1.8011323945929113e-04 + 6 2.0926870695908297e-04 2.6449376032441903e-05 -1.0508922741401427e-04 + 7 1.4629046128362941e-04 -1.6873362379723111e-04 -6.8353900724071745e-05 + 8 1.5844098346817935e-04 3.7728756087619165e-05 -1.9162577392849147e-05 + 9 2.1299357198252962e-04 1.6917133003966793e-04 -6.3528006071199972e-05 + 10 5.4261569071245965e-05 -9.4655546204698774e-05 1.0511372702289789e-04 + 11 -3.2194218121522970e-05 -2.2025090185602350e-04 2.0300208519292848e-04 + 12 1.2640585449263589e-04 -2.9851081600946756e-04 -7.9476186245575178e-05 + 13 8.4523551795102697e-05 -4.0583140303608210e-04 -4.7550925831930561e-05 + 14 9.9954071734164248e-05 -4.2610809338913878e-04 -7.9255453072661758e-05 + 15 2.4417483202630007e-04 -2.3521005781669015e-04 -2.4875292755151984e-04 + 16 -9.0959360838839976e-06 3.7773746063194848e-06 2.4035204669042588e-04 + 17 5.7507084250806896e-05 2.2629200960629374e-04 2.0686926033794661e-04 18 -6.0936815808025862e-04 -9.3774557532468582e-04 -3.3558072507805731e-04 19 -6.9919768291957119e-04 -3.6060777270430031e-03 4.2833405289822791e-03 20 4.7777805013736515e-03 5.1003745845520452e-03 1.8002873923729241e-03 diff --git a/unittest/force-styles/tests/fix-timestep-rigid_small.yaml b/unittest/force-styles/tests/fix-timestep-rigid_small.yaml index 92754f85f5..ed92a4e5a8 100644 --- a/unittest/force-styles/tests/fix-timestep-rigid_small.yaml +++ b/unittest/force-styles/tests/fix-timestep-rigid_small.yaml @@ -1,6 +1,7 @@ --- -lammps_version: 17 Feb 2022 -date_generated: Fri Mar 18 22:18:00 2022 +lammps_version: 7 Feb 2024 +tags: +date_generated: Thu Apr 4 21:27:15 2024 epsilon: 5e-13 skip_tests: prerequisites: ! | @@ -14,8 +15,8 @@ post_commands: ! | input_file: in.fourmol natoms: 29 run_stress: ! |- - -4.9200116134789873e+01 -2.6907707565987707e+01 -6.0080860422278581e+00 -2.5620423972101300e+01 -1.3450224059983967e+01 -1.4947288487003760e+00 -global_scalar: 18.3405601674144 + -4.9200116134788615e+01 -2.6907707565987401e+01 -6.0080860422276308e+00 -2.5620423972100241e+01 -1.3450224059984270e+01 -1.4947288487006070e+00 +global_scalar: 18.340560167414402 run_pos: ! |2 1 -2.7993683669226832e-01 2.4726588069312840e+00 -1.7200860244148433e-01 2 3.0197083955402204e-01 2.9515239068888608e+00 -8.5689735572907566e-01 @@ -34,15 +35,15 @@ run_pos: ! |2 15 2.9756315249791303e+00 5.6334269722969288e-01 -1.2437650754599008e+00 16 2.6517554244980306e+00 -2.3957110424978438e+00 3.2908335999178327e-02 17 2.2309964792710639e+00 -2.1022918943319384e+00 1.1491948328949437e+00 - 18 2.1392027588271301e+00 3.0171068018412779e+00 -3.5144628518856349e+00 - 19 1.5366124997074571e+00 2.6286809834111748e+00 -4.2452547844370221e+00 - 20 2.7628161763455852e+00 3.6842251687634775e+00 -3.9370881219352554e+00 - 21 4.9036621347791245e+00 -4.0757648442838548e+00 -3.6192617654515904e+00 - 22 4.3655322291888483e+00 -4.2084949965552561e+00 -4.4622011117402334e+00 - 23 5.7380414793463101e+00 -3.5841969195032672e+00 -3.8827839830470219e+00 + 18 2.1392027588271301e+00 3.0171068018412783e+00 -3.5144628518856353e+00 + 19 1.5366124997074575e+00 2.6286809834111740e+00 -4.2452547844370221e+00 + 20 2.7628161763455852e+00 3.6842251687634775e+00 -3.9370881219352558e+00 + 21 4.9036621347791236e+00 -4.0757648442838548e+00 -3.6192617654515908e+00 + 22 4.3655322291888474e+00 -4.2084949965552561e+00 -4.4622011117402343e+00 + 23 5.7380414793463110e+00 -3.5841969195032672e+00 -3.8827839830470219e+00 24 2.0701314765323930e+00 3.1499370533342330e+00 3.1565324852522938e+00 - 25 1.3030170721374779e+00 3.2711173927682249e+00 2.5081940917429768e+00 - 26 2.5776230782480045e+00 4.0127347068243875e+00 3.2182355138709275e+00 + 25 1.3030170721374787e+00 3.2711173927682244e+00 2.5081940917429759e+00 + 26 2.5776230782480041e+00 4.0127347068243884e+00 3.2182355138709284e+00 27 -1.9613581876744359e+00 -4.3556300596085160e+00 2.1101467673534788e+00 28 -2.7406520384725965e+00 -4.0207251278130975e+00 1.5828689861678511e+00 29 -1.3108232656499081e+00 -3.5992986322410760e+00 2.2680459788743503e+00 @@ -64,15 +65,15 @@ run_vel: ! |2 15 -4.3301707382721859e-03 -3.1802661664634938e-03 3.2037919043360571e-03 16 -9.6715751018414326e-05 -5.0016572678960377e-04 1.4945658875149626e-03 17 6.5692180538157174e-04 3.6635779995305095e-04 8.3495414466050911e-04 - 18 3.6149625095704914e-04 -3.1032459262908286e-04 8.1043030117346052e-04 - 19 8.5103884665345452e-04 -1.4572280596788108e-03 1.0163621287634116e-03 - 20 -6.5204659278590683e-04 4.3989037444289853e-04 4.9909839028507901e-04 - 21 -1.3888125881903923e-03 -3.1978049143082385e-04 1.1455681499836646e-03 - 22 -1.6084223477729510e-03 -1.5355394240821117e-03 1.4772010826232375e-03 - 23 2.6392672378805124e-04 -3.9375414431174821e-03 -3.6991583139728095e-04 - 24 8.6062827067890247e-04 -9.4179873474469237e-04 5.5396395550012453e-04 - 25 1.5933645477487538e-03 -2.2139156625681695e-03 -5.5078029695647401e-04 - 26 -1.5679561743998840e-03 3.5146224354726100e-04 2.4446924193334478e-03 + 18 3.6149625095704849e-04 -3.1032459262908286e-04 8.1043030117346052e-04 + 19 8.5103884665345473e-04 -1.4572280596788095e-03 1.0163621287634121e-03 + 20 -6.5204659278590661e-04 4.3989037444289755e-04 4.9909839028507901e-04 + 21 -1.3888125881903906e-03 -3.1978049143082342e-04 1.1455681499836646e-03 + 22 -1.6084223477729526e-03 -1.5355394240821163e-03 1.4772010826232394e-03 + 23 2.6392672378804821e-04 -3.9375414431174795e-03 -3.6991583139728377e-04 + 24 8.6062827067890269e-04 -9.4179873474469291e-04 5.5396395550012388e-04 + 25 1.5933645477487551e-03 -2.2139156625681673e-03 -5.5078029695647250e-04 + 26 -1.5679561743998888e-03 3.5146224354726068e-04 2.4446924193334487e-03 27 -2.6510179146429716e-04 3.6306203629019116e-04 -5.6235585400647747e-04 28 -2.3068708109787484e-04 -8.5663070212203200e-04 2.1302563179109169e-03 29 -2.5054744388303732e-03 -1.6773997805290820e-04 2.8436699761004796e-03 diff --git a/unittest/force-styles/tests/mol-pair-tip4p_long.yaml b/unittest/force-styles/tests/mol-pair-tip4p_long.yaml index 072642c471..9a67706c01 100644 --- a/unittest/force-styles/tests/mol-pair-tip4p_long.yaml +++ b/unittest/force-styles/tests/mol-pair-tip4p_long.yaml @@ -2,7 +2,7 @@ lammps_version: 17 Feb 2022 tags: unstable date_generated: Fri Mar 18 22:17:36 2022 -epsilon: 5e-13 +epsilon: 1e-10 skip_tests: prerequisites: ! | atom full