Compare commits
63 Commits
patch_9Jan
...
patch_17Ja
| Author | SHA1 | Date | |
|---|---|---|---|
| d2fc88a626 | |||
| c52a26382f | |||
| ad4d299975 | |||
| 83408b195f | |||
| cd7bdf9251 | |||
| 8c5b108900 | |||
| c19d2011bb | |||
| 973bef4d45 | |||
| 1b9e50c8cb | |||
| 252e07e083 | |||
| 74a661ae26 | |||
| d8bc590aaf | |||
| c9bea60710 | |||
| 5cd856c97f | |||
| 2f13365cf5 | |||
| 0a2b78acb8 | |||
| 3f46b6d782 | |||
| 5abd6e5122 | |||
| f3a82f454e | |||
| 473a3ebeef | |||
| b220850377 | |||
| fa00e0593f | |||
| 4a09399dc6 | |||
| 5821fe8dd5 | |||
| 98ceb6feb1 | |||
| 61cff85435 | |||
| aa0b327f7e | |||
| 04fe071968 | |||
| 78498715b4 | |||
| b2f67fea30 | |||
| c59bcf31d1 | |||
| 2540fc281c | |||
| e8e03dd440 | |||
| daf766d4f8 | |||
| 630783c8e8 | |||
| c94030d966 | |||
| 1229f6f60b | |||
| 0b081b0086 | |||
| 8e1cf6643c | |||
| 6950a99162 | |||
| 9f4e5e0661 | |||
| 34cb4027df | |||
| 1d0e600ab7 | |||
| 7162cafdf5 | |||
| ee9e7cfbd5 | |||
| 7839c335da | |||
| 622d926849 | |||
| 92d15d4a89 | |||
| 95706ac846 | |||
| d06688bb91 | |||
| d014e00e53 | |||
| 0db2a07993 | |||
| 33412c76ed | |||
| e5ac49d1de | |||
| 1a81da0f73 | |||
| ebd25cc078 | |||
| 9250a55923 | |||
| a9f0b7d523 | |||
| 20f8a8c219 | |||
| 09af780aa8 | |||
| 51e52b477a | |||
| 20a4e365b7 | |||
| ccd09e3967 |
Binary file not shown.
|
Before Width: | Height: | Size: 57 KiB After Width: | Height: | Size: 25 KiB |
BIN
doc/src/JPG/tutorial_reverse_pull_request7.png
Normal file
BIN
doc/src/JPG/tutorial_reverse_pull_request7.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 25 KiB |
@ -1,7 +1,7 @@
|
||||
<!-- HTML_ONLY -->
|
||||
<HEAD>
|
||||
<TITLE>LAMMPS Users Manual</TITLE>
|
||||
<META NAME="docnumber" CONTENT="9 Jan 2017 version">
|
||||
<META NAME="docnumber" CONTENT="17 Jan 2017 version">
|
||||
<META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
|
||||
<META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation. This software and manual is distributed under the GNU General Public License.">
|
||||
</HEAD>
|
||||
@ -21,7 +21,7 @@
|
||||
<H1></H1>
|
||||
|
||||
LAMMPS Documentation :c,h3
|
||||
9 Jan 2017 version :c,h4
|
||||
17 Jan 2017 version :c,h4
|
||||
|
||||
Version info: :h4
|
||||
|
||||
|
||||
@ -55,12 +55,13 @@ LAMMPS errors are detected at setup time; others like a bond
|
||||
stretching too far may not occur until the middle of a run.
|
||||
|
||||
LAMMPS tries to flag errors and print informative error messages so
|
||||
you can fix the problem. Of course, LAMMPS cannot figure out your
|
||||
physics or numerical mistakes, like choosing too big a timestep,
|
||||
specifying erroneous force field coefficients, or putting 2 atoms on
|
||||
top of each other! If you run into errors that LAMMPS doesn't catch
|
||||
that you think it should flag, please send an email to the
|
||||
"developers"_http://lammps.sandia.gov/authors.html.
|
||||
you can fix the problem. For most errors it will also print the last
|
||||
input script command that it was processing. Of course, LAMMPS cannot
|
||||
figure out your physics or numerical mistakes, like choosing too big a
|
||||
timestep, specifying erroneous force field coefficients, or putting 2
|
||||
atoms on top of each other! If you run into errors that LAMMPS
|
||||
doesn't catch that you think it should flag, please send an email to
|
||||
the "developers"_http://lammps.sandia.gov/authors.html.
|
||||
|
||||
If you get an error message about an invalid command in your input
|
||||
script, you can determine what command is causing the problem by
|
||||
|
||||
@ -1153,7 +1153,7 @@ Package, Description, Author(s), Doc page, Example, Pic/movie, Library
|
||||
"USER-MISC"_#USER-MISC, single-file contributions, USER-MISC/README, USER-MISC/README, -, -, -
|
||||
"USER-MANIFOLD"_#USER-MANIFOLD, motion on 2d surface, Stefan Paquay (Eindhoven U of Technology), "fix manifoldforce"_fix_manifoldforce.html, USER/manifold, "manifold"_manifold, -
|
||||
"USER-MOLFILE"_#USER-MOLFILE, "VMD"_VMD molfile plug-ins, Axel Kohlmeyer (Temple U), "dump molfile"_dump_molfile.html, -, -, VMD-MOLFILE
|
||||
"USER-NC-DUMP"_#USER-NC-DUMP, dump output via NetCDF, Lars Pastewka (Karlsruhe Institute of Technology, KIT), "dump nc, dump nc/mpiio"_dump_nc.html, -, -, lib/netcdf
|
||||
"USER-NC-DUMP"_#USER-NC-DUMP, dump output via NetCDF, Lars Pastewka (Karlsruhe Institute of Technology, KIT), "dump nc / dump nc/mpiio"_dump_nc.html, -, -, lib/netcdf
|
||||
"USER-OMP"_#USER-OMP, OpenMP threaded styles, Axel Kohlmeyer (Temple U), "Section 5.3.4"_accelerate_omp.html, -, -, -
|
||||
"USER-PHONON"_#USER-PHONON, phonon dynamical matrix, Ling-Ti Kong (Shanghai Jiao Tong U), "fix phonon"_fix_phonon.html, USER/phonon, -, -
|
||||
"USER-QMMM"_#USER-QMMM, QM/MM coupling, Axel Kohlmeyer (Temple U), "fix qmmm"_fix_qmmm.html, USER/qmmm, -, lib/qmmm
|
||||
@ -1610,11 +1610,12 @@ and a "dump nc/mpiio"_dump_nc.html command to output LAMMPS snapshots
|
||||
in this format. See src/USER-NC-DUMP/README for more details.
|
||||
|
||||
NetCDF files can be directly visualized with the following tools:
|
||||
|
||||
Ovito (http://www.ovito.org/). Ovito supports the AMBER convention
|
||||
and all of the above extensions. :ulb,l
|
||||
and all of the above extensions. :ulb,l
|
||||
VMD (http://www.ks.uiuc.edu/Research/vmd/) :l
|
||||
AtomEye (http://www.libatoms.org/). The libAtoms version of AtomEye contains
|
||||
a NetCDF reader that is not present in the standard distribution of AtomEye :l,ule
|
||||
a NetCDF reader that is not present in the standard distribution of AtomEye :l,ule
|
||||
|
||||
The person who created these files is Lars Pastewka at
|
||||
Karlsruhe Institute of Technology (lars.pastewka at kit.edu).
|
||||
|
||||
@ -1727,7 +1727,7 @@ thermodynamic state and a total run time for the simulation. It then
|
||||
appends statistics about the CPU time and storage requirements for the
|
||||
simulation. An example set of statistics is shown here:
|
||||
|
||||
Loop time of 2.81192 on 4 procs for 300 steps with 2004 atoms
|
||||
Loop time of 2.81192 on 4 procs for 300 steps with 2004 atoms :pre
|
||||
|
||||
Performance: 18.436 ns/day 1.302 hours/ns 106.689 timesteps/s
|
||||
97.0% CPU use with 4 MPI tasks x no OpenMP threads :pre
|
||||
@ -1757,14 +1757,14 @@ Ave special neighs/atom = 2.34032
|
||||
Neighbor list builds = 26
|
||||
Dangerous builds = 0 :pre
|
||||
|
||||
The first section provides a global loop timing summary. The loop time
|
||||
The first section provides a global loop timing summary. The {loop time}
|
||||
is the total wall time for the section. The {Performance} line is
|
||||
provided for convenience to help predicting the number of loop
|
||||
continuations required and for comparing performance with other
|
||||
similar MD codes. The CPU use line provides the CPU utilzation per
|
||||
continuations required and for comparing performance with other,
|
||||
similar MD codes. The {CPU use} line provides the CPU utilzation per
|
||||
MPI task; it should be close to 100% times the number of OpenMP
|
||||
threads (or 1). Lower numbers correspond to delays due to file I/O or
|
||||
insufficient thread utilization.
|
||||
threads (or 1 of no OpenMP). Lower numbers correspond to delays due
|
||||
to file I/O or insufficient thread utilization.
|
||||
|
||||
The MPI task section gives the breakdown of the CPU run time (in
|
||||
seconds) into major categories:
|
||||
@ -1791,7 +1791,7 @@ is present that also prints the CPU utilization in percent. In
|
||||
addition, when using {timer full} and the "package omp"_package.html
|
||||
command are active, a similar timing summary of time spent in threaded
|
||||
regions to monitor thread utilization and load balance is provided. A
|
||||
new entry is the {Reduce} section, which lists the time spend in
|
||||
new entry is the {Reduce} section, which lists the time spent in
|
||||
reducing the per-thread data elements to the storage for non-threaded
|
||||
computation. These thread timings are taking from the first MPI rank
|
||||
only and and thus, as the breakdown for MPI tasks can change from MPI
|
||||
|
||||
@ -110,14 +110,14 @@ mpirun -np 96 -ppn 12 lmp_g++ -k on t 20 -sf kk -in in.lj # ditto on 8 Phis :p
|
||||
[Required hardware/software:]
|
||||
|
||||
Kokkos support within LAMMPS must be built with a C++11 compatible
|
||||
compiler. If using gcc, version 4.8.1 or later is required.
|
||||
compiler. If using gcc, version 4.7.2 or later is required.
|
||||
|
||||
To build with Kokkos support for CPUs, your compiler must support the
|
||||
OpenMP interface. You should have one or more multi-core CPUs so that
|
||||
multiple threads can be launched by each MPI task running on a CPU.
|
||||
|
||||
To build with Kokkos support for NVIDIA GPUs, NVIDIA Cuda software
|
||||
version 6.5 or later must be installed on your system. See the
|
||||
version 7.5 or later must be installed on your system. See the
|
||||
discussion for the "GPU"_accelerate_gpu.html package for details of
|
||||
how to check and do this.
|
||||
|
||||
|
||||
@ -91,6 +91,7 @@ Commands :h1
|
||||
suffix
|
||||
tad
|
||||
temper
|
||||
temper_grem
|
||||
thermo
|
||||
thermo_modify
|
||||
thermo_style
|
||||
|
||||
@ -10,22 +10,34 @@ compute coord/atom command :h3
|
||||
|
||||
[Syntax:]
|
||||
|
||||
compute ID group-ID coord/atom cutoff type1 type2 ... :pre
|
||||
compute ID group-ID coord/atom cstyle args ... :pre
|
||||
|
||||
ID, group-ID are documented in "compute"_compute.html command
|
||||
coord/atom = style name of this compute command
|
||||
cutoff = distance within which to count coordination neighbors (distance units)
|
||||
typeN = atom type for Nth coordination count (see asterisk form below) :ul
|
||||
one cstyle must be appended :ul
|
||||
|
||||
cstyle = {cutoff} or {orientorder}
|
||||
|
||||
{cutoff} args = cutoff typeN
|
||||
cutoff = distance within which to count coordination neighbors (distance units)
|
||||
typeN = atom type for Nth coordination count (see asterisk form below) :pre
|
||||
|
||||
{orientorder} args = orientorderID threshold
|
||||
orientorderID = ID of a previously defined orientorder/atom compute
|
||||
threshold = minimum value of the scalar product between two 'connected' atoms (see text for explanation) :pre
|
||||
|
||||
[Examples:]
|
||||
|
||||
compute 1 all coord/atom 2.0
|
||||
compute 1 all coord/atom 6.0 1 2
|
||||
compute 1 all coord/atom 6.0 2*4 5*8 * :pre
|
||||
compute 1 all coord/atom cutoff 2.0
|
||||
compute 1 all coord/atom cutoff 6.0 1 2
|
||||
compute 1 all coord/atom cutoff 6.0 2*4 5*8 *
|
||||
compute 1 all coord/atom orientorder 2 0.5 :pre
|
||||
|
||||
[Description:]
|
||||
|
||||
Define a computation that calculates one or more coordination numbers
|
||||
This compute performs generic calculations between neighboring atoms. So far,
|
||||
there are two cstyles implemented: {cutoff} and {orientorder}.
|
||||
The {cutoff} cstyle calculates one or more coordination numbers
|
||||
for each atom in a group.
|
||||
|
||||
A coordination number is defined as the number of neighbor atoms with
|
||||
@ -49,6 +61,14 @@ from 1 to N. A leading asterisk means all types from 1 to n
|
||||
(inclusive). A middle asterisk means all types from m to n
|
||||
(inclusive).
|
||||
|
||||
The {orientorder} cstyle calculates the number of 'connected' atoms j
|
||||
around each atom i. The atom j is connected to i if the scalar product
|
||||
({Ybar_lm(i)},{Ybar_lm(j)}) is larger than {threshold}. Thus, this cstyle
|
||||
will work only if a "compute orientorder/atom"_compute_orientorder_atom.html
|
||||
has been previously defined. This cstyle allows one to apply the
|
||||
ten Wolde's criterion to identify cristal-like atoms in a system
|
||||
(see "ten Wolde et al."_#tenWolde).
|
||||
|
||||
The value of all coordination numbers will be 0.0 for atoms not in the
|
||||
specified compute group.
|
||||
|
||||
@ -83,10 +103,19 @@ options.
|
||||
The per-atom vector or array values will be a number >= 0.0, as
|
||||
explained above.
|
||||
|
||||
[Restrictions:] none
|
||||
[Restrictions:]
|
||||
The cstyle {orientorder} can only be used if a
|
||||
"compute orientorder/atom"_compute_orientorder_atom.html command
|
||||
was previously defined. Otherwise, an error message will be issued.
|
||||
|
||||
[Related commands:]
|
||||
|
||||
"compute cluster/atom"_compute_cluster_atom.html
|
||||
"compute orientorder/atom"_compute_orientorder_atom.html
|
||||
|
||||
[Default:] none
|
||||
|
||||
:line
|
||||
|
||||
:link(tenWolde)
|
||||
[(tenWolde)] P. R. ten Wolde, M. J. Ruiz-Montero, D. Frenkel, J. Chem. Phys. 104, 9932 (1996).
|
||||
|
||||
@ -15,17 +15,19 @@ compute ID group-ID orientorder/atom keyword values ... :pre
|
||||
ID, group-ID are documented in "compute"_compute.html command :ulb,l
|
||||
orientorder/atom = style name of this compute command :l
|
||||
one or more keyword/value pairs may be appended :l
|
||||
keyword = {cutoff} or {nnn} or {degrees}
|
||||
keyword = {cutoff} or {nnn} or {degrees} or {components}
|
||||
{cutoff} value = distance cutoff
|
||||
{nnn} value = number of nearest neighbors
|
||||
{degrees} values = nlvalues, l1, l2,... :pre
|
||||
{degrees} values = nlvalues, l1, l2,...
|
||||
{components} value = l :pre
|
||||
|
||||
:ule
|
||||
|
||||
[Examples:]
|
||||
|
||||
compute 1 all orientorder/atom
|
||||
compute 1 all orientorder/atom degrees 5 4 6 8 10 12 nnn NULL cutoff 1.5 :pre
|
||||
compute 1 all orientorder/atom degrees 5 4 6 8 10 12 nnn NULL cutoff 1.5
|
||||
compute 1 all orientorder/atom degrees 4 6 components 6 nnn NULL cutoff 3.0 :pre
|
||||
|
||||
[Description:]
|
||||
|
||||
@ -71,6 +73,13 @@ The numerical values of all order parameters up to {Q}12
|
||||
for a range of commonly encountered high-symmetry structures are given
|
||||
in Table I of "Mickel et al."_#Mickel.
|
||||
|
||||
The optional keyword {components} will output the components of
|
||||
the normalized complex vector {Ybar_lm} of degree {l}, which must be
|
||||
explicitly included in the keyword {degrees}. This option can be used
|
||||
in conjunction with "compute coord_atom"_compute_coord_atom.html to
|
||||
calculate the ten Wolde's criterion to identify crystal-like particles
|
||||
(see "ten Wolde et al."_#tenWolde96).
|
||||
|
||||
The value of {Ql} is set to zero for atoms not in the
|
||||
specified compute group, as well as for atoms that have less than
|
||||
{nnn} neighbors within the distance cutoff.
|
||||
@ -98,6 +107,12 @@ the neighbor list.
|
||||
This compute calculates a per-atom array with {nlvalues} columns, giving the
|
||||
{Ql} values for each atom, which are real numbers on the range 0 <= {Ql} <= 1.
|
||||
|
||||
If the keyword {components} is set, then the real and imaginary parts of each
|
||||
component of (normalized) {Ybar_lm} will be added to the output array in the
|
||||
following order:
|
||||
Re({Ybar_-m}) Im({Ybar_-m}) Re({Ybar_-m+1}) Im({Ybar_-m+1}) ... Re({Ybar_m}) Im({Ybar_m}).
|
||||
This way, the per-atom array will have a total of {nlvalues}+2*(2{l}+1) columns.
|
||||
|
||||
These values can be accessed by any command that uses
|
||||
per-atom values from a compute as input. See "Section
|
||||
6.15"_Section_howto.html#howto_15 for an overview of LAMMPS output
|
||||
@ -117,5 +132,9 @@ The option defaults are {cutoff} = pair style cutoff, {nnn} = 12, {degrees} = 5
|
||||
|
||||
:link(Steinhardt)
|
||||
[(Steinhardt)] P. Steinhardt, D. Nelson, and M. Ronchetti, Phys. Rev. B 28, 784 (1983).
|
||||
|
||||
:link(Mickel)
|
||||
[(Mickel)] W. Mickel, S. C. Kapfer, G. E. Schroeder-Turkand, K. Mecke, J. Chem. Phys. 138, 044501 (2013).
|
||||
|
||||
:link(tenWolde96)
|
||||
[(tenWolde)] P. R. ten Wolde, M. J. Ruiz-Montero, D. Frenkel, J. Chem. Phys. 104, 9932 (1996).
|
||||
|
||||
@ -35,6 +35,7 @@ Computes :h1
|
||||
compute_erotate_sphere_atom
|
||||
compute_event_displace
|
||||
compute_fep
|
||||
compute_global_atom
|
||||
compute_group_group
|
||||
compute_gyration
|
||||
compute_gyration_chunk
|
||||
|
||||
@ -151,7 +151,7 @@ The option default for the {energy} keyword is energy = no.
|
||||
:line
|
||||
|
||||
:link(Strong)
|
||||
[(Strong)] Strong and Eaves, J. Phys. Chem. Lett. 7, 1907 (2016).
|
||||
[(Strong)] Strong and Eaves, J. Phys. Chem. B 121, 189 (2017).
|
||||
|
||||
:link(Evans)
|
||||
[(Evans)] Evans and Morriss, Phys. Rev. Lett. 56, 2172 (1986).
|
||||
|
||||
@ -29,7 +29,7 @@ fix fxgREM all grem 502 -0.15 -80000 fxnvt :pre
|
||||
[Description:]
|
||||
|
||||
This fix implements the molecular dynamics version of the generalized
|
||||
replica exchange method (gREM) originally developed by "(Kim)"_#Kim,
|
||||
replica exchange method (gREM) originally developed by "(Kim)"_#Kim2010,
|
||||
which uses non-Boltzmann ensembles to sample over first order phase
|
||||
transitions. The is done by defining replicas with an enthalpy
|
||||
dependent effective temperature
|
||||
@ -103,7 +103,7 @@ npt"_fix_nh.html, "thermo_modify"_thermo_modify.html
|
||||
|
||||
:line
|
||||
|
||||
:link(Kim)
|
||||
:link(Kim2010)
|
||||
[(Kim)] Kim, Keyes, Straub, J Chem. Phys, 132, 224107 (2010).
|
||||
|
||||
:link(Malolepsza)
|
||||
|
||||
@ -89,11 +89,7 @@ NOTE: The center of mass of a group of atoms is calculated in
|
||||
group can straddle a periodic boundary. See the "dump"_dump.html doc
|
||||
page for a discussion of unwrapped coordinates. It also means that a
|
||||
spring connecting two groups or a group and the tether point can cross
|
||||
a periodic boundary and its length be calculated correctly. One
|
||||
exception is for rigid bodies, which should not be used with the fix
|
||||
spring command, if the rigid body will cross a periodic boundary.
|
||||
This is because image flags for rigid bodies are used in a different
|
||||
way, as explained on the "fix rigid"_fix_rigid.html doc page.
|
||||
a periodic boundary and its length be calculated correctly.
|
||||
|
||||
[Restart, fix_modify, output, run start/stop, minimize info:]
|
||||
|
||||
|
||||
@ -23,6 +23,7 @@ Section_history.html
|
||||
|
||||
tutorial_drude.html
|
||||
tutorial_github.html
|
||||
tutorial_pylammps.html
|
||||
|
||||
body.html
|
||||
manifolds.html
|
||||
@ -113,6 +114,7 @@ special_bonds.html
|
||||
suffix.html
|
||||
tad.html
|
||||
temper.html
|
||||
temper_grem.html
|
||||
thermo.html
|
||||
thermo_modify.html
|
||||
thermo_style.html
|
||||
|
||||
@ -32,7 +32,7 @@ Run a parallel tempering or replica exchange simulation in LAMMPS
|
||||
partition mode using multiple generalized replicas (ensembles) of a
|
||||
system defined by "fix grem"_fix_grem.html, which stands for the
|
||||
generalized replica exchange method (gREM) originally developed by
|
||||
"(Kim)"_#Kim. It uses non-Boltzmann ensembles to sample over first
|
||||
"(Kim)"_#KimStraub. It uses non-Boltzmann ensembles to sample over first
|
||||
order phase transitions. The is done by defining replicas with an
|
||||
enthalpy dependent effective temperature
|
||||
|
||||
@ -105,5 +105,5 @@ This command must be used with "fix grem"_fix_grem.html.
|
||||
|
||||
[Default:] none
|
||||
|
||||
:link(Kim)
|
||||
:link(KimStraub)
|
||||
[(Kim)] Kim, Keyes, Straub, J Chem Phys, 132, 224107 (2010).
|
||||
|
||||
@ -33,14 +33,14 @@ timer loop :pre
|
||||
Select the level of detail at which LAMMPS performs its CPU timings.
|
||||
Multiple keywords can be specified with the {timer} command. For
|
||||
keywords that are mutually exclusive, the last one specified takes
|
||||
effect.
|
||||
precedence.
|
||||
|
||||
During a simulation run LAMMPS collects information about how much
|
||||
time is spent in different sections of the code and thus can provide
|
||||
information for determining performance and load imbalance problems.
|
||||
This can be done at different levels of detail and accuracy. For more
|
||||
information about the timing output, see this "discussion of screen
|
||||
output"_Section_start.html#start_8.
|
||||
output in Section 2.8"_Section_start.html#start_8.
|
||||
|
||||
The {off} setting will turn all time measurements off. The {loop}
|
||||
setting will only measure the total time for a run and not collect any
|
||||
@ -52,20 +52,22 @@ procsessors. The {full} setting adds information about CPU
|
||||
utilization and thread utilization, when multi-threading is enabled.
|
||||
|
||||
With the {sync} setting, all MPI tasks are synchronized at each timer
|
||||
call which meaures load imbalance more accuractly, though it can also
|
||||
slow down the simulation. Using the {nosync} setting (which is the
|
||||
default) turns off this synchronization.
|
||||
call which measures load imbalance for each section more accuractly,
|
||||
though it can also slow down the simulation by prohibiting overlapping
|
||||
independent computations on different MPI ranks Using the {nosync}
|
||||
setting (which is the default) turns this synchronization off.
|
||||
|
||||
With the {timeout} keyword a walltime limit can be imposed that
|
||||
With the {timeout} keyword a walltime limit can be imposed, that
|
||||
affects the "run"_run.html and "minimize"_minimize.html commands.
|
||||
This can be convenient when runs have to confirm to time limits,
|
||||
e.g. when running under a batch system and you want to maximize
|
||||
the utilization of the batch time slot, especially when the time
|
||||
per timestep varies and is thus difficult to predict how many
|
||||
steps a simulation can perform, or for difficult to converge
|
||||
minimizations. The timeout {elapse} value should be somewhat smaller
|
||||
than the time requested from the batch system, as there is usually
|
||||
some overhead to launch jobs, and it may be advisable to write
|
||||
This can be convenient when calculations have to comply with execution
|
||||
time limits, e.g. when running under a batch system when you want to
|
||||
maximize the utilization of the batch time slot, especially for runs
|
||||
where the time per timestep varies much and thus it becomes difficult
|
||||
to predict how many steps a simulation can perform for a given walltime
|
||||
limit. This also applies for difficult to converge minimizations.
|
||||
The timeout {elapse} value should be somewhat smaller than the maximum
|
||||
wall time requested from the batch system, as there is usually
|
||||
some overhead to launch jobs, and it is advisable to write
|
||||
out a restart after terminating a run due to a timeout.
|
||||
|
||||
The timeout timer starts when the command is issued. When the time
|
||||
|
||||
@ -336,12 +336,15 @@ commit and push again:
|
||||
$ git commit -m "Merged Axel's suggestions and updated text"
|
||||
$ git push git@github.com:Pakketeretet2/lammps :pre
|
||||
|
||||
This merge also shows up on the lammps Github page:
|
||||
|
||||
:c,image(JPG/tutorial_reverse_pull_request7.png)
|
||||
|
||||
:line
|
||||
|
||||
[After a merge]
|
||||
|
||||
When everything is fine, the feature branch is merged into the master branch.
|
||||
When everything is fine, the feature branch is merged into the master branch:
|
||||
|
||||
:c,image(JPG/tutorial_merged.png)
|
||||
|
||||
|
||||
@ -18,7 +18,7 @@ neigh_modify every 1 delay 0 check no once no
|
||||
timestep 0.001
|
||||
|
||||
compute dpdU all dpd
|
||||
variable totEnergy equal pe+ke+c_dpdU[1]+c_dpdU[1]+press*vol
|
||||
variable totEnergy equal pe+ke+c_dpdU[1]+c_dpdU[2]+press*vol
|
||||
|
||||
thermo 1
|
||||
thermo_style custom step temp press vol pe ke v_totEnergy cella cellb cellc
|
||||
|
||||
@ -22,7 +22,7 @@ neigh_modify every 1 delay 0 check no once no
|
||||
timestep 0.001
|
||||
|
||||
compute dpdU all dpd
|
||||
variable totEnergy equal pe+ke+c_dpdU[1]+c_dpdU[1]+press*vol
|
||||
variable totEnergy equal pe+ke+c_dpdU[1]+c_dpdU[2]+press*vol
|
||||
|
||||
thermo 1
|
||||
thermo_style custom step temp press vol pe ke v_totEnergy cella cellb cellc
|
||||
@ -34,129 +34,137 @@ fix 2 all eos/cv 0.0005
|
||||
|
||||
run 100
|
||||
Neighbor list info ...
|
||||
1 neighbor list requests
|
||||
update every 1 steps, delay 0 steps, check no
|
||||
max neighbors/atom: 2000, page size: 100000
|
||||
master list distance cutoff = 12
|
||||
ghost atom cutoff = 12
|
||||
binsize = 6 -> bins = 22 22 22
|
||||
Memory usage per processor = 6.48143 Mbytes
|
||||
binsize = 6, bins = 22 22 22
|
||||
2 neighbor lists, perpetual/occasional/extra = 2 0 0
|
||||
(1) pair dpd/fdt/energy, perpetual
|
||||
pair build: half/bin/newton
|
||||
stencil: half/bin/3d/newton
|
||||
bin: standard
|
||||
(2) fix shardlow, perpetual, ssa
|
||||
pair build: half/bin/newton/ssa
|
||||
stencil: half/bin/3d/newton/ssa
|
||||
bin: ssa
|
||||
Memory usage per processor = 8.55503 Mbytes
|
||||
Step Temp Press Volume PotEng KinEng v_totEnergy Cella Cellb Cellc
|
||||
0 239.4274282976 2817.4421750949 2146689.0000000000 2639.8225470740 313.3218455755 6048176597.3066043854 129.0000000000 129.0000000000 129.0000000000
|
||||
1 239.4771405316 2817.4798146419 2146689.0000581890 2639.8304543632 313.3869004818 6048257397.9450111389 129.0000000012 129.0000000012 129.0000000012
|
||||
2 239.5643955010 2817.5423194969 2146689.0002327557 2639.8379071907 313.5010849268 6048391577.0431985855 129.0000000047 129.0000000047 129.0000000047
|
||||
3 239.6633839196 2817.6123662396 2146689.0005237064 2639.8445238058 313.6306241122 6048541946.5712032318 129.0000000105 129.0000000105 129.0000000105
|
||||
4 239.5371222027 2817.5355424336 2146689.0009310376 2639.8505035043 313.4653942786 6048377030.7404460907 129.0000000186 129.0000000186 129.0000000186
|
||||
5 239.6512678169 2817.6153097076 2146689.0014547524 2639.8561498340 313.6147686202 6048548267.9007377625 129.0000000291 129.0000000291 129.0000000291
|
||||
6 239.5617886781 2817.5624195435 2146689.0020948485 2639.8617493725 313.4976735610 6048434730.8592004776 129.0000000420 129.0000000420 129.0000000420
|
||||
7 239.5228587856 2817.5420009502 2146689.0028513218 2639.8666590407 313.4467287471 6048390900.5748577118 129.0000000571 129.0000000571 129.0000000571
|
||||
8 239.6066877934 2817.6008649264 2146689.0037241788 2639.8710757645 313.5564298772 6048517265.7987136841 129.0000000746 129.0000000746 129.0000000746
|
||||
9 239.5719861485 2817.5823530300 2146689.0047134170 2639.8752557893 313.5110182737 6048477529.2603597641 129.0000000944 129.0000000944 129.0000000944
|
||||
10 239.5800176776 2817.5915671176 2146689.0058190385 2639.8793778438 313.5215285712 6048497312.1706552505 129.0000001166 129.0000001166 129.0000001166
|
||||
11 239.6299830954 2817.6281223139 2146689.0070410441 2639.8829762049 313.5869148014 6048575788.3208351135 129.0000001410 129.0000001410 129.0000001410
|
||||
12 239.6011995911 2817.6132377273 2146689.0083794324 2639.8860704236 313.5492478526 6048543839.4788360596 129.0000001678 129.0000001678 129.0000001678
|
||||
13 239.6407681166 2817.6427924824 2146689.0098342048 2639.8889816934 313.6010284005 6048607288.5005025864 129.0000001970 129.0000001970 129.0000001970
|
||||
14 239.6981172055 2817.6844100046 2146689.0114053637 2639.8913405110 313.6760771219 6048696632.8825626373 129.0000002285 129.0000002285 129.0000002285
|
||||
15 239.8563971968 2817.7922519039 2146689.0130929090 2639.8934358481 313.8832070208 6048928140.8671455383 129.0000002623 129.0000002623 129.0000002623
|
||||
16 239.8561894618 2817.7971208197 2146689.0148968464 2639.8950496967 313.8829351726 6048938597.9994916916 129.0000002984 129.0000002984 129.0000002984
|
||||
17 239.8816520361 2817.8185621543 2146689.0168171758 2639.8961257823 313.9162562538 6048984631.3226108551 129.0000003369 129.0000003369 129.0000003369
|
||||
18 239.9099966096 2817.8417368960 2146689.0188538977 2639.8965743204 313.9533488047 6049034386.0627622604 129.0000003777 129.0000003777 129.0000003777
|
||||
19 240.0514024347 2817.9389205774 2146689.0210070144 2639.8966103811 314.1383966683 6049243015.4568052292 129.0000004208 129.0000004208 129.0000004208
|
||||
20 239.8802541140 2817.8327386176 2146689.0232765260 2639.8962085210 313.9144268914 6049015081.9802341461 129.0000004662 129.0000004662 129.0000004662
|
||||
21 239.8462621903 2817.8160306167 2146689.0256624296 2639.8953174755 313.8699440502 6048979221.7758703232 129.0000005140 129.0000005140 129.0000005140
|
||||
22 240.0487944678 2817.9533849157 2146689.0281647225 2639.8938590354 314.1349838054 6049274086.0571212769 129.0000005642 129.0000005642 129.0000005642
|
||||
23 240.0966314441 2817.9897873787 2146689.0307834130 2639.8918104774 314.1975846937 6049352238.2649183273 129.0000006166 129.0000006166 129.0000006166
|
||||
24 240.1765312516 2818.0463843765 2146689.0335185044 2639.8891292321 314.3021439554 6049473742.2287187576 129.0000006714 129.0000006714 129.0000006714
|
||||
25 240.1500705973 2818.0336048048 2146689.0363699966 2639.8858785483 314.2675167572 6049446316.4600162506 129.0000007285 129.0000007285 129.0000007285
|
||||
26 240.2681423500 2818.1151708195 2146689.0393378921 2639.8825176506 314.4220289603 6049621421.8445177078 129.0000007880 129.0000007880 129.0000007880
|
||||
27 240.4728815247 2818.2527327079 2146689.0424221945 2639.8784158747 314.6899567267 6049916733.3989181519 129.0000008498 129.0000008498 129.0000008498
|
||||
28 240.4793027032 2818.2613348477 2146689.0456229053 2639.8736089473 314.6983596717 6049935208.5421981812 129.0000009139 129.0000009139 129.0000009139
|
||||
29 240.5020619198 2818.2805472685 2146689.0489400285 2639.8681043704 314.7281430587 6049976461.0082206726 129.0000009803 129.0000009803 129.0000009803
|
||||
30 240.5513721776 2818.3167157263 2146689.0523735629 2639.8623484053 314.7926719270 6050054113.1760177612 129.0000010491 129.0000010491 129.0000010491
|
||||
31 240.7340393104 2818.4391703712 2146689.0559235099 2639.8563442170 315.0317155636 6050316995.4599781036 129.0000011202 129.0000011202 129.0000011202
|
||||
32 240.8254719483 2818.5014640740 2146689.0595898777 2639.8498122053 315.1513670299 6050450731.1168394089 129.0000011936 129.0000011936 129.0000011936
|
||||
33 240.9681573541 2818.5965480750 2146689.0633726656 2639.8425779528 315.3380893908 6050654857.7432861328 129.0000012694 129.0000012694 129.0000012694
|
||||
34 241.0039494187 2818.6217008564 2146689.0672718794 2639.8347174393 315.3849279499 6050708863.9733209610 129.0000013475 129.0000013475 129.0000013475
|
||||
35 241.0314566197 2818.6411150538 2146689.0712875174 2639.8262983643 315.4209246902 6050750551.5649127960 129.0000014279 129.0000014279 129.0000014279
|
||||
36 241.0829173424 2818.6763455617 2146689.0754195810 2639.8174397481 315.4882677207 6050826192.2165899277 129.0000015107 129.0000015107 129.0000015107
|
||||
37 241.2845682012 2818.8087982181 2146689.0796680767 2639.8080129872 315.7521540252 6051110539.1171846390 129.0000015958 129.0000015958 129.0000015958
|
||||
38 241.3214712920 2818.8336260248 2146689.0840330068 2639.7981963574 315.8004465062 6051163849.0412235260 129.0000016833 129.0000016833 129.0000016833
|
||||
39 241.3392127125 2818.8456991528 2146689.0885143690 2639.7879618658 315.8236634561 6051189778.9386901855 129.0000017730 129.0000017730 129.0000017730
|
||||
40 241.5383770555 2818.9753950055 2146689.0931121684 2639.7769824244 316.0842958321 6051468208.8210506439 129.0000018651 129.0000018651 129.0000018651
|
||||
41 241.5059730674 2818.9543817992 2146689.0978264087 2639.7656512498 316.0418910106 6051423113.2358427048 129.0000019595 129.0000019595 129.0000019595
|
||||
42 241.3907605672 2818.8793800508 2146689.1026570834 2639.7541331920 315.8911205101 6051262121.2551422119 129.0000020563 129.0000020563 129.0000020563
|
||||
43 241.5095917610 2818.9559595711 2146689.1076041958 2639.7424355740 316.0466265406 6051426527.7663059235 129.0000021554 129.0000021554 129.0000021554
|
||||
44 241.6271631762 2819.0312325531 2146689.1126677482 2639.7297705654 316.2004839873 6051588129.8722610474 129.0000022568 129.0000022568 129.0000022568
|
||||
45 241.5702411838 2818.9923790176 2146689.1178477411 2639.7163554760 316.1259941770 6051504737.9250564575 129.0000023606 129.0000023606 129.0000023606
|
||||
46 241.7029985068 2819.0771124986 2146689.1231441777 2639.7024246704 316.2997243538 6051686649.4576120377 129.0000024667 129.0000024667 129.0000024667
|
||||
47 241.7966144965 2819.1357830868 2146689.1285570571 2639.6882106593 316.4222330191 6051812612.3391046524 129.0000025751 129.0000025751 129.0000025751
|
||||
48 241.8573480255 2819.1726205120 2146689.1340863821 2639.6735287925 316.5017107195 6051891706.4921989441 129.0000026859 129.0000026859 129.0000026859
|
||||
49 241.9611147338 2819.2374095379 2146689.1397321564 2639.6583357477 316.6375029166 6052030804.4275226593 129.0000027990 129.0000027990 129.0000027990
|
||||
50 242.1023518806 2819.3259059811 2146689.1454943856 2639.6424863169 316.8223300428 6052220795.1955394745 129.0000029144 129.0000029144 129.0000029144
|
||||
51 242.1174105473 2819.3319633044 2146689.1513730693 2639.6264141131 316.8420362613 6052233814.9634265900 129.0000030321 129.0000030321 129.0000030321
|
||||
52 242.2534914901 2819.4164594322 2146689.1573682069 2639.6098392670 317.0201158259 6052415218.9485445023 129.0000031522 129.0000031522 129.0000031522
|
||||
53 242.3504633236 2819.4754119996 2146689.1634798055 2639.5930076506 317.1470160479 6052541789.1274013519 129.0000032746 129.0000032746 129.0000032746
|
||||
54 242.2982323323 2819.4368568264 2146689.1697078613 2639.5756353782 317.0786650211 6052459040.6286897659 129.0000033994 129.0000033994 129.0000033994
|
||||
55 242.3452896272 2819.4623310219 2146689.1760523771 2639.5575918586 317.1402455951 6052513743.7400159836 129.0000035265 129.0000035265 129.0000035265
|
||||
56 242.4181903333 2819.5048897011 2146689.1825133534 2639.5390347547 317.2356456249 6052605122.2894439697 129.0000036559 129.0000036559 129.0000036559
|
||||
57 242.5317091656 2819.5739975787 2146689.1890907930 2639.5199828249 317.3841997413 6052753494.0979280472 129.0000037876 129.0000037876 129.0000037876
|
||||
58 242.5478978740 2819.5796954935 2146689.1957846982 2639.5006137388 317.4053847660 6052765744.6257629395 129.0000039217 129.0000039217 129.0000039217
|
||||
59 242.6655316466 2819.6519225743 2146689.2025950695 2639.4808234811 317.5593238156 6052920813.0568208694 129.0000040582 129.0000040582 129.0000040582
|
||||
60 242.8126131177 2819.7431588157 2146689.2095219092 2639.4607996998 317.7517989980 6053116688.6155729294 129.0000041969 129.0000041969 129.0000041969
|
||||
61 242.7957124913 2819.7275989047 2146689.2165652174 2639.4406312730 317.7296823362 6053083306.1403274536 129.0000043380 129.0000043380 129.0000043380
|
||||
62 242.9276177041 2819.8088790098 2146689.2237249981 2639.4201279058 317.9022974164 6053257809.6067762375 129.0000044814 129.0000044814 129.0000044814
|
||||
63 243.0465445938 2819.8814758895 2146689.2310012528 2639.3991657500 318.0579286774 6053413673.1989650726 129.0000046272 129.0000046272 129.0000046272
|
||||
64 242.9890585501 2819.8387587817 2146689.2383939880 2639.3781767844 317.9827007328 6053321993.5937871933 129.0000047752 129.0000047752 129.0000047752
|
||||
65 242.9653746583 2819.8180104181 2146689.2459031967 2639.3568184374 317.9517072884 6053277474.4272727966 129.0000049256 129.0000049256 129.0000049256
|
||||
66 243.0259297024 2819.8514334947 2146689.2535288804 2639.3352568621 318.0309514181 6053349244.9473772049 129.0000050784 129.0000050784 129.0000050784
|
||||
67 242.9638979697 2819.8046112742 2146689.2612710390 2639.3134547096 317.9497748498 6053248753.9180717468 129.0000052335 129.0000052335 129.0000052335
|
||||
68 243.0283540775 2819.8395632725 2146689.2691296688 2639.2912303374 318.0341240273 6053323807.2197017670 129.0000053909 129.0000053909 129.0000053909
|
||||
69 243.2256418664 2819.9609646019 2146689.2771047787 2639.2684509205 318.2923006889 6053584440.8757400513 129.0000055506 129.0000055506 129.0000055506
|
||||
70 243.2507495334 2819.9706145524 2146689.2851963686 2639.2450126010 318.3251573278 6053605179.1483964920 129.0000057127 129.0000057127 129.0000057127
|
||||
71 243.4287155518 2820.0794853386 2146689.2934044413 2639.2213699915 318.5580489464 6053838914.2552747726 129.0000058771 129.0000058771 129.0000058771
|
||||
72 243.5097518574 2820.1249498194 2146689.3017290002 2639.1971212009 318.6640954635 6053936535.9274711609 129.0000060439 129.0000060439 129.0000060439
|
||||
73 243.5356790969 2820.1337977544 2146689.3101700447 2639.1723394661 318.6980246193 6053955553.5090074539 129.0000062130 129.0000062130 129.0000062130
|
||||
74 243.5479180498 2820.1331964183 2146689.3187275808 2639.1473868749 318.7140408766 6053954286.7515821457 129.0000063844 129.0000063844 129.0000063844
|
||||
75 243.7115573025 2820.2314361523 2146689.3274016059 2639.1220411207 318.9281840641 6054165201.5909118652 129.0000065581 129.0000065581 129.0000065581
|
||||
76 243.7457279618 2820.2454531429 2146689.3361921217 2639.0963868224 318.9729008040 6054195316.5254154205 129.0000067342 129.0000067342 129.0000067342
|
||||
77 243.8345031069 2820.2948644965 2146689.3450991292 2639.0700900389 319.0890745962 6054301412.5615310669 129.0000069126 129.0000069126 129.0000069126
|
||||
78 244.0193931195 2820.4067881628 2146689.3541226317 2639.0435094409 319.3310271594 6054541703.5689058304 129.0000070934 129.0000070934 129.0000070934
|
||||
79 243.9919100078 2820.3799166166 2146689.3632626338 2639.0164249037 319.2950619430 6054484044.4218587875 129.0000072765 129.0000072765 129.0000072765
|
||||
80 244.0965612207 2820.4387335935 2146689.3725191355 2638.9888176882 319.4320116291 6054610332.4174261093 129.0000074619 129.0000074619 129.0000074619
|
||||
81 244.1334315951 2820.4535208568 2146689.3818921377 2638.9608330195 319.4802612965 6054642102.5347270966 129.0000076496 129.0000076496 129.0000076496
|
||||
82 244.3029520408 2820.5543485196 2146689.3913816395 2638.9318525796 319.7021007878 6054858575.1664342880 129.0000078397 129.0000078397 129.0000078397
|
||||
83 244.3445761189 2820.5713690935 2146689.4009876498 2638.9021684795 319.7565712929 6054895140.1710596085 129.0000080321 129.0000080321 129.0000080321
|
||||
84 244.2696671559 2820.5125763350 2146689.4107101629 2638.8720941742 319.6585431986 6054768957.6739044189 129.0000082269 129.0000082269 129.0000082269
|
||||
85 244.5161919319 2820.6629431352 2146689.4205491822 2638.8415194387 319.9811528443 6055091776.5361995697 129.0000084240 129.0000084240 129.0000084240
|
||||
86 244.5641090282 2820.6838080201 2146689.4305047127 2638.8103612394 320.0438585800 6055136595.0767974854 129.0000086234 129.0000086234 129.0000086234
|
||||
87 244.5348240638 2820.6541129118 2146689.4405767513 2638.7789728309 320.0055354056 6055072877.2416200638 129.0000088251 129.0000088251 129.0000088251
|
||||
88 244.6939431427 2820.7468233396 2146689.4507653015 2638.7470269267 320.2137633592 6055271926.6536149979 129.0000090292 129.0000090292 129.0000090292
|
||||
89 244.8800201091 2820.8567117003 2146689.4610703662 2638.7147520097 320.4572692055 6055507852.1186332703 129.0000092356 129.0000092356 129.0000092356
|
||||
90 244.8804280382 2820.8451141876 2146689.4714919478 2638.6820441173 320.4578030336 6055482985.2258749008 129.0000094444 129.0000094444 129.0000094444
|
||||
91 244.9558851986 2820.8815975090 2146689.4820300462 2638.6491836104 320.5565485155 6055561333.3803453445 129.0000096555 129.0000096555 129.0000096555
|
||||
92 244.9965893140 2820.8949614294 2146689.4926846647 2638.6159817170 320.6098151301 6055590051.6433181763 129.0000098689 129.0000098689 129.0000098689
|
||||
93 245.1381056687 2820.9732811388 2146689.5034558061 2638.5824451870 320.7950076360 6055758210.2774200439 129.0000100846 129.0000100846 129.0000100846
|
||||
94 245.2954807041 2821.0619342131 2146689.5143434699 2638.5485198222 321.0009532826 6055948551.7882709503 129.0000103027 129.0000103027 129.0000103027
|
||||
95 245.3535822199 2821.0860553731 2146689.5253476589 2638.5144817512 321.0769866522 6056000363.5151576996 129.0000105232 129.0000105232 129.0000105232
|
||||
96 245.5013476026 2821.1682908185 2146689.5364683764 2638.4801107361 321.2703568219 6056176929.0169925690 129.0000107459 129.0000107459 129.0000107459
|
||||
97 245.4166531417 2821.0989038023 2146689.5477056229 2638.4453663061 321.1595231342 6056028008.1910057068 129.0000109710 129.0000109710 129.0000109710
|
||||
98 245.4121937790 2821.0817490953 2146689.5590593945 2638.4097762390 321.1536874797 6055991214.3494396210 129.0000111984 129.0000111984 129.0000111984
|
||||
99 245.4532592994 2821.0946353191 2146689.5705296928 2638.3738037546 321.2074270397 6056018909.4480972290 129.0000114282 129.0000114282 129.0000114282
|
||||
100 245.7500657390 2821.2735939427 2146689.5821165247 2638.3375549051 321.5958367642 6056403111.1006488800 129.0000116603 129.0000116603 129.0000116603
|
||||
Loop time of 4.05006 on 1 procs for 100 steps with 10125 atoms
|
||||
0 239.4274282976 2817.4421750949 2146689.0000000000 2639.8225470740 313.3218455755 6048176597.3066034317 129.0000000000 129.0000000000 129.0000000000
|
||||
1 239.4771405316 2817.4798146419 2146689.0000581890 2639.8304543632 313.3869004818 6048257397.8720483780 129.0000000012 129.0000000012 129.0000000012
|
||||
2 239.5643955010 2817.5423194969 2146689.0002327557 2639.8379071907 313.5010849268 6048391576.8485937119 129.0000000047 129.0000000047 129.0000000047
|
||||
3 239.6633839196 2817.6123662396 2146689.0005237064 2639.8445238058 313.6306241122 6048541946.2404479980 129.0000000105 129.0000000105 129.0000000105
|
||||
4 239.5371222027 2817.5355424336 2146689.0009310376 2639.8505035043 313.4653942786 6048377030.5689325333 129.0000000186 129.0000000186 129.0000000186
|
||||
5 239.6512678169 2817.6153097076 2146689.0014547524 2639.8561498340 313.6147686202 6048548267.5742130280 129.0000000291 129.0000000291 129.0000000291
|
||||
6 239.5617886781 2817.5624195435 2146689.0020948485 2639.8617493725 313.4976735610 6048434730.6441593170 129.0000000420 129.0000000420 129.0000000420
|
||||
7 239.5228587856 2817.5420009502 2146689.0028513218 2639.8666590407 313.4467287471 6048390900.4058599472 129.0000000571 129.0000000571 129.0000000571
|
||||
8 239.6066877934 2817.6008649264 2146689.0037241788 2639.8710757645 313.5564298772 6048517265.5155982971 129.0000000746 129.0000000746 129.0000000746
|
||||
9 239.5719861485 2817.5823530300 2146689.0047134170 2639.8752557893 313.5110182737 6048477529.0184717178 129.0000000944 129.0000000944 129.0000000944
|
||||
10 239.5800176776 2817.5915671176 2146689.0058190385 2639.8793778438 313.5215285712 6048497311.9141387939 129.0000001166 129.0000001166 129.0000001166
|
||||
11 239.6299830954 2817.6281223139 2146689.0070410441 2639.8829762049 313.5869148014 6048575787.9953098297 129.0000001410 129.0000001410 129.0000001410
|
||||
12 239.6011995911 2817.6132377273 2146689.0083794324 2639.8860704236 313.5492478526 6048543839.1878814697 129.0000001678 129.0000001678 129.0000001678
|
||||
13 239.6407681166 2817.6427924824 2146689.0098342048 2639.8889816934 313.6010284005 6048607288.1548709869 129.0000001970 129.0000001970 129.0000001970
|
||||
14 239.6981172055 2817.6844100046 2146689.0114053637 2639.8913405110 313.6760771219 6048696632.4595127106 129.0000002285 129.0000002285 129.0000002285
|
||||
15 239.8563971968 2817.7922519039 2146689.0130929090 2639.8934358481 313.8832070208 6048928140.2348766327 129.0000002623 129.0000002623 129.0000002623
|
||||
16 239.8561894618 2817.7971208196 2146689.0148968464 2639.8950496967 313.8829351726 6048938597.3658657074 129.0000002984 129.0000002984 129.0000002984
|
||||
17 239.8816520361 2817.8185621543 2146689.0168171758 2639.8961257823 313.9162562538 6048984630.6545839310 129.0000003369 129.0000003369 129.0000003369
|
||||
18 239.9099966096 2817.8417368960 2146689.0188538977 2639.8965743204 313.9533488047 6049034385.3571958542 129.0000003777 129.0000003777 129.0000003777
|
||||
19 240.0514024347 2817.9389205774 2146689.0210070144 2639.8966103811 314.1383966683 6049243014.5661621094 129.0000004208 129.0000004208 129.0000004208
|
||||
20 239.8802541140 2817.8327386176 2146689.0232765260 2639.8962085210 313.9144268914 6049015081.3139505386 129.0000004662 129.0000004662 129.0000004662
|
||||
21 239.8462621903 2817.8160306167 2146689.0256624296 2639.8953174755 313.8699440502 6048979221.1549577713 129.0000005140 129.0000005140 129.0000005140
|
||||
22 240.0487944678 2817.9533849157 2146689.0281647225 2639.8938590354 314.1349838054 6049274085.1726217270 129.0000005642 129.0000005642 129.0000005642
|
||||
23 240.0966314441 2817.9897873787 2146689.0307834130 2639.8918104774 314.1975846937 6049352237.3198652267 129.0000006166 129.0000006166 129.0000006166
|
||||
24 240.1765312516 2818.0463843765 2146689.0335185044 2639.8891292321 314.3021439554 6049473741.1817827225 129.0000006714 129.0000006714 129.0000006714
|
||||
25 240.1500705973 2818.0336048048 2146689.0363699966 2639.8858785483 314.2675167572 6049446315.4509468079 129.0000007285 129.0000007285 129.0000007285
|
||||
26 240.2681423500 2818.1151708195 2146689.0393378921 2639.8825176506 314.4220289603 6049621420.6842966080 129.0000007880 129.0000007880 129.0000007880
|
||||
27 240.4728815247 2818.2527327079 2146689.0424221945 2639.8784158747 314.6899567267 6049916731.9748563766 129.0000008498 129.0000008498 129.0000008498
|
||||
28 240.4793027032 2818.2613348477 2146689.0456229053 2639.8736089473 314.6983596717 6049935207.1145420074 129.0000009139 129.0000009139 129.0000009139
|
||||
29 240.5020619198 2818.2805472685 2146689.0489400285 2639.8681043704 314.7281430587 6049976459.5562763214 129.0000009803 129.0000009803 129.0000009803
|
||||
30 240.5513721776 2818.3167157263 2146689.0523735629 2639.8623484053 314.7926719270 6050054111.6652946472 129.0000010491 129.0000010491 129.0000010491
|
||||
31 240.7340393104 2818.4391703712 2146689.0559235099 2639.8563442170 315.0317155636 6050316993.7162160873 129.0000011202 129.0000011202 129.0000011202
|
||||
32 240.8254719483 2818.5014640740 2146689.0595898777 2639.8498122053 315.1513670299 6050450729.2599506378 129.0000011936 129.0000011936 129.0000011936
|
||||
33 240.9681573541 2818.5965480750 2146689.0633726656 2639.8425779528 315.3380893908 6050654855.7068986893 129.0000012694 129.0000012694 129.0000012694
|
||||
34 241.0039494187 2818.6217008564 2146689.0672718794 2639.8347174393 315.3849279499 6050708861.8979463577 129.0000013475 129.0000013475 129.0000013475
|
||||
35 241.0314566197 2818.6411150538 2146689.0712875174 2639.8262983643 315.4209246902 6050750549.4619541168 129.0000014279 129.0000014279 129.0000014279
|
||||
36 241.0829173424 2818.6763455617 2146689.0754195810 2639.8174397481 315.4882677207 6050826190.0551443100 129.0000015107 129.0000015107 129.0000015107
|
||||
37 241.2845682012 2818.8087982181 2146689.0796680767 2639.8080129872 315.7521540252 6051110536.7012710571 129.0000015958 129.0000015958 129.0000015958
|
||||
38 241.3214712920 2818.8336260248 2146689.0840330068 2639.7981963574 315.8004465062 6051163846.5868301392 129.0000016833 129.0000016833 129.0000016833
|
||||
39 241.3392127125 2818.8456991528 2146689.0885143690 2639.7879618658 315.8236634561 6051189776.4712991714 129.0000017730 129.0000017730 129.0000017730
|
||||
40 241.5383770555 2818.9753950055 2146689.0931121684 2639.7769824244 316.0842958321 6051468206.1039972305 129.0000018651 129.0000018651 129.0000018651
|
||||
41 241.5059730674 2818.9543817992 2146689.0978264087 2639.7656512498 316.0418910106 6051423110.5725250244 129.0000019595 129.0000019595 129.0000019595
|
||||
42 241.3907605672 2818.8793800508 2146689.1026570834 2639.7541331920 315.8911205101 6051262118.7541017532 129.0000020563 129.0000020563 129.0000020563
|
||||
43 241.5095917610 2818.9559595711 2146689.1076041958 2639.7424355740 316.0466265406 6051426525.1214485168 129.0000021554 129.0000021554 129.0000021554
|
||||
44 241.6271631762 2819.0312325531 2146689.1126677482 2639.7297705654 316.2004839873 6051588127.0861988068 129.0000022568 129.0000022568 129.0000022568
|
||||
45 241.5702411838 2818.9923790176 2146689.1178477411 2639.7163554760 316.1259941770 6051504735.2269029617 129.0000023606 129.0000023606 129.0000023606
|
||||
46 241.7029985068 2819.0771124986 2146689.1231441777 2639.7024246704 316.2997243538 6051686646.5996389389 129.0000024667 129.0000024667 129.0000024667
|
||||
47 241.7966144965 2819.1357830868 2146689.1285570571 2639.6882106593 316.4222330191 6051812609.3728218079 129.0000025751 129.0000025751 129.0000025751
|
||||
48 241.8573480255 2819.1726205120 2146689.1340863821 2639.6735287925 316.5017107195 6051891703.4611186981 129.0000026859 129.0000026859 129.0000026859
|
||||
49 241.9611147338 2819.2374095379 2146689.1397321564 2639.6583357477 316.6375029166 6052030801.2758235931 129.0000027990 129.0000027990 129.0000027990
|
||||
50 242.1023518806 2819.3259059811 2146689.1454943856 2639.6424863169 316.8223300428 6052220791.8748512268 129.0000029144 129.0000029144 129.0000029144
|
||||
51 242.1174105473 2819.3319633044 2146689.1513730693 2639.6264141131 316.8420362613 6052233811.6391019821 129.0000030321 129.0000030321 129.0000030321
|
||||
52 242.2534914901 2819.4164594322 2146689.1573682069 2639.6098392671 317.0201158259 6052415215.4627037048 129.0000031522 129.0000031522 129.0000031522
|
||||
53 242.3504633236 2819.4754119996 2146689.1634798055 2639.5930076506 317.1470160479 6052541785.5314817429 129.0000032746 129.0000032746 129.0000032746
|
||||
54 242.2982323323 2819.4368568264 2146689.1697078613 2639.5756353782 317.0786650211 6052459037.1184797287 129.0000033994 129.0000033994 129.0000033994
|
||||
55 242.3452896272 2819.4623310219 2146689.1760523771 2639.5575918586 317.1402455951 6052513740.1862611771 129.0000035265 129.0000035265 129.0000035265
|
||||
56 242.4181903333 2819.5048897011 2146689.1825133534 2639.5390347547 317.2356456249 6052605118.6588287354 129.0000036559 129.0000036559 129.0000036559
|
||||
57 242.5317091656 2819.5739975787 2146689.1890907930 2639.5199828249 317.3841997413 6052753490.3378009796 129.0000037876 129.0000037876 129.0000037876
|
||||
58 242.5478978740 2819.5796954935 2146689.1957846982 2639.5006137388 317.4053847660 6052765740.8638200760 129.0000039217 129.0000039217 129.0000039217
|
||||
59 242.6655316466 2819.6519225743 2146689.2025950695 2639.4808234811 317.5593238156 6052920809.1607065201 129.0000040582 129.0000040582 129.0000040582
|
||||
60 242.8126131177 2819.7431588157 2146689.2095219092 2639.4607996998 317.7517989980 6053116684.5470046997 129.0000041969 129.0000041969 129.0000041969
|
||||
61 242.7957124913 2819.7275989047 2146689.2165652174 2639.4406312730 317.7296823362 6053083302.1140241623 129.0000043380 129.0000043380 129.0000043380
|
||||
62 242.9276177041 2819.8088790098 2146689.2237249981 2639.4201279058 317.9022974164 6053257805.4283437729 129.0000044814 129.0000044814 129.0000044814
|
||||
63 243.0465445938 2819.8814758895 2146689.2310012528 2639.3991657500 318.0579286774 6053413668.8858547211 129.0000046272 129.0000046272 129.0000046272
|
||||
64 242.9890585501 2819.8387587817 2146689.2383939880 2639.3781767844 317.9827007328 6053321989.3768787384 129.0000047752 129.0000047752 129.0000047752
|
||||
65 242.9653746583 2819.8180104181 2146689.2459031967 2639.3568184374 317.9517072884 6053277470.2627182007 129.0000049256 129.0000049256 129.0000049256
|
||||
66 243.0259297024 2819.8514334947 2146689.2535288804 2639.3352568621 318.0309514181 6053349240.7251205444 129.0000050784 129.0000050784 129.0000050784
|
||||
67 242.9638979697 2819.8046112742 2146689.2612710390 2639.3134547096 317.9497748498 6053248749.7987766266 129.0000052335 129.0000052335 129.0000052335
|
||||
68 243.0283540775 2819.8395632725 2146689.2691296688 2639.2912303374 318.0341240273 6053323803.0382738113 129.0000053909 129.0000053909 129.0000053909
|
||||
69 243.2256418664 2819.9609646019 2146689.2771047787 2639.2684509205 318.2923006889 6053584436.4588871002 129.0000055506 129.0000055506 129.0000055506
|
||||
70 243.2507495334 2819.9706145524 2146689.2851963686 2639.2450126010 318.3251573278 6053605174.7221174240 129.0000057127 129.0000057127 129.0000057127
|
||||
71 243.4287155518 2820.0794853386 2146689.2934044413 2639.2213699915 318.5580489464 6053838909.6197280884 129.0000058771 129.0000058771 129.0000058771
|
||||
72 243.5097518574 2820.1249498194 2146689.3017290002 2639.1971212009 318.6640954635 6053936531.2101163864 129.0000060439 129.0000060439 129.0000060439
|
||||
73 243.5356790969 2820.1337977544 2146689.3101700447 2639.1723394661 318.6980246193 6053955548.7824945450 129.0000062130 129.0000062130 129.0000062130
|
||||
74 243.5479180498 2820.1331964183 2146689.3187275808 2639.1473868749 318.7140408766 6053954282.0339813232 129.0000063844 129.0000063844 129.0000063844
|
||||
75 243.7115573025 2820.2314361523 2146689.3274016059 2639.1220411207 318.9281840641 6054165196.6845111847 129.0000065581 129.0000065581 129.0000065581
|
||||
76 243.7457279618 2820.2454531429 2146689.3361921217 2639.0963868224 318.9729008040 6054195311.5999307632 129.0000067342 129.0000067342 129.0000067342
|
||||
77 243.8345031069 2820.2948644965 2146689.3450991292 2639.0700900389 319.0890745962 6054301407.5461502075 129.0000069126 129.0000069126 129.0000069126
|
||||
78 244.0193931195 2820.4067881628 2146689.3541226317 2639.0435094409 319.3310271594 6054541698.3381366730 129.0000070934 129.0000070934 129.0000070934
|
||||
79 243.9919100078 2820.3799166166 2146689.3632626338 2639.0164249037 319.2950619430 6054484039.2541246414 129.0000072765 129.0000072765 129.0000072765
|
||||
80 244.0965612207 2820.4387335935 2146689.3725191355 2638.9888176882 319.4320116291 6054610327.1403293610 129.0000074619 129.0000074619 129.0000074619
|
||||
81 244.1334315951 2820.4535208568 2146689.3818921377 2638.9608330195 319.4802612965 6054642097.2373485565 129.0000076496 129.0000076496 129.0000076496
|
||||
82 244.3029520408 2820.5543485196 2146689.3913816395 2638.9318525796 319.7021007878 6054858569.6761827469 129.0000078397 129.0000078397 129.0000078397
|
||||
83 244.3445761189 2820.5713690935 2146689.4009876498 2638.9021684795 319.7565712929 6054895134.6560049057 129.0000080321 129.0000080321 129.0000080321
|
||||
84 244.2696671559 2820.5125763350 2146689.4107101629 2638.8720941742 319.6585431986 6054768952.2869329453 129.0000082269 129.0000082269 129.0000082269
|
||||
85 244.5161919319 2820.6629431352 2146689.4205491822 2638.8415194387 319.9811528443 6055091770.8571672440 129.0000084240 129.0000084240 129.0000084240
|
||||
86 244.5641090282 2820.6838080201 2146689.4305047127 2638.8103612394 320.0438585800 6055136589.3662166595 129.0000086234 129.0000086234 129.0000086234
|
||||
87 244.5348240638 2820.6541129118 2146689.4405767513 2638.7789728309 320.0055354056 6055072871.6007261276 129.0000088251 129.0000088251 129.0000088251
|
||||
88 244.6939431427 2820.7468233396 2146689.4507653015 2638.7470269267 320.2137633592 6055271920.8364210129 129.0000090292 129.0000090292 129.0000090292
|
||||
89 244.8800201091 2820.8567117003 2146689.4610703662 2638.7147520097 320.4572692055 6055507846.0901927948 129.0000092356 129.0000092356 129.0000092356
|
||||
90 244.8804280382 2820.8451141876 2146689.4714919478 2638.6820441173 320.4578030336 6055482979.2295818329 129.0000094444 129.0000094444 129.0000094444
|
||||
91 244.9558851986 2820.8815975090 2146689.4820300462 2638.6491836104 320.5565485155 6055561327.3181543350 129.0000096555 129.0000096555 129.0000096555
|
||||
92 244.9965893140 2820.8949614294 2146689.4926846647 2638.6159817170 320.6098151301 6055590045.5610351562 129.0000098689 129.0000098689 129.0000098689
|
||||
93 245.1381056687 2820.9732811388 2146689.5034558061 2638.5824451870 320.7950076360 6055758204.0434722900 129.0000100846 129.0000100846 129.0000100846
|
||||
94 245.2954807041 2821.0619342131 2146689.5143434699 2638.5485198222 321.0009532826 6055948545.3822879791 129.0000103027 129.0000103027 129.0000103027
|
||||
95 245.3535822199 2821.0860553731 2146689.5253476589 2638.5144817512 321.0769866522 6056000357.0671482086 129.0000105232 129.0000105232 129.0000105232
|
||||
96 245.5013476026 2821.1682908185 2146689.5364683764 2638.4801107361 321.2703568219 6056176922.4099712372 129.0000107459 129.0000107459 129.0000107459
|
||||
97 245.4166531417 2821.0989038023 2146689.5477056229 2638.4453663061 321.1595231342 6056028001.7295455933 129.0000109710 129.0000109710 129.0000109710
|
||||
98 245.4121937790 2821.0817490953 2146689.5590593945 2638.4097762390 321.1536874797 6055991207.9293851852 129.0000111984 129.0000111984 129.0000111984
|
||||
99 245.4532592994 2821.0946353191 2146689.5705296928 2638.3738037546 321.2074270397 6056018903.0102539062 129.0000114282 129.0000114282 129.0000114282
|
||||
100 245.7500657390 2821.2735939427 2146689.5821165247 2638.3375549051 321.5958367642 6056403104.3106222153 129.0000116603 129.0000116603 129.0000116603
|
||||
Loop time of 5.22601 on 1 procs for 100 steps with 10125 atoms
|
||||
|
||||
Performance: 2.133 ns/day, 11.250 hours/ns, 24.691 timesteps/s
|
||||
99.8% CPU use with 1 MPI tasks x no OpenMP threads
|
||||
Performance: 1.653 ns/day, 14.517 hours/ns, 19.135 timesteps/s
|
||||
99.7% CPU use with 1 MPI tasks x no OpenMP threads
|
||||
|
||||
MPI task timing breakdown:
|
||||
Section | min time | avg time | max time |%varavg| %total
|
||||
---------------------------------------------------------------
|
||||
Pair | 0.46587 | 0.46587 | 0.46587 | 0.0 | 11.50
|
||||
Neigh | 1.4713 | 1.4713 | 1.4713 | 0.0 | 36.33
|
||||
Comm | 0.05567 | 0.05567 | 0.05567 | 0.0 | 1.37
|
||||
Output | 0.011364 | 0.011364 | 0.011364 | 0.0 | 0.28
|
||||
Modify | 2.0158 | 2.0158 | 2.0158 | 0.0 | 49.77
|
||||
Other | | 0.03004 | | | 0.74
|
||||
Pair | 0.44045 | 0.44045 | 0.44045 | 0.0 | 8.43
|
||||
Neigh | 2.669 | 2.669 | 2.669 | 0.0 | 51.07
|
||||
Comm | 0.056143 | 0.056143 | 0.056143 | 0.0 | 1.07
|
||||
Output | 0.012469 | 0.012469 | 0.012469 | 0.0 | 0.24
|
||||
Modify | 2.0163 | 2.0163 | 2.0163 | 0.0 | 38.58
|
||||
Other | | 0.03168 | | | 0.61
|
||||
|
||||
Nlocal: 10125 ave 10125 max 10125 min
|
||||
Histogram: 1 0 0 0 0 0 0 0 0 0
|
||||
@ -172,4 +180,4 @@ Dangerous builds not checked
|
||||
|
||||
Please see the log.cite file for references relevant to this simulation
|
||||
|
||||
Total wall time: 0:00:04
|
||||
Total wall time: 0:00:05
|
||||
|
||||
@ -1,163 +1,163 @@
|
||||
############################################################################
|
||||
# Input file for investigating twinning nucleation under uniaxial loading with basal plane vector analysis
|
||||
# Christopher Barrett, March 2013
|
||||
# This script requires a Mg pair potential file to be in the same directory.
|
||||
|
||||
# fname is the file name. It is necessary for loops to work correctly. (See jump command)
|
||||
variable fname index in.basal
|
||||
|
||||
######################################
|
||||
# POTENTIAL VARIABLES
|
||||
# lattice parameters and the minimum energy per atom which should be obtained with the current pair potential and homogeneous lattice
|
||||
variable lx equal 3.181269601
|
||||
variable b equal sqrt(3)
|
||||
variable c equal sqrt(8/3)
|
||||
variable ly equal ${b}*${lx}
|
||||
variable lz equal ${c}*${lx}
|
||||
variable pairlocation index almg.liu
|
||||
variable pairstyle index eam/alloy/opt
|
||||
|
||||
######################################
|
||||
# EQUILIBRATION/DEFORMATION VARIABLES
|
||||
# eqpress = 10 bar = 1 MPa
|
||||
# tstep (the timestep) is set to a default value of 0.001 (1 fs)
|
||||
# seed randomizes the velocity
|
||||
# srate is the rate of strain in 1/s
|
||||
# Ndump is the number of timesteps in between each dump of the atom coordinates
|
||||
variable tstep equal 0.001
|
||||
variable seed equal 95812384
|
||||
variable srate equal 1e9
|
||||
|
||||
######################################
|
||||
# INITIALIZATION
|
||||
units metal
|
||||
dimension 3
|
||||
boundary s s s
|
||||
atom_style atomic
|
||||
|
||||
######################################
|
||||
# ATOM BUILD
|
||||
atom_modify map array
|
||||
|
||||
# lattice custom scale a1 "coordinates of a1" a2 "coordinates of a2" a3 "coordinates of a3" basis "atom1 coordinates" basis "atom2 coordinates" basis "atom3 coordinates" basis "atom4 coordinates" orient x "crystallagraphic orientation of x axis" orient y "crystallagraphic orientation of y axis" z "crystallagraphic orientation of z axis"
|
||||
lattice custom 3.181269601 a1 1 0 0 a2 0 1.732050808 0 a3 0 0 1.632993162 basis 0.0 0.0 0.0 basis 0.5 0.5 0 basis 0 0.3333333 0.5 basis 0.5 0.833333 0.5 orient x 0 1 1 orient y 1 0 0 orient z 0 1 -1
|
||||
variable multiple equal 20
|
||||
variable mx equal "v_lx*v_multiple"
|
||||
variable my equal "v_ly*v_multiple"
|
||||
variable mz equal "v_lz*v_multiple"
|
||||
|
||||
# the simulation region should be from 0 to a multiple of the periodic boundary in x, y and z.
|
||||
region whole block 0 ${mz} 0 ${mx} 0 ${my} units box
|
||||
create_box 2 whole
|
||||
create_atoms 1 box basis 1 1 basis 2 1 basis 3 1 basis 4 1
|
||||
|
||||
region fixed1 block INF INF INF INF INF 10 units box
|
||||
region fixed2 block INF INF INF INF 100 INF units box
|
||||
group lower region fixed1
|
||||
group upper region fixed2
|
||||
group boundary union upper lower
|
||||
group mobile subtract all boundary
|
||||
|
||||
variable natoms equal "count(all)"
|
||||
print "# of atoms are: ${natoms}"
|
||||
|
||||
######################################
|
||||
# INTERATOMIC POTENTIAL
|
||||
pair_style ${pairstyle}
|
||||
pair_coeff * * ${pairlocation} Mg Mg
|
||||
|
||||
######################################
|
||||
# COMPUTES REQUIRED
|
||||
compute csym all centro/atom 12
|
||||
compute eng all pe/atom
|
||||
compute eatoms all reduce sum c_eng
|
||||
compute basal all basal/atom
|
||||
|
||||
######################################
|
||||
# MINIMIZATION
|
||||
# Primarily adjusts the c/a ratio to value predicted by EAM potential
|
||||
reset_timestep 0
|
||||
thermo 1
|
||||
thermo_style custom step pe c_eatoms
|
||||
min_style cg
|
||||
minimize 1e-15 1e-15 1000 2000
|
||||
variable eminimum equal "c_eatoms / count(all)"
|
||||
print "%%e(it,1)=${eminimum}"
|
||||
|
||||
######################################
|
||||
# EQUILIBRATION
|
||||
reset_timestep 0
|
||||
timestep ${tstep}
|
||||
# atoms are given a random velocity based on a temperature of 100K.
|
||||
velocity all create 100 ${seed} mom yes rot no
|
||||
|
||||
# temperature and pressure are set to 100 and 0
|
||||
fix 1 all nve
|
||||
|
||||
# Set thermo output
|
||||
thermo 100
|
||||
thermo_style custom step lx ly lz press pxx pyy pzz pe temp
|
||||
|
||||
# Run for at least 2 picosecond (assuming 1 fs timestep)
|
||||
run 2000
|
||||
|
||||
# Loop to run until pressure is below the variable eqpress (defined at beginning of file)
|
||||
label loopeq
|
||||
variable eq loop 100
|
||||
run 250
|
||||
variable converge equal press
|
||||
if "${converge} <= 0" then "variable converge equal -press" else "variable converge equal press"
|
||||
if "${converge} <= 50" then "jump ${fname} breakeq"
|
||||
next eq
|
||||
jump ${fname} loopeq
|
||||
label breakeq
|
||||
|
||||
# Store length for strain rate calculations
|
||||
variable tmp equal "lx"
|
||||
variable L0 equal ${tmp}
|
||||
print "Initial Length, L0: ${L0}"
|
||||
unfix 1
|
||||
|
||||
######################################
|
||||
# DEFORMATION
|
||||
reset_timestep 0
|
||||
timestep ${tstep}
|
||||
|
||||
# Impose constant strain rate
|
||||
variable srate1 equal "v_srate / 1.0e10"
|
||||
velocity upper set 0.0 NULL 0.0 units box
|
||||
velocity lower set 0.0 NULL 0.0 units box
|
||||
|
||||
fix 2 upper setforce 0.0 NULL 0.0
|
||||
fix 3 lower setforce 0.0 NULL 0.0
|
||||
fix 1 all nve
|
||||
|
||||
# Output strain and stress info to file
|
||||
# for units metal, pressure is in [bars] = 100 [kPa] = 1/10000 [GPa]
|
||||
# p2 is in GPa
|
||||
variable strain equal "(lx - v_L0)/v_L0"
|
||||
variable p1 equal "v_strain"
|
||||
variable p2 equal "-pxz/10000"
|
||||
variable p3 equal "lx"
|
||||
variable p4 equal "temp"
|
||||
variable p5 equal "pe"
|
||||
variable p6 equal "ke"
|
||||
fix def1 all print 100 "${p1} ${p2} ${p3} ${p4} ${p5} ${p6}" file output.def1.txt screen no
|
||||
# Dump coordinates to file (for void size calculations)
|
||||
dump 1 all custom 1000 output.dump.* id x y z c_basal[1] c_basal[2] c_basal[3]
|
||||
|
||||
# Display thermo
|
||||
thermo_style custom step v_strain pxz lx temp pe ke
|
||||
restart 50000 output.restart
|
||||
|
||||
# run deformation for 100000 timesteps (10% strain assuming 1 fs timestep and 1e9/s strainrate)
|
||||
variable runtime equal 0
|
||||
label loop
|
||||
displace_atoms all ramp x 0.0 ${srate1} z 10 100 units box
|
||||
run 100
|
||||
variable runtime equal ${runtime}+100
|
||||
if "${runtime} < 100000" then "jump ${fname} loop"
|
||||
|
||||
######################################
|
||||
# SIMULATION DONE
|
||||
print "All done"
|
||||
############################################################################
|
||||
# Input file for investigating twinning nucleation under uniaxial loading with basal plane vector analysis
|
||||
# Christopher Barrett, March 2013
|
||||
# This script requires a Mg pair potential file to be in the same directory.
|
||||
|
||||
# fname is the file name. It is necessary for loops to work correctly. (See jump command)
|
||||
variable fname index in.basal
|
||||
|
||||
######################################
|
||||
# POTENTIAL VARIABLES
|
||||
# lattice parameters and the minimum energy per atom which should be obtained with the current pair potential and homogeneous lattice
|
||||
variable lx equal 3.181269601
|
||||
variable b equal sqrt(3)
|
||||
variable c equal sqrt(8/3)
|
||||
variable ly equal ${b}*${lx}
|
||||
variable lz equal ${c}*${lx}
|
||||
variable pairlocation index almg.liu
|
||||
variable pairstyle index eam/alloy/opt
|
||||
|
||||
######################################
|
||||
# EQUILIBRATION/DEFORMATION VARIABLES
|
||||
# eqpress = 10 bar = 1 MPa
|
||||
# tstep (the timestep) is set to a default value of 0.001 (1 fs)
|
||||
# seed randomizes the velocity
|
||||
# srate is the rate of strain in 1/s
|
||||
# Ndump is the number of timesteps in between each dump of the atom coordinates
|
||||
variable tstep equal 0.001
|
||||
variable seed equal 95812384
|
||||
variable srate equal 1e9
|
||||
|
||||
######################################
|
||||
# INITIALIZATION
|
||||
units metal
|
||||
dimension 3
|
||||
boundary s s s
|
||||
atom_style atomic
|
||||
|
||||
######################################
|
||||
# ATOM BUILD
|
||||
atom_modify map array
|
||||
|
||||
# lattice custom scale a1 "coordinates of a1" a2 "coordinates of a2" a3 "coordinates of a3" basis "atom1 coordinates" basis "atom2 coordinates" basis "atom3 coordinates" basis "atom4 coordinates" orient x "crystallagraphic orientation of x axis" orient y "crystallagraphic orientation of y axis" z "crystallagraphic orientation of z axis"
|
||||
lattice custom 3.181269601 a1 1 0 0 a2 0 1.732050808 0 a3 0 0 1.632993162 basis 0.0 0.0 0.0 basis 0.5 0.5 0 basis 0 0.3333333 0.5 basis 0.5 0.833333 0.5 orient x 0 1 1 orient y 1 0 0 orient z 0 1 -1
|
||||
variable multiple equal 20
|
||||
variable mx equal "v_lx*v_multiple"
|
||||
variable my equal "v_ly*v_multiple"
|
||||
variable mz equal "v_lz*v_multiple"
|
||||
|
||||
# the simulation region should be from 0 to a multiple of the periodic boundary in x, y and z.
|
||||
region whole block 0 ${mz} 0 ${mx} 0 ${my} units box
|
||||
create_box 2 whole
|
||||
create_atoms 1 box basis 1 1 basis 2 1 basis 3 1 basis 4 1
|
||||
|
||||
region fixed1 block INF INF INF INF INF 10 units box
|
||||
region fixed2 block INF INF INF INF 100 INF units box
|
||||
group lower region fixed1
|
||||
group upper region fixed2
|
||||
group boundary union upper lower
|
||||
group mobile subtract all boundary
|
||||
|
||||
variable natoms equal "count(all)"
|
||||
print "# of atoms are: ${natoms}"
|
||||
|
||||
######################################
|
||||
# INTERATOMIC POTENTIAL
|
||||
pair_style ${pairstyle}
|
||||
pair_coeff * * ${pairlocation} Mg Mg
|
||||
|
||||
######################################
|
||||
# COMPUTES REQUIRED
|
||||
compute csym all centro/atom 12
|
||||
compute eng all pe/atom
|
||||
compute eatoms all reduce sum c_eng
|
||||
compute basal all basal/atom
|
||||
|
||||
######################################
|
||||
# MINIMIZATION
|
||||
# Primarily adjusts the c/a ratio to value predicted by EAM potential
|
||||
reset_timestep 0
|
||||
thermo 1
|
||||
thermo_style custom step pe c_eatoms
|
||||
min_style cg
|
||||
minimize 1e-15 1e-15 1000 2000
|
||||
variable eminimum equal "c_eatoms / count(all)"
|
||||
print "%%e(it,1)=${eminimum}"
|
||||
|
||||
######################################
|
||||
# EQUILIBRATION
|
||||
reset_timestep 0
|
||||
timestep ${tstep}
|
||||
# atoms are given a random velocity based on a temperature of 100K.
|
||||
velocity all create 100 ${seed} mom yes rot no
|
||||
|
||||
# temperature and pressure are set to 100 and 0
|
||||
fix 1 all nve
|
||||
|
||||
# Set thermo output
|
||||
thermo 100
|
||||
thermo_style custom step lx ly lz press pxx pyy pzz pe temp
|
||||
|
||||
# Run for at least 2 picosecond (assuming 1 fs timestep)
|
||||
run 2000
|
||||
|
||||
# Loop to run until pressure is below the variable eqpress (defined at beginning of file)
|
||||
label loopeq
|
||||
variable eq loop 100
|
||||
run 250
|
||||
variable converge equal press
|
||||
if "${converge} <= 0" then "variable converge equal -press" else "variable converge equal press"
|
||||
if "${converge} <= 50" then "jump ${fname} breakeq"
|
||||
next eq
|
||||
jump ${fname} loopeq
|
||||
label breakeq
|
||||
|
||||
# Store length for strain rate calculations
|
||||
variable tmp equal "lx"
|
||||
variable L0 equal ${tmp}
|
||||
print "Initial Length, L0: ${L0}"
|
||||
unfix 1
|
||||
|
||||
######################################
|
||||
# DEFORMATION
|
||||
reset_timestep 0
|
||||
timestep ${tstep}
|
||||
|
||||
# Impose constant strain rate
|
||||
variable srate1 equal "v_srate / 1.0e10"
|
||||
velocity upper set 0.0 NULL 0.0 units box
|
||||
velocity lower set 0.0 NULL 0.0 units box
|
||||
|
||||
fix 2 upper setforce 0.0 NULL 0.0
|
||||
fix 3 lower setforce 0.0 NULL 0.0
|
||||
fix 1 all nve
|
||||
|
||||
# Output strain and stress info to file
|
||||
# for units metal, pressure is in [bars] = 100 [kPa] = 1/10000 [GPa]
|
||||
# p2 is in GPa
|
||||
variable strain equal "(lx - v_L0)/v_L0"
|
||||
variable p1 equal "v_strain"
|
||||
variable p2 equal "-pxz/10000"
|
||||
variable p3 equal "lx"
|
||||
variable p4 equal "temp"
|
||||
variable p5 equal "pe"
|
||||
variable p6 equal "ke"
|
||||
fix def1 all print 100 "${p1} ${p2} ${p3} ${p4} ${p5} ${p6}" file output.def1.txt screen no
|
||||
# Dump coordinates to file (for void size calculations)
|
||||
dump 1 all custom 1000 output.dump.* id x y z c_basal[1] c_basal[2] c_basal[3]
|
||||
|
||||
# Display thermo
|
||||
thermo_style custom step v_strain pxz lx temp pe ke
|
||||
restart 50000 output.restart
|
||||
|
||||
# run deformation for 100000 timesteps (10% strain assuming 1 fs timestep and 1e9/s strainrate)
|
||||
variable runtime equal 0
|
||||
label loop
|
||||
displace_atoms all ramp x 0.0 ${srate1} z 10 100 units box
|
||||
run 100
|
||||
variable runtime equal ${runtime}+100
|
||||
if "${runtime} < 100000" then "jump ${fname} loop"
|
||||
|
||||
######################################
|
||||
# SIMULATION DONE
|
||||
print "All done"
|
||||
|
||||
@ -15,6 +15,7 @@ bond_style harmonic
|
||||
bond_coeff * 225.0 0.85
|
||||
|
||||
comm_modify vel yes
|
||||
comm_modify cutoff 3.6
|
||||
|
||||
# must use pair hybrid, since srp bond particles
|
||||
# do not interact with other atoms types
|
||||
|
||||
@ -78,7 +78,7 @@ run 100
|
||||
|
||||
# only output atoms near vacancy
|
||||
|
||||
compute coord all coord/atom $r
|
||||
compute coord all coord/atom cutoff $r
|
||||
|
||||
#dump events all custom 1 dump.prd id type x y z
|
||||
#dump_modify events thresh c_coord != 4
|
||||
|
||||
@ -80,7 +80,7 @@ velocity all zero linear
|
||||
|
||||
# only output atoms near vacancy
|
||||
|
||||
compute coord all coord/atom $r
|
||||
compute coord all coord/atom cutoff $r
|
||||
|
||||
#dump events all custom 1 dump.prd id type x y z
|
||||
#dump_modify events thresh c_coord != 4
|
||||
|
||||
8
lib/kokkos/.gitignore
vendored
8
lib/kokkos/.gitignore
vendored
@ -1,8 +0,0 @@
|
||||
# Standard ignores
|
||||
*~
|
||||
*.pyc
|
||||
\#*#
|
||||
.#*
|
||||
.*.swp
|
||||
.cproject
|
||||
.project
|
||||
284
lib/kokkos/CHANGELOG.md
Normal file
284
lib/kokkos/CHANGELOG.md
Normal file
@ -0,0 +1,284 @@
|
||||
# Change Log
|
||||
|
||||
## [2.02.07](https://github.com/kokkos/kokkos/tree/2.02.07) (2016-12-16)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.01...2.02.07)
|
||||
|
||||
**Implemented enhancements:**
|
||||
|
||||
- Add CMake option to enable Cuda Lambda support [\#589](https://github.com/kokkos/kokkos/issues/589)
|
||||
- Add CMake option to enable Cuda RDC support [\#588](https://github.com/kokkos/kokkos/issues/588)
|
||||
- Add Initial Intel Sky Lake Xeon-HPC Compiler Support to Kokkos Make System [\#584](https://github.com/kokkos/kokkos/issues/584)
|
||||
- Building Tutorial Examples [\#582](https://github.com/kokkos/kokkos/issues/582)
|
||||
- Internal way for using ThreadVectorRange without TeamHandle [\#574](https://github.com/kokkos/kokkos/issues/574)
|
||||
- Testing: Add testing for uvm and rdc [\#571](https://github.com/kokkos/kokkos/issues/571)
|
||||
- Profiling: Add Memory Tracing and Region Markers [\#557](https://github.com/kokkos/kokkos/issues/557)
|
||||
- nvcc\_wrapper not installed with Kokkos built with CUDA through CMake [\#543](https://github.com/kokkos/kokkos/issues/543)
|
||||
- Improve DynRankView debug check [\#541](https://github.com/kokkos/kokkos/issues/541)
|
||||
- Benchmarks: Add Gather benchmark [\#536](https://github.com/kokkos/kokkos/issues/536)
|
||||
- Testing: add spot\_check option to test\_all\_sandia [\#535](https://github.com/kokkos/kokkos/issues/535)
|
||||
- Deprecate Kokkos::Impl::VerifyExecutionCanAccessMemorySpace [\#527](https://github.com/kokkos/kokkos/issues/527)
|
||||
- Add AtomicAdd support for 64bit float for Pascal [\#522](https://github.com/kokkos/kokkos/issues/522)
|
||||
- Add Restrict and Aligned memory trait [\#517](https://github.com/kokkos/kokkos/issues/517)
|
||||
- Kokkos Tests are Not Run using Compiler Optimization [\#501](https://github.com/kokkos/kokkos/issues/501)
|
||||
- Add support for clang 3.7 w/ openmp backend [\#393](https://github.com/kokkos/kokkos/issues/393)
|
||||
- Provide an error throw class [\#79](https://github.com/kokkos/kokkos/issues/79)
|
||||
|
||||
**Fixed bugs:**
|
||||
|
||||
- Cuda UVM Allocation test broken with UVM as default space [\#586](https://github.com/kokkos/kokkos/issues/586)
|
||||
- Bug \(develop branch only\): multiple tests are now failing when forcing uvm usage. [\#570](https://github.com/kokkos/kokkos/issues/570)
|
||||
- Error in generate\_makefile.sh for Kokkos when Compiler is Empty String/Fails [\#568](https://github.com/kokkos/kokkos/issues/568)
|
||||
- XL 13.1.4 incorrect C++11 flag [\#553](https://github.com/kokkos/kokkos/issues/553)
|
||||
- Improve DynRankView debug check [\#541](https://github.com/kokkos/kokkos/issues/541)
|
||||
- Installing Library on MAC broken due to cp -u [\#539](https://github.com/kokkos/kokkos/issues/539)
|
||||
- Intel Nightly Testing with Debug enabled fails [\#534](https://github.com/kokkos/kokkos/issues/534)
|
||||
|
||||
## [2.02.01](https://github.com/kokkos/kokkos/tree/2.02.01) (2016-11-01)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.00...2.02.01)
|
||||
|
||||
**Implemented enhancements:**
|
||||
|
||||
- Add Changelog generation to our process. [\#506](https://github.com/kokkos/kokkos/issues/506)
|
||||
|
||||
**Fixed bugs:**
|
||||
|
||||
- Test scratch\_request fails in Serial with Debug enabled [\#520](https://github.com/kokkos/kokkos/issues/520)
|
||||
- Bug In BoundsCheck for DynRankView [\#516](https://github.com/kokkos/kokkos/issues/516)
|
||||
|
||||
## [2.02.00](https://github.com/kokkos/kokkos/tree/2.02.00) (2016-10-30)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.01.10...2.02.00)
|
||||
|
||||
**Implemented enhancements:**
|
||||
|
||||
- Add PowerPC assembly for grabbing clock register in memory pool [\#511](https://github.com/kokkos/kokkos/issues/511)
|
||||
- Add GCC 6.x support [\#508](https://github.com/kokkos/kokkos/issues/508)
|
||||
- Test install and build against installed library [\#498](https://github.com/kokkos/kokkos/issues/498)
|
||||
- Makefile.kokkos adds expt-extended-lambda to cuda build with clang [\#490](https://github.com/kokkos/kokkos/issues/490)
|
||||
- Add top-level makefile option to just test kokkos-core unit-test [\#485](https://github.com/kokkos/kokkos/issues/485)
|
||||
- Split and harmonize Object Files of Core UnitTests to increase build parallelism [\#484](https://github.com/kokkos/kokkos/issues/484)
|
||||
- LayoutLeft to LayoutLeft subview for 3D and 4D views [\#473](https://github.com/kokkos/kokkos/issues/473)
|
||||
- Add official Cuda 8.0 support [\#468](https://github.com/kokkos/kokkos/issues/468)
|
||||
- Allow C++1Z Flag for Class Lambda capture [\#465](https://github.com/kokkos/kokkos/issues/465)
|
||||
- Add Clang 4.0+ compilation of Cuda code [\#455](https://github.com/kokkos/kokkos/issues/455)
|
||||
- Possible Issue with Intel 17.0.098 and GCC 6.1.0 in Develop Branch [\#445](https://github.com/kokkos/kokkos/issues/445)
|
||||
- Add name of view to "View bounds error" [\#432](https://github.com/kokkos/kokkos/issues/432)
|
||||
- Move Sort Binning Operators into Kokkos namespace [\#421](https://github.com/kokkos/kokkos/issues/421)
|
||||
- TaskPolicy - generate error when attempt to use uninitialized [\#396](https://github.com/kokkos/kokkos/issues/396)
|
||||
- Import WithoutInitializing and AllowPadding into Kokkos namespace [\#325](https://github.com/kokkos/kokkos/issues/325)
|
||||
- TeamThreadRange requires begin, end to be the same type [\#305](https://github.com/kokkos/kokkos/issues/305)
|
||||
- CudaUVMSpace should track \# allocations, due to CUDA limit on \# UVM allocations [\#300](https://github.com/kokkos/kokkos/issues/300)
|
||||
- Remove old View and its infrastructure [\#259](https://github.com/kokkos/kokkos/issues/259)
|
||||
|
||||
**Fixed bugs:**
|
||||
|
||||
- Bug in TestCuda\_Other.cpp: most likely assembly inserted into Device code [\#515](https://github.com/kokkos/kokkos/issues/515)
|
||||
- Cuda Compute Capability check of GPU is outdated [\#509](https://github.com/kokkos/kokkos/issues/509)
|
||||
- multi\_scratch test with hwloc and pthreads seg-faults. [\#504](https://github.com/kokkos/kokkos/issues/504)
|
||||
- generate\_makefile.bash: "make install" is broken [\#503](https://github.com/kokkos/kokkos/issues/503)
|
||||
- make clean in Out of Source Build/Tests Does Not Work Correctly [\#502](https://github.com/kokkos/kokkos/issues/502)
|
||||
- Makefiles for test and examples have issues in Cuda when CXX is not explicitly specified [\#497](https://github.com/kokkos/kokkos/issues/497)
|
||||
- Dispatch lambda test directly inside GTEST macro doesn't work with nvcc [\#491](https://github.com/kokkos/kokkos/issues/491)
|
||||
- UnitTests with HWLOC enabled fail if run with mpirun bound to a single core [\#489](https://github.com/kokkos/kokkos/issues/489)
|
||||
- Failing Reducer Test on Mac with Pthreads [\#479](https://github.com/kokkos/kokkos/issues/479)
|
||||
- make test Dumps Error with Clang Not Found [\#471](https://github.com/kokkos/kokkos/issues/471)
|
||||
- OpenMP TeamPolicy member broadcast not using correct volatile shared variable [\#424](https://github.com/kokkos/kokkos/issues/424)
|
||||
- TaskPolicy - generate error when attempt to use uninitialized [\#396](https://github.com/kokkos/kokkos/issues/396)
|
||||
- New task policy implementation is pulling in old experimental code. [\#372](https://github.com/kokkos/kokkos/issues/372)
|
||||
- MemoryPool unit test hangs on Power8 with GCC 6.1.0 [\#298](https://github.com/kokkos/kokkos/issues/298)
|
||||
|
||||
## [2.01.10](https://github.com/kokkos/kokkos/tree/2.01.10) (2016-09-27)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.01.06...2.01.10)
|
||||
|
||||
**Implemented enhancements:**
|
||||
|
||||
- Enable Profiling by default in Tribits build [\#438](https://github.com/kokkos/kokkos/issues/438)
|
||||
- parallel\_reduce\(0\), parallel\_scan\(0\) unit tests [\#436](https://github.com/kokkos/kokkos/issues/436)
|
||||
- data\(\)==NULL after realloc with LayoutStride [\#351](https://github.com/kokkos/kokkos/issues/351)
|
||||
- Fix tutorials to track new Kokkos::View [\#323](https://github.com/kokkos/kokkos/issues/323)
|
||||
- Rename team policy set\_scratch\_size. [\#195](https://github.com/kokkos/kokkos/issues/195)
|
||||
|
||||
**Fixed bugs:**
|
||||
|
||||
- Possible Issue with Intel 17.0.098 and GCC 6.1.0 in Develop Branch [\#445](https://github.com/kokkos/kokkos/issues/445)
|
||||
- Makefile spits syntax error [\#435](https://github.com/kokkos/kokkos/issues/435)
|
||||
- Kokkos::sort fails for view with all the same values [\#422](https://github.com/kokkos/kokkos/issues/422)
|
||||
- Generic Reducers: can't accept inline constructed reducer [\#404](https://github.com/kokkos/kokkos/issues/404)
|
||||
- data\\(\\)==NULL after realloc with LayoutStride [\#351](https://github.com/kokkos/kokkos/issues/351)
|
||||
- const subview of const view with compile time dimensions on Cuda backend [\#310](https://github.com/kokkos/kokkos/issues/310)
|
||||
- Kokkos \(in Trilinos\) Causes Internal Compiler Error on CUDA 8.0.21-EA on POWER8 [\#307](https://github.com/kokkos/kokkos/issues/307)
|
||||
- Core Oversubscription Detection Broken? [\#159](https://github.com/kokkos/kokkos/issues/159)
|
||||
|
||||
|
||||
## [2.01.06](https://github.com/kokkos/kokkos/tree/2.01.06) (2016-09-02)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.01.00...2.01.06)
|
||||
|
||||
**Implemented enhancements:**
|
||||
|
||||
- Add "standard" reducers for lambda-supportable customized reduce [\#411](https://github.com/kokkos/kokkos/issues/411)
|
||||
- TaskPolicy - single thread back-end execution [\#390](https://github.com/kokkos/kokkos/issues/390)
|
||||
- Kokkos master clone tag [\#387](https://github.com/kokkos/kokkos/issues/387)
|
||||
- Query memory requirements from task policy [\#378](https://github.com/kokkos/kokkos/issues/378)
|
||||
- Output order of test\_atomic.cpp is confusing [\#373](https://github.com/kokkos/kokkos/issues/373)
|
||||
- Missing testing for atomics [\#341](https://github.com/kokkos/kokkos/issues/341)
|
||||
- Feature request for Kokkos to provide Kokkos::atomic\_fetch\_max and atomic\_fetch\_min [\#336](https://github.com/kokkos/kokkos/issues/336)
|
||||
- TaskPolicy\<Cuda\> performance requires teams mapped to warps [\#218](https://github.com/kokkos/kokkos/issues/218)
|
||||
|
||||
**Fixed bugs:**
|
||||
|
||||
- Reduce with Teams broken for custom initialize [\#407](https://github.com/kokkos/kokkos/issues/407)
|
||||
- Failing Kokkos build on Debian [\#402](https://github.com/kokkos/kokkos/issues/402)
|
||||
- Failing Tests on NVIDIA Pascal GPUs [\#398](https://github.com/kokkos/kokkos/issues/398)
|
||||
- Algorithms: fill\_random assumes dimensions fit in unsigned int [\#389](https://github.com/kokkos/kokkos/issues/389)
|
||||
- Kokkos::subview with RandomAccess Memory Trait [\#385](https://github.com/kokkos/kokkos/issues/385)
|
||||
- Build warning \(signed / unsigned comparison\) in Cuda implementation [\#365](https://github.com/kokkos/kokkos/issues/365)
|
||||
- wrong results for a parallel\_reduce with CUDA8 / Maxwell50 [\#352](https://github.com/kokkos/kokkos/issues/352)
|
||||
- Hierarchical parallelism - 3 level unit test [\#344](https://github.com/kokkos/kokkos/issues/344)
|
||||
- Can I allocate a View w/ both WithoutInitializing & AllowPadding? [\#324](https://github.com/kokkos/kokkos/issues/324)
|
||||
- subview View layout determination [\#309](https://github.com/kokkos/kokkos/issues/309)
|
||||
- Unit tests with Cuda - Maxwell [\#196](https://github.com/kokkos/kokkos/issues/196)
|
||||
|
||||
## [2.01.00](https://github.com/kokkos/kokkos/tree/2.01.00) (2016-07-21)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/End_C++98...2.01.00)
|
||||
|
||||
**Implemented enhancements:**
|
||||
|
||||
- Edit ViewMapping so assigning Views with the same custom layout compiles when const casting [\#327](https://github.com/kokkos/kokkos/issues/327)
|
||||
- DynRankView: Performance improvement for operator\(\) [\#321](https://github.com/kokkos/kokkos/issues/321)
|
||||
- Interoperability between static and dynamic rank views [\#295](https://github.com/kokkos/kokkos/issues/295)
|
||||
- subview member function ? [\#280](https://github.com/kokkos/kokkos/issues/280)
|
||||
- Inter-operatibility between View and DynRankView. [\#245](https://github.com/kokkos/kokkos/issues/245)
|
||||
- \(Trilinos\) build warning in atomic\_assign, with Kokkos::complex [\#177](https://github.com/kokkos/kokkos/issues/177)
|
||||
- View\<\>::shmem\_size should runtime check for number of arguments equal to rank [\#176](https://github.com/kokkos/kokkos/issues/176)
|
||||
- Custom reduction join via lambda argument [\#99](https://github.com/kokkos/kokkos/issues/99)
|
||||
- DynRankView with 0 dimensions passed in at construction [\#293](https://github.com/kokkos/kokkos/issues/293)
|
||||
- Inject view\_alloc and friends into Kokkos namespace [\#292](https://github.com/kokkos/kokkos/issues/292)
|
||||
- Less restrictive TeamPolicy reduction on Cuda [\#286](https://github.com/kokkos/kokkos/issues/286)
|
||||
- deep\_copy using remap with source execution space [\#267](https://github.com/kokkos/kokkos/issues/267)
|
||||
- Suggestion: Enable opt-in L1 caching via nvcc-wrapper [\#261](https://github.com/kokkos/kokkos/issues/261)
|
||||
- More flexible create\_mirror functions [\#260](https://github.com/kokkos/kokkos/issues/260)
|
||||
- Rename View::memory\_span to View::required\_allocation\_size [\#256](https://github.com/kokkos/kokkos/issues/256)
|
||||
- Use of subviews and views with compile-time dimensions [\#237](https://github.com/kokkos/kokkos/issues/237)
|
||||
- Use of subviews and views with compile-time dimensions [\#237](https://github.com/kokkos/kokkos/issues/237)
|
||||
- Kokkos::Timer [\#234](https://github.com/kokkos/kokkos/issues/234)
|
||||
- Fence CudaUVMSpace allocations [\#230](https://github.com/kokkos/kokkos/issues/230)
|
||||
- View::operator\(\) accept std::is\_integral and std::is\_enum [\#227](https://github.com/kokkos/kokkos/issues/227)
|
||||
- Allocating zero size View [\#216](https://github.com/kokkos/kokkos/issues/216)
|
||||
- Thread scalable memory pool [\#212](https://github.com/kokkos/kokkos/issues/212)
|
||||
- Add a way to disable memory leak output [\#194](https://github.com/kokkos/kokkos/issues/194)
|
||||
- Kokkos exec space init should init Kokkos profiling [\#192](https://github.com/kokkos/kokkos/issues/192)
|
||||
- Runtime rank wrapper for View [\#189](https://github.com/kokkos/kokkos/issues/189)
|
||||
- Profiling Interface [\#158](https://github.com/kokkos/kokkos/issues/158)
|
||||
- Fix View assignment \(of managed to unmanaged\) [\#153](https://github.com/kokkos/kokkos/issues/153)
|
||||
- Add unit test for assignment of managed View to unmanaged View [\#152](https://github.com/kokkos/kokkos/issues/152)
|
||||
- Check for oversubscription of threads with MPI in Kokkos::initialize [\#149](https://github.com/kokkos/kokkos/issues/149)
|
||||
- Dynamic resizeable 1dimensional view [\#143](https://github.com/kokkos/kokkos/issues/143)
|
||||
- Develop TaskPolicy for CUDA [\#142](https://github.com/kokkos/kokkos/issues/142)
|
||||
- New View : Test Compilation Downstream [\#138](https://github.com/kokkos/kokkos/issues/138)
|
||||
- New View Implementation [\#135](https://github.com/kokkos/kokkos/issues/135)
|
||||
- Add variant of subview that lets users add traits [\#134](https://github.com/kokkos/kokkos/issues/134)
|
||||
- NVCC-WRAPPER: Add --host-only flag [\#121](https://github.com/kokkos/kokkos/issues/121)
|
||||
- Address gtest issue with TriBITS Kokkos build outside of Trilinos [\#117](https://github.com/kokkos/kokkos/issues/117)
|
||||
- Make tests pass with -expt-extended-lambda on CUDA [\#108](https://github.com/kokkos/kokkos/issues/108)
|
||||
- Dynamic scheduling for parallel\_for and parallel\_reduce [\#106](https://github.com/kokkos/kokkos/issues/106)
|
||||
- Runtime or compile time error when reduce functor's join is not properly specified as const member function or with volatile arguments [\#105](https://github.com/kokkos/kokkos/issues/105)
|
||||
- Error out when the number of threads is modified after kokkos is initialized [\#104](https://github.com/kokkos/kokkos/issues/104)
|
||||
- Porting to POWER and remove assumption of X86 default [\#103](https://github.com/kokkos/kokkos/issues/103)
|
||||
- Dynamic scheduling option for RangePolicy [\#100](https://github.com/kokkos/kokkos/issues/100)
|
||||
- SharedMemory Support for Lambdas [\#81](https://github.com/kokkos/kokkos/issues/81)
|
||||
- Recommended TeamSize for Lambdas [\#80](https://github.com/kokkos/kokkos/issues/80)
|
||||
- Add Aggressive Vectorization Compilation mode [\#72](https://github.com/kokkos/kokkos/issues/72)
|
||||
- Dynamic scheduling team execution policy [\#53](https://github.com/kokkos/kokkos/issues/53)
|
||||
- UVM allocations in multi-GPU systems [\#50](https://github.com/kokkos/kokkos/issues/50)
|
||||
- Synchronic in Kokkos::Impl [\#44](https://github.com/kokkos/kokkos/issues/44)
|
||||
- index and dimension types in for loops [\#28](https://github.com/kokkos/kokkos/issues/28)
|
||||
- Subview assign of 1D Strided with stride 1 to LayoutLeft/Right [\#1](https://github.com/kokkos/kokkos/issues/1)
|
||||
|
||||
**Fixed bugs:**
|
||||
|
||||
- misspelled variable name in Kokkos\_Atomic\_Fetch + missing unit tests [\#340](https://github.com/kokkos/kokkos/issues/340)
|
||||
- seg fault Kokkos::Impl::CudaInternal::print\_configuration [\#338](https://github.com/kokkos/kokkos/issues/338)
|
||||
- Clang compiler error with named parallel\_reduce, tags, and TeamPolicy. [\#335](https://github.com/kokkos/kokkos/issues/335)
|
||||
- Shared Memory Allocation Error at parallel\_reduce [\#311](https://github.com/kokkos/kokkos/issues/311)
|
||||
- DynRankView: Fix resize and realloc [\#303](https://github.com/kokkos/kokkos/issues/303)
|
||||
- Scratch memory and dynamic scheduling [\#279](https://github.com/kokkos/kokkos/issues/279)
|
||||
- MemoryPool infinite loop when out of memory [\#312](https://github.com/kokkos/kokkos/issues/312)
|
||||
- Kokkos DynRankView changes break Sacado and Panzer [\#299](https://github.com/kokkos/kokkos/issues/299)
|
||||
- MemoryPool fails to compile on non-cuda non-x86 [\#297](https://github.com/kokkos/kokkos/issues/297)
|
||||
- Random Number Generator Fix [\#296](https://github.com/kokkos/kokkos/issues/296)
|
||||
- View template parameter ordering Bug [\#282](https://github.com/kokkos/kokkos/issues/282)
|
||||
- Serial task policy broken. [\#281](https://github.com/kokkos/kokkos/issues/281)
|
||||
- deep\_copy with LayoutStride should not memcpy [\#262](https://github.com/kokkos/kokkos/issues/262)
|
||||
- DualView::need\_sync should be a const method [\#248](https://github.com/kokkos/kokkos/issues/248)
|
||||
- Arbitrary-sized atomics on GPUs broken; loop forever [\#238](https://github.com/kokkos/kokkos/issues/238)
|
||||
- boolean reduction value\_type changes answer [\#225](https://github.com/kokkos/kokkos/issues/225)
|
||||
- Custom init\(\) function for parallel\_reduce with array value\_type [\#210](https://github.com/kokkos/kokkos/issues/210)
|
||||
- unit\_test Makefile is Broken - Recursively Calls itself until Machine Apocalypse. [\#202](https://github.com/kokkos/kokkos/issues/202)
|
||||
- nvcc\_wrapper Does Not Support -Xcompiler \<compiler option\> [\#198](https://github.com/kokkos/kokkos/issues/198)
|
||||
- Kokkos exec space init should init Kokkos profiling [\#192](https://github.com/kokkos/kokkos/issues/192)
|
||||
- Kokkos Threads Backend impl\_shared\_alloc Broken on Intel 16.1 \(Shepard Haswell\) [\#186](https://github.com/kokkos/kokkos/issues/186)
|
||||
- pthread back end hangs if used uninitialized [\#182](https://github.com/kokkos/kokkos/issues/182)
|
||||
- parallel\_reduce of size 0, not calling init/join [\#175](https://github.com/kokkos/kokkos/issues/175)
|
||||
- Bug in Threads with OpenMP enabled [\#173](https://github.com/kokkos/kokkos/issues/173)
|
||||
- KokkosExp\_SharedAlloc, m\_team\_work\_index inaccessible [\#166](https://github.com/kokkos/kokkos/issues/166)
|
||||
- 128-bit CAS without Assembly Broken? [\#161](https://github.com/kokkos/kokkos/issues/161)
|
||||
- fatal error: Cuda/Kokkos\_Cuda\_abort.hpp: No such file or directory [\#157](https://github.com/kokkos/kokkos/issues/157)
|
||||
- Power8: Fix OpenMP backend [\#139](https://github.com/kokkos/kokkos/issues/139)
|
||||
- Data race in Kokkos OpenMP initialization [\#131](https://github.com/kokkos/kokkos/issues/131)
|
||||
- parallel\_launch\_local\_memory and cuda 7.5 [\#125](https://github.com/kokkos/kokkos/issues/125)
|
||||
- Resize can fail with Cuda due to asynchronous dispatch [\#119](https://github.com/kokkos/kokkos/issues/119)
|
||||
- Qthread taskpolicy initialization bug. [\#92](https://github.com/kokkos/kokkos/issues/92)
|
||||
- Windows: sys/mman.h [\#89](https://github.com/kokkos/kokkos/issues/89)
|
||||
- Windows: atomic\_fetch\_sub\(\) [\#88](https://github.com/kokkos/kokkos/issues/88)
|
||||
- Windows: snprintf [\#87](https://github.com/kokkos/kokkos/issues/87)
|
||||
- Parallel\_Reduce with TeamPolicy and league size of 0 returns garbage [\#85](https://github.com/kokkos/kokkos/issues/85)
|
||||
- Throw with Cuda when using \(2D\) team\_policy parallel\_reduce with less than a warp size [\#76](https://github.com/kokkos/kokkos/issues/76)
|
||||
- Scalar views don't work with Kokkos::Atomic memory trait [\#69](https://github.com/kokkos/kokkos/issues/69)
|
||||
- Reduce the number of threads per team for Cuda [\#63](https://github.com/kokkos/kokkos/issues/63)
|
||||
- Named Kernels fail for reductions with CUDA [\#60](https://github.com/kokkos/kokkos/issues/60)
|
||||
- Kokkos View dimension\_\(\) for long returning unsigned int [\#20](https://github.com/kokkos/kokkos/issues/20)
|
||||
- atomic test hangs with LLVM [\#6](https://github.com/kokkos/kokkos/issues/6)
|
||||
- OpenMP Test should set omp\_set\_num\_threads to 1 [\#4](https://github.com/kokkos/kokkos/issues/4)
|
||||
|
||||
**Closed issues:**
|
||||
|
||||
- develop branch broken with CUDA 8 and --expt-extended-lambda [\#354](https://github.com/kokkos/kokkos/issues/354)
|
||||
- --arch=KNL with Intel 2016 build failure [\#349](https://github.com/kokkos/kokkos/issues/349)
|
||||
- Error building with Cuda when passing -DKOKKOS\_CUDA\_USE\_LAMBDA to generate\_makefile.bash [\#343](https://github.com/kokkos/kokkos/issues/343)
|
||||
- Can I safely use int indices in a 2-D View with capacity \> 2B? [\#318](https://github.com/kokkos/kokkos/issues/318)
|
||||
- Kokkos::ViewAllocateWithoutInitializing is not working [\#317](https://github.com/kokkos/kokkos/issues/317)
|
||||
- Intel build on Mac OS X [\#277](https://github.com/kokkos/kokkos/issues/277)
|
||||
- deleted [\#271](https://github.com/kokkos/kokkos/issues/271)
|
||||
- Broken Mira build [\#268](https://github.com/kokkos/kokkos/issues/268)
|
||||
- 32-bit build [\#246](https://github.com/kokkos/kokkos/issues/246)
|
||||
- parallel\_reduce with RDC crashes linker [\#232](https://github.com/kokkos/kokkos/issues/232)
|
||||
- build of Kokkos\_Sparse\_MV\_impl\_spmv\_Serial.cpp.o fails if you use nvcc and have cuda disabled [\#209](https://github.com/kokkos/kokkos/issues/209)
|
||||
- Kokkos Serial execution space is not tested with TeamPolicy. [\#207](https://github.com/kokkos/kokkos/issues/207)
|
||||
- Unit test failure on Hansen KokkosCore\_UnitTest\_Cuda\_MPI\_1 [\#200](https://github.com/kokkos/kokkos/issues/200)
|
||||
- nvcc compiler warning: calling a \_\_host\_\_ function from a \_\_host\_\_ \_\_device\_\_ function is not allowed [\#180](https://github.com/kokkos/kokkos/issues/180)
|
||||
- Intel 15 build error with defaulted "move" operators [\#171](https://github.com/kokkos/kokkos/issues/171)
|
||||
- missing libkokkos.a during Trilinos 12.4.2 build, yet other libkokkos\*.a libs are there [\#165](https://github.com/kokkos/kokkos/issues/165)
|
||||
- Tie atomic updates to execution space or even to thread team? \(speculation\) [\#144](https://github.com/kokkos/kokkos/issues/144)
|
||||
- New View: Compiletime/size Test [\#137](https://github.com/kokkos/kokkos/issues/137)
|
||||
- New View : Performance Test [\#136](https://github.com/kokkos/kokkos/issues/136)
|
||||
- Signed/unsigned comparison warning in CUDA parallel [\#130](https://github.com/kokkos/kokkos/issues/130)
|
||||
- Kokkos::complex: Need op\* w/ std::complex & real [\#126](https://github.com/kokkos/kokkos/issues/126)
|
||||
- Use uintptr\_t for casting pointers [\#110](https://github.com/kokkos/kokkos/issues/110)
|
||||
- Default thread mapping behavior between P and Q threads. [\#91](https://github.com/kokkos/kokkos/issues/91)
|
||||
- Windows: Atomic\_Fetch\_Exchange\(\) return type [\#90](https://github.com/kokkos/kokkos/issues/90)
|
||||
- Synchronic unit test is way too long [\#84](https://github.com/kokkos/kokkos/issues/84)
|
||||
- nvcc\_wrapper -\> $\(NVCC\_WRAPPER\) [\#42](https://github.com/kokkos/kokkos/issues/42)
|
||||
- Check compiler version and print helpful message [\#39](https://github.com/kokkos/kokkos/issues/39)
|
||||
- Kokkos shared memory on Cuda uses a lot of registers [\#31](https://github.com/kokkos/kokkos/issues/31)
|
||||
- Can not pass unit test `cuda.space` without a GT 720 [\#25](https://github.com/kokkos/kokkos/issues/25)
|
||||
- Makefile.kokkos lacks bounds checking option that CMake has [\#24](https://github.com/kokkos/kokkos/issues/24)
|
||||
- Kokkos can not complete unit tests with CUDA UVM enabled [\#23](https://github.com/kokkos/kokkos/issues/23)
|
||||
- Simplify teams + shared memory histogram example to remove vectorization [\#21](https://github.com/kokkos/kokkos/issues/21)
|
||||
- Kokkos needs to rever to ${PROJECT\_NAME}\_ENABLE\_CXX11 not Trilinos\_ENABLE\_CXX11 [\#17](https://github.com/kokkos/kokkos/issues/17)
|
||||
- Kokkos Base Makefile adds AVX to KNC Build [\#16](https://github.com/kokkos/kokkos/issues/16)
|
||||
- MS Visual Studio 2013 Build Errors [\#9](https://github.com/kokkos/kokkos/issues/9)
|
||||
- subview\(X, ALL\(\), j\) for 2-D LayoutRight View X: should it view a column? [\#5](https://github.com/kokkos/kokkos/issues/5)
|
||||
|
||||
## [End_C++98](https://github.com/kokkos/kokkos/tree/End_C++98) (2015-04-15)
|
||||
|
||||
|
||||
\* *This Change Log was automatically generated by [github_changelog_generator](https://github.com/skywinder/Github-Changelog-Generator)*
|
||||
@ -34,8 +34,8 @@ TRIBITS_PACKAGE_DECL(Kokkos) # ENABLE_SHADOWING_WARNINGS)
|
||||
# for compatibility with Kokkos' Makefile build system.
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
${PACKAGE_NAME}_ENABLE_DEBUG
|
||||
${PACKAGE_NAME_UC}_HAVE_DEBUG
|
||||
Kokkos_ENABLE_DEBUG
|
||||
KOKKOS_HAVE_DEBUG
|
||||
"Enable run-time debug checks. These checks may be expensive, so they are disabled by default in a release build."
|
||||
${${PROJECT_NAME}_ENABLE_DEBUG}
|
||||
)
|
||||
@ -57,7 +57,21 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
Kokkos_ENABLE_Cuda_UVM
|
||||
KOKKOS_USE_CUDA_UVM
|
||||
"Enable CUDA Unified Virtual Memory support in Kokkos."
|
||||
"Enable CUDA Unified Virtual Memory as the default in Kokkos."
|
||||
OFF
|
||||
)
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
Kokkos_ENABLE_Cuda_RDC
|
||||
KOKKOS_HAVE_CUDA_RDC
|
||||
"Enable CUDA Relocatable Device Code support in Kokkos."
|
||||
OFF
|
||||
)
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
Kokkos_ENABLE_Cuda_Lambda
|
||||
KOKKOS_HAVE_CUDA_LAMBDA
|
||||
"Enable CUDA LAMBDA support in Kokkos."
|
||||
OFF
|
||||
)
|
||||
|
||||
@ -72,6 +86,9 @@ ASSERT_DEFINED(TPL_ENABLE_Pthread)
|
||||
IF (Kokkos_ENABLE_Pthread AND NOT TPL_ENABLE_Pthread)
|
||||
MESSAGE(FATAL_ERROR "You set Kokkos_ENABLE_Pthread=ON, but Trilinos' support for Pthread(s) is not enabled (TPL_ENABLE_Pthread=OFF). This is not allowed. Please enable Pthreads in Trilinos before attempting to enable Kokkos' support for Pthreads.")
|
||||
ENDIF ()
|
||||
IF (NOT TPL_ENABLE_Pthread)
|
||||
ADD_DEFINITIONS(-DGTEST_HAS_PTHREAD=0)
|
||||
ENDIF()
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
Kokkos_ENABLE_OpenMP
|
||||
@ -162,13 +179,28 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
#
|
||||
# C) Process the subpackages for Kokkos
|
||||
# C) Install Kokkos' executable scripts
|
||||
#
|
||||
|
||||
|
||||
# nvcc_wrapper is Kokkos' wrapper for NVIDIA's NVCC CUDA compiler.
|
||||
# Kokkos needs nvcc_wrapper in order to build. Other libraries and
|
||||
# executables also need nvcc_wrapper. Thus, we need to install it.
|
||||
# If the argument of DESTINATION is a relative path, CMake computes it
|
||||
# as relative to ${CMAKE_INSTALL_PATH}.
|
||||
|
||||
INSTALL(PROGRAMS ${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper DESTINATION bin)
|
||||
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
#
|
||||
# D) Process the subpackages for Kokkos
|
||||
#
|
||||
|
||||
TRIBITS_PROCESS_SUBPACKAGES()
|
||||
|
||||
#
|
||||
# D) If Kokkos itself is enabled, process the Kokkos package
|
||||
# E) If Kokkos itself is enabled, process the Kokkos package
|
||||
#
|
||||
|
||||
TRIBITS_PACKAGE_DEF()
|
||||
|
||||
@ -7,25 +7,26 @@ CXXFLAGS=$(CCFLAGS)
|
||||
#Options: OpenMP,Serial,Pthreads,Cuda
|
||||
KOKKOS_DEVICES ?= "OpenMP"
|
||||
#KOKKOS_DEVICES ?= "Pthreads"
|
||||
#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal61,ARMv8,BGQ,Power7,Power8,KNL,BDW
|
||||
#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal61,ARMv80,ARMv81,ARMv8-ThunderX,BGQ,Power7,Power8,KNL,BDW,SKX
|
||||
KOKKOS_ARCH ?= ""
|
||||
#Options: yes,no
|
||||
KOKKOS_DEBUG ?= "no"
|
||||
#Options: hwloc,librt,experimental_memkind
|
||||
KOKKOS_USE_TPLS ?= ""
|
||||
#Options: c++11
|
||||
#Options: c++11,c++1z
|
||||
KOKKOS_CXX_STANDARD ?= "c++11"
|
||||
#Options: aggressive_vectorization,disable_profiling
|
||||
KOKKOS_OPTIONS ?= ""
|
||||
|
||||
#Default settings specific options
|
||||
#Options: force_uvm,use_ldg,rdc,enable_lambda
|
||||
KOKKOS_CUDA_OPTIONS ?= ""
|
||||
KOKKOS_CUDA_OPTIONS ?= "enable_lambda"
|
||||
|
||||
# Check for general settings
|
||||
|
||||
KOKKOS_INTERNAL_ENABLE_DEBUG := $(strip $(shell echo $(KOKKOS_DEBUG) | grep "yes" | wc -l))
|
||||
KOKKOS_INTERNAL_ENABLE_CXX11 := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++11" | wc -l))
|
||||
KOKKOS_INTERNAL_ENABLE_CXX1Z := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++1z" | wc -l))
|
||||
|
||||
# Check for external libraries
|
||||
KOKKOS_INTERNAL_USE_HWLOC := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "hwloc" | wc -l))
|
||||
@ -53,23 +54,71 @@ ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 0)
|
||||
endif
|
||||
endif
|
||||
|
||||
# Check for other Execution Spaces
|
||||
|
||||
KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda | wc -l))
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc)
|
||||
CUDA_PATH ?= $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=)
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC_VERSION := $(shell nvcc --version 2>&1 | grep release | cut -d' ' -f5 | cut -d',' -f1 | tr -d .)
|
||||
endif
|
||||
|
||||
# Check OS
|
||||
|
||||
KOKKOS_OS := $(shell uname -s)
|
||||
KOKKOS_INTERNAL_OS_CYGWIN := $(shell uname -s | grep CYGWIN | wc -l)
|
||||
KOKKOS_INTERNAL_OS_LINUX := $(shell uname -s | grep Linux | wc -l)
|
||||
KOKKOS_INTERNAL_OS_DARWIN := $(shell uname -s | grep Darwin | wc -l)
|
||||
|
||||
# Check compiler
|
||||
|
||||
KOKKOS_INTERNAL_COMPILER_INTEL := $(shell $(CXX) --version 2>&1 | grep "Intel Corporation" | wc -l)
|
||||
KOKKOS_INTERNAL_COMPILER_PGI := $(shell $(CXX) --version 2>&1 | grep PGI | wc -l)
|
||||
KOKKOS_INTERNAL_COMPILER_XL := $(shell $(CXX) -qversion 2>&1 | grep XL | wc -l)
|
||||
KOKKOS_INTERNAL_COMPILER_CRAY := $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-" | wc -l)
|
||||
KOKKOS_INTERNAL_OS_CYGWIN := $(shell uname | grep CYGWIN | wc -l)
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC := $(shell $(CXX) --version 2>&1 | grep "nvcc" | wc -l)
|
||||
ifneq ($(OMPI_CXX),)
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC := $(shell $(OMPI_CXX) --version 2>&1 | grep "nvcc" | wc -l)
|
||||
endif
|
||||
ifneq ($(MPICH_CXX),)
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC := $(shell $(MPICH_CXX) --version 2>&1 | grep "nvcc" | wc -l)
|
||||
endif
|
||||
KOKKOS_INTERNAL_COMPILER_CLANG := $(shell $(CXX) --version 2>&1 | grep "clang" | wc -l)
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2)
|
||||
KOKKOS_INTERNAL_COMPILER_CLANG = 1
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 2)
|
||||
KOKKOS_INTERNAL_COMPILER_XL = 1
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
KOKKOS_INTERNAL_COMPILER_CLANG_VERSION := $(shell clang --version | grep version | cut -d ' ' -f3 | tr -d '.')
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_CLANG_VERSION) -lt 400; echo $$?),0)
|
||||
$(error Compiling Cuda code directly with Clang requires version 4.0.0 or higher)
|
||||
endif
|
||||
KOKKOS_INTERNAL_CUDA_USE_LAMBDA := 1
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
KOKKOS_INTERNAL_OPENMP_FLAG := -mp
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
|
||||
KOKKOS_INTERNAL_OPENMP_FLAG := -qsmp=omp
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
# OpenMP is turned on by default in Cray compiler environment
|
||||
KOKKOS_INTERNAL_OPENMP_FLAG :=
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
|
||||
KOKKOS_INTERNAL_OPENMP_FLAG := -qsmp=omp
|
||||
else
|
||||
KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
# OpenMP is turned on by default in Cray compiler environment
|
||||
KOKKOS_INTERNAL_OPENMP_FLAG :=
|
||||
else
|
||||
KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
@ -84,13 +133,11 @@ else
|
||||
KOKKOS_INTERNAL_CXX11_FLAG := -hstd=c++11
|
||||
else
|
||||
KOKKOS_INTERNAL_CXX11_FLAG := --std=c++11
|
||||
KOKKOS_INTERNAL_CXX1Z_FLAG := --std=c++1z
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# Check for other Execution Spaces
|
||||
KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda | wc -l))
|
||||
|
||||
# Check for Kokkos Architecture settings
|
||||
|
||||
#Intel based
|
||||
@ -98,6 +145,7 @@ KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC |
|
||||
KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_SKX := $(strip $(shell echo $(KOKKOS_ARCH) | grep SKX | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_KNL := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNL | wc -l))
|
||||
|
||||
#NVIDIA based
|
||||
@ -110,11 +158,13 @@ KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep
|
||||
KOKKOS_INTERNAL_USE_ARCH_MAXWELL52 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell52 | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_MAXWELL53 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell53 | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_PASCAL61 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Pascal61 | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_PASCAL60 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Pascal60 | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
|
||||
@ -127,13 +177,16 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_AR
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
|
||||
endif
|
||||
|
||||
#ARM based
|
||||
KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv8 | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv80 | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_ARMV81 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv81 | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv8-ThunderX | wc -l))
|
||||
|
||||
#IBM based
|
||||
KOKKOS_INTERNAL_USE_ARCH_BGQ := $(strip $(shell echo $(KOKKOS_ARCH) | grep BGQ | wc -l))
|
||||
@ -145,17 +198,18 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l))
|
||||
|
||||
#Any AVX?
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
|
||||
|
||||
# Decide what ISA level we are able to support
|
||||
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ISA_KNC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8) | bc ))
|
||||
|
||||
#Incompatible flags?
|
||||
KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV80)>1" | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)>1" | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc))
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
|
||||
@ -207,15 +261,21 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ISA_X86_64), 1)
|
||||
tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_USE_ISA_X86_64" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ISA_KNC), 1)
|
||||
tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_USE_ISA_KNC" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCLE), 1)
|
||||
tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_USE_ISA_POWERPCLE" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
|
||||
@ -230,9 +290,15 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_HAVE_CXX11 1" >> KokkosCore_config.tmp )
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Z), 1)
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Z_FLAG)
|
||||
tmp := $(shell echo "\#define KOKKOS_HAVE_CXX11 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_HAVE_CXX1Z 1" >> KokkosCore_config.tmp )
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
KOKKOS_CXXFLAGS += -G
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
|
||||
KOKKOS_CXXFLAGS += -lineinfo
|
||||
endif
|
||||
KOKKOS_CXXFLAGS += -g
|
||||
KOKKOS_LDFLAGS += -g -ldl
|
||||
@ -273,13 +339,14 @@ endif
|
||||
|
||||
tmp := $(shell echo "/* Cuda Settings */" >> KokkosCore_config.tmp)
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LDG), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LDG_INTRINSIC 1" >> KokkosCore_config.tmp )
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_CUDA_USE_UVM 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_USE_CUDA_UVM 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_USE_CUDA_UVM 1" >> KokkosCore_config.tmp )
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1)
|
||||
@ -289,27 +356,101 @@ ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1)
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LAMBDA), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += -expt-extended-lambda
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
|
||||
ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION) -gt 70; echo $$?),0)
|
||||
tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += -expt-extended-lambda
|
||||
else
|
||||
$(warning Warning: Cuda Lambda support was requested but NVCC version is too low. This requires NVCC for Cuda version 7.5 or higher. Disabling Lambda support now.)
|
||||
endif
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp )
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
#Add Architecture flags
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp )
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp )
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
KOKKOS_CXXFLAGS +=
|
||||
KOKKOS_LDFLAGS +=
|
||||
else
|
||||
KOKKOS_CXXFLAGS += -mavx
|
||||
KOKKOS_LDFLAGS += -mavx
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
KOKKOS_CXXFLAGS +=
|
||||
KOKKOS_LDFLAGS +=
|
||||
else
|
||||
KOKKOS_CXXFLAGS += -march=armv8-a
|
||||
KOKKOS_LDFLAGS += -march=armv8-a
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV81 1" >> KokkosCore_config.tmp )
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
KOKKOS_CXXFLAGS +=
|
||||
KOKKOS_LDFLAGS +=
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
KOKKOS_CXXFLAGS +=
|
||||
KOKKOS_LDFLAGS +=
|
||||
else
|
||||
KOKKOS_CXXFLAGS += -march=armv8.1-a
|
||||
KOKKOS_LDFLAGS += -march=armv8.1-a
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV8_THUNDERX 1" >> KokkosCore_config.tmp )
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
KOKKOS_CXXFLAGS +=
|
||||
KOKKOS_LDFLAGS +=
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
KOKKOS_CXXFLAGS +=
|
||||
KOKKOS_LDFLAGS +=
|
||||
else
|
||||
KOKKOS_CXXFLAGS += -march=armv8-a -mtune=thunderx
|
||||
KOKKOS_LDFLAGS += -march=armv8-a -mtune=thunderx
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp )
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
|
||||
KOKKOS_CXXFLAGS += -mavx
|
||||
KOKKOS_LDFLAGS += -mavx
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
KOKKOS_CXXFLAGS += -tp=sandybridge
|
||||
KOKKOS_LDFLAGS += -tp=sandybridge
|
||||
else
|
||||
# Assume that this is a really a GNU compiler
|
||||
KOKKOS_CXXFLAGS += -mavx
|
||||
KOKKOS_LDFLAGS += -mavx
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
|
||||
KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
|
||||
else
|
||||
# Assume that this is a really a GNU compiler or it could be XL on P8
|
||||
KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
|
||||
KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
|
||||
@ -322,7 +463,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
|
||||
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
|
||||
KOKKOS_CXXFLAGS += -tp=haswell
|
||||
KOKKOS_LDFLAGS += -tp=haswell
|
||||
else
|
||||
# Assume that this is a really a GNU compiler
|
||||
KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2
|
||||
@ -352,52 +494,85 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_AVX512XEON 1" >> KokkosCore_config.tmp )
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
|
||||
KOKKOS_CXXFLAGS += -xCORE-AVX512
|
||||
KOKKOS_LDFLAGS += -xCORE-AVX512
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
|
||||
else
|
||||
# Nothing here yet
|
||||
KOKKOS_CXXFLAGS += -march=skylake-avx512
|
||||
KOKKOS_LDFLAGS += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_KNC 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += -mmic
|
||||
KOKKOS_LDFLAGS += -mmic
|
||||
endif
|
||||
|
||||
#Figure out the architecture flag for Cuda
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
|
||||
KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG=-arch
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG=-x cuda --cuda-gpu-arch
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER30 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += -arch=sm_30
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_30
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER32 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += -arch=sm_32
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_32
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER35 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += -arch=sm_35
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_35
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER37 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += -arch=sm_37
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_37
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL50 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += -arch=sm_50
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_50
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL52 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += -arch=sm_52
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_52
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL53 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += -arch=sm_53
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_53
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL61 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += -arch=sm_61
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_61
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL60 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_60
|
||||
endif
|
||||
endif
|
||||
|
||||
@ -424,6 +599,7 @@ KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp)
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
|
||||
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
|
||||
KOKKOS_CXXFLAGS += -I$(CUDA_PATH)/include
|
||||
KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64
|
||||
KOKKOS_LIBS += -lcudart -lcuda
|
||||
endif
|
||||
@ -443,7 +619,7 @@ endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
|
||||
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp)
|
||||
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
|
||||
KOKKOS_CXXFLAGS += -Xcompiler $(KOKKOS_INTERNAL_OPENMP_FLAG)
|
||||
else
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
|
||||
@ -451,6 +627,14 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
|
||||
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
|
||||
endif
|
||||
|
||||
#Explicitly set the GCC Toolchain for Clang
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
KOKKOS_INTERNAL_GCC_PATH = $(shell which g++)
|
||||
KOKKOS_INTERNAL_GCC_TOOLCHAIN = $(KOKKOS_INTERNAL_GCC_PATH:/bin/g++=)
|
||||
KOKKOS_CXXFLAGS += --gcc-toolchain=$(KOKKOS_INTERNAL_GCC_TOOLCHAIN) -DKOKKOS_CUDA_CLANG_WORKAROUND -DKOKKOS_CUDA_USE_LDG_INTRINSIC
|
||||
KOKKOS_LDFLAGS += --gcc-toolchain=$(KOKKOS_INTERNAL_GCC_TOOLCHAIN)
|
||||
endif
|
||||
|
||||
#With Cygwin functions such as fdopen and fileno are not defined
|
||||
#when strict ansi is enabled. strict ansi gets enabled with --std=c++11
|
||||
#though. So we hard undefine it here. Not sure if that has any bad side effects
|
||||
@ -471,7 +655,7 @@ KOKKOS_OBJ_LINK = $(notdir $(KOKKOS_OBJ))
|
||||
include $(KOKKOS_PATH)/Makefile.targets
|
||||
|
||||
kokkos-clean:
|
||||
-rm -f $(KOKKOS_OBJ_LINK) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a
|
||||
rm -f $(KOKKOS_OBJ_LINK) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a
|
||||
|
||||
libkokkos.a: $(KOKKOS_OBJ_LINK) $(KOKKOS_SRC) $(KOKKOS_HEADERS)
|
||||
ar cr libkokkos.a $(KOKKOS_OBJ_LINK)
|
||||
|
||||
@ -14,20 +14,16 @@ Kokkos_hwloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp
|
||||
Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp
|
||||
Kokkos_Serial_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
|
||||
Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
|
||||
Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
|
||||
Kokkos_Shape.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp
|
||||
Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
|
||||
Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
|
||||
Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
|
||||
KokkosExp_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/KokkosExp_SharedAlloc.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/KokkosExp_SharedAlloc.cpp
|
||||
Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
|
||||
Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
|
||||
|
||||
@ -38,8 +34,6 @@ Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
|
||||
Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
|
||||
Kokkos_Cuda_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
|
||||
@ -47,8 +41,6 @@ Kokkos_ThreadsExec_base.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp
|
||||
Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
|
||||
Kokkos_Threads_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
|
||||
@ -67,6 +59,4 @@ endif
|
||||
|
||||
Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
|
||||
Kokkos_HBWAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWAllocators.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWAllocators.cpp
|
||||
|
||||
|
||||
@ -45,31 +45,32 @@ Primary tested compilers on X86 are:
|
||||
Intel 14.0.4
|
||||
Intel 15.0.2
|
||||
Intel 16.0.1
|
||||
Intel 17.0.098
|
||||
Clang 3.5.2
|
||||
Clang 3.6.1
|
||||
Clang 3.9.0
|
||||
|
||||
Primary tested compilers on Power 8 are:
|
||||
IBM XL 13.1.3 (OpenMP,Serial)
|
||||
GCC 4.9.2 (OpenMP,Serial)
|
||||
GCC 5.3.0 (OpenMP,Serial)
|
||||
GCC 5.4.0 (OpenMP,Serial)
|
||||
IBM XL 13.1.3 (OpenMP, Serial) (There is a workaround in place to avoid a compiler bug)
|
||||
|
||||
Primary tested compilers on Intel KNL are:
|
||||
Intel 16.2.181 (with gcc 4.7.2)
|
||||
Intel 17.0.098 (with gcc 4.7.2)
|
||||
|
||||
Secondary tested compilers are:
|
||||
CUDA 6.5 (with gcc 4.7.2)
|
||||
CUDA 7.0 (with gcc 4.7.2)
|
||||
CUDA 7.5 (with gcc 4.8.4)
|
||||
CUDA 7.5 (with gcc 4.7.2)
|
||||
CUDA 8.0 (with gcc 5.3.0 on X86 and gcc 5.4.0 on Power8)
|
||||
CUDA/Clang 8.0 using Clang/Trunk compiler
|
||||
|
||||
Other compilers working:
|
||||
X86:
|
||||
Intel 17.0.042 (the FENL example causes internal compiler error)
|
||||
PGI 15.4
|
||||
Cygwin 2.1.0 64bit with gcc 4.9.3
|
||||
KNL:
|
||||
Intel 16.2.181 (the FENL example causes internal compiler error)
|
||||
Intel 17.0.042 (the FENL example causes internal compiler error)
|
||||
|
||||
Known non-working combinations:
|
||||
Power8:
|
||||
GCC 6.1.0
|
||||
Pthreads backend
|
||||
|
||||
|
||||
@ -92,9 +93,10 @@ master branch, without -Werror and only for a select set of backends.
|
||||
|
||||
In the 'example/tutorial' directory you will find step by step tutorial
|
||||
examples which explain many of the features of Kokkos. They work with
|
||||
simple Makefiles. To build with g++ and OpenMP simply type 'make openmp'
|
||||
simple Makefiles. To build with g++ and OpenMP simply type 'make'
|
||||
in the 'example/tutorial' directory. This will build all examples in the
|
||||
subfolders.
|
||||
subfolders. To change the build options refer to the Programming Guide
|
||||
in the compilation section.
|
||||
|
||||
============================================================================
|
||||
====Running Unit Tests======================================================
|
||||
|
||||
@ -476,54 +476,54 @@ namespace Kokkos {
|
||||
};
|
||||
|
||||
template<class Generator>
|
||||
struct rand<Generator, ::Kokkos::complex<float> > {
|
||||
struct rand<Generator, Kokkos::complex<float> > {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static ::Kokkos::complex<float> max () {
|
||||
return ::Kokkos::complex<float> (1.0, 1.0);
|
||||
static Kokkos::complex<float> max () {
|
||||
return Kokkos::complex<float> (1.0, 1.0);
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static ::Kokkos::complex<float> draw (Generator& gen) {
|
||||
static Kokkos::complex<float> draw (Generator& gen) {
|
||||
const float re = gen.frand ();
|
||||
const float im = gen.frand ();
|
||||
return ::Kokkos::complex<float> (re, im);
|
||||
return Kokkos::complex<float> (re, im);
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static ::Kokkos::complex<float> draw (Generator& gen, const ::Kokkos::complex<float>& range) {
|
||||
static Kokkos::complex<float> draw (Generator& gen, const Kokkos::complex<float>& range) {
|
||||
const float re = gen.frand (real (range));
|
||||
const float im = gen.frand (imag (range));
|
||||
return ::Kokkos::complex<float> (re, im);
|
||||
return Kokkos::complex<float> (re, im);
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static ::Kokkos::complex<float> draw (Generator& gen, const ::Kokkos::complex<float>& start, const ::Kokkos::complex<float>& end) {
|
||||
static Kokkos::complex<float> draw (Generator& gen, const Kokkos::complex<float>& start, const Kokkos::complex<float>& end) {
|
||||
const float re = gen.frand (real (start), real (end));
|
||||
const float im = gen.frand (imag (start), imag (end));
|
||||
return ::Kokkos::complex<float> (re, im);
|
||||
return Kokkos::complex<float> (re, im);
|
||||
}
|
||||
};
|
||||
|
||||
template<class Generator>
|
||||
struct rand<Generator, ::Kokkos::complex<double> > {
|
||||
struct rand<Generator, Kokkos::complex<double> > {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static ::Kokkos::complex<double> max () {
|
||||
return ::Kokkos::complex<double> (1.0, 1.0);
|
||||
static Kokkos::complex<double> max () {
|
||||
return Kokkos::complex<double> (1.0, 1.0);
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static ::Kokkos::complex<double> draw (Generator& gen) {
|
||||
static Kokkos::complex<double> draw (Generator& gen) {
|
||||
const double re = gen.drand ();
|
||||
const double im = gen.drand ();
|
||||
return ::Kokkos::complex<double> (re, im);
|
||||
return Kokkos::complex<double> (re, im);
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static ::Kokkos::complex<double> draw (Generator& gen, const ::Kokkos::complex<double>& range) {
|
||||
static Kokkos::complex<double> draw (Generator& gen, const Kokkos::complex<double>& range) {
|
||||
const double re = gen.drand (real (range));
|
||||
const double im = gen.drand (imag (range));
|
||||
return ::Kokkos::complex<double> (re, im);
|
||||
return Kokkos::complex<double> (re, im);
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static ::Kokkos::complex<double> draw (Generator& gen, const ::Kokkos::complex<double>& start, const ::Kokkos::complex<double>& end) {
|
||||
static Kokkos::complex<double> draw (Generator& gen, const Kokkos::complex<double>& start, const Kokkos::complex<double>& end) {
|
||||
const double re = gen.drand (real (start), real (end));
|
||||
const double im = gen.drand (imag (start), imag (end));
|
||||
return ::Kokkos::complex<double> (re, im);
|
||||
return Kokkos::complex<double> (re, im);
|
||||
}
|
||||
};
|
||||
|
||||
@ -670,8 +670,8 @@ namespace Kokkos {
|
||||
double S = 2.0;
|
||||
double U;
|
||||
while(S>=1.0) {
|
||||
U = drand();
|
||||
const double V = drand();
|
||||
U = 2.0*drand() - 1.0;
|
||||
const double V = 2.0*drand() - 1.0;
|
||||
S = U*U+V*V;
|
||||
}
|
||||
return U*sqrt(-2.0*log(S)/S);
|
||||
@ -910,8 +910,8 @@ namespace Kokkos {
|
||||
double S = 2.0;
|
||||
double U;
|
||||
while(S>=1.0) {
|
||||
U = drand();
|
||||
const double V = drand();
|
||||
U = 2.0*drand() - 1.0;
|
||||
const double V = 2.0*drand() - 1.0;
|
||||
S = U*U+V*V;
|
||||
}
|
||||
return U*sqrt(-2.0*log(S)/S);
|
||||
@ -1163,8 +1163,8 @@ namespace Kokkos {
|
||||
double S = 2.0;
|
||||
double U;
|
||||
while(S>=1.0) {
|
||||
U = drand();
|
||||
const double V = drand();
|
||||
U = 2.0*drand() - 1.0;
|
||||
const double V = 2.0*drand() - 1.0;
|
||||
S = U*U+V*V;
|
||||
}
|
||||
return U*sqrt(-2.0*log(S)/S);
|
||||
|
||||
@ -51,7 +51,7 @@
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace SortImpl {
|
||||
namespace Impl {
|
||||
|
||||
template<class ValuesViewType, int Rank=ValuesViewType::Rank>
|
||||
struct CopyOp;
|
||||
@ -199,7 +199,7 @@ public:
|
||||
|
||||
parallel_for(values.dimension_0(),
|
||||
bin_sort_sort_functor<ValuesViewType, offset_type,
|
||||
SortImpl::CopyOp<ValuesViewType> >(values,sorted_values,sort_order));
|
||||
Impl::CopyOp<ValuesViewType> >(values,sorted_values,sort_order));
|
||||
|
||||
deep_copy(values,sorted_values);
|
||||
}
|
||||
@ -262,17 +262,15 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
namespace SortImpl {
|
||||
|
||||
template<class KeyViewType>
|
||||
struct DefaultBinOp1D {
|
||||
struct BinOp1D {
|
||||
const int max_bins_;
|
||||
const double mul_;
|
||||
typename KeyViewType::const_value_type range_;
|
||||
typename KeyViewType::const_value_type min_;
|
||||
|
||||
//Construct BinOp with number of bins, minimum value and maxuimum value
|
||||
DefaultBinOp1D(int max_bins__, typename KeyViewType::const_value_type min,
|
||||
BinOp1D(int max_bins__, typename KeyViewType::const_value_type min,
|
||||
typename KeyViewType::const_value_type max )
|
||||
:max_bins_(max_bins__+1),mul_(1.0*max_bins__/(max-min)),range_(max-min),min_(min) {}
|
||||
|
||||
@ -298,13 +296,13 @@ struct DefaultBinOp1D {
|
||||
};
|
||||
|
||||
template<class KeyViewType>
|
||||
struct DefaultBinOp3D {
|
||||
struct BinOp3D {
|
||||
int max_bins_[3];
|
||||
double mul_[3];
|
||||
typename KeyViewType::non_const_value_type range_[3];
|
||||
typename KeyViewType::non_const_value_type min_[3];
|
||||
|
||||
DefaultBinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[],
|
||||
BinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[],
|
||||
typename KeyViewType::const_value_type max[] )
|
||||
{
|
||||
max_bins_[0] = max_bins__[0]+1;
|
||||
@ -348,109 +346,11 @@ struct DefaultBinOp3D {
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Scalar>
|
||||
struct min_max {
|
||||
Scalar min;
|
||||
Scalar max;
|
||||
bool init;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
min_max() {
|
||||
min = 0;
|
||||
max = 0;
|
||||
init = 0;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
min_max (const min_max& val) {
|
||||
min = val.min;
|
||||
max = val.max;
|
||||
init = val.init;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
min_max operator = (const min_max& val) {
|
||||
min = val.min;
|
||||
max = val.max;
|
||||
init = val.init;
|
||||
return *this;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator+= (const Scalar& val) {
|
||||
if(init) {
|
||||
min = min<val?min:val;
|
||||
max = max>val?max:val;
|
||||
} else {
|
||||
min = val;
|
||||
max = val;
|
||||
init = 1;
|
||||
}
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator+= (const min_max& val) {
|
||||
if(init && val.init) {
|
||||
min = min<val.min?min:val.min;
|
||||
max = max>val.max?max:val.max;
|
||||
} else {
|
||||
if(val.init) {
|
||||
min = val.min;
|
||||
max = val.max;
|
||||
init = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator+= (volatile const Scalar& val) volatile {
|
||||
if(init) {
|
||||
min = min<val?min:val;
|
||||
max = max>val?max:val;
|
||||
} else {
|
||||
min = val;
|
||||
max = val;
|
||||
init = 1;
|
||||
}
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator+= (volatile const min_max& val) volatile {
|
||||
if(init && val.init) {
|
||||
min = min<val.min?min:val.min;
|
||||
max = max>val.max?max:val.max;
|
||||
} else {
|
||||
if(val.init) {
|
||||
min = val.min;
|
||||
max = val.max;
|
||||
init = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<class ViewType>
|
||||
struct min_max_functor {
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
ViewType view;
|
||||
typedef min_max<typename ViewType::non_const_value_type> value_type;
|
||||
min_max_functor (const ViewType view_):view(view_) {
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const size_t& i, value_type& val) const {
|
||||
val += view(i);
|
||||
}
|
||||
};
|
||||
namespace Impl {
|
||||
|
||||
template<class ViewType>
|
||||
bool try_std_sort(ViewType view) {
|
||||
bool possible = true;
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
size_t stride[8];
|
||||
view.stride(stride);
|
||||
#else
|
||||
size_t stride[8] = { view.stride_0()
|
||||
, view.stride_1()
|
||||
, view.stride_2()
|
||||
@ -460,8 +360,7 @@ bool try_std_sort(ViewType view) {
|
||||
, view.stride_6()
|
||||
, view.stride_7()
|
||||
};
|
||||
#endif
|
||||
possible = possible && Impl::is_same<typename ViewType::memory_space, HostSpace>::value;
|
||||
possible = possible && std::is_same<typename ViewType::memory_space, HostSpace>::value;
|
||||
possible = possible && (ViewType::Rank == 1);
|
||||
possible = possible && (stride[0] == 1);
|
||||
if(possible) {
|
||||
@ -470,27 +369,39 @@ bool try_std_sort(ViewType view) {
|
||||
return possible;
|
||||
}
|
||||
|
||||
template<class ViewType>
|
||||
struct min_max_functor {
|
||||
typedef Kokkos::Experimental::MinMaxScalar<typename ViewType::non_const_value_type> minmax_scalar;
|
||||
|
||||
ViewType view;
|
||||
min_max_functor(const ViewType& view_):view(view_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const size_t& i, minmax_scalar& minmax) const {
|
||||
if(view(i) < minmax.min_val) minmax.min_val = view(i);
|
||||
if(view(i) > minmax.max_val) minmax.max_val = view(i);
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
template<class ViewType>
|
||||
void sort(ViewType view, bool always_use_kokkos_sort = false) {
|
||||
if(!always_use_kokkos_sort) {
|
||||
if(SortImpl::try_std_sort(view)) return;
|
||||
if(Impl::try_std_sort(view)) return;
|
||||
}
|
||||
typedef BinOp1D<ViewType> CompType;
|
||||
|
||||
typedef SortImpl::DefaultBinOp1D<ViewType> CompType;
|
||||
SortImpl::min_max<typename ViewType::non_const_value_type> val;
|
||||
parallel_reduce(view.dimension_0(),SortImpl::min_max_functor<ViewType>(view),val);
|
||||
BinSort<ViewType, CompType> bin_sort(view,CompType(view.dimension_0()/2,val.min,val.max),true);
|
||||
Kokkos::Experimental::MinMaxScalar<typename ViewType::non_const_value_type> result;
|
||||
Kokkos::Experimental::MinMax<typename ViewType::non_const_value_type> reducer(result);
|
||||
parallel_reduce(Kokkos::RangePolicy<typename ViewType::execution_space>(0,view.dimension_0()),
|
||||
Impl::min_max_functor<ViewType>(view),reducer);
|
||||
if(result.min_val == result.max_val) return;
|
||||
BinSort<ViewType, CompType> bin_sort(view,CompType(view.dimension_0()/2,result.min_val,result.max_val),true);
|
||||
bin_sort.create_permute_vector();
|
||||
bin_sort.sort(view);
|
||||
}
|
||||
|
||||
/*template<class ViewType, class Comparator>
|
||||
void sort(ViewType view, Comparator comp, bool always_use_kokkos_sort = false) {
|
||||
|
||||
}*/
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
|
||||
|
||||
SET(SOURCES
|
||||
|
||||
@ -7,21 +7,18 @@ vpath %.cpp ${KOKKOS_PATH}/algorithms/unit_tests
|
||||
default: build_all
|
||||
echo "End Build"
|
||||
|
||||
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
|
||||
CXX = $(KOKKOS_PATH)/config/nvcc_wrapper
|
||||
else
|
||||
CXX = g++
|
||||
endif
|
||||
|
||||
CXXFLAGS = -O3
|
||||
LINK ?= $(CXX)
|
||||
LDFLAGS ?= -lpthread
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
CXX = $(NVCC_WRAPPER)
|
||||
CXXFLAGS ?= -O3
|
||||
LINK = $(CXX)
|
||||
LDFLAGS ?= -lpthread
|
||||
else
|
||||
CXX ?= g++
|
||||
CXXFLAGS ?= -O3
|
||||
LINK ?= $(CXX)
|
||||
LDFLAGS ?= -lpthread
|
||||
endif
|
||||
|
||||
KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/algorithms/unit_tests
|
||||
|
||||
TEST_TARGETS =
|
||||
|
||||
@ -131,6 +131,10 @@ void test_1D_sort(unsigned int n,bool force_kokkos) {
|
||||
typedef Kokkos::View<KeyType*,ExecutionSpace> KeyViewType;
|
||||
KeyViewType keys("Keys",n);
|
||||
|
||||
// Test sorting array with all numbers equal
|
||||
Kokkos::deep_copy(keys,KeyType(1));
|
||||
Kokkos::sort(keys,force_kokkos);
|
||||
|
||||
Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
|
||||
Kokkos::fill_random(keys,g,Kokkos::Random_XorShift64_Pool<ExecutionSpace>::generator_type::MAX_URAND);
|
||||
|
||||
@ -174,7 +178,7 @@ void test_3D_sort(unsigned int n) {
|
||||
typename KeyViewType::value_type min[3] = {0,0,0};
|
||||
typename KeyViewType::value_type max[3] = {100,100,100};
|
||||
|
||||
typedef Kokkos::SortImpl::DefaultBinOp3D< KeyViewType > BinOp;
|
||||
typedef Kokkos::BinOp3D< KeyViewType > BinOp;
|
||||
BinOp bin_op(bin_max,min,max);
|
||||
Kokkos::BinSort< KeyViewType , BinOp >
|
||||
Sorter(keys,bin_op,false);
|
||||
|
||||
43
lib/kokkos/benchmarks/bytes_and_flops/Makefile
Normal file
43
lib/kokkos/benchmarks/bytes_and_flops/Makefile
Normal file
@ -0,0 +1,43 @@
|
||||
KOKKOS_PATH = ${HOME}/kokkos
|
||||
SRC = $(wildcard *.cpp)
|
||||
KOKKOS_DEVICES=Cuda
|
||||
KOKKOS_CUDA_OPTIONS=enable_lambda
|
||||
|
||||
default: build
|
||||
echo "Start Build"
|
||||
|
||||
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
|
||||
CXX = ${KOKKOS_PATH}/config/nvcc_wrapper
|
||||
EXE = bytes_and_flops.cuda
|
||||
KOKKOS_DEVICES = "Cuda,OpenMP"
|
||||
KOKKOS_ARCH = "SNB,Kepler35"
|
||||
else
|
||||
CXX = g++
|
||||
EXE = bytes_and_flops.host
|
||||
KOKKOS_DEVICES = "OpenMP"
|
||||
KOKKOS_ARCH = "SNB"
|
||||
endif
|
||||
|
||||
CXXFLAGS = -O3 -g
|
||||
|
||||
DEPFLAGS = -M
|
||||
LINK = ${CXX}
|
||||
LINKFLAGS =
|
||||
|
||||
OBJ = $(SRC:.cpp=.o)
|
||||
LIB =
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
build: $(EXE)
|
||||
|
||||
$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
|
||||
|
||||
clean: kokkos-clean
|
||||
rm -f *.o *.cuda *.host
|
||||
|
||||
# Compilation rules
|
||||
|
||||
%.o:%.cpp $(KOKKOS_CPP_DEPENDS) bench.hpp bench_unroll_stride.hpp bench_stride.hpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
|
||||
99
lib/kokkos/benchmarks/bytes_and_flops/bench.hpp
Normal file
99
lib/kokkos/benchmarks/bytes_and_flops/bench.hpp
Normal file
@ -0,0 +1,99 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include<Kokkos_Core.hpp>
|
||||
#include<impl/Kokkos_Timer.hpp>
|
||||
|
||||
template<class Scalar, int Unroll,int Stride>
|
||||
struct Run {
|
||||
static void run(int N, int K, int R, int F, int T, int S);
|
||||
};
|
||||
|
||||
template<class Scalar, int Stride>
|
||||
struct RunStride {
|
||||
static void run_1(int N, int K, int R, int F, int T, int S);
|
||||
static void run_2(int N, int K, int R, int F, int T, int S);
|
||||
static void run_3(int N, int K, int R, int F, int T, int S);
|
||||
static void run_4(int N, int K, int R, int F, int T, int S);
|
||||
static void run_5(int N, int K, int R, int F, int T, int S);
|
||||
static void run_6(int N, int K, int R, int F, int T, int S);
|
||||
static void run_7(int N, int K, int R, int F, int T, int S);
|
||||
static void run_8(int N, int K, int R, int F, int T, int S);
|
||||
static void run(int N, int K, int R, int U, int F, int T, int S);
|
||||
};
|
||||
|
||||
#define STRIDE 1
|
||||
#include<bench_stride.hpp>
|
||||
#undef STRIDE
|
||||
#define STRIDE 2
|
||||
#include<bench_stride.hpp>
|
||||
#undef STRIDE
|
||||
#define STRIDE 4
|
||||
#include<bench_stride.hpp>
|
||||
#undef STRIDE
|
||||
#define STRIDE 8
|
||||
#include<bench_stride.hpp>
|
||||
#undef STRIDE
|
||||
#define STRIDE 16
|
||||
#include<bench_stride.hpp>
|
||||
#undef STRIDE
|
||||
#define STRIDE 32
|
||||
#include<bench_stride.hpp>
|
||||
#undef STRIDE
|
||||
|
||||
template<class Scalar>
|
||||
void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S) {
|
||||
if(D == 1)
|
||||
RunStride<Scalar,1>::run(N,K,R,U,F,T,S);
|
||||
if(D == 2)
|
||||
RunStride<Scalar,2>::run(N,K,R,U,F,T,S);
|
||||
if(D == 4)
|
||||
RunStride<Scalar,4>::run(N,K,R,U,F,T,S);
|
||||
if(D == 8)
|
||||
RunStride<Scalar,8>::run(N,K,R,U,F,T,S);
|
||||
if(D == 16)
|
||||
RunStride<Scalar,16>::run(N,K,R,U,F,T,S);
|
||||
if(D == 32)
|
||||
RunStride<Scalar,32>::run(N,K,R,U,F,T,S);
|
||||
}
|
||||
|
||||
124
lib/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp
Normal file
124
lib/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp
Normal file
@ -0,0 +1,124 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
|
||||
#define UNROLL 1
|
||||
#include<bench_unroll_stride.hpp>
|
||||
#undef UNROLL
|
||||
#define UNROLL 2
|
||||
#include<bench_unroll_stride.hpp>
|
||||
#undef UNROLL
|
||||
#define UNROLL 3
|
||||
#include<bench_unroll_stride.hpp>
|
||||
#undef UNROLL
|
||||
#define UNROLL 4
|
||||
#include<bench_unroll_stride.hpp>
|
||||
#undef UNROLL
|
||||
#define UNROLL 5
|
||||
#include<bench_unroll_stride.hpp>
|
||||
#undef UNROLL
|
||||
#define UNROLL 6
|
||||
#include<bench_unroll_stride.hpp>
|
||||
#undef UNROLL
|
||||
#define UNROLL 7
|
||||
#include<bench_unroll_stride.hpp>
|
||||
#undef UNROLL
|
||||
#define UNROLL 8
|
||||
#include<bench_unroll_stride.hpp>
|
||||
#undef UNROLL
|
||||
|
||||
template<class Scalar>
|
||||
struct RunStride<Scalar,STRIDE> {
|
||||
static void run_1(int N, int K, int R, int F, int T, int S) {
|
||||
Run<Scalar,1,STRIDE>::run(N,K,R,F,T,S);
|
||||
}
|
||||
static void run_2(int N, int K, int R, int F, int T, int S) {
|
||||
Run<Scalar,2,STRIDE>::run(N,K,R,F,T,S);
|
||||
}
|
||||
static void run_3(int N, int K, int R, int F, int T, int S) {
|
||||
Run<Scalar,3,STRIDE>::run(N,K,R,F,T,S);
|
||||
}
|
||||
static void run_4(int N, int K, int R, int F, int T, int S) {
|
||||
Run<Scalar,4,STRIDE>::run(N,K,R,F,T,S);
|
||||
}
|
||||
static void run_5(int N, int K, int R, int F, int T, int S) {
|
||||
Run<Scalar,5,STRIDE>::run(N,K,R,F,T,S);
|
||||
}
|
||||
static void run_6(int N, int K, int R, int F, int T, int S) {
|
||||
Run<Scalar,6,STRIDE>::run(N,K,R,F,T,S);
|
||||
}
|
||||
static void run_7(int N, int K, int R, int F, int T, int S) {
|
||||
Run<Scalar,7,STRIDE>::run(N,K,R,F,T,S);
|
||||
}
|
||||
static void run_8(int N, int K, int R, int F, int T, int S) {
|
||||
Run<Scalar,8,STRIDE>::run(N,K,R,F,T,S);
|
||||
}
|
||||
|
||||
static void run(int N, int K, int R, int U, int F, int T, int S) {
|
||||
if(U==1) {
|
||||
run_1(N,K,R,F,T,S);
|
||||
}
|
||||
if(U==2) {
|
||||
run_2(N,K,R,F,T,S);
|
||||
}
|
||||
if(U==3) {
|
||||
run_3(N,K,R,F,T,S);
|
||||
}
|
||||
if(U==4) {
|
||||
run_4(N,K,R,F,T,S);
|
||||
}
|
||||
if(U==5) {
|
||||
run_5(N,K,R,F,T,S);
|
||||
}
|
||||
if(U==6) {
|
||||
run_6(N,K,R,F,T,S);
|
||||
}
|
||||
if(U==7) {
|
||||
run_7(N,K,R,F,T,S);
|
||||
}
|
||||
if(U==8) {
|
||||
run_8(N,K,R,F,T,S);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
148
lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp
Normal file
148
lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp
Normal file
@ -0,0 +1,148 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
template<class Scalar>
|
||||
struct Run<Scalar,UNROLL,STRIDE> {
|
||||
static void run(int N, int K, int R, int F, int T, int S) {
|
||||
Kokkos::View<Scalar**[STRIDE],Kokkos::LayoutRight> A("A",N,K);
|
||||
Kokkos::View<Scalar**[STRIDE],Kokkos::LayoutRight> B("B",N,K);
|
||||
Kokkos::View<Scalar**[STRIDE],Kokkos::LayoutRight> C("C",N,K);
|
||||
|
||||
Kokkos::deep_copy(A,Scalar(1.5));
|
||||
Kokkos::deep_copy(B,Scalar(2.5));
|
||||
Kokkos::deep_copy(C,Scalar(3.5));
|
||||
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::parallel_for("BenchmarkKernel",Kokkos::TeamPolicy<>(N,T).set_scratch_size(0,Kokkos::PerTeam(S)),
|
||||
KOKKOS_LAMBDA ( const Kokkos::TeamPolicy<>::member_type& team) {
|
||||
const int n = team.league_rank();
|
||||
for(int r=0; r<R; r++) {
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,K), [&] (const int& i) {
|
||||
Scalar a1 = A(n,i,0);
|
||||
const Scalar b = B(n,i,0);
|
||||
#if(UNROLL>1)
|
||||
Scalar a2 = a1*1.3;
|
||||
#endif
|
||||
#if(UNROLL>2)
|
||||
Scalar a3 = a2*1.1;
|
||||
#endif
|
||||
#if(UNROLL>3)
|
||||
Scalar a4 = a3*1.1;
|
||||
#endif
|
||||
#if(UNROLL>4)
|
||||
Scalar a5 = a4*1.3;
|
||||
#endif
|
||||
#if(UNROLL>5)
|
||||
Scalar a6 = a5*1.1;
|
||||
#endif
|
||||
#if(UNROLL>6)
|
||||
Scalar a7 = a6*1.1;
|
||||
#endif
|
||||
#if(UNROLL>7)
|
||||
Scalar a8 = a7*1.1;
|
||||
#endif
|
||||
|
||||
|
||||
for(int f = 0; f<F; f++) {
|
||||
a1 += b*a1;
|
||||
#if(UNROLL>1)
|
||||
a2 += b*a2;
|
||||
#endif
|
||||
#if(UNROLL>2)
|
||||
a3 += b*a3;
|
||||
#endif
|
||||
#if(UNROLL>3)
|
||||
a4 += b*a4;
|
||||
#endif
|
||||
#if(UNROLL>4)
|
||||
a5 += b*a5;
|
||||
#endif
|
||||
#if(UNROLL>5)
|
||||
a6 += b*a6;
|
||||
#endif
|
||||
#if(UNROLL>6)
|
||||
a7 += b*a7;
|
||||
#endif
|
||||
#if(UNROLL>7)
|
||||
a8 += b*a8;
|
||||
#endif
|
||||
|
||||
|
||||
}
|
||||
#if(UNROLL==1)
|
||||
C(n,i,0) = a1;
|
||||
#endif
|
||||
#if(UNROLL==2)
|
||||
C(n,i,0) = a1+a2;
|
||||
#endif
|
||||
#if(UNROLL==3)
|
||||
C(n,i,0) = a1+a2+a3;
|
||||
#endif
|
||||
#if(UNROLL==4)
|
||||
C(n,i,0) = a1+a2+a3+a4;
|
||||
#endif
|
||||
#if(UNROLL==5)
|
||||
C(n,i,0) = a1+a2+a3+a4+a5;
|
||||
#endif
|
||||
#if(UNROLL==6)
|
||||
C(n,i,0) = a1+a2+a3+a4+a5+a6;
|
||||
#endif
|
||||
#if(UNROLL==7)
|
||||
C(n,i,0) = a1+a2+a3+a4+a5+a6+a7;
|
||||
#endif
|
||||
#if(UNROLL==8)
|
||||
C(n,i,0) = a1+a2+a3+a4+a5+a6+a7+a8;
|
||||
#endif
|
||||
|
||||
});
|
||||
}
|
||||
});
|
||||
Kokkos::fence();
|
||||
double seconds = timer.seconds();
|
||||
|
||||
double bytes = 1.0*N*K*R*3*sizeof(Scalar);
|
||||
double flops = 1.0*N*K*R*(F*2*UNROLL + 2*(UNROLL-1));
|
||||
printf("NKRUFTS: %i %i %i %i %i %i %i Time: %lfs Bandwidth: %lfGiB/s GFlop/s: %lf\n",N,K,R,UNROLL,F,T,S,seconds,1.0*bytes/seconds/1024/1024/1024,1.e-9*flops/seconds);
|
||||
}
|
||||
};
|
||||
|
||||
96
lib/kokkos/benchmarks/bytes_and_flops/main.cpp
Normal file
96
lib/kokkos/benchmarks/bytes_and_flops/main.cpp
Normal file
@ -0,0 +1,96 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include<Kokkos_Core.hpp>
|
||||
#include<impl/Kokkos_Timer.hpp>
|
||||
#include<bench.hpp>
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
Kokkos::initialize();
|
||||
|
||||
|
||||
if(argc<10) {
|
||||
printf("Arguments: N K R D U F T S\n");
|
||||
printf(" P: Precision (1==float, 2==double)\n");
|
||||
printf(" N,K: dimensions of the 2D array to allocate\n");
|
||||
printf(" R: how often to loop through the K dimension with each team\n");
|
||||
printf(" D: distance between loaded elements (stride)\n");
|
||||
printf(" U: how many independent flops to do per load\n");
|
||||
printf(" F: how many times to repeat the U unrolled operations before reading next element\n");
|
||||
printf(" T: team size\n");
|
||||
printf(" S: shared memory per team (used to control occupancy on GPUs)\n");
|
||||
printf("Example Input GPU:\n");
|
||||
printf(" Bandwidth Bound : 2 100000 1024 1 1 1 1 256 6000\n");
|
||||
printf(" Cache Bound : 2 100000 1024 64 1 1 1 512 20000\n");
|
||||
printf(" Compute Bound : 2 100000 1024 1 1 8 64 256 6000\n");
|
||||
printf(" Load Slots Used : 2 20000 256 32 16 1 1 256 6000\n");
|
||||
printf(" Inefficient Load: 2 20000 256 32 2 1 1 256 20000\n");
|
||||
Kokkos::finalize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int P = atoi(argv[1]);
|
||||
int N = atoi(argv[2]);
|
||||
int K = atoi(argv[3]);
|
||||
int R = atoi(argv[4]);
|
||||
int D = atoi(argv[5]);
|
||||
int U = atoi(argv[6]);
|
||||
int F = atoi(argv[7]);
|
||||
int T = atoi(argv[8]);
|
||||
int S = atoi(argv[9]);
|
||||
|
||||
if(U>8) {printf("U must be 1-8\n"); return 0;}
|
||||
if( (D!=1) && (D!=2) && (D!=4) && (D!=8) && (D!=16) && (D!=32)) {printf("D must be one of 1,2,4,8,16,32\n"); return 0;}
|
||||
if( (P!=1) && (P!=2) ) {printf("P must be one of 1,2\n"); return 0;}
|
||||
|
||||
if(P==1) {
|
||||
run_stride_unroll<float>(N,K,R,D,U,F,T,S);
|
||||
}
|
||||
if(P==2) {
|
||||
run_stride_unroll<double>(N,K,R,D,U,F,T,S);
|
||||
}
|
||||
|
||||
Kokkos::finalize();
|
||||
}
|
||||
|
||||
44
lib/kokkos/benchmarks/gather/Makefile
Normal file
44
lib/kokkos/benchmarks/gather/Makefile
Normal file
@ -0,0 +1,44 @@
|
||||
KOKKOS_PATH = ${HOME}/kokkos
|
||||
SRC = $(wildcard *.cpp)
|
||||
KOKKOS_DEVICES=Cuda
|
||||
KOKKOS_CUDA_OPTIONS=enable_lambda
|
||||
|
||||
default: build
|
||||
echo "Start Build"
|
||||
|
||||
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
|
||||
CXX = ${KOKKOS_PATH}/config/nvcc_wrapper
|
||||
EXE = gather.cuda
|
||||
KOKKOS_DEVICES = "Cuda,OpenMP"
|
||||
KOKKOS_ARCH = "SNB,Kepler35"
|
||||
else
|
||||
CXX = g++
|
||||
EXE = gather.host
|
||||
KOKKOS_DEVICES = "OpenMP"
|
||||
KOKKOS_ARCH = "SNB"
|
||||
endif
|
||||
|
||||
CXXFLAGS = -O3 -g
|
||||
|
||||
DEPFLAGS = -M
|
||||
LINK = ${CXX}
|
||||
LINKFLAGS =
|
||||
|
||||
OBJ = $(SRC:.cpp=.o)
|
||||
LIB =
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
$(warning ${KOKKOS_CPPFLAGS})
|
||||
build: $(EXE)
|
||||
|
||||
$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
|
||||
|
||||
clean: kokkos-clean
|
||||
rm -f *.o *.cuda *.host
|
||||
|
||||
# Compilation rules
|
||||
|
||||
%.o:%.cpp $(KOKKOS_CPP_DEPENDS) gather_unroll.hpp gather.hpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
|
||||
92
lib/kokkos/benchmarks/gather/gather.hpp
Normal file
92
lib/kokkos/benchmarks/gather/gather.hpp
Normal file
@ -0,0 +1,92 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
template<class Scalar, int UNROLL>
|
||||
struct RunGather {
|
||||
static void run(int N, int K, int D, int R, int F);
|
||||
};
|
||||
|
||||
#define UNROLL 1
|
||||
#include<gather_unroll.hpp>
|
||||
#undef UNROLL
|
||||
#define UNROLL 2
|
||||
#include<gather_unroll.hpp>
|
||||
#undef UNROLL
|
||||
#define UNROLL 3
|
||||
#include<gather_unroll.hpp>
|
||||
#undef UNROLL
|
||||
#define UNROLL 4
|
||||
#include<gather_unroll.hpp>
|
||||
#undef UNROLL
|
||||
#define UNROLL 5
|
||||
#include<gather_unroll.hpp>
|
||||
#undef UNROLL
|
||||
#define UNROLL 6
|
||||
#include<gather_unroll.hpp>
|
||||
#undef UNROLL
|
||||
#define UNROLL 7
|
||||
#include<gather_unroll.hpp>
|
||||
#undef UNROLL
|
||||
#define UNROLL 8
|
||||
#include<gather_unroll.hpp>
|
||||
#undef UNROLL
|
||||
|
||||
template<class Scalar>
|
||||
void run_gather_test(int N, int K, int D, int R, int U, int F) {
|
||||
if(U == 1)
|
||||
RunGather<Scalar,1>::run(N,K,D,R,F);
|
||||
if(U == 2)
|
||||
RunGather<Scalar,2>::run(N,K,D,R,F);
|
||||
if(U == 3)
|
||||
RunGather<Scalar,3>::run(N,K,D,R,F);
|
||||
if(U == 4)
|
||||
RunGather<Scalar,4>::run(N,K,D,R,F);
|
||||
if(U == 5)
|
||||
RunGather<Scalar,5>::run(N,K,D,R,F);
|
||||
if(U == 6)
|
||||
RunGather<Scalar,6>::run(N,K,D,R,F);
|
||||
if(U == 7)
|
||||
RunGather<Scalar,7>::run(N,K,D,R,F);
|
||||
if(U == 8)
|
||||
RunGather<Scalar,8>::run(N,K,D,R,F);
|
||||
}
|
||||
169
lib/kokkos/benchmarks/gather/gather_unroll.hpp
Normal file
169
lib/kokkos/benchmarks/gather/gather_unroll.hpp
Normal file
@ -0,0 +1,169 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include<Kokkos_Core.hpp>
|
||||
#include<Kokkos_Random.hpp>
|
||||
|
||||
template<class Scalar>
|
||||
struct RunGather<Scalar,UNROLL> {
|
||||
static void run(int N, int K, int D, int R, int F) {
|
||||
Kokkos::View<int**> connectivity("Connectivity",N,K);
|
||||
Kokkos::View<Scalar*> A_in("Input",N);
|
||||
Kokkos::View<Scalar*> B_in("Input",N);
|
||||
Kokkos::View<Scalar*> C("Output",N);
|
||||
|
||||
Kokkos::Random_XorShift64_Pool<> rand_pool(12313);
|
||||
|
||||
Kokkos::deep_copy(A_in,1.5);
|
||||
Kokkos::deep_copy(B_in,2.0);
|
||||
|
||||
Kokkos::View<const Scalar*, Kokkos::MemoryTraits<Kokkos::RandomAccess> > A(A_in);
|
||||
Kokkos::View<const Scalar*, Kokkos::MemoryTraits<Kokkos::RandomAccess> > B(B_in);
|
||||
|
||||
Kokkos::parallel_for("InitKernel",N,
|
||||
KOKKOS_LAMBDA (const int& i) {
|
||||
auto rand_gen = rand_pool.get_state();
|
||||
for( int jj=0; jj<K; jj++) {
|
||||
connectivity(i,jj) = (rand_gen.rand(D) + i - D/2 + N)%N;
|
||||
}
|
||||
rand_pool.free_state(rand_gen);
|
||||
});
|
||||
Kokkos::fence();
|
||||
|
||||
|
||||
Kokkos::Timer timer;
|
||||
for(int r = 0; r<R; r++) {
|
||||
Kokkos::parallel_for("BenchmarkKernel",N,
|
||||
KOKKOS_LAMBDA (const int& i) {
|
||||
Scalar c = Scalar(0.0);
|
||||
for( int jj=0; jj<K; jj++) {
|
||||
const int j = connectivity(i,jj);
|
||||
Scalar a1 = A(j);
|
||||
const Scalar b = B(j);
|
||||
#if(UNROLL>1)
|
||||
Scalar a2 = a1*Scalar(1.3);
|
||||
#endif
|
||||
#if(UNROLL>2)
|
||||
Scalar a3 = a2*Scalar(1.1);
|
||||
#endif
|
||||
#if(UNROLL>3)
|
||||
Scalar a4 = a3*Scalar(1.1);
|
||||
#endif
|
||||
#if(UNROLL>4)
|
||||
Scalar a5 = a4*Scalar(1.3);
|
||||
#endif
|
||||
#if(UNROLL>5)
|
||||
Scalar a6 = a5*Scalar(1.1);
|
||||
#endif
|
||||
#if(UNROLL>6)
|
||||
Scalar a7 = a6*Scalar(1.1);
|
||||
#endif
|
||||
#if(UNROLL>7)
|
||||
Scalar a8 = a7*Scalar(1.1);
|
||||
#endif
|
||||
|
||||
|
||||
for(int f = 0; f<F; f++) {
|
||||
a1 += b*a1;
|
||||
#if(UNROLL>1)
|
||||
a2 += b*a2;
|
||||
#endif
|
||||
#if(UNROLL>2)
|
||||
a3 += b*a3;
|
||||
#endif
|
||||
#if(UNROLL>3)
|
||||
a4 += b*a4;
|
||||
#endif
|
||||
#if(UNROLL>4)
|
||||
a5 += b*a5;
|
||||
#endif
|
||||
#if(UNROLL>5)
|
||||
a6 += b*a6;
|
||||
#endif
|
||||
#if(UNROLL>6)
|
||||
a7 += b*a7;
|
||||
#endif
|
||||
#if(UNROLL>7)
|
||||
a8 += b*a8;
|
||||
#endif
|
||||
|
||||
|
||||
}
|
||||
#if(UNROLL==1)
|
||||
c += a1;
|
||||
#endif
|
||||
#if(UNROLL==2)
|
||||
c += a1+a2;
|
||||
#endif
|
||||
#if(UNROLL==3)
|
||||
c += a1+a2+a3;
|
||||
#endif
|
||||
#if(UNROLL==4)
|
||||
c += a1+a2+a3+a4;
|
||||
#endif
|
||||
#if(UNROLL==5)
|
||||
c += a1+a2+a3+a4+a5;
|
||||
#endif
|
||||
#if(UNROLL==6)
|
||||
c += a1+a2+a3+a4+a5+a6;
|
||||
#endif
|
||||
#if(UNROLL==7)
|
||||
c += a1+a2+a3+a4+a5+a6+a7;
|
||||
#endif
|
||||
#if(UNROLL==8)
|
||||
c += a1+a2+a3+a4+a5+a6+a7+a8;
|
||||
#endif
|
||||
|
||||
}
|
||||
C(i) = c ;
|
||||
});
|
||||
Kokkos::fence();
|
||||
}
|
||||
double seconds = timer.seconds();
|
||||
|
||||
double bytes = 1.0*N*K*R*(2*sizeof(Scalar)+sizeof(int)) + 1.0*N*R*sizeof(Scalar);
|
||||
double flops = 1.0*N*K*R*(F*2*UNROLL + 2*(UNROLL-1));
|
||||
double gather_ops = 1.0*N*K*R*2;
|
||||
printf("SNKDRUF: %i %i %i %i %i %i %i Time: %lfs Bandwidth: %lfGiB/s GFlop/s: %lf GGather/s: %lf\n",sizeof(Scalar)/4,N,K,D,R,UNROLL,F,seconds,1.0*bytes/seconds/1024/1024/1024,1.e-9*flops/seconds,1.e-9*gather_ops/seconds);
|
||||
}
|
||||
};
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,73 +36,58 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
#include<Kokkos_Core.hpp>
|
||||
#include<impl/Kokkos_Timer.hpp>
|
||||
#include<gather.hpp>
|
||||
|
||||
#include <impl/Kokkos_HBWAllocators.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
int main(int argc, char* argv[]) {
|
||||
Kokkos::initialize(argc,argv);
|
||||
|
||||
|
||||
#include <stdint.h> // uintptr_t
|
||||
#include <cstdlib> // for malloc, realloc, and free
|
||||
#include <cstring> // for memcpy
|
||||
|
||||
#if defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
|
||||
#include <sys/mman.h> // for mmap, munmap, MAP_ANON, etc
|
||||
#include <unistd.h> // for sysconf, _SC_PAGE_SIZE, _SC_PHYS_PAGES
|
||||
#endif
|
||||
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
|
||||
#ifdef KOKKOS_HAVE_HBWSPACE
|
||||
#include <memkind.h>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
#define MEMKIND_TYPE MEMKIND_HBW //hbw_get_kind(HBW_PAGESIZE_4KB)
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
void* HBWMallocAllocator::allocate( size_t size )
|
||||
{
|
||||
std::cout<< "Allocate HBW: " << 1.0e-6*size << "MB" << std::endl;
|
||||
void * ptr = NULL;
|
||||
if (size) {
|
||||
ptr = memkind_malloc(MEMKIND_TYPE,size);
|
||||
|
||||
if (!ptr)
|
||||
{
|
||||
std::ostringstream msg ;
|
||||
msg << name() << ": allocate(" << size << ") FAILED";
|
||||
Kokkos::Impl::throw_runtime_exception( msg.str() );
|
||||
}
|
||||
if(argc<8) {
|
||||
printf("Arguments: S N K D\n");
|
||||
printf(" S: Scalar Type Size (1==float, 2==double, 4=complex<double>)\n");
|
||||
printf(" N: Number of entities\n");
|
||||
printf(" K: Number of things to gather per entity\n");
|
||||
printf(" D: Max distance of gathered things of an entity\n");
|
||||
printf(" R: how often to loop through the K dimension with each team\n");
|
||||
printf(" U: how many independent flops to do per load\n");
|
||||
printf(" F: how many times to repeat the U unrolled operations before reading next element\n");
|
||||
printf("Example Input GPU:\n");
|
||||
printf(" Bandwidth Bound : 2 10000000 1 1 10 1 1\n");
|
||||
printf(" Cache Bound : 2 10000000 64 1 10 1 1\n");
|
||||
printf(" Cache Gather : 2 10000000 64 256 10 1 1\n");
|
||||
printf(" Global Gather : 2 100000000 16 100000000 1 1 1\n");
|
||||
printf(" Typical MD : 2 100000 32 512 1000 8 2\n");
|
||||
Kokkos::finalize();
|
||||
return 0;
|
||||
}
|
||||
return ptr;
|
||||
|
||||
|
||||
int S = atoi(argv[1]);
|
||||
int N = atoi(argv[2]);
|
||||
int K = atoi(argv[3]);
|
||||
int D = atoi(argv[4]);
|
||||
int R = atoi(argv[5]);
|
||||
int U = atoi(argv[6]);
|
||||
int F = atoi(argv[7]);
|
||||
|
||||
if( (S!=1) && (S!=2) && (S!=4)) {printf("S must be one of 1,2,4\n"); return 0;}
|
||||
if( N<D ) {printf("N must be larger or equal to D\n"); return 0; }
|
||||
if(S==1) {
|
||||
run_gather_test<float>(N,K,D,R,U,F);
|
||||
}
|
||||
if(S==2) {
|
||||
run_gather_test<double>(N,K,D,R,U,F);
|
||||
}
|
||||
if(S==4) {
|
||||
run_gather_test<Kokkos::complex<double> >(N,K,D,R,U,F);
|
||||
}
|
||||
Kokkos::finalize();
|
||||
}
|
||||
|
||||
void HBWMallocAllocator::deallocate( void * ptr, size_t /*size*/ )
|
||||
{
|
||||
if (ptr) {
|
||||
memkind_free(MEMKIND_TYPE,ptr);
|
||||
}
|
||||
}
|
||||
|
||||
void * HBWMallocAllocator::reallocate(void * old_ptr, size_t /*old_size*/, size_t new_size)
|
||||
{
|
||||
void * ptr = memkind_realloc(MEMKIND_TYPE, old_ptr, new_size);
|
||||
|
||||
if (new_size > 0u && ptr == NULL) {
|
||||
Kokkos::Impl::throw_runtime_exception("Error: Malloc Allocator could not reallocate memory");
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
#endif
|
||||
284
lib/kokkos/bin/nvcc_wrapper
Executable file
284
lib/kokkos/bin/nvcc_wrapper
Executable file
@ -0,0 +1,284 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# This shell script (nvcc_wrapper) wraps both the host compiler and
|
||||
# NVCC, if you are building legacy C or C++ code with CUDA enabled.
|
||||
# The script remedies some differences between the interface of NVCC
|
||||
# and that of the host compiler, in particular for linking.
|
||||
# It also means that a legacy code doesn't need separate .cu files;
|
||||
# it can just use .cpp files.
|
||||
#
|
||||
# Default settings: change those according to your machine. For
|
||||
# example, you may have have two different wrappers with either icpc
|
||||
# or g++ as their back-end compiler. The defaults can be overwritten
|
||||
# by using the usual arguments (e.g., -arch=sm_30 -ccbin icpc).
|
||||
|
||||
default_arch="sm_35"
|
||||
#default_arch="sm_50"
|
||||
|
||||
#
|
||||
# The default C++ compiler.
|
||||
#
|
||||
host_compiler=${NVCC_WRAPPER_DEFAULT_COMPILER:-"g++"}
|
||||
#host_compiler="icpc"
|
||||
#host_compiler="/usr/local/gcc/4.8.3/bin/g++"
|
||||
#host_compiler="/usr/local/gcc/4.9.1/bin/g++"
|
||||
|
||||
#
|
||||
# Internal variables
|
||||
#
|
||||
|
||||
# C++ files
|
||||
cpp_files=""
|
||||
|
||||
# Host compiler arguments
|
||||
xcompiler_args=""
|
||||
|
||||
# Cuda (NVCC) only arguments
|
||||
cuda_args=""
|
||||
|
||||
# Arguments for both NVCC and Host compiler
|
||||
shared_args=""
|
||||
|
||||
# Linker arguments
|
||||
xlinker_args=""
|
||||
|
||||
# Object files passable to NVCC
|
||||
object_files=""
|
||||
|
||||
# Link objects for the host linker only
|
||||
object_files_xlinker=""
|
||||
|
||||
# Shared libraries with version numbers are not handled correctly by NVCC
|
||||
shared_versioned_libraries_host=""
|
||||
shared_versioned_libraries=""
|
||||
|
||||
# Does the User set the architecture
|
||||
arch_set=0
|
||||
|
||||
# Does the user overwrite the host compiler
|
||||
ccbin_set=0
|
||||
|
||||
#Error code of compilation
|
||||
error_code=0
|
||||
|
||||
# Do a dry run without actually compiling
|
||||
dry_run=0
|
||||
|
||||
# Skip NVCC compilation and use host compiler directly
|
||||
host_only=0
|
||||
|
||||
# Enable workaround for CUDA 6.5 for pragma ident
|
||||
replace_pragma_ident=0
|
||||
|
||||
# Mark first host compiler argument
|
||||
first_xcompiler_arg=1
|
||||
|
||||
temp_dir=${TMPDIR:-/tmp}
|
||||
|
||||
# Check if we have an optimization argument already
|
||||
optimization_applied=0
|
||||
|
||||
#echo "Arguments: $# $@"
|
||||
|
||||
while [ $# -gt 0 ]
|
||||
do
|
||||
case $1 in
|
||||
#show the executed command
|
||||
--show|--nvcc-wrapper-show)
|
||||
dry_run=1
|
||||
;;
|
||||
#run host compilation only
|
||||
--host-only)
|
||||
host_only=1
|
||||
;;
|
||||
#replace '#pragma ident' with '#ident' this is needed to compile OpenMPI due to a configure script bug and a non standardized behaviour of pragma with macros
|
||||
--replace-pragma-ident)
|
||||
replace_pragma_ident=1
|
||||
;;
|
||||
#handle source files to be compiled as cuda files
|
||||
*.cpp|*.cxx|*.cc|*.C|*.c++|*.cu)
|
||||
cpp_files="$cpp_files $1"
|
||||
;;
|
||||
# Ensure we only have one optimization flag because NVCC doesn't allow muliple
|
||||
-O*)
|
||||
if [ $optimization_applied -eq 1 ]; then
|
||||
echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-O*), only the first is used because nvcc can only accept a single optimization setting."
|
||||
else
|
||||
shared_args="$shared_args $1"
|
||||
optimization_applied=1
|
||||
fi
|
||||
;;
|
||||
#Handle shared args (valid for both nvcc and the host compiler)
|
||||
-D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared)
|
||||
shared_args="$shared_args $1"
|
||||
;;
|
||||
#Handle shared args that have an argument
|
||||
-o|-MT)
|
||||
shared_args="$shared_args $1 $2"
|
||||
shift
|
||||
;;
|
||||
#Handle known nvcc args
|
||||
-gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*)
|
||||
cuda_args="$cuda_args $1"
|
||||
;;
|
||||
#Handle more known nvcc args
|
||||
--expt-extended-lambda|--expt-relaxed-constexpr)
|
||||
cuda_args="$cuda_args $1"
|
||||
;;
|
||||
#Handle known nvcc args that have an argument
|
||||
-rdc|-maxrregcount|--default-stream)
|
||||
cuda_args="$cuda_args $1 $2"
|
||||
shift
|
||||
;;
|
||||
#Handle c++11 setting
|
||||
--std=c++11|-std=c++11)
|
||||
shared_args="$shared_args $1"
|
||||
;;
|
||||
#strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98
|
||||
-std=c++98|--std=c++98)
|
||||
;;
|
||||
#strip of pedantic because it produces endless warnings about #LINE added by the preprocessor
|
||||
-pedantic|-Wpedantic|-ansi)
|
||||
;;
|
||||
#strip -Xcompiler because we add it
|
||||
-Xcompiler)
|
||||
if [ $first_xcompiler_arg -eq 1 ]; then
|
||||
xcompiler_args="$2"
|
||||
first_xcompiler_arg=0
|
||||
else
|
||||
xcompiler_args="$xcompiler_args,$2"
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
#strip of "-x cu" because we add that
|
||||
-x)
|
||||
if [[ $2 != "cu" ]]; then
|
||||
if [ $first_xcompiler_arg -eq 1 ]; then
|
||||
xcompiler_args="-x,$2"
|
||||
first_xcompiler_arg=0
|
||||
else
|
||||
xcompiler_args="$xcompiler_args,-x,$2"
|
||||
fi
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
#Handle -ccbin (if its not set we can set it to a default value)
|
||||
-ccbin)
|
||||
cuda_args="$cuda_args $1 $2"
|
||||
ccbin_set=1
|
||||
host_compiler=$2
|
||||
shift
|
||||
;;
|
||||
#Handle -arch argument (if its not set use a default
|
||||
-arch*)
|
||||
cuda_args="$cuda_args $1"
|
||||
arch_set=1
|
||||
;;
|
||||
#Handle -Xcudafe argument
|
||||
-Xcudafe)
|
||||
cuda_args="$cuda_args -Xcudafe $2"
|
||||
shift
|
||||
;;
|
||||
#Handle args that should be sent to the linker
|
||||
-Wl*)
|
||||
xlinker_args="$xlinker_args -Xlinker ${1:4:${#1}}"
|
||||
host_linker_args="$host_linker_args ${1:4:${#1}}"
|
||||
;;
|
||||
#Handle object files: -x cu applies to all input files, so give them to linker, except if only linking
|
||||
*.a|*.so|*.o|*.obj)
|
||||
object_files="$object_files $1"
|
||||
object_files_xlinker="$object_files_xlinker -Xlinker $1"
|
||||
;;
|
||||
#Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking
|
||||
*.dylib)
|
||||
object_files="$object_files -Xlinker $1"
|
||||
object_files_xlinker="$object_files_xlinker -Xlinker $1"
|
||||
;;
|
||||
#Handle shared libraries with *.so.* names which nvcc can't do.
|
||||
*.so.*)
|
||||
shared_versioned_libraries_host="$shared_versioned_libraries_host $1"
|
||||
shared_versioned_libraries="$shared_versioned_libraries -Xlinker $1"
|
||||
;;
|
||||
#All other args are sent to the host compiler
|
||||
*)
|
||||
if [ $first_xcompiler_arg -eq 1 ]; then
|
||||
xcompiler_args=$1
|
||||
first_xcompiler_arg=0
|
||||
else
|
||||
xcompiler_args="$xcompiler_args,$1"
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
|
||||
shift
|
||||
done
|
||||
|
||||
#Add default host compiler if necessary
|
||||
if [ $ccbin_set -ne 1 ]; then
|
||||
cuda_args="$cuda_args -ccbin $host_compiler"
|
||||
fi
|
||||
|
||||
#Add architecture command
|
||||
if [ $arch_set -ne 1 ]; then
|
||||
cuda_args="$cuda_args -arch=$default_arch"
|
||||
fi
|
||||
|
||||
#Compose compilation command
|
||||
nvcc_command="nvcc $cuda_args $shared_args $xlinker_args $shared_versioned_libraries"
|
||||
if [ $first_xcompiler_arg -eq 0 ]; then
|
||||
nvcc_command="$nvcc_command -Xcompiler $xcompiler_args"
|
||||
fi
|
||||
|
||||
#Compose host only command
|
||||
host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args $shared_versioned_libraries_host"
|
||||
|
||||
#nvcc does not accept '#pragma ident SOME_MACRO_STRING' but it does accept '#ident SOME_MACRO_STRING'
|
||||
if [ $replace_pragma_ident -eq 1 ]; then
|
||||
cpp_files2=""
|
||||
for file in $cpp_files
|
||||
do
|
||||
var=`grep pragma ${file} | grep ident | grep "#"`
|
||||
if [ "${#var}" -gt 0 ]
|
||||
then
|
||||
sed 's/#[\ \t]*pragma[\ \t]*ident/#ident/g' $file > $temp_dir/nvcc_wrapper_tmp_$file
|
||||
cpp_files2="$cpp_files2 $temp_dir/nvcc_wrapper_tmp_$file"
|
||||
else
|
||||
cpp_files2="$cpp_files2 $file"
|
||||
fi
|
||||
done
|
||||
cpp_files=$cpp_files2
|
||||
#echo $cpp_files
|
||||
fi
|
||||
|
||||
if [ "$cpp_files" ]; then
|
||||
nvcc_command="$nvcc_command $object_files_xlinker -x cu $cpp_files"
|
||||
else
|
||||
nvcc_command="$nvcc_command $object_files"
|
||||
fi
|
||||
|
||||
if [ "$cpp_files" ]; then
|
||||
host_command="$host_command $object_files $cpp_files"
|
||||
else
|
||||
host_command="$host_command $object_files"
|
||||
fi
|
||||
|
||||
#Print command for dryrun
|
||||
if [ $dry_run -eq 1 ]; then
|
||||
if [ $host_only -eq 1 ]; then
|
||||
echo $host_command
|
||||
else
|
||||
echo $nvcc_command
|
||||
fi
|
||||
exit 0
|
||||
fi
|
||||
|
||||
#Run compilation command
|
||||
if [ $host_only -eq 1 ]; then
|
||||
$host_command
|
||||
else
|
||||
$nvcc_command
|
||||
fi
|
||||
error_code=$?
|
||||
|
||||
#Report error code
|
||||
exit $error_code
|
||||
@ -53,12 +53,12 @@
|
||||
# ************************************************************************
|
||||
# @HEADER
|
||||
|
||||
include(${TRIBITS_DEPS_DIR}/CUDA.cmake)
|
||||
#include(${TRIBITS_DEPS_DIR}/CUDA.cmake)
|
||||
|
||||
IF (TPL_ENABLE_CUDA)
|
||||
GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS)
|
||||
GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS})
|
||||
GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY})
|
||||
TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE)
|
||||
ENDIF()
|
||||
#IF (TPL_ENABLE_CUDA)
|
||||
# GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS)
|
||||
# GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS})
|
||||
# GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY})
|
||||
# TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE)
|
||||
#ENDIF()
|
||||
|
||||
|
||||
@ -1,6 +1,16 @@
|
||||
INCLUDE(CMakeParseArguments)
|
||||
INCLUDE(CTest)
|
||||
|
||||
cmake_policy(SET CMP0054 NEW)
|
||||
|
||||
IF(NOT DEFINED ${PROJECT_NAME})
|
||||
project(Kokkos)
|
||||
ENDIF()
|
||||
|
||||
IF(NOT DEFINED ${${PROJECT_NAME}_ENABLE_DEBUG}})
|
||||
SET(${PROJECT_NAME}_ENABLE_DEBUG OFF)
|
||||
ENDIF()
|
||||
|
||||
FUNCTION(ASSERT_DEFINED VARS)
|
||||
FOREACH(VAR ${VARS})
|
||||
IF(NOT DEFINED ${VAR})
|
||||
@ -75,6 +85,13 @@ MACRO(TRIBITS_ADD_EXAMPLE_DIRECTORIES)
|
||||
|
||||
ENDMACRO()
|
||||
|
||||
|
||||
function(INCLUDE_DIRECTORIES)
|
||||
cmake_parse_arguments(INCLUDE_DIRECTORIES "REQUIRED_DURING_INSTALLATION_TESTING" "" "" ${ARGN})
|
||||
_INCLUDE_DIRECTORIES(${INCLUDE_DIRECTORIES_UNPARSED_ARGUMENTS})
|
||||
endfunction()
|
||||
|
||||
|
||||
MACRO(TARGET_TRANSFER_PROPERTY TARGET_NAME PROP_IN PROP_OUT)
|
||||
SET(PROP_VALUES)
|
||||
FOREACH(TARGET_X ${ARGN})
|
||||
@ -271,6 +288,11 @@ ENDFUNCTION()
|
||||
|
||||
ADD_CUSTOM_TARGET(check COMMAND ${CMAKE_CTEST_COMMAND} -VV -C ${CMAKE_CFG_INTDIR})
|
||||
|
||||
FUNCTION(TRIBITS_ADD_TEST)
|
||||
ENDFUNCTION()
|
||||
FUNCTION(TRIBITS_TPL_TENTATIVELY_ENABLE)
|
||||
ENDFUNCTION()
|
||||
|
||||
FUNCTION(TRIBITS_ADD_EXECUTABLE_AND_TEST EXE_NAME)
|
||||
|
||||
SET(options STANDARD_PASS_OUTPUT WILL_FAIL)
|
||||
|
||||
0
lib/kokkos/config/configure_compton_cpu.sh
Executable file → Normal file
0
lib/kokkos/config/configure_compton_cpu.sh
Executable file → Normal file
0
lib/kokkos/config/configure_compton_mic.sh
Executable file → Normal file
0
lib/kokkos/config/configure_compton_mic.sh
Executable file → Normal file
0
lib/kokkos/config/configure_kokkos.sh
Executable file → Normal file
0
lib/kokkos/config/configure_kokkos.sh
Executable file → Normal file
0
lib/kokkos/config/configure_kokkos_nvidia.sh
Executable file → Normal file
0
lib/kokkos/config/configure_kokkos_nvidia.sh
Executable file → Normal file
0
lib/kokkos/config/configure_shannon.sh
Executable file → Normal file
0
lib/kokkos/config/configure_shannon.sh
Executable file → Normal file
@ -91,9 +91,20 @@ Step 3:
|
||||
|
||||
// -------------------------------------------------------------------------------- //
|
||||
|
||||
Step 4:
|
||||
4.1. Once all Trilinos tests pass promote Kokkos develop branch to master on Github
|
||||
Step 4: Once all Trilinos tests pass promote Kokkos develop branch to master on Github
|
||||
4.1. Generate Changelog (You need a github API token)
|
||||
|
||||
Close all Open issues with "InDevelop" tag on github
|
||||
|
||||
(Not from kokkos directory)
|
||||
gitthub_changelog_generator kokkos/kokkos --token TOKEN --no-pull-requests --include-labels 'InDevelop' --enhancement-labels 'enhancement,Feature Request' --future-release 'NEWTAG' --between-tags 'NEWTAG,OLDTAG'
|
||||
|
||||
(Copy the new section from the generated CHANGELOG.md to the kokkos/CHANGELOG.md)
|
||||
(Make desired changes to CHANGELOG.md to enhance clarity)
|
||||
(Commit and push the CHANGELOG to develop)
|
||||
|
||||
4.2 Merge develop into Master
|
||||
|
||||
- DO NOT fast-forward the merge!!!!
|
||||
|
||||
(From kokkos directory):
|
||||
@ -103,7 +114,7 @@ Step 4:
|
||||
git reset --hard origin/master
|
||||
git merge --no-ff origin/develop
|
||||
|
||||
4.2. Update the tag in kokkos/config/master_history.txt
|
||||
4.3. Update the tag in kokkos/config/master_history.txt
|
||||
Tag description: MajorNumber.MinorNumber.WeeksSinceMinorNumberUpdate
|
||||
Tag format: #.#.##
|
||||
|
||||
|
||||
@ -1,3 +1,6 @@
|
||||
tag: 2.01.00 date: 07:21:2016 master: xxxxxxxx develop: fa6dfcc4
|
||||
tag: 2.01.06 date: 09:02:2016 master: 9afaa87f develop: 555f1a3a
|
||||
|
||||
tag: 2.01.10 date: 09:27:2016 master: e4119325 develop: e6cda11e
|
||||
tag: 2.02.00 date: 10:30:2016 master: 6c90a581 develop: ca3dd56e
|
||||
tag: 2.02.01 date: 11:01:2016 master: 9c698c86 develop: b0072304
|
||||
tag: 2.02.07 date: 12:16:2016 master: 4b4cc4ba develop: 382c0966
|
||||
|
||||
@ -121,6 +121,10 @@ do
|
||||
-gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*)
|
||||
cuda_args="$cuda_args $1"
|
||||
;;
|
||||
#Handle more known nvcc args
|
||||
--expt-extended-lambda|--expt-relaxed-constexpr)
|
||||
cuda_args="$cuda_args $1"
|
||||
;;
|
||||
#Handle known nvcc args that have an argument
|
||||
-rdc|-maxrregcount|--default-stream)
|
||||
cuda_args="$cuda_args $1 $2"
|
||||
|
||||
@ -16,6 +16,8 @@ elif [[ "$HOSTNAME" =~ .*bowman.* ]]; then
|
||||
MACHINE=bowman
|
||||
elif [[ "$HOSTNAME" =~ node.* ]]; then # Warning: very generic name
|
||||
MACHINE=shepard
|
||||
elif [[ "$HOSTNAME" =~ apollo ]]; then
|
||||
MACHINE=apollo
|
||||
elif [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then
|
||||
MACHINE=sems
|
||||
else
|
||||
@ -28,6 +30,7 @@ IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
|
||||
INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
|
||||
CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial"
|
||||
CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial"
|
||||
CUDA_IBM_BUILD_LIST="Cuda_OpenMP,Cuda_Serial"
|
||||
|
||||
GCC_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized"
|
||||
IBM_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
|
||||
@ -44,102 +47,12 @@ BUILD_ONLY=False
|
||||
declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3
|
||||
TEST_SCRIPT=False
|
||||
SKIP_HWLOC=False
|
||||
SPOT_CHECK=False
|
||||
|
||||
ARCH_FLAG=""
|
||||
PRINT_HELP=False
|
||||
OPT_FLAG=""
|
||||
KOKKOS_OPTIONS=""
|
||||
|
||||
#
|
||||
# Machine specific config
|
||||
#
|
||||
|
||||
if [ "$MACHINE" = "sems" ]; then
|
||||
source /projects/modulefiles/utils/sems-modules-init.sh
|
||||
source /projects/modulefiles/utils/kokkos-modules-init.sh
|
||||
|
||||
BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>/base,hwloc/1.10.1/<COMPILER_NAME>/<COMPILER_VERSION>/base"
|
||||
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.7.2/base"
|
||||
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"cuda/6.5.14 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
)
|
||||
|
||||
elif [ "$MACHINE" = "white" ]; then
|
||||
source /etc/profile.d/modules.sh
|
||||
SKIP_HWLOC=True
|
||||
export SLURM_TASKS_PER_NODE=32
|
||||
|
||||
BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
|
||||
IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
|
||||
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.9.2"
|
||||
|
||||
# Don't do pthread on white
|
||||
GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
|
||||
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("gcc/4.9.2 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/5.3.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
|
||||
)
|
||||
|
||||
ARCH_FLAG="--arch=Power8"
|
||||
NUM_JOBS_TO_RUN_IN_PARALLEL=8
|
||||
|
||||
elif [ "$MACHINE" = "bowman" ]; then
|
||||
source /etc/profile.d/modules.sh
|
||||
SKIP_HWLOC=True
|
||||
export SLURM_TASKS_PER_NODE=32
|
||||
|
||||
BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
|
||||
|
||||
OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
|
||||
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
)
|
||||
|
||||
ARCH_FLAG="--arch=KNL"
|
||||
NUM_JOBS_TO_RUN_IN_PARALLEL=8
|
||||
|
||||
elif [ "$MACHINE" = "shepard" ]; then
|
||||
source /etc/profile.d/modules.sh
|
||||
SKIP_HWLOC=True
|
||||
export SLURM_TASKS_PER_NODE=32
|
||||
|
||||
BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
|
||||
|
||||
OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
|
||||
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
)
|
||||
|
||||
ARCH_FLAG="--arch=HSW"
|
||||
NUM_JOBS_TO_RUN_IN_PARALLEL=8
|
||||
|
||||
else
|
||||
echo "Unhandled machine $MACHINE" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
export OMP_NUM_THREADS=4
|
||||
|
||||
declare -i NUM_RESULTS_TO_KEEP=7
|
||||
|
||||
RESULT_ROOT_PREFIX=TestAll
|
||||
|
||||
SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
|
||||
|
||||
#
|
||||
# Handle arguments
|
||||
@ -173,7 +86,211 @@ NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
|
||||
--dry-run*)
|
||||
DRYRUN=True
|
||||
;;
|
||||
--help)
|
||||
--spot-check*)
|
||||
SPOT_CHECK=True
|
||||
;;
|
||||
--arch*)
|
||||
ARCH_FLAG="--arch=${key#*=}"
|
||||
;;
|
||||
--opt-flag*)
|
||||
OPT_FLAG="${key#*=}"
|
||||
;;
|
||||
--with-cuda-options*)
|
||||
KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}"
|
||||
;;
|
||||
--help*)
|
||||
PRINT_HELP=True
|
||||
;;
|
||||
*)
|
||||
# args, just append
|
||||
ARGS="$ARGS $1"
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
|
||||
|
||||
# set kokkos path
|
||||
if [ -z "$KOKKOS_PATH" ]; then
|
||||
KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT
|
||||
else
|
||||
# Ensure KOKKOS_PATH is abs path
|
||||
KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd )
|
||||
fi
|
||||
|
||||
#
|
||||
# Machine specific config
|
||||
#
|
||||
|
||||
if [ "$MACHINE" = "sems" ]; then
|
||||
source /projects/sems/modulefiles/utils/sems-modules-init.sh
|
||||
|
||||
BASE_MODULE_LIST="sems-env,kokkos-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>,kokkos-hwloc/1.10.1/base"
|
||||
CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base"
|
||||
CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base"
|
||||
|
||||
if [ -z "$ARCH_FLAG" ]; then
|
||||
ARCH_FLAG=""
|
||||
fi
|
||||
|
||||
if [ "$SPOT_CHECK" = "True" ]; then
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/5.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
|
||||
"intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
|
||||
"clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
|
||||
"cuda/8.0.44 $CUDA8_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
)
|
||||
else
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"cuda/8.0.44 $CUDA8_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
)
|
||||
fi
|
||||
|
||||
elif [ "$MACHINE" = "white" ]; then
|
||||
source /etc/profile.d/modules.sh
|
||||
SKIP_HWLOC=True
|
||||
export SLURM_TASKS_PER_NODE=32
|
||||
|
||||
BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
|
||||
IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
|
||||
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/5.4.0"
|
||||
|
||||
# Don't do pthread on white
|
||||
GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
|
||||
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
|
||||
"cuda/8.0.44 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
)
|
||||
if [ -z "$ARCH_FLAG" ]; then
|
||||
ARCH_FLAG="--arch=Power8,Kepler37"
|
||||
fi
|
||||
NUM_JOBS_TO_RUN_IN_PARALLEL=2
|
||||
|
||||
elif [ "$MACHINE" = "bowman" ]; then
|
||||
source /etc/profile.d/modules.sh
|
||||
SKIP_HWLOC=True
|
||||
export SLURM_TASKS_PER_NODE=32
|
||||
|
||||
BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
|
||||
|
||||
OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
|
||||
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/17.0.098 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
)
|
||||
|
||||
if [ -z "$ARCH_FLAG" ]; then
|
||||
ARCH_FLAG="--arch=KNL"
|
||||
fi
|
||||
NUM_JOBS_TO_RUN_IN_PARALLEL=2
|
||||
|
||||
elif [ "$MACHINE" = "shepard" ]; then
|
||||
source /etc/profile.d/modules.sh
|
||||
SKIP_HWLOC=True
|
||||
export SLURM_TASKS_PER_NODE=32
|
||||
|
||||
BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
|
||||
|
||||
OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
|
||||
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/17.0.098 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
)
|
||||
|
||||
if [ -z "$ARCH_FLAG" ]; then
|
||||
ARCH_FLAG="--arch=HSW"
|
||||
fi
|
||||
NUM_JOBS_TO_RUN_IN_PARALLEL=2
|
||||
|
||||
elif [ "$MACHINE" = "apollo" ]; then
|
||||
source /projects/sems/modulefiles/utils/sems-modules-init.sh
|
||||
module use /home/projects/modulefiles/local/x86-64
|
||||
module load kokkos-env
|
||||
|
||||
module load sems-git
|
||||
module load sems-tex
|
||||
module load sems-cmake/3.5.2
|
||||
module load sems-gdb
|
||||
|
||||
SKIP_HWLOC=True
|
||||
|
||||
BASE_MODULE_LIST="sems-env,kokkos-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>,kokkos-hwloc/1.10.1/base"
|
||||
CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base"
|
||||
CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base"
|
||||
|
||||
CLANG_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/8.0.44"
|
||||
NVCC_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0"
|
||||
|
||||
BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP"
|
||||
BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread"
|
||||
BUILD_LIST_CLANG="Serial,Pthread,OpenMP"
|
||||
|
||||
if [ "$SPOT_CHECK" = "True" ]; then
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/5.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
|
||||
"intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
|
||||
"clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/head $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
|
||||
"cuda/8.0.44 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
)
|
||||
else
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("cuda/8.0.44 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"clang/head $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
|
||||
"clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
|
||||
"gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
)
|
||||
fi
|
||||
|
||||
if [ -z "$ARCH_FLAG" ]; then
|
||||
ARCH_FLAG="--arch=SNB,Kepler35"
|
||||
fi
|
||||
NUM_JOBS_TO_RUN_IN_PARALLEL=2
|
||||
else
|
||||
echo "Unhandled machine $MACHINE" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
|
||||
export OMP_NUM_THREADS=4
|
||||
|
||||
declare -i NUM_RESULTS_TO_KEEP=7
|
||||
|
||||
RESULT_ROOT_PREFIX=TestAll
|
||||
|
||||
if [ "$PRINT_HELP" = "True" ]; then
|
||||
echo "test_all_sandia <ARGS> <OPTIONS>:"
|
||||
echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
|
||||
echo " Defaults to root repo containing this script"
|
||||
@ -183,6 +300,9 @@ echo "--skip-hwloc: Do not do hwloc tests"
|
||||
echo "--num=N: Number of jobs to run in parallel "
|
||||
echo "--dry-run: Just print what would be executed"
|
||||
echo "--build-only: Just do builds, don't run anything"
|
||||
echo "--opt-flag=FLAG: Optimization flag (default: -O3)"
|
||||
echo "--arch=ARCHITECTURE: overwrite architecture flags"
|
||||
echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS"
|
||||
echo "--build-list=BUILD,BUILD,BUILD..."
|
||||
echo " Provide a comma-separated list of builds instead of running all builds"
|
||||
echo " Valid items:"
|
||||
@ -220,21 +340,6 @@ echo " hit ctrl-z"
|
||||
echo " % kill -9 %1"
|
||||
echo
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
# args, just append
|
||||
ARGS="$ARGS $1"
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
# set kokkos path
|
||||
if [ -z "$KOKKOS_PATH" ]; then
|
||||
KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT
|
||||
else
|
||||
# Ensure KOKKOS_PATH is abs path
|
||||
KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd )
|
||||
fi
|
||||
|
||||
# set build type
|
||||
@ -381,11 +486,15 @@ single_build_and_test() {
|
||||
local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info)))
|
||||
fi
|
||||
|
||||
if [[ "$OPT_FLAG" = "" ]]; then
|
||||
OPT_FLAG="-O3"
|
||||
fi
|
||||
|
||||
if [[ "$build_type" = *debug* ]]; then
|
||||
local extra_args="$extra_args --debug"
|
||||
local cxxflags="-g $compiler_warning_flags"
|
||||
else
|
||||
local cxxflags="-O3 $compiler_warning_flags"
|
||||
local cxxflags="$OPT_FLAG $compiler_warning_flags"
|
||||
fi
|
||||
|
||||
if [[ "$compiler" == cuda* ]]; then
|
||||
@ -393,7 +502,9 @@ single_build_and_test() {
|
||||
export TMPDIR=$(pwd)
|
||||
fi
|
||||
|
||||
# cxxflags="-DKOKKOS_USING_EXP_VIEW=1 $cxxflags"
|
||||
if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then
|
||||
local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS"
|
||||
fi
|
||||
|
||||
echo " Starting job $desc"
|
||||
|
||||
@ -440,13 +551,14 @@ run_in_background() {
|
||||
local compiler=$1
|
||||
|
||||
local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL
|
||||
if [[ "$BUILD_ONLY" == True ]]; then
|
||||
num_jobs=8
|
||||
else
|
||||
# don't override command line input
|
||||
# if [[ "$BUILD_ONLY" == True ]]; then
|
||||
# num_jobs=8
|
||||
# else
|
||||
if [[ "$compiler" == cuda* ]]; then
|
||||
num_jobs=1
|
||||
fi
|
||||
fi
|
||||
# fi
|
||||
wait_for_jobs $num_jobs
|
||||
|
||||
single_build_and_test $* &
|
||||
|
||||
50
lib/kokkos/config/trilinos-integration/prepare_trilinos_repos.sh
Executable file
50
lib/kokkos/config/trilinos-integration/prepare_trilinos_repos.sh
Executable file
@ -0,0 +1,50 @@
|
||||
#!/bin/bash -le
|
||||
|
||||
export TRILINOS_UPDATED_PATH=${PWD}/trilinos-update
|
||||
export TRILINOS_PRISTINE_PATH=${PWD}/trilinos-pristine
|
||||
|
||||
#rm -rf ${KOKKOS_PATH}
|
||||
#rm -rf ${TRILINOS_UPDATED_PATH}
|
||||
#rm -rf ${TRILINOS_PRISTINE_PATH}
|
||||
|
||||
#Already done:
|
||||
if [ ! -d "${TRILINOS_UPDATED_PATH}" ]; then
|
||||
git clone https://github.com/trilinos/trilinos ${TRILINOS_UPDATED_PATH}
|
||||
fi
|
||||
if [ ! -d "${TRILINOS_PRISTINE_PATH}" ]; then
|
||||
git clone https://github.com/trilinos/trilinos ${TRILINOS_PRISTINE_PATH}
|
||||
fi
|
||||
|
||||
cd ${TRILINOS_UPDATED_PATH}
|
||||
git checkout develop
|
||||
git reset --hard origin/develop
|
||||
git pull
|
||||
cd ..
|
||||
|
||||
python kokkos/config/snapshot.py ${KOKKOS_PATH} ${TRILINOS_UPDATED_PATH}/packages
|
||||
|
||||
cd ${TRILINOS_UPDATED_PATH}
|
||||
echo ""
|
||||
echo ""
|
||||
echo "Trilinos State:"
|
||||
git log --pretty=oneline --since=2.days
|
||||
SHA=`git log --pretty=oneline --since=2.days | head -n 2 | tail -n 1 | awk '{print $1}'`
|
||||
cd ..
|
||||
|
||||
cd ${TRILINOS_PRISTINE_PATH}
|
||||
git status
|
||||
git log --pretty=oneline --since=2.days
|
||||
echo "Checkout develop"
|
||||
git checkout develop
|
||||
echo "Pull"
|
||||
git pull
|
||||
echo "Checkout SHA"
|
||||
git checkout ${SHA}
|
||||
cd ..
|
||||
|
||||
cd ${TRILINOS_PRISTINE_PATH}
|
||||
echo ""
|
||||
echo ""
|
||||
echo "Trilinos Pristine State:"
|
||||
git log --pretty=oneline --since=2.days
|
||||
cd ..
|
||||
@ -1,6 +1,6 @@
|
||||
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
|
||||
|
||||
SET(SOURCES
|
||||
|
||||
@ -7,21 +7,18 @@ vpath %.cpp ${KOKKOS_PATH}/containers/performance_tests
|
||||
default: build_all
|
||||
echo "End Build"
|
||||
|
||||
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
|
||||
CXX = $(KOKKOS_PATH)/config/nvcc_wrapper
|
||||
else
|
||||
CXX = g++
|
||||
endif
|
||||
|
||||
CXXFLAGS = -O3
|
||||
LINK ?= $(CXX)
|
||||
LDFLAGS ?= -lpthread
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
CXX = $(NVCC_WRAPPER)
|
||||
CXXFLAGS ?= -O3
|
||||
LINK = $(CXX)
|
||||
LDFLAGS ?= -lpthread
|
||||
else
|
||||
CXX ?= g++
|
||||
CXXFLAGS ?= -O3
|
||||
LINK ?= $(CXX)
|
||||
LDFLAGS ?= -lpthread
|
||||
endif
|
||||
|
||||
KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/performance_tests
|
||||
|
||||
TEST_TARGETS =
|
||||
|
||||
@ -83,7 +83,7 @@ TEST_F( cuda, dynrankview_perf )
|
||||
{
|
||||
std::cout << "Cuda" << std::endl;
|
||||
std::cout << " DynRankView vs View: Initialization Only " << std::endl;
|
||||
test_dynrankview_op_perf<Kokkos::Cuda>( 4096 );
|
||||
test_dynrankview_op_perf<Kokkos::Cuda>( 40960 );
|
||||
}
|
||||
|
||||
TEST_F( cuda, global_2_local)
|
||||
|
||||
@ -180,8 +180,8 @@ void test_dynrankview_op_perf( const int par_size )
|
||||
|
||||
typedef DeviceType execution_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
const size_type dim2 = 900;
|
||||
const size_type dim3 = 300;
|
||||
const size_type dim2 = 90;
|
||||
const size_type dim3 = 30;
|
||||
|
||||
double elapsed_time_view = 0;
|
||||
double elapsed_time_compview = 0;
|
||||
|
||||
@ -261,9 +261,6 @@ public:
|
||||
modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device")),
|
||||
modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
|
||||
{
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
Impl::assert_shapes_are_equal (d_view.shape (), h_view.shape ());
|
||||
#else
|
||||
if ( int(d_view.rank) != int(h_view.rank) ||
|
||||
d_view.dimension_0() != h_view.dimension_0() ||
|
||||
d_view.dimension_1() != h_view.dimension_1() ||
|
||||
@ -284,7 +281,6 @@ public:
|
||||
d_view.span() != h_view.span() ) {
|
||||
Kokkos::Impl::throw_runtime_exception("DualView constructed with incompatible views");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
//@}
|
||||
@ -315,13 +311,13 @@ public:
|
||||
template< class Device >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const typename Impl::if_c<
|
||||
Impl::is_same<typename t_dev::memory_space,
|
||||
std::is_same<typename t_dev::memory_space,
|
||||
typename Device::memory_space>::value,
|
||||
t_dev,
|
||||
t_host>::type& view () const
|
||||
{
|
||||
return Impl::if_c<
|
||||
Impl::is_same<
|
||||
std::is_same<
|
||||
typename t_dev::memory_space,
|
||||
typename Device::memory_space>::value,
|
||||
t_dev,
|
||||
@ -347,13 +343,13 @@ public:
|
||||
/// appropriate template parameter.
|
||||
template<class Device>
|
||||
void sync( const typename Impl::enable_if<
|
||||
( Impl::is_same< typename traits::data_type , typename traits::non_const_data_type>::value) ||
|
||||
( Impl::is_same< Device , int>::value)
|
||||
( std::is_same< typename traits::data_type , typename traits::non_const_data_type>::value) ||
|
||||
( std::is_same< Device , int>::value)
|
||||
, int >::type& = 0)
|
||||
{
|
||||
const unsigned int dev =
|
||||
Impl::if_c<
|
||||
Impl::is_same<
|
||||
std::is_same<
|
||||
typename t_dev::memory_space,
|
||||
typename Device::memory_space>::value ,
|
||||
unsigned int,
|
||||
@ -370,7 +366,7 @@ public:
|
||||
modified_host() = modified_device() = 0;
|
||||
}
|
||||
}
|
||||
if(Impl::is_same<typename t_host::memory_space,typename t_dev::memory_space>::value) {
|
||||
if(std::is_same<typename t_host::memory_space,typename t_dev::memory_space>::value) {
|
||||
t_dev::execution_space::fence();
|
||||
t_host::execution_space::fence();
|
||||
}
|
||||
@ -378,13 +374,13 @@ public:
|
||||
|
||||
template<class Device>
|
||||
void sync ( const typename Impl::enable_if<
|
||||
( ! Impl::is_same< typename traits::data_type , typename traits::non_const_data_type>::value ) ||
|
||||
( Impl::is_same< Device , int>::value)
|
||||
( ! std::is_same< typename traits::data_type , typename traits::non_const_data_type>::value ) ||
|
||||
( std::is_same< Device , int>::value)
|
||||
, int >::type& = 0 )
|
||||
{
|
||||
const unsigned int dev =
|
||||
Impl::if_c<
|
||||
Impl::is_same<
|
||||
std::is_same<
|
||||
typename t_dev::memory_space,
|
||||
typename Device::memory_space>::value,
|
||||
unsigned int,
|
||||
@ -405,7 +401,7 @@ public:
|
||||
{
|
||||
const unsigned int dev =
|
||||
Impl::if_c<
|
||||
Impl::is_same<
|
||||
std::is_same<
|
||||
typename t_dev::memory_space,
|
||||
typename Device::memory_space>::value ,
|
||||
unsigned int,
|
||||
@ -431,7 +427,7 @@ public:
|
||||
void modify () {
|
||||
const unsigned int dev =
|
||||
Impl::if_c<
|
||||
Impl::is_same<
|
||||
std::is_same<
|
||||
typename t_dev::memory_space,
|
||||
typename Device::memory_space>::value,
|
||||
unsigned int,
|
||||
@ -514,11 +510,7 @@ public:
|
||||
|
||||
//! The allocation size (same as Kokkos::View::capacity).
|
||||
size_t capacity() const {
|
||||
#if KOKKOS_USING_EXP_VIEW
|
||||
return d_view.span();
|
||||
#else
|
||||
return d_view.capacity();
|
||||
#endif
|
||||
}
|
||||
|
||||
//! Get stride(s) for each dimension.
|
||||
@ -555,8 +547,6 @@ public:
|
||||
// Partial specializations of Kokkos::subview() for DualView objects.
|
||||
//
|
||||
|
||||
#if KOKKOS_USING_EXP_VIEW
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
@ -590,352 +580,6 @@ subview( const DualView<D,A1,A2,A3> & src , Args ... args )
|
||||
|
||||
} /* namespace Kokkos */
|
||||
|
||||
#else
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
//
|
||||
// Partial specializations of Kokkos::subview() for DualView objects.
|
||||
//
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
|
||||
, class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
|
||||
, class SubArg4_type , class SubArg5_type , class SubArg6_type , class SubArg7_type
|
||||
>
|
||||
struct ViewSubview< DualView< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type >
|
||||
, SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type
|
||||
, SubArg4_type , SubArg5_type , SubArg6_type , SubArg7_type >
|
||||
{
|
||||
private:
|
||||
|
||||
typedef DualView< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type > SrcViewType ;
|
||||
|
||||
enum { V0 = Impl::is_same< SubArg0_type , void >::value ? 1 : 0 };
|
||||
enum { V1 = Impl::is_same< SubArg1_type , void >::value ? 1 : 0 };
|
||||
enum { V2 = Impl::is_same< SubArg2_type , void >::value ? 1 : 0 };
|
||||
enum { V3 = Impl::is_same< SubArg3_type , void >::value ? 1 : 0 };
|
||||
enum { V4 = Impl::is_same< SubArg4_type , void >::value ? 1 : 0 };
|
||||
enum { V5 = Impl::is_same< SubArg5_type , void >::value ? 1 : 0 };
|
||||
enum { V6 = Impl::is_same< SubArg6_type , void >::value ? 1 : 0 };
|
||||
enum { V7 = Impl::is_same< SubArg7_type , void >::value ? 1 : 0 };
|
||||
|
||||
// The source view rank must be equal to the input argument rank
|
||||
// Once a void argument is encountered all subsequent arguments must be void.
|
||||
enum { InputRank =
|
||||
Impl::StaticAssert<( SrcViewType::rank ==
|
||||
( V0 ? 0 : (
|
||||
V1 ? 1 : (
|
||||
V2 ? 2 : (
|
||||
V3 ? 3 : (
|
||||
V4 ? 4 : (
|
||||
V5 ? 5 : (
|
||||
V6 ? 6 : (
|
||||
V7 ? 7 : 8 ))))))) ))
|
||||
&&
|
||||
( SrcViewType::rank ==
|
||||
( 8 - ( V0 + V1 + V2 + V3 + V4 + V5 + V6 + V7 ) ) )
|
||||
>::value ? SrcViewType::rank : 0 };
|
||||
|
||||
enum { R0 = Impl::ViewOffsetRange< SubArg0_type >::is_range ? 1 : 0 };
|
||||
enum { R1 = Impl::ViewOffsetRange< SubArg1_type >::is_range ? 1 : 0 };
|
||||
enum { R2 = Impl::ViewOffsetRange< SubArg2_type >::is_range ? 1 : 0 };
|
||||
enum { R3 = Impl::ViewOffsetRange< SubArg3_type >::is_range ? 1 : 0 };
|
||||
enum { R4 = Impl::ViewOffsetRange< SubArg4_type >::is_range ? 1 : 0 };
|
||||
enum { R5 = Impl::ViewOffsetRange< SubArg5_type >::is_range ? 1 : 0 };
|
||||
enum { R6 = Impl::ViewOffsetRange< SubArg6_type >::is_range ? 1 : 0 };
|
||||
enum { R7 = Impl::ViewOffsetRange< SubArg7_type >::is_range ? 1 : 0 };
|
||||
|
||||
enum { OutputRank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3)
|
||||
+ unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) };
|
||||
|
||||
// Reverse
|
||||
enum { R0_rev = 0 == InputRank ? 0u : (
|
||||
1 == InputRank ? unsigned(R0) : (
|
||||
2 == InputRank ? unsigned(R1) : (
|
||||
3 == InputRank ? unsigned(R2) : (
|
||||
4 == InputRank ? unsigned(R3) : (
|
||||
5 == InputRank ? unsigned(R4) : (
|
||||
6 == InputRank ? unsigned(R5) : (
|
||||
7 == InputRank ? unsigned(R6) : unsigned(R7) ))))))) };
|
||||
|
||||
typedef typename SrcViewType::array_layout SrcViewLayout ;
|
||||
|
||||
// Choose array layout, attempting to preserve original layout if at all possible.
|
||||
typedef typename Impl::if_c<
|
||||
( // Same Layout IF
|
||||
// OutputRank 0
|
||||
( OutputRank == 0 )
|
||||
||
|
||||
// OutputRank 1 or 2, InputLayout Left, Interval 0
|
||||
// because single stride one or second index has a stride.
|
||||
( OutputRank <= 2 && R0 && Impl::is_same<SrcViewLayout,LayoutLeft>::value )
|
||||
||
|
||||
// OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1]
|
||||
// because single stride one or second index has a stride.
|
||||
( OutputRank <= 2 && R0_rev && Impl::is_same<SrcViewLayout,LayoutRight>::value )
|
||||
), SrcViewLayout , Kokkos::LayoutStride >::type OutputViewLayout ;
|
||||
|
||||
// Choose data type as a purely dynamic rank array to accomodate a runtime range.
|
||||
typedef typename Impl::if_c< OutputRank == 0 , typename SrcViewType::value_type ,
|
||||
typename Impl::if_c< OutputRank == 1 , typename SrcViewType::value_type *,
|
||||
typename Impl::if_c< OutputRank == 2 , typename SrcViewType::value_type **,
|
||||
typename Impl::if_c< OutputRank == 3 , typename SrcViewType::value_type ***,
|
||||
typename Impl::if_c< OutputRank == 4 , typename SrcViewType::value_type ****,
|
||||
typename Impl::if_c< OutputRank == 5 , typename SrcViewType::value_type *****,
|
||||
typename Impl::if_c< OutputRank == 6 , typename SrcViewType::value_type ******,
|
||||
typename Impl::if_c< OutputRank == 7 , typename SrcViewType::value_type *******,
|
||||
typename SrcViewType::value_type ********
|
||||
>::type >::type >::type >::type >::type >::type >::type >::type OutputData ;
|
||||
|
||||
// Choose space.
|
||||
// If the source view's template arg1 or arg2 is a space then use it,
|
||||
// otherwise use the source view's execution space.
|
||||
|
||||
typedef typename Impl::if_c< Impl::is_space< SrcArg1Type >::value , SrcArg1Type ,
|
||||
typename Impl::if_c< Impl::is_space< SrcArg2Type >::value , SrcArg2Type , typename SrcViewType::execution_space
|
||||
>::type >::type OutputSpace ;
|
||||
|
||||
public:
|
||||
|
||||
// If keeping the layout then match non-data type arguments
|
||||
// else keep execution space and memory traits.
|
||||
typedef typename
|
||||
Impl::if_c< Impl::is_same< SrcViewLayout , OutputViewLayout >::value
|
||||
, Kokkos::DualView< OutputData , SrcArg1Type , SrcArg2Type , SrcArg3Type >
|
||||
, Kokkos::DualView< OutputData , OutputViewLayout , OutputSpace
|
||||
, typename SrcViewType::memory_traits >
|
||||
>::type type ;
|
||||
};
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
template< class D , class A1 , class A2 , class A3 ,
|
||||
class ArgType0 >
|
||||
typename Impl::ViewSubview< DualView<D,A1,A2,A3>
|
||||
, ArgType0 , void , void , void
|
||||
, void , void , void , void
|
||||
>::type
|
||||
subview( const DualView<D,A1,A2,A3> & src ,
|
||||
const ArgType0 & arg0 )
|
||||
{
|
||||
typedef typename
|
||||
Impl::ViewSubview< DualView<D,A1,A2,A3>
|
||||
, ArgType0 , void , void , void
|
||||
, void , void , void , void
|
||||
>::type
|
||||
DstViewType ;
|
||||
DstViewType sub_view;
|
||||
sub_view.d_view = subview(src.d_view,arg0);
|
||||
sub_view.h_view = subview(src.h_view,arg0);
|
||||
sub_view.modified_device = src.modified_device;
|
||||
sub_view.modified_host = src.modified_host;
|
||||
return sub_view;
|
||||
}
|
||||
|
||||
|
||||
template< class D , class A1 , class A2 , class A3 ,
|
||||
class ArgType0 , class ArgType1 >
|
||||
typename Impl::ViewSubview< DualView<D,A1,A2,A3>
|
||||
, ArgType0 , ArgType1 , void , void
|
||||
, void , void , void , void
|
||||
>::type
|
||||
subview( const DualView<D,A1,A2,A3> & src ,
|
||||
const ArgType0 & arg0 ,
|
||||
const ArgType1 & arg1 )
|
||||
{
|
||||
typedef typename
|
||||
Impl::ViewSubview< DualView<D,A1,A2,A3>
|
||||
, ArgType0 , ArgType1 , void , void
|
||||
, void , void , void , void
|
||||
>::type
|
||||
DstViewType ;
|
||||
DstViewType sub_view;
|
||||
sub_view.d_view = subview(src.d_view,arg0,arg1);
|
||||
sub_view.h_view = subview(src.h_view,arg0,arg1);
|
||||
sub_view.modified_device = src.modified_device;
|
||||
sub_view.modified_host = src.modified_host;
|
||||
return sub_view;
|
||||
}
|
||||
|
||||
template< class D , class A1 , class A2 , class A3 ,
|
||||
class ArgType0 , class ArgType1 , class ArgType2 >
|
||||
typename Impl::ViewSubview< DualView<D,A1,A2,A3>
|
||||
, ArgType0 , ArgType1 , ArgType2 , void
|
||||
, void , void , void , void
|
||||
>::type
|
||||
subview( const DualView<D,A1,A2,A3> & src ,
|
||||
const ArgType0 & arg0 ,
|
||||
const ArgType1 & arg1 ,
|
||||
const ArgType2 & arg2 )
|
||||
{
|
||||
typedef typename
|
||||
Impl::ViewSubview< DualView<D,A1,A2,A3>
|
||||
, ArgType0 , ArgType1 , ArgType2 , void
|
||||
, void , void , void , void
|
||||
>::type
|
||||
DstViewType ;
|
||||
DstViewType sub_view;
|
||||
sub_view.d_view = subview(src.d_view,arg0,arg1,arg2);
|
||||
sub_view.h_view = subview(src.h_view,arg0,arg1,arg2);
|
||||
sub_view.modified_device = src.modified_device;
|
||||
sub_view.modified_host = src.modified_host;
|
||||
return sub_view;
|
||||
}
|
||||
|
||||
template< class D , class A1 , class A2 , class A3 ,
|
||||
class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 >
|
||||
typename Impl::ViewSubview< DualView<D,A1,A2,A3>
|
||||
, ArgType0 , ArgType1 , ArgType2 , ArgType3
|
||||
, void , void , void , void
|
||||
>::type
|
||||
subview( const DualView<D,A1,A2,A3> & src ,
|
||||
const ArgType0 & arg0 ,
|
||||
const ArgType1 & arg1 ,
|
||||
const ArgType2 & arg2 ,
|
||||
const ArgType3 & arg3 )
|
||||
{
|
||||
typedef typename
|
||||
Impl::ViewSubview< DualView<D,A1,A2,A3>
|
||||
, ArgType0 , ArgType1 , ArgType2 , ArgType3
|
||||
, void , void , void , void
|
||||
>::type
|
||||
DstViewType ;
|
||||
DstViewType sub_view;
|
||||
sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3);
|
||||
sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3);
|
||||
sub_view.modified_device = src.modified_device;
|
||||
sub_view.modified_host = src.modified_host;
|
||||
return sub_view;
|
||||
}
|
||||
|
||||
template< class D , class A1 , class A2 , class A3 ,
|
||||
class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
|
||||
class ArgType4 >
|
||||
typename Impl::ViewSubview< DualView<D,A1,A2,A3>
|
||||
, ArgType0 , ArgType1 , ArgType2 , ArgType3
|
||||
, ArgType4 , void , void , void
|
||||
>::type
|
||||
subview( const DualView<D,A1,A2,A3> & src ,
|
||||
const ArgType0 & arg0 ,
|
||||
const ArgType1 & arg1 ,
|
||||
const ArgType2 & arg2 ,
|
||||
const ArgType3 & arg3 ,
|
||||
const ArgType4 & arg4 )
|
||||
{
|
||||
typedef typename
|
||||
Impl::ViewSubview< DualView<D,A1,A2,A3>
|
||||
, ArgType0 , ArgType1 , ArgType2 , ArgType3
|
||||
, ArgType4 , void , void ,void
|
||||
>::type
|
||||
DstViewType ;
|
||||
DstViewType sub_view;
|
||||
sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4);
|
||||
sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4);
|
||||
sub_view.modified_device = src.modified_device;
|
||||
sub_view.modified_host = src.modified_host;
|
||||
return sub_view;
|
||||
}
|
||||
|
||||
template< class D , class A1 , class A2 , class A3 ,
|
||||
class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
|
||||
class ArgType4 , class ArgType5 >
|
||||
typename Impl::ViewSubview< DualView<D,A1,A2,A3>
|
||||
, ArgType0 , ArgType1 , ArgType2 , ArgType3
|
||||
, ArgType4 , ArgType5 , void , void
|
||||
>::type
|
||||
subview( const DualView<D,A1,A2,A3> & src ,
|
||||
const ArgType0 & arg0 ,
|
||||
const ArgType1 & arg1 ,
|
||||
const ArgType2 & arg2 ,
|
||||
const ArgType3 & arg3 ,
|
||||
const ArgType4 & arg4 ,
|
||||
const ArgType5 & arg5 )
|
||||
{
|
||||
typedef typename
|
||||
Impl::ViewSubview< DualView<D,A1,A2,A3>
|
||||
, ArgType0 , ArgType1 , ArgType2 , ArgType3
|
||||
, ArgType4 , ArgType5 , void , void
|
||||
>::type
|
||||
DstViewType ;
|
||||
DstViewType sub_view;
|
||||
sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5);
|
||||
sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5);
|
||||
sub_view.modified_device = src.modified_device;
|
||||
sub_view.modified_host = src.modified_host;
|
||||
return sub_view;
|
||||
}
|
||||
|
||||
template< class D , class A1 , class A2 , class A3 ,
|
||||
class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
|
||||
class ArgType4 , class ArgType5 , class ArgType6 >
|
||||
typename Impl::ViewSubview< DualView<D,A1,A2,A3>
|
||||
, ArgType0 , ArgType1 , ArgType2 , ArgType3
|
||||
, ArgType4 , ArgType5 , ArgType6 , void
|
||||
>::type
|
||||
subview( const DualView<D,A1,A2,A3> & src ,
|
||||
const ArgType0 & arg0 ,
|
||||
const ArgType1 & arg1 ,
|
||||
const ArgType2 & arg2 ,
|
||||
const ArgType3 & arg3 ,
|
||||
const ArgType4 & arg4 ,
|
||||
const ArgType5 & arg5 ,
|
||||
const ArgType6 & arg6 )
|
||||
{
|
||||
typedef typename
|
||||
Impl::ViewSubview< DualView<D,A1,A2,A3>
|
||||
, ArgType0 , ArgType1 , ArgType2 , ArgType3
|
||||
, ArgType4 , ArgType5 , ArgType6 , void
|
||||
>::type
|
||||
DstViewType ;
|
||||
DstViewType sub_view;
|
||||
sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6);
|
||||
sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6);
|
||||
sub_view.modified_device = src.modified_device;
|
||||
sub_view.modified_host = src.modified_host;
|
||||
return sub_view;
|
||||
}
|
||||
|
||||
template< class D , class A1 , class A2 , class A3 ,
|
||||
class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
|
||||
class ArgType4 , class ArgType5 , class ArgType6 , class ArgType7 >
|
||||
typename Impl::ViewSubview< DualView<D,A1,A2,A3>
|
||||
, ArgType0 , ArgType1 , ArgType2 , ArgType3
|
||||
, ArgType4 , ArgType5 , ArgType6 , ArgType7
|
||||
>::type
|
||||
subview( const DualView<D,A1,A2,A3> & src ,
|
||||
const ArgType0 & arg0 ,
|
||||
const ArgType1 & arg1 ,
|
||||
const ArgType2 & arg2 ,
|
||||
const ArgType3 & arg3 ,
|
||||
const ArgType4 & arg4 ,
|
||||
const ArgType5 & arg5 ,
|
||||
const ArgType6 & arg6 ,
|
||||
const ArgType7 & arg7 )
|
||||
{
|
||||
typedef typename
|
||||
Impl::ViewSubview< DualView<D,A1,A2,A3>
|
||||
, ArgType0 , ArgType1 , ArgType2 , ArgType3
|
||||
, ArgType4 , ArgType5 , ArgType6 , ArgType7
|
||||
>::type
|
||||
DstViewType ;
|
||||
DstViewType sub_view;
|
||||
sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7);
|
||||
sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7);
|
||||
sub_view.modified_device = src.modified_device;
|
||||
sub_view.modified_host = src.modified_host;
|
||||
return sub_view;
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* KOKKOS_USING_EXP_VIEW */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@ -223,14 +223,85 @@ struct DynRankDimTraits {
|
||||
);
|
||||
}
|
||||
|
||||
template < typename DynRankViewType , typename iType >
|
||||
void verify_dynrankview_rank ( iType N , const DynRankViewType &drv )
|
||||
{
|
||||
if ( static_cast<iType>(drv.rank()) > N )
|
||||
{
|
||||
Kokkos::abort( "Need at least rank arguments to the operator()" );
|
||||
}
|
||||
|
||||
/** \brief Debug bounds-checking routines */
|
||||
// Enhanced debug checking - most infrastructure matches that of functions in
|
||||
// Kokkos_ViewMapping; additional checks for extra arguments beyond rank are 0
|
||||
template< unsigned , typename iType0 , class MapType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool dyn_rank_view_verify_operator_bounds( const iType0 & , const MapType & )
|
||||
{ return true ; }
|
||||
|
||||
template< unsigned R , typename iType0 , class MapType , typename iType1 , class ... Args >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool dyn_rank_view_verify_operator_bounds
|
||||
( const iType0 & rank
|
||||
, const MapType & map
|
||||
, const iType1 & i
|
||||
, Args ... args
|
||||
)
|
||||
{
|
||||
if ( static_cast<iType0>(R) < rank ) {
|
||||
return ( size_t(i) < map.extent(R) )
|
||||
&& dyn_rank_view_verify_operator_bounds<R+1>( rank , map , args ... );
|
||||
}
|
||||
else if ( i != 0 ) {
|
||||
printf("DynRankView Debug Bounds Checking Error: at rank %u\n Extra arguments beyond the rank must be zero \n",R);
|
||||
return ( false )
|
||||
&& dyn_rank_view_verify_operator_bounds<R+1>( rank , map , args ... );
|
||||
}
|
||||
else {
|
||||
return ( true )
|
||||
&& dyn_rank_view_verify_operator_bounds<R+1>( rank , map , args ... );
|
||||
}
|
||||
}
|
||||
|
||||
template< unsigned , class MapType >
|
||||
inline
|
||||
void dyn_rank_view_error_operator_bounds( char * , int , const MapType & )
|
||||
{}
|
||||
|
||||
template< unsigned R , class MapType , class iType , class ... Args >
|
||||
inline
|
||||
void dyn_rank_view_error_operator_bounds
|
||||
( char * buf
|
||||
, int len
|
||||
, const MapType & map
|
||||
, const iType & i
|
||||
, Args ... args
|
||||
)
|
||||
{
|
||||
const int n =
|
||||
snprintf(buf,len," %ld < %ld %c"
|
||||
, static_cast<unsigned long>(i)
|
||||
, static_cast<unsigned long>( map.extent(R) )
|
||||
, ( sizeof...(Args) ? ',' : ')' )
|
||||
);
|
||||
dyn_rank_view_error_operator_bounds<R+1>(buf+n,len-n,map,args...);
|
||||
}
|
||||
|
||||
// op_rank = rank of the operator version that was called
|
||||
template< typename iType0 , typename iType1 , class MapType , class ... Args >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void dyn_rank_view_verify_operator_bounds
|
||||
( const iType0 & op_rank , const iType1 & rank , const char* label , const MapType & map , Args ... args )
|
||||
{
|
||||
if ( static_cast<iType0>(rank) > op_rank ) {
|
||||
Kokkos::abort( "DynRankView Bounds Checking Error: Need at least rank arguments to the operator()" );
|
||||
}
|
||||
|
||||
if ( ! dyn_rank_view_verify_operator_bounds<0>( rank , map , args ... ) ) {
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
enum { LEN = 1024 };
|
||||
char buffer[ LEN ];
|
||||
int n = snprintf(buffer,LEN,"DynRankView bounds error of view %s (", label);
|
||||
dyn_rank_view_error_operator_bounds<0>( buffer + n , LEN - n , map , args ... );
|
||||
Kokkos::Impl::throw_runtime_exception(std::string(buffer));
|
||||
#else
|
||||
Kokkos::abort("DynRankView bounds error");
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** \brief Assign compatible default mappings */
|
||||
@ -341,7 +412,6 @@ class DynRankView : public ViewTraits< DataType , Properties ... >
|
||||
|
||||
private:
|
||||
template < class , class ... > friend class DynRankView ;
|
||||
// template < class , class ... > friend class Kokkos::Experimental::View ; //unnecessary now...
|
||||
template < class , class ... > friend class Impl::ViewMapping ;
|
||||
|
||||
public:
|
||||
@ -504,20 +574,26 @@ private:
|
||||
( is_layout_left || is_layout_right || is_layout_stride )
|
||||
};
|
||||
|
||||
template< class Space , bool = Kokkos::Impl::MemorySpaceAccess< Space , typename traits::memory_space >::accessible > struct verify_space
|
||||
{ KOKKOS_FORCEINLINE_FUNCTION static void check() {} };
|
||||
|
||||
template< class Space > struct verify_space<Space,false>
|
||||
{ KOKKOS_FORCEINLINE_FUNCTION static void check()
|
||||
{ Kokkos::abort("Kokkos::DynRankView ERROR: attempt to access inaccessible memory space"); };
|
||||
};
|
||||
|
||||
// Bounds checking macros
|
||||
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
|
||||
|
||||
#define KOKKOS_VIEW_OPERATOR_VERIFY( N , ARG ) \
|
||||
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \
|
||||
< Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify(); \
|
||||
Kokkos::Experimental::Impl::verify_dynrankview_rank ( N , *this ) ; \
|
||||
Kokkos::Experimental::Impl::view_verify_operator_bounds ARG ;
|
||||
// rank of the calling operator - included as first argument in ARG
|
||||
#define KOKKOS_VIEW_OPERATOR_VERIFY( ARG ) \
|
||||
DynRankView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check(); \
|
||||
Kokkos::Experimental::Impl::dyn_rank_view_verify_operator_bounds ARG ;
|
||||
|
||||
#else
|
||||
|
||||
#define KOKKOS_VIEW_OPERATOR_VERIFY( N , ARG ) \
|
||||
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \
|
||||
< Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify();
|
||||
#define KOKKOS_VIEW_OPERATOR_VERIFY( ARG ) \
|
||||
DynRankView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
|
||||
|
||||
#endif
|
||||
|
||||
@ -532,7 +608,11 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
reference_type operator()() const
|
||||
{
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( 0 , ( implementation_map() ) )
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (0 , this->rank() , NULL , m_map) )
|
||||
#else
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (0 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map) )
|
||||
#endif
|
||||
return implementation_map().reference();
|
||||
//return m_map.reference(0,0,0,0,0,0,0);
|
||||
}
|
||||
@ -563,12 +643,17 @@ public:
|
||||
return rankone_view(i0);
|
||||
}
|
||||
|
||||
// Rank 1 parenthesis
|
||||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType>::value), reference_type>::type
|
||||
operator()(const iType & i0 ) const
|
||||
{
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( 1 , ( m_map , i0 ) )
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (1 , this->rank() , NULL , m_map , i0) )
|
||||
#else
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (1 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0) )
|
||||
#endif
|
||||
return m_map.reference(i0);
|
||||
}
|
||||
|
||||
@ -577,6 +662,11 @@ public:
|
||||
typename std::enable_if< !(std::is_same<typename traits::specialize , void>::value && std::is_integral<iType>::value), reference_type>::type
|
||||
operator()(const iType & i0 ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (1 , this->rank() , NULL , m_map , i0) )
|
||||
#else
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (1 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0) )
|
||||
#endif
|
||||
return m_map.reference(i0,0,0,0,0,0,0);
|
||||
}
|
||||
|
||||
@ -586,7 +676,11 @@ public:
|
||||
typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value && std::is_integral<iType1>::value), reference_type>::type
|
||||
operator()(const iType0 & i0 , const iType1 & i1 ) const
|
||||
{
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( 2 , ( m_map , i0 , i1 ) )
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (2 , this->rank() , NULL , m_map , i0 , i1) )
|
||||
#else
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (2 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1) )
|
||||
#endif
|
||||
return m_map.reference(i0,i1);
|
||||
}
|
||||
|
||||
@ -595,7 +689,11 @@ public:
|
||||
typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
|
||||
operator()(const iType0 & i0 , const iType1 & i1 ) const
|
||||
{
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( 2 , ( m_map , i0 , i1 ) )
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (2 , this->rank() , NULL , m_map , i0 , i1) )
|
||||
#else
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (2 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1) )
|
||||
#endif
|
||||
return m_map.reference(i0,i1,0,0,0,0,0);
|
||||
}
|
||||
|
||||
@ -605,7 +703,11 @@ public:
|
||||
typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value && std::is_integral<iType1>::value && std::is_integral<iType2>::value), reference_type>::type
|
||||
operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const
|
||||
{
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( 3 , ( m_map , i0 , i1 , i2 ) )
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (3 , this->rank() , NULL , m_map , i0 , i1 , i2) )
|
||||
#else
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (3 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2) )
|
||||
#endif
|
||||
return m_map.reference(i0,i1,i2);
|
||||
}
|
||||
|
||||
@ -614,7 +716,11 @@ public:
|
||||
typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
|
||||
operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const
|
||||
{
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( 3 , ( m_map , i0 , i1 , i2 ) )
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (3 , this->rank() , NULL , m_map , i0 , i1 , i2) )
|
||||
#else
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (3 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2) )
|
||||
#endif
|
||||
return m_map.reference(i0,i1,i2,0,0,0,0);
|
||||
}
|
||||
|
||||
@ -624,7 +730,11 @@ public:
|
||||
typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value), reference_type>::type
|
||||
operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const
|
||||
{
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( 4 , ( m_map , i0 , i1 , i2 , i3 ) )
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (4 , this->rank() , NULL , m_map , i0 , i1 , i2 , i3) )
|
||||
#else
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (4 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3) )
|
||||
#endif
|
||||
return m_map.reference(i0,i1,i2,i3);
|
||||
}
|
||||
|
||||
@ -633,7 +743,11 @@ public:
|
||||
typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
|
||||
operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const
|
||||
{
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( 4 , ( m_map , i0 , i1 , i2 , i3 ) )
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (4 , this->rank() , NULL , m_map , i0 , i1 , i2 , i3) )
|
||||
#else
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (4 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3) )
|
||||
#endif
|
||||
return m_map.reference(i0,i1,i2,i3,0,0,0);
|
||||
}
|
||||
|
||||
@ -643,7 +757,11 @@ public:
|
||||
typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value && std::is_integral<iType4>::value), reference_type>::type
|
||||
operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 ) const
|
||||
{
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( 5 , ( m_map , i0 , i1 , i2 , i3 , i4 ) )
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (5 , this->rank() , NULL , m_map , i0 , i1 , i2 , i3, i4) )
|
||||
#else
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (5 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4) )
|
||||
#endif
|
||||
return m_map.reference(i0,i1,i2,i3,i4);
|
||||
}
|
||||
|
||||
@ -652,7 +770,11 @@ public:
|
||||
typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
|
||||
operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 ) const
|
||||
{
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( 5 , ( m_map , i0 , i1 , i2 , i3 , i4 ) )
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (5 , this->rank() , NULL , m_map , i0 , i1 , i2 , i3, i4) )
|
||||
#else
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (5 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4) )
|
||||
#endif
|
||||
return m_map.reference(i0,i1,i2,i3,i4,0,0);
|
||||
}
|
||||
|
||||
@ -662,7 +784,11 @@ public:
|
||||
typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value && std::is_integral<iType4>::value && std::is_integral<iType5>::value), reference_type>::type
|
||||
operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 ) const
|
||||
{
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( 6 , ( m_map , i0 , i1 , i2 , i3 , i4 , i5 ) )
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (6 , this->rank() , NULL , m_map , i0 , i1 , i2 , i3, i4 , i5) )
|
||||
#else
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (6 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5) )
|
||||
#endif
|
||||
return m_map.reference(i0,i1,i2,i3,i4,i5);
|
||||
}
|
||||
|
||||
@ -671,7 +797,11 @@ public:
|
||||
typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
|
||||
operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 ) const
|
||||
{
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( 6 , ( m_map , i0 , i1 , i2 , i3 , i4 , i5 ) )
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (6 , this->rank() , NULL , m_map , i0 , i1 , i2 , i3, i4 , i5) )
|
||||
#else
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (6 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5) )
|
||||
#endif
|
||||
return m_map.reference(i0,i1,i2,i3,i4,i5,0);
|
||||
}
|
||||
|
||||
@ -681,7 +811,11 @@ public:
|
||||
typename std::enable_if< (std::is_integral<iType0>::value && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value && std::is_integral<iType4>::value && std::is_integral<iType5>::value && std::is_integral<iType6>::value), reference_type>::type
|
||||
operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const
|
||||
{
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( 7 , ( m_map , i0 , i1 , i2 , i3 , i4 , i5 , i6 ) )
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (7 , this->rank() , NULL , m_map , i0 , i1 , i2 , i3, i4 , i5 , i6) )
|
||||
#else
|
||||
KOKKOS_VIEW_OPERATOR_VERIFY( (7 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6) )
|
||||
#endif
|
||||
return m_map.reference(i0,i1,i2,i3,i4,i5,i6);
|
||||
}
|
||||
|
||||
@ -1136,13 +1270,13 @@ private:
|
||||
|
||||
public:
|
||||
|
||||
typedef Kokkos::Experimental::ViewTraits
|
||||
typedef Kokkos::ViewTraits
|
||||
< data_type
|
||||
, array_layout
|
||||
, typename SrcTraits::device_type
|
||||
, typename SrcTraits::memory_traits > traits_type ;
|
||||
|
||||
typedef Kokkos::Experimental::View
|
||||
typedef Kokkos::View
|
||||
< data_type
|
||||
, array_layout
|
||||
, typename SrcTraits::device_type
|
||||
@ -1154,13 +1288,13 @@ public:
|
||||
|
||||
static_assert( Kokkos::Impl::is_memory_traits< MemoryTraits >::value , "" );
|
||||
|
||||
typedef Kokkos::Experimental::ViewTraits
|
||||
typedef Kokkos::ViewTraits
|
||||
< data_type
|
||||
, array_layout
|
||||
, typename SrcTraits::device_type
|
||||
, MemoryTraits > traits_type ;
|
||||
|
||||
typedef Kokkos::Experimental::View
|
||||
typedef Kokkos::View
|
||||
< data_type
|
||||
, array_layout
|
||||
, typename SrcTraits::device_type
|
||||
@ -1264,7 +1398,7 @@ subdynrankview( const Kokkos::Experimental::DynRankView< D , P... > &src , Args.
|
||||
if ( src.rank() > sizeof...(Args) ) //allow sizeof...(Args) >= src.rank(), ignore the remaining args
|
||||
{ Kokkos::abort("subdynrankview: num of args must be >= rank of the source DynRankView"); }
|
||||
|
||||
typedef Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::Experimental::ViewTraits< D*******, P... > , Args... > metafcn ;
|
||||
typedef Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ;
|
||||
|
||||
return metafcn::subview( src.rank() , src , args... );
|
||||
}
|
||||
@ -1502,10 +1636,10 @@ void deep_copy
|
||||
typedef typename src_type::memory_space src_memory_space ;
|
||||
|
||||
enum { DstExecCanAccessSrc =
|
||||
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value };
|
||||
Kokkos::Impl::SpaceAccessibility< dst_execution_space , src_memory_space >::accessible };
|
||||
|
||||
enum { SrcExecCanAccessDst =
|
||||
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename src_execution_space::memory_space , dst_memory_space >::value };
|
||||
Kokkos::Impl::SpaceAccessibility< src_execution_space , dst_memory_space >::accessible };
|
||||
|
||||
if ( (void *) dst.data() != (void*) src.data() ) {
|
||||
|
||||
@ -1666,7 +1800,7 @@ inline
|
||||
typename DynRankView<T,P...>::HostMirror
|
||||
create_mirror( const DynRankView<T,P...> & src
|
||||
, typename std::enable_if<
|
||||
! std::is_same< typename Kokkos::Experimental::ViewTraits<T,P...>::array_layout
|
||||
! std::is_same< typename Kokkos::ViewTraits<T,P...>::array_layout
|
||||
, Kokkos::LayoutStride >::value
|
||||
>::type * = 0
|
||||
)
|
||||
@ -1684,7 +1818,7 @@ inline
|
||||
typename DynRankView<T,P...>::HostMirror
|
||||
create_mirror( const DynRankView<T,P...> & src
|
||||
, typename std::enable_if<
|
||||
std::is_same< typename Kokkos::Experimental::ViewTraits<T,P...>::array_layout
|
||||
std::is_same< typename Kokkos::ViewTraits<T,P...>::array_layout
|
||||
, Kokkos::LayoutStride >::value
|
||||
>::type * = 0
|
||||
)
|
||||
@ -1779,7 +1913,7 @@ void resize( DynRankView<T,P...> & v ,
|
||||
{
|
||||
typedef DynRankView<T,P...> drview_type ;
|
||||
|
||||
static_assert( Kokkos::Experimental::ViewTraits<T,P...>::is_managed , "Can only resize managed views" );
|
||||
static_assert( Kokkos::ViewTraits<T,P...>::is_managed , "Can only resize managed views" );
|
||||
|
||||
drview_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6 );
|
||||
|
||||
@ -1803,7 +1937,7 @@ void realloc( DynRankView<T,P...> & v ,
|
||||
{
|
||||
typedef DynRankView<T,P...> drview_type ;
|
||||
|
||||
static_assert( Kokkos::Experimental::ViewTraits<T,P...>::is_managed , "Can only realloc managed views" );
|
||||
static_assert( Kokkos::ViewTraits<T,P...>::is_managed , "Can only realloc managed views" );
|
||||
|
||||
const std::string label = v.label();
|
||||
|
||||
|
||||
@ -56,7 +56,7 @@ namespace Experimental {
|
||||
* Subviews are not allowed.
|
||||
*/
|
||||
template< typename DataType , typename ... P >
|
||||
class DynamicView : public Kokkos::Experimental::ViewTraits< DataType , P ... >
|
||||
class DynamicView : public Kokkos::ViewTraits< DataType , P ... >
|
||||
{
|
||||
public:
|
||||
|
||||
@ -75,6 +75,15 @@ private:
|
||||
std::is_same< typename traits::specialize , void >::value
|
||||
, "DynamicView must have trivial data type" );
|
||||
|
||||
|
||||
template< class Space , bool = Kokkos::Impl::MemorySpaceAccess< Space , typename traits::memory_space >::accessible > struct verify_space
|
||||
{ KOKKOS_FORCEINLINE_FUNCTION static void check() {} };
|
||||
|
||||
template< class Space > struct verify_space<Space,false>
|
||||
{ KOKKOS_FORCEINLINE_FUNCTION static void check()
|
||||
{ Kokkos::abort("Kokkos::DynamicView ERROR: attempt to access inaccessible memory space"); };
|
||||
};
|
||||
|
||||
public:
|
||||
|
||||
typedef Kokkos::Experimental::MemoryPool< typename traits::device_type > memory_pool ;
|
||||
@ -117,10 +126,10 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t size() const
|
||||
{
|
||||
return
|
||||
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace
|
||||
Kokkos::Impl::MemorySpaceAccess
|
||||
< Kokkos::Impl::ActiveExecutionMemorySpace
|
||||
, typename traits::memory_space
|
||||
>::value
|
||||
>::accessible
|
||||
? // Runtime size is at the end of the chunk pointer array
|
||||
(*reinterpret_cast<const uintptr_t*>( m_chunks + m_chunk_max ))
|
||||
<< m_chunk_shift
|
||||
@ -179,10 +188,7 @@ public:
|
||||
static_assert( Kokkos::Impl::are_integral<I0,Args...>::value
|
||||
, "Indices must be integral type" );
|
||||
|
||||
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace
|
||||
< Kokkos::Impl::ActiveExecutionMemorySpace
|
||||
, typename traits::memory_space
|
||||
>::verify();
|
||||
DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
|
||||
|
||||
// Which chunk is being indexed.
|
||||
const uintptr_t ic = uintptr_t( i0 >> m_chunk_shift );
|
||||
@ -223,15 +229,13 @@ public:
|
||||
{
|
||||
typedef typename traits::value_type value_type ;
|
||||
|
||||
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace
|
||||
< Kokkos::Impl::ActiveExecutionMemorySpace
|
||||
, typename traits::memory_space >::verify();
|
||||
DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
|
||||
|
||||
const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ;
|
||||
|
||||
if ( m_chunk_max < NC ) {
|
||||
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
|
||||
printf("DynamicView::resize_parallel(%lu) m_chunk_max(%lu) NC(%lu)\n"
|
||||
printf("DynamicView::resize_parallel(%lu) m_chunk_max(%u) NC(%lu)\n"
|
||||
, n , m_chunk_max , NC );
|
||||
#endif
|
||||
Kokkos::abort("DynamicView::resize_parallel exceeded maximum size");
|
||||
@ -269,9 +273,7 @@ public:
|
||||
inline
|
||||
void resize_serial( size_t n )
|
||||
{
|
||||
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace
|
||||
< Kokkos::Impl::ActiveExecutionMemorySpace
|
||||
, typename traits::memory_space >::verify();
|
||||
DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
|
||||
|
||||
const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ;
|
||||
|
||||
@ -398,9 +400,7 @@ public:
|
||||
, m_chunk_mask( ( 1 << m_chunk_shift ) - 1 )
|
||||
, m_chunk_max( ( arg_size_max + m_chunk_mask ) >> m_chunk_shift )
|
||||
{
|
||||
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace
|
||||
< Kokkos::Impl::ActiveExecutionMemorySpace
|
||||
, typename traits::memory_space >::verify();
|
||||
DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
|
||||
|
||||
// A functor to deallocate all of the chunks upon final destruction
|
||||
|
||||
@ -452,7 +452,7 @@ void deep_copy( const View<T,DP...> & dst
|
||||
typedef typename ViewTraits<T,SP...>::memory_space src_memory_space ;
|
||||
|
||||
enum { DstExecCanAccessSrc =
|
||||
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value };
|
||||
Kokkos::Impl::SpaceAccessibility< dst_execution_space , src_memory_space >::accessible };
|
||||
|
||||
if ( DstExecCanAccessSrc ) {
|
||||
// Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
|
||||
@ -476,7 +476,7 @@ void deep_copy( const DynamicView<T,DP...> & dst
|
||||
typedef typename ViewTraits<T,SP...>::memory_space src_memory_space ;
|
||||
|
||||
enum { DstExecCanAccessSrc =
|
||||
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value };
|
||||
Kokkos::Impl::SpaceAccessibility< dst_execution_space , src_memory_space >::accessible };
|
||||
|
||||
if ( DstExecCanAccessSrc ) {
|
||||
// Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
|
||||
|
||||
196
lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp
Normal file
196
lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp
Normal file
@ -0,0 +1,196 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_EXPERIMENTAL_ERROR_REPORTER_HPP
|
||||
#define KOKKOS_EXPERIMENTAL_ERROR_REPORTER_HPP
|
||||
|
||||
#include <vector>
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <Kokkos_View.hpp>
|
||||
#include <Kokkos_DualView.hpp>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
template <typename ReportType, typename DeviceType>
|
||||
class ErrorReporter
|
||||
{
|
||||
public:
|
||||
|
||||
typedef ReportType report_type;
|
||||
typedef DeviceType device_type;
|
||||
typedef typename device_type::execution_space execution_space;
|
||||
|
||||
ErrorReporter(int max_results)
|
||||
: m_numReportsAttempted(""),
|
||||
m_reports("", max_results),
|
||||
m_reporters("", max_results)
|
||||
{
|
||||
clear();
|
||||
}
|
||||
|
||||
int getCapacity() const { return m_reports.h_view.dimension_0(); }
|
||||
|
||||
int getNumReports();
|
||||
|
||||
int getNumReportAttempts();
|
||||
|
||||
void getReports(std::vector<int> &reporters_out, std::vector<report_type> &reports_out);
|
||||
void getReports( typename Kokkos::View<int*, typename DeviceType::execution_space >::HostMirror &reporters_out,
|
||||
typename Kokkos::View<report_type*, typename DeviceType::execution_space >::HostMirror &reports_out);
|
||||
|
||||
void clear();
|
||||
|
||||
void resize(const size_t new_size);
|
||||
|
||||
bool full() {return (getNumReportAttempts() >= getCapacity()); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool add_report(int reporter_id, report_type report) const
|
||||
{
|
||||
int idx = Kokkos::atomic_fetch_add(&m_numReportsAttempted(), 1);
|
||||
|
||||
if (idx >= 0 && (idx < static_cast<int>(m_reports.d_view.dimension_0()))) {
|
||||
m_reporters.d_view(idx) = reporter_id;
|
||||
m_reports.d_view(idx) = report;
|
||||
return true;
|
||||
}
|
||||
else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
typedef Kokkos::View<report_type *, execution_space> reports_view_t;
|
||||
typedef Kokkos::DualView<report_type *, execution_space> reports_dualview_t;
|
||||
|
||||
typedef typename reports_dualview_t::host_mirror_space host_mirror_space;
|
||||
Kokkos::View<int, execution_space> m_numReportsAttempted;
|
||||
reports_dualview_t m_reports;
|
||||
Kokkos::DualView<int *, execution_space> m_reporters;
|
||||
|
||||
};
|
||||
|
||||
|
||||
template <typename ReportType, typename DeviceType>
|
||||
inline int ErrorReporter<ReportType, DeviceType>::getNumReports()
|
||||
{
|
||||
int num_reports = 0;
|
||||
Kokkos::deep_copy(num_reports,m_numReportsAttempted);
|
||||
if (num_reports > static_cast<int>(m_reports.h_view.dimension_0())) {
|
||||
num_reports = m_reports.h_view.dimension_0();
|
||||
}
|
||||
return num_reports;
|
||||
}
|
||||
|
||||
template <typename ReportType, typename DeviceType>
|
||||
inline int ErrorReporter<ReportType, DeviceType>::getNumReportAttempts()
|
||||
{
|
||||
int num_reports = 0;
|
||||
Kokkos::deep_copy(num_reports,m_numReportsAttempted);
|
||||
return num_reports;
|
||||
}
|
||||
|
||||
template <typename ReportType, typename DeviceType>
|
||||
void ErrorReporter<ReportType, DeviceType>::getReports(std::vector<int> &reporters_out, std::vector<report_type> &reports_out)
|
||||
{
|
||||
int num_reports = getNumReports();
|
||||
reporters_out.clear();
|
||||
reporters_out.reserve(num_reports);
|
||||
reports_out.clear();
|
||||
reports_out.reserve(num_reports);
|
||||
|
||||
if (num_reports > 0) {
|
||||
m_reports.template sync<host_mirror_space>();
|
||||
m_reporters.template sync<host_mirror_space>();
|
||||
|
||||
for (int i = 0; i < num_reports; ++i) {
|
||||
reporters_out.push_back(m_reporters.h_view(i));
|
||||
reports_out.push_back(m_reports.h_view(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ReportType, typename DeviceType>
|
||||
void ErrorReporter<ReportType, DeviceType>::getReports(
|
||||
typename Kokkos::View<int*, typename DeviceType::execution_space >::HostMirror &reporters_out,
|
||||
typename Kokkos::View<report_type*, typename DeviceType::execution_space >::HostMirror &reports_out)
|
||||
{
|
||||
int num_reports = getNumReports();
|
||||
reporters_out = typename Kokkos::View<int*, typename DeviceType::execution_space >::HostMirror("ErrorReport::reporters_out",num_reports);
|
||||
reports_out = typename Kokkos::View<report_type*, typename DeviceType::execution_space >::HostMirror("ErrorReport::reports_out",num_reports);
|
||||
|
||||
if (num_reports > 0) {
|
||||
m_reports.template sync<host_mirror_space>();
|
||||
m_reporters.template sync<host_mirror_space>();
|
||||
|
||||
for (int i = 0; i < num_reports; ++i) {
|
||||
reporters_out(i) = m_reporters.h_view(i);
|
||||
reports_out(i) = m_reports.h_view(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ReportType, typename DeviceType>
|
||||
void ErrorReporter<ReportType, DeviceType>::clear()
|
||||
{
|
||||
int num_reports=0;
|
||||
Kokkos::deep_copy(m_numReportsAttempted, num_reports);
|
||||
m_reports.template modify<execution_space>();
|
||||
m_reporters.template modify<execution_space>();
|
||||
}
|
||||
|
||||
template <typename ReportType, typename DeviceType>
|
||||
void ErrorReporter<ReportType, DeviceType>::resize(const size_t new_size)
|
||||
{
|
||||
m_reports.resize(new_size);
|
||||
m_reporters.resize(new_size);
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
|
||||
} // namespace Experimental
|
||||
} // namespace kokkos
|
||||
|
||||
#endif
|
||||
@ -1,531 +0,0 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_SEGMENTED_VIEW_HPP_
|
||||
#define KOKKOS_SEGMENTED_VIEW_HPP_
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <cstdio>
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template<class DataType, class Arg1Type, class Arg2Type, class Arg3Type>
|
||||
struct delete_segmented_view;
|
||||
|
||||
template<class MemorySpace>
|
||||
inline
|
||||
void DeviceSetAllocatableMemorySize(size_t) {}
|
||||
|
||||
#if defined( KOKKOS_HAVE_CUDA )
|
||||
|
||||
template<>
|
||||
inline
|
||||
void DeviceSetAllocatableMemorySize<Kokkos::CudaSpace>(size_t size) {
|
||||
#ifdef __CUDACC__
|
||||
size_t size_limit;
|
||||
cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize);
|
||||
if(size_limit<size)
|
||||
cudaDeviceSetLimit(cudaLimitMallocHeapSize,2*size);
|
||||
cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<>
|
||||
inline
|
||||
void DeviceSetAllocatableMemorySize<Kokkos::CudaUVMSpace>(size_t size) {
|
||||
#ifdef __CUDACC__
|
||||
size_t size_limit;
|
||||
cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize);
|
||||
if(size_limit<size)
|
||||
cudaDeviceSetLimit(cudaLimitMallocHeapSize,2*size);
|
||||
cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
|
||||
|
||||
}
|
||||
|
||||
template< class DataType ,
|
||||
class Arg1Type = void ,
|
||||
class Arg2Type = void ,
|
||||
class Arg3Type = void>
|
||||
class SegmentedView : public Kokkos::ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type >
|
||||
{
|
||||
public:
|
||||
//! \name Typedefs for device types and various Kokkos::View specializations.
|
||||
//@{
|
||||
typedef Kokkos::ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > traits ;
|
||||
|
||||
//! The type of a Kokkos::View on the device.
|
||||
typedef Kokkos::View< typename traits::data_type ,
|
||||
typename traits::array_layout ,
|
||||
typename traits::memory_space ,
|
||||
Kokkos::MemoryUnmanaged > t_dev ;
|
||||
|
||||
|
||||
private:
|
||||
Kokkos::View<t_dev*,typename traits::memory_space> segments_;
|
||||
|
||||
Kokkos::View<int,typename traits::memory_space> realloc_lock;
|
||||
Kokkos::View<int,typename traits::memory_space> nsegments_;
|
||||
|
||||
size_t segment_length_;
|
||||
size_t segment_length_m1_;
|
||||
int max_segments_;
|
||||
|
||||
int segment_length_log2;
|
||||
|
||||
// Dimensions, cardinality, capacity, and offset computation for
|
||||
// multidimensional array view of contiguous memory.
|
||||
// Inherits from Impl::Shape
|
||||
typedef Kokkos::Impl::ViewOffset< typename traits::shape_type
|
||||
, typename traits::array_layout
|
||||
> offset_map_type ;
|
||||
|
||||
offset_map_type m_offset_map ;
|
||||
|
||||
typedef Kokkos::View< typename traits::array_intrinsic_type ,
|
||||
typename traits::array_layout ,
|
||||
typename traits::memory_space ,
|
||||
typename traits::memory_traits > array_type ;
|
||||
|
||||
typedef Kokkos::View< typename traits::const_data_type ,
|
||||
typename traits::array_layout ,
|
||||
typename traits::memory_space ,
|
||||
typename traits::memory_traits > const_type ;
|
||||
|
||||
typedef Kokkos::View< typename traits::non_const_data_type ,
|
||||
typename traits::array_layout ,
|
||||
typename traits::memory_space ,
|
||||
typename traits::memory_traits > non_const_type ;
|
||||
|
||||
typedef Kokkos::View< typename traits::non_const_data_type ,
|
||||
typename traits::array_layout ,
|
||||
HostSpace ,
|
||||
void > HostMirror ;
|
||||
|
||||
template< bool Accessible >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename Kokkos::Impl::enable_if< Accessible , typename traits::size_type >::type
|
||||
dimension_0_intern() const { return nsegments_() * segment_length_ ; }
|
||||
|
||||
template< bool Accessible >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename Kokkos::Impl::enable_if< ! Accessible , typename traits::size_type >::type
|
||||
dimension_0_intern() const
|
||||
{
|
||||
// In Host space
|
||||
int n = 0 ;
|
||||
#if ! defined( __CUDA_ARCH__ )
|
||||
Kokkos::Impl::DeepCopy< HostSpace , typename traits::memory_space >( & n , nsegments_.ptr_on_device() , sizeof(int) );
|
||||
#endif
|
||||
|
||||
return n * segment_length_ ;
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
enum { Rank = traits::rank };
|
||||
|
||||
KOKKOS_INLINE_FUNCTION offset_map_type shape() const { return m_offset_map ; }
|
||||
|
||||
/* \brief return (current) size of dimension 0 */
|
||||
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const {
|
||||
enum { Accessible = Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
|
||||
Kokkos::Impl::ActiveExecutionMemorySpace, typename traits::memory_space >::value };
|
||||
int n = SegmentedView::dimension_0_intern< Accessible >();
|
||||
return n ;
|
||||
}
|
||||
|
||||
/* \brief return size of dimension 1 */
|
||||
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_offset_map.N1 ; }
|
||||
/* \brief return size of dimension 2 */
|
||||
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return m_offset_map.N2 ; }
|
||||
/* \brief return size of dimension 3 */
|
||||
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return m_offset_map.N3 ; }
|
||||
/* \brief return size of dimension 4 */
|
||||
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return m_offset_map.N4 ; }
|
||||
/* \brief return size of dimension 5 */
|
||||
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return m_offset_map.N5 ; }
|
||||
/* \brief return size of dimension 6 */
|
||||
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return m_offset_map.N6 ; }
|
||||
/* \brief return size of dimension 7 */
|
||||
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return m_offset_map.N7 ; }
|
||||
|
||||
/* \brief return size of dimension 2 */
|
||||
KOKKOS_INLINE_FUNCTION typename traits::size_type size() const {
|
||||
return dimension_0() *
|
||||
m_offset_map.N1 * m_offset_map.N2 * m_offset_map.N3 * m_offset_map.N4 *
|
||||
m_offset_map.N5 * m_offset_map.N6 * m_offset_map.N7 ;
|
||||
}
|
||||
|
||||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename traits::size_type dimension( const iType & i ) const {
|
||||
if(i==0)
|
||||
return dimension_0();
|
||||
else
|
||||
return Kokkos::Impl::dimension( m_offset_map , i );
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename traits::size_type capacity() {
|
||||
return segments_.dimension_0() *
|
||||
m_offset_map.N1 * m_offset_map.N2 * m_offset_map.N3 * m_offset_map.N4 *
|
||||
m_offset_map.N5 * m_offset_map.N6 * m_offset_map.N7;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename traits::size_type get_num_segments() {
|
||||
enum { Accessible = Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
|
||||
Kokkos::Impl::ActiveExecutionMemorySpace, typename traits::memory_space >::value };
|
||||
int n = SegmentedView::dimension_0_intern< Accessible >();
|
||||
return n/segment_length_ ;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename traits::size_type get_max_segments() {
|
||||
return max_segments_;
|
||||
}
|
||||
|
||||
/// \brief Constructor that allocates View objects with an initial length of 0.
|
||||
///
|
||||
/// This constructor works mostly like the analogous constructor of View.
|
||||
/// The first argument is a string label, which is entirely for your
|
||||
/// benefit. (Different SegmentedView objects may have the same label if
|
||||
/// you like.) The second argument 'view_length' is the size of the segments.
|
||||
/// This number must be a power of two. The third argument n0 is the maximum
|
||||
/// value for the first dimension of the segmented view. The maximal allocatable
|
||||
/// number of Segments is thus: (n0+view_length-1)/view_length.
|
||||
/// The arguments that follow are the other dimensions of the (1-7) of the
|
||||
/// View objects. For example, for a View with 3 runtime dimensions,
|
||||
/// the first 4 integer arguments will be nonzero:
|
||||
/// SegmentedView("Name",32768,10000000,8,4). This allocates a SegmentedView
|
||||
/// with a maximum of 306 segments of dimension (32768,8,4). The logical size of
|
||||
/// the segmented view is (n,8,4) with n between 0 and 10000000.
|
||||
/// You may omit the integer arguments that follow.
|
||||
template< class LabelType >
|
||||
SegmentedView(const LabelType & label ,
|
||||
const size_t view_length ,
|
||||
const size_t n0 ,
|
||||
const size_t n1 = 0 ,
|
||||
const size_t n2 = 0 ,
|
||||
const size_t n3 = 0 ,
|
||||
const size_t n4 = 0 ,
|
||||
const size_t n5 = 0 ,
|
||||
const size_t n6 = 0 ,
|
||||
const size_t n7 = 0
|
||||
): segment_length_(view_length),segment_length_m1_(view_length-1)
|
||||
{
|
||||
segment_length_log2 = -1;
|
||||
size_t l = segment_length_;
|
||||
while(l>0) {
|
||||
l>>=1;
|
||||
segment_length_log2++;
|
||||
}
|
||||
l = 1<<segment_length_log2;
|
||||
if(l!=segment_length_)
|
||||
Kokkos::Impl::throw_runtime_exception("Kokkos::SegmentedView requires a 'power of 2' segment length");
|
||||
|
||||
max_segments_ = (n0+segment_length_m1_)/segment_length_;
|
||||
|
||||
Impl::DeviceSetAllocatableMemorySize<typename traits::memory_space>(segment_length_*max_segments_*sizeof(typename traits::value_type));
|
||||
|
||||
segments_ = Kokkos::View<t_dev*,typename traits::execution_space>(label , max_segments_);
|
||||
realloc_lock = Kokkos::View<int,typename traits::execution_space>("Lock");
|
||||
nsegments_ = Kokkos::View<int,typename traits::execution_space>("nviews");
|
||||
m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n0*n1*n2*n3*n4*n5*n6*n7 );
|
||||
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
SegmentedView(const SegmentedView& src):
|
||||
segments_(src.segments_),
|
||||
realloc_lock (src.realloc_lock),
|
||||
nsegments_ (src.nsegments_),
|
||||
segment_length_(src.segment_length_),
|
||||
segment_length_m1_(src.segment_length_m1_),
|
||||
max_segments_ (src.max_segments_),
|
||||
segment_length_log2(src.segment_length_log2),
|
||||
m_offset_map (src.m_offset_map)
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
SegmentedView& operator= (const SegmentedView& src) {
|
||||
segments_ = src.segments_;
|
||||
realloc_lock = src.realloc_lock;
|
||||
nsegments_ = src.nsegments_;
|
||||
segment_length_= src.segment_length_;
|
||||
segment_length_m1_= src.segment_length_m1_;
|
||||
max_segments_ = src.max_segments_;
|
||||
segment_length_log2= src.segment_length_log2;
|
||||
m_offset_map = src.m_offset_map;
|
||||
return *this;
|
||||
}
|
||||
|
||||
~SegmentedView() {
|
||||
if ( !segments_.tracker().ref_counting()) { return; }
|
||||
size_t ref_count = segments_.tracker().ref_count();
|
||||
if(ref_count == 1u) {
|
||||
Kokkos::fence();
|
||||
typename Kokkos::View<int,typename traits::execution_space>::HostMirror h_nviews("h_nviews");
|
||||
Kokkos::deep_copy(h_nviews,nsegments_);
|
||||
Kokkos::parallel_for(h_nviews(),Impl::delete_segmented_view<DataType , Arg1Type , Arg2Type, Arg3Type>(*this));
|
||||
}
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
t_dev get_segment(const int& i) const {
|
||||
return segments_[i];
|
||||
}
|
||||
|
||||
template< class MemberType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void grow (MemberType& team_member, const size_t& growSize) const {
|
||||
if (growSize>max_segments_*segment_length_) {
|
||||
printf ("Exceeding maxSize: %lu %lu\n", growSize, max_segments_*segment_length_);
|
||||
return;
|
||||
}
|
||||
|
||||
if(team_member.team_rank()==0) {
|
||||
bool too_small = growSize > segment_length_ * nsegments_();
|
||||
if (too_small) {
|
||||
while(Kokkos::atomic_compare_exchange(&realloc_lock(),0,1) )
|
||||
; // get the lock
|
||||
too_small = growSize > segment_length_ * nsegments_(); // Recheck once we have the lock
|
||||
if(too_small) {
|
||||
while(too_small) {
|
||||
const size_t alloc_size = segment_length_*m_offset_map.N1*m_offset_map.N2*m_offset_map.N3*
|
||||
m_offset_map.N4*m_offset_map.N5*m_offset_map.N6*m_offset_map.N7;
|
||||
typename traits::non_const_value_type* const ptr = new typename traits::non_const_value_type[alloc_size];
|
||||
|
||||
segments_(nsegments_()) =
|
||||
t_dev(ptr,segment_length_,m_offset_map.N1,m_offset_map.N2,m_offset_map.N3,m_offset_map.N4,m_offset_map.N5,m_offset_map.N6,m_offset_map.N7);
|
||||
nsegments_()++;
|
||||
too_small = growSize > segment_length_ * nsegments_();
|
||||
}
|
||||
}
|
||||
realloc_lock() = 0; //release the lock
|
||||
}
|
||||
}
|
||||
team_member.team_barrier();
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void grow_non_thread_safe (const size_t& growSize) const {
|
||||
if (growSize>max_segments_*segment_length_) {
|
||||
printf ("Exceeding maxSize: %lu %lu\n", growSize, max_segments_*segment_length_);
|
||||
return;
|
||||
}
|
||||
bool too_small = growSize > segment_length_ * nsegments_();
|
||||
if(too_small) {
|
||||
while(too_small) {
|
||||
const size_t alloc_size = segment_length_*m_offset_map.N1*m_offset_map.N2*m_offset_map.N3*
|
||||
m_offset_map.N4*m_offset_map.N5*m_offset_map.N6*m_offset_map.N7;
|
||||
typename traits::non_const_value_type* const ptr =
|
||||
new typename traits::non_const_value_type[alloc_size];
|
||||
|
||||
segments_(nsegments_()) =
|
||||
t_dev (ptr, segment_length_, m_offset_map.N1, m_offset_map.N2,
|
||||
m_offset_map.N3, m_offset_map.N4, m_offset_map.N5,
|
||||
m_offset_map.N6, m_offset_map.N7);
|
||||
nsegments_()++;
|
||||
too_small = growSize > segment_length_ * nsegments_();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template< typename iType0 >
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<iType0>::value && traits::rank == 1 )
|
||||
, typename traits::value_type &
|
||||
>::type
|
||||
operator() ( const iType0 & i0 ) const
|
||||
{
|
||||
return segments_[i0>>segment_length_log2](i0&(segment_length_m1_));
|
||||
}
|
||||
|
||||
template< typename iType0 , typename iType1 >
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<iType0>::value &&
|
||||
std::is_integral<iType1>::value &&
|
||||
traits::rank == 2 )
|
||||
, typename traits::value_type &
|
||||
>::type
|
||||
operator() ( const iType0 & i0 , const iType1 & i1 ) const
|
||||
{
|
||||
return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1);
|
||||
}
|
||||
|
||||
template< typename iType0 , typename iType1 , typename iType2 >
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<iType0>::value &&
|
||||
std::is_integral<iType1>::value &&
|
||||
std::is_integral<iType2>::value &&
|
||||
traits::rank == 3 )
|
||||
, typename traits::value_type &
|
||||
>::type
|
||||
operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const
|
||||
{
|
||||
return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2);
|
||||
}
|
||||
|
||||
template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<iType0>::value &&
|
||||
std::is_integral<iType1>::value &&
|
||||
std::is_integral<iType2>::value &&
|
||||
std::is_integral<iType3>::value &&
|
||||
traits::rank == 4 )
|
||||
, typename traits::value_type &
|
||||
>::type
|
||||
operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const
|
||||
{
|
||||
return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3);
|
||||
}
|
||||
|
||||
template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
|
||||
typename iType4 >
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<iType0>::value &&
|
||||
std::is_integral<iType1>::value &&
|
||||
std::is_integral<iType2>::value &&
|
||||
std::is_integral<iType3>::value &&
|
||||
std::is_integral<iType4>::value &&
|
||||
traits::rank == 5 )
|
||||
, typename traits::value_type &
|
||||
>::type
|
||||
operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
|
||||
const iType4 & i4 ) const
|
||||
{
|
||||
return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4);
|
||||
}
|
||||
|
||||
template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
|
||||
typename iType4 , typename iType5 >
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<iType0>::value &&
|
||||
std::is_integral<iType1>::value &&
|
||||
std::is_integral<iType2>::value &&
|
||||
std::is_integral<iType3>::value &&
|
||||
std::is_integral<iType4>::value &&
|
||||
std::is_integral<iType5>::value &&
|
||||
traits::rank == 6 )
|
||||
, typename traits::value_type &
|
||||
>::type
|
||||
operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
|
||||
const iType4 & i4 , const iType5 & i5 ) const
|
||||
{
|
||||
return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4,i5);
|
||||
}
|
||||
|
||||
template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
|
||||
typename iType4 , typename iType5 , typename iType6 >
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<iType0>::value &&
|
||||
std::is_integral<iType1>::value &&
|
||||
std::is_integral<iType2>::value &&
|
||||
std::is_integral<iType3>::value &&
|
||||
std::is_integral<iType4>::value &&
|
||||
std::is_integral<iType5>::value &&
|
||||
std::is_integral<iType6>::value &&
|
||||
traits::rank == 7 )
|
||||
, typename traits::value_type &
|
||||
>::type
|
||||
operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
|
||||
const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const
|
||||
{
|
||||
return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4,i5,i6);
|
||||
}
|
||||
|
||||
template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
|
||||
typename iType4 , typename iType5 , typename iType6 , typename iType7 >
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<iType0>::value &&
|
||||
std::is_integral<iType1>::value &&
|
||||
std::is_integral<iType2>::value &&
|
||||
std::is_integral<iType3>::value &&
|
||||
std::is_integral<iType4>::value &&
|
||||
std::is_integral<iType5>::value &&
|
||||
std::is_integral<iType6>::value &&
|
||||
std::is_integral<iType7>::value &&
|
||||
traits::rank == 8 )
|
||||
, typename traits::value_type &
|
||||
>::type
|
||||
operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
|
||||
const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const
|
||||
{
|
||||
return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4,i5,i6,i7);
|
||||
}
|
||||
};
|
||||
|
||||
namespace Impl {
|
||||
template<class DataType, class Arg1Type, class Arg2Type, class Arg3Type>
|
||||
struct delete_segmented_view {
|
||||
typedef SegmentedView<DataType , Arg1Type , Arg2Type, Arg3Type> view_type;
|
||||
typedef typename view_type::execution_space execution_space;
|
||||
|
||||
view_type view_;
|
||||
delete_segmented_view(view_type view):view_(view) {
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (int i) const {
|
||||
delete [] view_.get_segment(i).ptr_on_device();
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@ -241,9 +241,9 @@ public:
|
||||
typedef UnorderedMap<const_key_type,value_type,execution_space,hasher_type,equal_to_type> modifiable_map_type;
|
||||
typedef UnorderedMap<const_key_type,const_value_type,execution_space,hasher_type,equal_to_type> const_map_type;
|
||||
|
||||
static const bool is_set = Impl::is_same<void,value_type>::value;
|
||||
static const bool has_const_key = Impl::is_same<const_key_type,declared_key_type>::value;
|
||||
static const bool has_const_value = is_set || Impl::is_same<const_value_type,declared_value_type>::value;
|
||||
static const bool is_set = std::is_same<void,value_type>::value;
|
||||
static const bool has_const_key = std::is_same<const_key_type,declared_key_type>::value;
|
||||
static const bool has_const_value = is_set || std::is_same<const_value_type,declared_value_type>::value;
|
||||
|
||||
static const bool is_insertable_map = !has_const_key && (is_set || !has_const_value);
|
||||
static const bool is_modifiable_map = has_const_key && !has_const_value;
|
||||
@ -735,8 +735,8 @@ public:
|
||||
}
|
||||
|
||||
template <typename SKey, typename SValue, typename SDevice>
|
||||
typename Impl::enable_if< Impl::is_same< typename Impl::remove_const<SKey>::type, key_type>::value &&
|
||||
Impl::is_same< typename Impl::remove_const<SValue>::type, value_type>::value
|
||||
typename Impl::enable_if< std::is_same< typename Impl::remove_const<SKey>::type, key_type>::value &&
|
||||
std::is_same< typename Impl::remove_const<SValue>::type, value_type>::value
|
||||
>::type
|
||||
create_copy_view( UnorderedMap<SKey, SValue, SDevice, Hasher,EqualTo> const& src)
|
||||
{
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
|
||||
|
||||
SET(SOURCES
|
||||
|
||||
@ -7,21 +7,18 @@ vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests
|
||||
default: build_all
|
||||
echo "End Build"
|
||||
|
||||
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
|
||||
CXX = $(KOKKOS_PATH)/config/nvcc_wrapper
|
||||
else
|
||||
CXX = g++
|
||||
endif
|
||||
|
||||
CXXFLAGS = -O3
|
||||
LINK ?= $(CXX)
|
||||
LDFLAGS ?= -lpthread
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
CXX = $(NVCC_WRAPPER)
|
||||
CXXFLAGS ?= -O3
|
||||
LINK = $(CXX)
|
||||
LDFLAGS ?= -lpthread
|
||||
else
|
||||
CXX ?= g++
|
||||
CXXFLAGS ?= -O3
|
||||
LINK ?= $(CXX)
|
||||
LDFLAGS ?= -lpthread
|
||||
endif
|
||||
|
||||
KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/unit_tests
|
||||
|
||||
TEST_TARGETS =
|
||||
|
||||
@ -59,11 +59,13 @@
|
||||
#include <TestVector.hpp>
|
||||
#include <TestDualView.hpp>
|
||||
#include <TestDynamicView.hpp>
|
||||
#include <TestSegmentedView.hpp>
|
||||
|
||||
#include <Kokkos_DynRankView.hpp>
|
||||
#include <TestDynViewAPI.hpp>
|
||||
|
||||
#include <Kokkos_ErrorReporter.hpp>
|
||||
#include <TestErrorReporter.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@ -133,11 +135,6 @@ void cuda_test_dualview_combinations(unsigned int size)
|
||||
test_dualview_combinations<int,Kokkos::Cuda>(size);
|
||||
}
|
||||
|
||||
void cuda_test_segmented_view(unsigned int size)
|
||||
{
|
||||
test_segmented_view<double,Kokkos::Cuda>(size);
|
||||
}
|
||||
|
||||
void cuda_test_bitset()
|
||||
{
|
||||
test_bitset<Kokkos::Cuda>();
|
||||
@ -184,11 +181,6 @@ void cuda_test_bitset()
|
||||
cuda_test_dualview_combinations(size); \
|
||||
}
|
||||
|
||||
#define CUDA_SEGMENTEDVIEW_TEST( size ) \
|
||||
TEST_F( cuda, segmentedview_##size##x) { \
|
||||
cuda_test_segmented_view(size); \
|
||||
}
|
||||
|
||||
CUDA_DUALVIEW_COMBINE_TEST( 10 )
|
||||
CUDA_VECTOR_COMBINE_TEST( 10 )
|
||||
CUDA_VECTOR_COMBINE_TEST( 3057 )
|
||||
@ -198,7 +190,6 @@ CUDA_INSERT_TEST(close, 100000, 90000, 100, 500)
|
||||
CUDA_INSERT_TEST(far, 100000, 90000, 100, 500)
|
||||
CUDA_DEEP_COPY( 10000, 1 )
|
||||
CUDA_FAILED_INSERT_TEST( 10000, 1000 )
|
||||
CUDA_SEGMENTEDVIEW_TEST( 200 )
|
||||
|
||||
|
||||
#undef CUDA_INSERT_TEST
|
||||
@ -207,7 +198,6 @@ CUDA_SEGMENTEDVIEW_TEST( 200 )
|
||||
#undef CUDA_DEEP_COPY
|
||||
#undef CUDA_VECTOR_COMBINE_TEST
|
||||
#undef CUDA_DUALVIEW_COMBINE_TEST
|
||||
#undef CUDA_SEGMENTEDVIEW_TEST
|
||||
|
||||
|
||||
TEST_F( cuda , dynamic_view )
|
||||
@ -221,6 +211,18 @@ TEST_F( cuda , dynamic_view )
|
||||
}
|
||||
|
||||
|
||||
#if defined(KOKKOS_CLASS_LAMBDA)
|
||||
TEST_F(cuda, ErrorReporterViaLambda)
|
||||
{
|
||||
TestErrorReporter<ErrorReporterDriverUseLambda<Kokkos::Cuda>>();
|
||||
}
|
||||
#endif
|
||||
|
||||
TEST_F(cuda, ErrorReporter)
|
||||
{
|
||||
TestErrorReporter<ErrorReporterDriver<Kokkos::Cuda>>();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif /* #ifdef KOKKOS_HAVE_CUDA */
|
||||
|
||||
@ -715,9 +715,9 @@ public:
|
||||
typedef Kokkos::Experimental::DynRankView< T, device, Kokkos::MemoryUnmanaged > dView0_unmanaged ;
|
||||
typedef typename dView0::host_mirror_space host_drv_space ;
|
||||
|
||||
typedef Kokkos::Experimental::View< T , device > View0 ;
|
||||
typedef Kokkos::Experimental::View< T* , device > View1 ;
|
||||
typedef Kokkos::Experimental::View< T******* , device > View7 ;
|
||||
typedef Kokkos::View< T , device > View0 ;
|
||||
typedef Kokkos::View< T* , device > View1 ;
|
||||
typedef Kokkos::View< T******* , device > View7 ;
|
||||
|
||||
typedef typename View0::host_mirror_space host_view_space ;
|
||||
|
||||
@ -1127,8 +1127,7 @@ public:
|
||||
// T v2 = hx(0,0) ; // Generates compile error as intended
|
||||
// hx(0,0) = v2 ; // Generates compile error as intended
|
||||
|
||||
/*
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
#if 0 /* Asynchronous deep copies not implemented for dynamic rank view */
|
||||
// Testing with asynchronous deep copy with respect to device
|
||||
{
|
||||
size_t count = 0 ;
|
||||
@ -1193,7 +1192,7 @@ public:
|
||||
{ ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
|
||||
}}}}
|
||||
}
|
||||
#endif */ // #if ! KOKKOS_USING_EXP_VIEW
|
||||
#endif
|
||||
|
||||
// Testing with synchronous deep copy
|
||||
{
|
||||
|
||||
227
lib/kokkos/containers/unit_tests/TestErrorReporter.hpp
Normal file
227
lib/kokkos/containers/unit_tests/TestErrorReporter.hpp
Normal file
@ -0,0 +1,227 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_TEST_EXPERIMENTAL_ERROR_REPORTER_HPP
|
||||
#define KOKKOS_TEST_EXPERIMENTAL_ERROR_REPORTER_HPP
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <iostream>
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
namespace Test {
|
||||
|
||||
// Just save the data in the report. Informative text goies in the operator<<(..).
|
||||
template <typename DataType1, typename DataType2, typename DataType3>
|
||||
struct ThreeValReport
|
||||
{
|
||||
DataType1 m_data1;
|
||||
DataType2 m_data2;
|
||||
DataType3 m_data3;
|
||||
|
||||
};
|
||||
|
||||
template <typename DataType1, typename DataType2, typename DataType3>
|
||||
std::ostream &operator<<(std::ostream & os, const ThreeValReport<DataType1, DataType2, DataType3> &val)
|
||||
{
|
||||
return os << "{" << val.m_data1 << " " << val.m_data2 << " " << val.m_data3 << "}";
|
||||
}
|
||||
|
||||
template<typename ReportType>
|
||||
void checkReportersAndReportsAgree(const std::vector<int> &reporters,
|
||||
const std::vector<ReportType> &reports)
|
||||
{
|
||||
for (size_t i = 0; i < reports.size(); ++i) {
|
||||
EXPECT_EQ(1, reporters[i] % 2);
|
||||
EXPECT_EQ(reporters[i], reports[i].m_data1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename DeviceType>
|
||||
struct ErrorReporterDriverBase {
|
||||
|
||||
typedef ThreeValReport<int, int, double> report_type;
|
||||
typedef Kokkos::Experimental::ErrorReporter<report_type, DeviceType> error_reporter_type;
|
||||
error_reporter_type m_errorReporter;
|
||||
|
||||
ErrorReporterDriverBase(int reporter_capacity, int test_size)
|
||||
: m_errorReporter(reporter_capacity) { }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION bool error_condition(const int work_idx) const { return (work_idx % 2 != 0); }
|
||||
|
||||
void check_expectations(int reporter_capacity, int test_size)
|
||||
{
|
||||
int num_reported = m_errorReporter.getNumReports();
|
||||
int num_attempts = m_errorReporter.getNumReportAttempts();
|
||||
|
||||
int expected_num_reports = std::min(reporter_capacity, test_size / 2);
|
||||
EXPECT_EQ(expected_num_reports, num_reported);
|
||||
EXPECT_EQ(test_size / 2, num_attempts);
|
||||
|
||||
bool expect_full = (reporter_capacity <= (test_size / 2));
|
||||
bool reported_full = m_errorReporter.full();
|
||||
EXPECT_EQ(expect_full, reported_full);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename ErrorReporterDriverType>
|
||||
void TestErrorReporter()
|
||||
{
|
||||
typedef ErrorReporterDriverType tester_type;
|
||||
std::vector<int> reporters;
|
||||
std::vector<typename tester_type::report_type> reports;
|
||||
|
||||
tester_type test1(100, 10);
|
||||
test1.m_errorReporter.getReports(reporters, reports);
|
||||
checkReportersAndReportsAgree(reporters, reports);
|
||||
|
||||
tester_type test2(10, 100);
|
||||
test2.m_errorReporter.getReports(reporters, reports);
|
||||
checkReportersAndReportsAgree(reporters, reports);
|
||||
|
||||
typename Kokkos::View<int*, typename ErrorReporterDriverType::execution_space >::HostMirror view_reporters;
|
||||
typename Kokkos::View<typename tester_type::report_type*, typename ErrorReporterDriverType::execution_space >::HostMirror
|
||||
view_reports;
|
||||
test2.m_errorReporter.getReports(view_reporters, view_reports);
|
||||
|
||||
int num_reports = view_reporters.extent(0);
|
||||
reporters.clear();
|
||||
reports.clear();
|
||||
reporters.reserve(num_reports);
|
||||
reports.reserve(num_reports);
|
||||
|
||||
for (int i = 0; i < num_reports; ++i) {
|
||||
reporters.push_back(view_reporters(i));
|
||||
reports.push_back(view_reports(i));
|
||||
}
|
||||
checkReportersAndReportsAgree(reporters, reports);
|
||||
|
||||
}
|
||||
|
||||
|
||||
template <typename DeviceType>
|
||||
struct ErrorReporterDriver : public ErrorReporterDriverBase<DeviceType>
|
||||
{
|
||||
typedef ErrorReporterDriverBase<DeviceType> driver_base;
|
||||
typedef typename driver_base::error_reporter_type::execution_space execution_space;
|
||||
|
||||
ErrorReporterDriver(int reporter_capacity, int test_size)
|
||||
: driver_base(reporter_capacity, test_size)
|
||||
{
|
||||
execute(reporter_capacity, test_size);
|
||||
|
||||
// Test that clear() and resize() work across memory spaces.
|
||||
if (reporter_capacity < test_size) {
|
||||
driver_base::m_errorReporter.clear();
|
||||
driver_base::m_errorReporter.resize(test_size);
|
||||
execute(test_size, test_size);
|
||||
}
|
||||
}
|
||||
|
||||
void execute(int reporter_capacity, int test_size)
|
||||
{
|
||||
Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0,test_size), *this);
|
||||
driver_base::check_expectations(reporter_capacity, test_size);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const int work_idx) const
|
||||
{
|
||||
if (driver_base::error_condition(work_idx)) {
|
||||
double val = M_PI * static_cast<double>(work_idx);
|
||||
typename driver_base::report_type report = {work_idx, -2*work_idx, val};
|
||||
driver_base::m_errorReporter.add_report(work_idx, report);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(KOKKOS_CLASS_LAMBDA)
|
||||
template <typename DeviceType>
|
||||
struct ErrorReporterDriverUseLambda : public ErrorReporterDriverBase<DeviceType>
|
||||
{
|
||||
|
||||
typedef ErrorReporterDriverBase<DeviceType> driver_base;
|
||||
typedef typename driver_base::error_reporter_type::execution_space execution_space;
|
||||
|
||||
ErrorReporterDriverUseLambda(int reporter_capacity, int test_size)
|
||||
: driver_base(reporter_capacity, test_size)
|
||||
{
|
||||
Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0,test_size), KOKKOS_CLASS_LAMBDA (const int work_idx) {
|
||||
if (driver_base::error_condition(work_idx)) {
|
||||
double val = M_PI * static_cast<double>(work_idx);
|
||||
typename driver_base::report_type report = {work_idx, -2*work_idx, val};
|
||||
driver_base::m_errorReporter.add_report(work_idx, report);
|
||||
}
|
||||
});
|
||||
driver_base::check_expectations(reporter_capacity, test_size);
|
||||
}
|
||||
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef KOKKOS_HAVE_OPENMP
|
||||
struct ErrorReporterDriverNativeOpenMP : public ErrorReporterDriverBase<Kokkos::OpenMP>
|
||||
{
|
||||
typedef ErrorReporterDriverBase<Kokkos::OpenMP> driver_base;
|
||||
typedef typename driver_base::error_reporter_type::execution_space execution_space;
|
||||
|
||||
ErrorReporterDriverNativeOpenMP(int reporter_capacity, int test_size)
|
||||
: driver_base(reporter_capacity, test_size)
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for(int work_idx = 0; work_idx < test_size; ++work_idx)
|
||||
{
|
||||
if (driver_base::error_condition(work_idx)) {
|
||||
double val = M_PI * static_cast<double>(work_idx);
|
||||
typename driver_base::report_type report = {work_idx, -2*work_idx, val};
|
||||
driver_base::m_errorReporter.add_report(work_idx, report);
|
||||
}
|
||||
};
|
||||
driver_base::check_expectations(reporter_capacity, test_size);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
} // namespace Test
|
||||
#endif // #ifndef KOKKOS_TEST_ERROR_REPORTING_HPP
|
||||
@ -56,12 +56,14 @@
|
||||
#include <TestVector.hpp>
|
||||
#include <TestDualView.hpp>
|
||||
#include <TestDynamicView.hpp>
|
||||
#include <TestSegmentedView.hpp>
|
||||
#include <TestComplex.hpp>
|
||||
|
||||
#include <Kokkos_DynRankView.hpp>
|
||||
#include <TestDynViewAPI.hpp>
|
||||
|
||||
#include <Kokkos_ErrorReporter.hpp>
|
||||
#include <TestErrorReporter.hpp>
|
||||
|
||||
#include <iomanip>
|
||||
|
||||
namespace Test {
|
||||
@ -143,11 +145,6 @@ TEST_F( openmp , staticcrsgraph )
|
||||
test_dualview_combinations<int,Kokkos::OpenMP>(size); \
|
||||
}
|
||||
|
||||
#define OPENMP_SEGMENTEDVIEW_TEST( size ) \
|
||||
TEST_F( openmp, segmentedview_##size##x) { \
|
||||
test_segmented_view<double,Kokkos::OpenMP>(size); \
|
||||
}
|
||||
|
||||
OPENMP_INSERT_TEST(close, 100000, 90000, 100, 500, true)
|
||||
OPENMP_INSERT_TEST(far, 100000, 90000, 100, 500, false)
|
||||
OPENMP_FAILED_INSERT_TEST( 10000, 1000 )
|
||||
@ -156,7 +153,6 @@ OPENMP_DEEP_COPY( 10000, 1 )
|
||||
OPENMP_VECTOR_COMBINE_TEST( 10 )
|
||||
OPENMP_VECTOR_COMBINE_TEST( 3057 )
|
||||
OPENMP_DUALVIEW_COMBINE_TEST( 10 )
|
||||
OPENMP_SEGMENTEDVIEW_TEST( 10000 )
|
||||
|
||||
#undef OPENMP_INSERT_TEST
|
||||
#undef OPENMP_FAILED_INSERT_TEST
|
||||
@ -164,7 +160,6 @@ OPENMP_SEGMENTEDVIEW_TEST( 10000 )
|
||||
#undef OPENMP_DEEP_COPY
|
||||
#undef OPENMP_VECTOR_COMBINE_TEST
|
||||
#undef OPENMP_DUALVIEW_COMBINE_TEST
|
||||
#undef OPENMP_SEGMENTEDVIEW_TEST
|
||||
#endif
|
||||
|
||||
|
||||
@ -178,5 +173,22 @@ TEST_F( openmp , dynamic_view )
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(KOKKOS_CLASS_LAMBDA)
|
||||
TEST_F(openmp, ErrorReporterViaLambda)
|
||||
{
|
||||
TestErrorReporter<ErrorReporterDriverUseLambda<Kokkos::OpenMP>>();
|
||||
}
|
||||
#endif
|
||||
|
||||
TEST_F(openmp, ErrorReporter)
|
||||
{
|
||||
TestErrorReporter<ErrorReporterDriver<Kokkos::OpenMP>>();
|
||||
}
|
||||
|
||||
TEST_F(openmp, ErrorReporterNativeOpenMP)
|
||||
{
|
||||
TestErrorReporter<ErrorReporterDriverNativeOpenMP>();
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
|
||||
|
||||
@ -1,708 +0,0 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_TEST_SEGMENTEDVIEW_HPP
|
||||
#define KOKKOS_TEST_SEGMENTEDVIEW_HPP
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
#include <Kokkos_SegmentedView.hpp>
|
||||
#include <impl/Kokkos_Timer.hpp>
|
||||
|
||||
namespace Test {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template<class ViewType , class ExecutionSpace, int Rank = ViewType::Rank>
|
||||
struct GrowTest;
|
||||
|
||||
template<class ViewType , class ExecutionSpace>
|
||||
struct GrowTest<ViewType , ExecutionSpace , 1> {
|
||||
typedef ExecutionSpace execution_space;
|
||||
typedef Kokkos::TeamPolicy<execution_space> Policy;
|
||||
typedef typename Policy::member_type team_type;
|
||||
typedef double value_type;
|
||||
|
||||
ViewType a;
|
||||
|
||||
GrowTest(ViewType in):a(in) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (team_type team_member, double& value) const {
|
||||
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
|
||||
|
||||
a.grow(team_member , team_idx+team_member.team_size());
|
||||
value += team_idx + team_member.team_rank();
|
||||
|
||||
if((a.dimension_0()>team_idx+team_member.team_rank()) &&
|
||||
(a.dimension(0)>team_idx+team_member.team_rank()))
|
||||
a(team_idx+team_member.team_rank()) = team_idx+team_member.team_rank();
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
template<class ViewType , class ExecutionSpace>
|
||||
struct GrowTest<ViewType , ExecutionSpace , 2> {
|
||||
typedef ExecutionSpace execution_space;
|
||||
typedef Kokkos::TeamPolicy<execution_space> Policy;
|
||||
typedef typename Policy::member_type team_type;
|
||||
typedef double value_type;
|
||||
|
||||
ViewType a;
|
||||
|
||||
GrowTest(ViewType in):a(in) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (team_type team_member, double& value) const {
|
||||
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
|
||||
|
||||
a.grow(team_member , team_idx+ team_member.team_size());
|
||||
|
||||
for( typename ExecutionSpace::size_type k=0;k<7;k++)
|
||||
value += team_idx + team_member.team_rank() + 13*k;
|
||||
|
||||
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
|
||||
(a.dimension(0)>team_idx+ team_member.team_rank())) {
|
||||
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++) {
|
||||
a(team_idx+ team_member.team_rank(),k) =
|
||||
team_idx+ team_member.team_rank() + 13*k;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class ViewType , class ExecutionSpace>
|
||||
struct GrowTest<ViewType , ExecutionSpace , 3> {
|
||||
typedef ExecutionSpace execution_space;
|
||||
typedef Kokkos::TeamPolicy<execution_space> Policy;
|
||||
typedef typename Policy::member_type team_type;
|
||||
typedef double value_type;
|
||||
|
||||
ViewType a;
|
||||
|
||||
GrowTest(ViewType in):a(in) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (team_type team_member, double& value) const {
|
||||
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
|
||||
|
||||
a.grow(team_member , team_idx+ team_member.team_size());
|
||||
|
||||
for( typename ExecutionSpace::size_type k=0;k<7;k++)
|
||||
for( typename ExecutionSpace::size_type l=0;l<3;l++)
|
||||
value += team_idx + team_member.team_rank() + 13*k + 3*l;
|
||||
|
||||
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
|
||||
(a.dimension(0)>team_idx+ team_member.team_rank())) {
|
||||
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
|
||||
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
|
||||
a(team_idx+ team_member.team_rank(),k,l) =
|
||||
team_idx+ team_member.team_rank() + 13*k + 3*l;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class ViewType , class ExecutionSpace>
|
||||
struct GrowTest<ViewType , ExecutionSpace , 4> {
|
||||
typedef ExecutionSpace execution_space;
|
||||
typedef Kokkos::TeamPolicy<execution_space> Policy;
|
||||
typedef typename Policy::member_type team_type;
|
||||
typedef double value_type;
|
||||
|
||||
ViewType a;
|
||||
|
||||
GrowTest(ViewType in):a(in) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (team_type team_member, double& value) const {
|
||||
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
|
||||
|
||||
a.grow(team_member , team_idx+ team_member.team_size());
|
||||
|
||||
for( typename ExecutionSpace::size_type k=0;k<7;k++)
|
||||
for( typename ExecutionSpace::size_type l=0;l<3;l++)
|
||||
for( typename ExecutionSpace::size_type m=0;m<2;m++)
|
||||
value += team_idx + team_member.team_rank() + 13*k + 3*l + 7*m;
|
||||
|
||||
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
|
||||
(a.dimension(0)>team_idx+ team_member.team_rank())) {
|
||||
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
|
||||
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
|
||||
for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
|
||||
a(team_idx+ team_member.team_rank(),k,l,m) =
|
||||
team_idx+ team_member.team_rank() + 13*k + 3*l + 7*m;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class ViewType , class ExecutionSpace>
|
||||
struct GrowTest<ViewType , ExecutionSpace , 5> {
|
||||
typedef ExecutionSpace execution_space;
|
||||
typedef Kokkos::TeamPolicy<execution_space> Policy;
|
||||
typedef typename Policy::member_type team_type;
|
||||
typedef double value_type;
|
||||
|
||||
ViewType a;
|
||||
|
||||
GrowTest(ViewType in):a(in) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (team_type team_member, double& value) const {
|
||||
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
|
||||
|
||||
a.grow(team_member , team_idx+ team_member.team_size());
|
||||
|
||||
for( typename ExecutionSpace::size_type k=0;k<7;k++)
|
||||
for( typename ExecutionSpace::size_type l=0;l<3;l++)
|
||||
for( typename ExecutionSpace::size_type m=0;m<2;m++)
|
||||
for( typename ExecutionSpace::size_type n=0;n<3;n++)
|
||||
value +=
|
||||
team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n;
|
||||
|
||||
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
|
||||
(a.dimension(0)>team_idx+ team_member.team_rank())) {
|
||||
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
|
||||
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
|
||||
for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
|
||||
for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
|
||||
a(team_idx+ team_member.team_rank(),k,l,m,n) =
|
||||
team_idx+ team_member.team_rank() + 13*k + 3*l + 7*m + 5*n;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class ViewType , class ExecutionSpace>
|
||||
struct GrowTest<ViewType , ExecutionSpace , 6> {
|
||||
typedef ExecutionSpace execution_space;
|
||||
typedef Kokkos::TeamPolicy<execution_space> Policy;
|
||||
typedef typename Policy::member_type team_type;
|
||||
typedef double value_type;
|
||||
|
||||
ViewType a;
|
||||
|
||||
GrowTest(ViewType in):a(in) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (team_type team_member, double& value) const {
|
||||
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
|
||||
|
||||
a.grow(team_member , team_idx+ team_member.team_size());
|
||||
|
||||
for( typename ExecutionSpace::size_type k=0;k<7;k++)
|
||||
for( typename ExecutionSpace::size_type l=0;l<3;l++)
|
||||
for( typename ExecutionSpace::size_type m=0;m<2;m++)
|
||||
for( typename ExecutionSpace::size_type n=0;n<3;n++)
|
||||
for( typename ExecutionSpace::size_type o=0;o<2;o++)
|
||||
value +=
|
||||
team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o ;
|
||||
|
||||
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
|
||||
(a.dimension(0)>team_idx+ team_member.team_rank())) {
|
||||
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
|
||||
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
|
||||
for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
|
||||
for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
|
||||
for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
|
||||
a(team_idx+ team_member.team_rank(),k,l,m,n,o) =
|
||||
team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o ;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class ViewType , class ExecutionSpace>
|
||||
struct GrowTest<ViewType , ExecutionSpace , 7> {
|
||||
typedef ExecutionSpace execution_space;
|
||||
typedef Kokkos::TeamPolicy<execution_space> Policy;
|
||||
typedef typename Policy::member_type team_type;
|
||||
typedef double value_type;
|
||||
|
||||
ViewType a;
|
||||
|
||||
GrowTest(ViewType in):a(in) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (team_type team_member, double& value) const {
|
||||
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
|
||||
|
||||
a.grow(team_member , team_idx+ team_member.team_size());
|
||||
|
||||
for( typename ExecutionSpace::size_type k=0;k<7;k++)
|
||||
for( typename ExecutionSpace::size_type l=0;l<3;l++)
|
||||
for( typename ExecutionSpace::size_type m=0;m<2;m++)
|
||||
for( typename ExecutionSpace::size_type n=0;n<3;n++)
|
||||
for( typename ExecutionSpace::size_type o=0;o<2;o++)
|
||||
for( typename ExecutionSpace::size_type p=0;p<4;p++)
|
||||
value +=
|
||||
team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p ;
|
||||
|
||||
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
|
||||
(a.dimension(0)>team_idx+ team_member.team_rank())) {
|
||||
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
|
||||
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
|
||||
for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
|
||||
for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
|
||||
for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
|
||||
for( typename ExecutionSpace::size_type p=0;p<a.dimension_6();p++)
|
||||
a(team_idx+ team_member.team_rank(),k,l,m,n,o,p) =
|
||||
team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p ;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class ViewType , class ExecutionSpace>
|
||||
struct GrowTest<ViewType , ExecutionSpace , 8> {
|
||||
typedef ExecutionSpace execution_space;
|
||||
typedef Kokkos::TeamPolicy<execution_space> Policy;
|
||||
typedef typename Policy::member_type team_type;
|
||||
typedef double value_type;
|
||||
|
||||
ViewType a;
|
||||
|
||||
GrowTest(ViewType in):a(in) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (team_type team_member, double& value) const {
|
||||
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
|
||||
a.grow(team_member , team_idx + team_member.team_size());
|
||||
|
||||
for( typename ExecutionSpace::size_type k=0;k<7;k++)
|
||||
for( typename ExecutionSpace::size_type l=0;l<3;l++)
|
||||
for( typename ExecutionSpace::size_type m=0;m<2;m++)
|
||||
for( typename ExecutionSpace::size_type n=0;n<3;n++)
|
||||
for( typename ExecutionSpace::size_type o=0;o<2;o++)
|
||||
for( typename ExecutionSpace::size_type p=0;p<4;p++)
|
||||
for( typename ExecutionSpace::size_type q=0;q<3;q++)
|
||||
value +=
|
||||
team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p + 17*q;
|
||||
|
||||
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
|
||||
(a.dimension(0)>team_idx+ team_member.team_rank())) {
|
||||
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
|
||||
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
|
||||
for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
|
||||
for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
|
||||
for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
|
||||
for( typename ExecutionSpace::size_type p=0;p<a.dimension_6();p++)
|
||||
for( typename ExecutionSpace::size_type q=0;q<a.dimension_7();q++)
|
||||
a(team_idx+ team_member.team_rank(),k,l,m,n,o,p,q) =
|
||||
team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p + 17*q;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class ViewType , class ExecutionSpace, int Rank = ViewType::Rank>
|
||||
struct VerifyTest;
|
||||
|
||||
template<class ViewType , class ExecutionSpace>
|
||||
struct VerifyTest<ViewType , ExecutionSpace , 1> {
|
||||
typedef ExecutionSpace execution_space;
|
||||
typedef Kokkos::TeamPolicy<execution_space> Policy;
|
||||
typedef typename Policy::member_type team_type;
|
||||
typedef double value_type;
|
||||
|
||||
ViewType a;
|
||||
|
||||
VerifyTest(ViewType in):a(in) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (team_type team_member, double& value) const {
|
||||
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
|
||||
|
||||
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
|
||||
(a.dimension(0)>team_idx+ team_member.team_rank())) {
|
||||
value += a(team_idx+ team_member.team_rank());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class ViewType , class ExecutionSpace>
|
||||
struct VerifyTest<ViewType , ExecutionSpace , 2> {
|
||||
typedef ExecutionSpace execution_space;
|
||||
typedef Kokkos::TeamPolicy<execution_space> Policy;
|
||||
typedef typename Policy::member_type team_type;
|
||||
typedef double value_type;
|
||||
|
||||
ViewType a;
|
||||
|
||||
VerifyTest(ViewType in):a(in) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (team_type team_member, double& value) const {
|
||||
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
|
||||
|
||||
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
|
||||
(a.dimension(0)>team_idx+ team_member.team_rank())) {
|
||||
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
|
||||
value += a(team_idx+ team_member.team_rank(),k);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class ViewType , class ExecutionSpace>
|
||||
struct VerifyTest<ViewType , ExecutionSpace , 3> {
|
||||
typedef ExecutionSpace execution_space;
|
||||
typedef Kokkos::TeamPolicy<execution_space> Policy;
|
||||
typedef typename Policy::member_type team_type;
|
||||
typedef double value_type;
|
||||
|
||||
ViewType a;
|
||||
|
||||
VerifyTest(ViewType in):a(in) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (team_type team_member, double& value) const {
|
||||
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
|
||||
|
||||
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
|
||||
(a.dimension(0)>team_idx+ team_member.team_rank())) {
|
||||
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
|
||||
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
|
||||
value += a(team_idx+ team_member.team_rank(),k,l);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class ViewType , class ExecutionSpace>
|
||||
struct VerifyTest<ViewType , ExecutionSpace , 4> {
|
||||
typedef ExecutionSpace execution_space;
|
||||
typedef Kokkos::TeamPolicy<execution_space> Policy;
|
||||
typedef typename Policy::member_type team_type;
|
||||
typedef double value_type;
|
||||
|
||||
ViewType a;
|
||||
|
||||
VerifyTest(ViewType in):a(in) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (team_type team_member, double& value) const {
|
||||
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
|
||||
|
||||
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
|
||||
(a.dimension(0)>team_idx+ team_member.team_rank())) {
|
||||
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
|
||||
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
|
||||
for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
|
||||
value += a(team_idx+ team_member.team_rank(),k,l,m);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class ViewType , class ExecutionSpace>
|
||||
struct VerifyTest<ViewType , ExecutionSpace , 5> {
|
||||
typedef ExecutionSpace execution_space;
|
||||
typedef Kokkos::TeamPolicy<execution_space> Policy;
|
||||
typedef typename Policy::member_type team_type;
|
||||
typedef double value_type;
|
||||
|
||||
ViewType a;
|
||||
|
||||
VerifyTest(ViewType in):a(in) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (team_type team_member, double& value) const {
|
||||
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
|
||||
|
||||
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
|
||||
(a.dimension(0)>team_idx+ team_member.team_rank())) {
|
||||
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
|
||||
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
|
||||
for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
|
||||
for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
|
||||
value += a(team_idx+ team_member.team_rank(),k,l,m,n);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class ViewType , class ExecutionSpace>
|
||||
struct VerifyTest<ViewType , ExecutionSpace , 6> {
|
||||
typedef ExecutionSpace execution_space;
|
||||
typedef Kokkos::TeamPolicy<execution_space> Policy;
|
||||
typedef typename Policy::member_type team_type;
|
||||
typedef double value_type;
|
||||
|
||||
ViewType a;
|
||||
|
||||
VerifyTest(ViewType in):a(in) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (team_type team_member, double& value) const {
|
||||
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
|
||||
|
||||
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
|
||||
(a.dimension(0)>team_idx+ team_member.team_rank())) {
|
||||
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
|
||||
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
|
||||
for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
|
||||
for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
|
||||
for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
|
||||
value += a(team_idx+ team_member.team_rank(),k,l,m,n,o);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class ViewType , class ExecutionSpace>
|
||||
struct VerifyTest<ViewType , ExecutionSpace , 7> {
|
||||
typedef ExecutionSpace execution_space;
|
||||
typedef Kokkos::TeamPolicy<execution_space> Policy;
|
||||
typedef typename Policy::member_type team_type;
|
||||
typedef double value_type;
|
||||
|
||||
ViewType a;
|
||||
|
||||
VerifyTest(ViewType in):a(in) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (team_type team_member, double& value) const {
|
||||
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
|
||||
|
||||
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
|
||||
(a.dimension(0)>team_idx+ team_member.team_rank())) {
|
||||
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
|
||||
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
|
||||
for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
|
||||
for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
|
||||
for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
|
||||
for( typename ExecutionSpace::size_type p=0;p<a.dimension_6();p++)
|
||||
value += a(team_idx+ team_member.team_rank(),k,l,m,n,o,p);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class ViewType , class ExecutionSpace>
|
||||
struct VerifyTest<ViewType , ExecutionSpace , 8> {
|
||||
typedef ExecutionSpace execution_space;
|
||||
typedef Kokkos::TeamPolicy<execution_space> Policy;
|
||||
typedef typename Policy::member_type team_type;
|
||||
typedef double value_type;
|
||||
|
||||
ViewType a;
|
||||
|
||||
VerifyTest(ViewType in):a(in) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (team_type team_member, double& value) const {
|
||||
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
|
||||
|
||||
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
|
||||
(a.dimension(0)>team_idx+ team_member.team_rank())) {
|
||||
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
|
||||
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
|
||||
for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
|
||||
for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
|
||||
for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
|
||||
for( typename ExecutionSpace::size_type p=0;p<a.dimension_6();p++)
|
||||
for( typename ExecutionSpace::size_type q=0;q<a.dimension_7();q++)
|
||||
value += a(team_idx+ team_member.team_rank(),k,l,m,n,o,p,q);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Scalar, class ExecutionSpace>
|
||||
struct test_segmented_view
|
||||
{
|
||||
typedef test_segmented_view<Scalar,ExecutionSpace> self_type;
|
||||
|
||||
typedef Scalar scalar_type;
|
||||
typedef ExecutionSpace execution_space;
|
||||
typedef Kokkos::TeamPolicy<execution_space> Policy;
|
||||
|
||||
double result;
|
||||
double reference;
|
||||
|
||||
template <class ViewType>
|
||||
void run_me(ViewType a, int max_length){
|
||||
const int team_size = Policy::team_size_max( GrowTest<ViewType,execution_space>(a) );
|
||||
const int nteams = max_length/team_size;
|
||||
|
||||
reference = 0;
|
||||
result = 0;
|
||||
|
||||
Kokkos::parallel_reduce(Policy(nteams,team_size),GrowTest<ViewType,execution_space>(a),reference);
|
||||
Kokkos::fence();
|
||||
Kokkos::parallel_reduce(Policy(nteams,team_size),VerifyTest<ViewType,execution_space>(a),result);
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
|
||||
test_segmented_view(unsigned int size,int rank)
|
||||
{
|
||||
reference = 0;
|
||||
result = 0;
|
||||
|
||||
const int dim_1 = 7;
|
||||
const int dim_2 = 3;
|
||||
const int dim_3 = 2;
|
||||
const int dim_4 = 3;
|
||||
const int dim_5 = 2;
|
||||
const int dim_6 = 4;
|
||||
//const int dim_7 = 3;
|
||||
|
||||
if(rank==1) {
|
||||
typedef Kokkos::Experimental::SegmentedView<Scalar*,Kokkos::LayoutLeft,ExecutionSpace> rank1_view;
|
||||
run_me< rank1_view >(rank1_view("Rank1",128,size), size);
|
||||
}
|
||||
if(rank==2) {
|
||||
typedef Kokkos::Experimental::SegmentedView<Scalar**,Kokkos::LayoutLeft,ExecutionSpace> rank2_view;
|
||||
run_me< rank2_view >(rank2_view("Rank2",128,size,dim_1), size);
|
||||
}
|
||||
if(rank==3) {
|
||||
typedef Kokkos::Experimental::SegmentedView<Scalar*[7][3][2],Kokkos::LayoutRight,ExecutionSpace> rank3_view;
|
||||
run_me< rank3_view >(rank3_view("Rank3",128,size), size);
|
||||
}
|
||||
if(rank==4) {
|
||||
typedef Kokkos::Experimental::SegmentedView<Scalar****,Kokkos::LayoutRight,ExecutionSpace> rank4_view;
|
||||
run_me< rank4_view >(rank4_view("Rank4",128,size,dim_1,dim_2,dim_3), size);
|
||||
}
|
||||
if(rank==5) {
|
||||
typedef Kokkos::Experimental::SegmentedView<Scalar*[7][3][2][3],Kokkos::LayoutLeft,ExecutionSpace> rank5_view;
|
||||
run_me< rank5_view >(rank5_view("Rank5",128,size), size);
|
||||
}
|
||||
if(rank==6) {
|
||||
typedef Kokkos::Experimental::SegmentedView<Scalar*****[2],Kokkos::LayoutRight,ExecutionSpace> rank6_view;
|
||||
run_me< rank6_view >(rank6_view("Rank6",128,size,dim_1,dim_2,dim_3,dim_4), size);
|
||||
}
|
||||
if(rank==7) {
|
||||
typedef Kokkos::Experimental::SegmentedView<Scalar*******,Kokkos::LayoutLeft,ExecutionSpace> rank7_view;
|
||||
run_me< rank7_view >(rank7_view("Rank7",128,size,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6), size);
|
||||
}
|
||||
if(rank==8) {
|
||||
typedef Kokkos::Experimental::SegmentedView<Scalar*****[2][4][3],Kokkos::LayoutLeft,ExecutionSpace> rank8_view;
|
||||
run_me< rank8_view >(rank8_view("Rank8",128,size,dim_1,dim_2,dim_3,dim_4), size);
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
|
||||
|
||||
|
||||
template <typename Scalar, class ExecutionSpace>
|
||||
void test_segmented_view(unsigned int size)
|
||||
{
|
||||
{
|
||||
typedef Kokkos::Experimental::SegmentedView<Scalar*****[2][4][3],Kokkos::LayoutLeft,ExecutionSpace> view_type;
|
||||
view_type a("A",128,size,7,3,2,3);
|
||||
double reference;
|
||||
|
||||
Impl::GrowTest<view_type,ExecutionSpace> f(a);
|
||||
|
||||
const int team_size = Kokkos::TeamPolicy<ExecutionSpace>::team_size_max( f );
|
||||
const int nteams = (size+team_size-1)/team_size;
|
||||
|
||||
Kokkos::parallel_reduce(Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size),f,reference);
|
||||
|
||||
size_t real_size = ((size+127)/128)*128;
|
||||
|
||||
ASSERT_EQ(real_size,a.dimension_0());
|
||||
ASSERT_EQ(7,a.dimension_1());
|
||||
ASSERT_EQ(3,a.dimension_2());
|
||||
ASSERT_EQ(2,a.dimension_3());
|
||||
ASSERT_EQ(3,a.dimension_4());
|
||||
ASSERT_EQ(2,a.dimension_5());
|
||||
ASSERT_EQ(4,a.dimension_6());
|
||||
ASSERT_EQ(3,a.dimension_7());
|
||||
ASSERT_EQ(real_size,a.dimension(0));
|
||||
ASSERT_EQ(7,a.dimension(1));
|
||||
ASSERT_EQ(3,a.dimension(2));
|
||||
ASSERT_EQ(2,a.dimension(3));
|
||||
ASSERT_EQ(3,a.dimension(4));
|
||||
ASSERT_EQ(2,a.dimension(5));
|
||||
ASSERT_EQ(4,a.dimension(6));
|
||||
ASSERT_EQ(3,a.dimension(7));
|
||||
ASSERT_EQ(8,a.Rank);
|
||||
}
|
||||
{
|
||||
Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,1);
|
||||
ASSERT_EQ(test.reference,test.result);
|
||||
}
|
||||
{
|
||||
Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,2);
|
||||
ASSERT_EQ(test.reference,test.result);
|
||||
}
|
||||
{
|
||||
Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,3);
|
||||
ASSERT_EQ(test.reference,test.result);
|
||||
}
|
||||
{
|
||||
Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,4);
|
||||
ASSERT_EQ(test.reference,test.result);
|
||||
}
|
||||
{
|
||||
Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,5);
|
||||
ASSERT_EQ(test.reference,test.result);
|
||||
}
|
||||
{
|
||||
Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,6);
|
||||
ASSERT_EQ(test.reference,test.result);
|
||||
}
|
||||
{
|
||||
Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,7);
|
||||
ASSERT_EQ(test.reference,test.result);
|
||||
}
|
||||
{
|
||||
Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,8);
|
||||
ASSERT_EQ(test.reference,test.result);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
} // namespace Test
|
||||
|
||||
#else
|
||||
|
||||
template <typename Scalar, class ExecutionSpace>
|
||||
void test_segmented_view(unsigned int ) {}
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* #ifndef KOKKOS_TEST_SEGMENTEDVIEW_HPP */
|
||||
|
||||
@ -58,7 +58,6 @@
|
||||
#include <TestStaticCrsGraph.hpp>
|
||||
#include <TestVector.hpp>
|
||||
#include <TestDualView.hpp>
|
||||
#include <TestSegmentedView.hpp>
|
||||
#include <TestDynamicView.hpp>
|
||||
#include <TestComplex.hpp>
|
||||
|
||||
@ -67,6 +66,9 @@
|
||||
#include <Kokkos_DynRankView.hpp>
|
||||
#include <TestDynViewAPI.hpp>
|
||||
|
||||
#include <Kokkos_ErrorReporter.hpp>
|
||||
#include <TestErrorReporter.hpp>
|
||||
|
||||
namespace Test {
|
||||
|
||||
class serial : public ::testing::Test {
|
||||
@ -135,11 +137,6 @@ TEST_F( serial, bitset )
|
||||
test_dualview_combinations<int,Kokkos::Serial>(size); \
|
||||
}
|
||||
|
||||
#define SERIAL_SEGMENTEDVIEW_TEST( size ) \
|
||||
TEST_F( serial, segmentedview_##size##x) { \
|
||||
test_segmented_view<double,Kokkos::Serial>(size); \
|
||||
}
|
||||
|
||||
SERIAL_INSERT_TEST(close, 100000, 90000, 100, 500, true)
|
||||
SERIAL_INSERT_TEST(far, 100000, 90000, 100, 500, false)
|
||||
SERIAL_FAILED_INSERT_TEST( 10000, 1000 )
|
||||
@ -148,7 +145,6 @@ SERIAL_DEEP_COPY( 10000, 1 )
|
||||
SERIAL_VECTOR_COMBINE_TEST( 10 )
|
||||
SERIAL_VECTOR_COMBINE_TEST( 3057 )
|
||||
SERIAL_DUALVIEW_COMBINE_TEST( 10 )
|
||||
SERIAL_SEGMENTEDVIEW_TEST( 10000 )
|
||||
|
||||
#undef SERIAL_INSERT_TEST
|
||||
#undef SERIAL_FAILED_INSERT_TEST
|
||||
@ -156,7 +152,6 @@ SERIAL_SEGMENTEDVIEW_TEST( 10000 )
|
||||
#undef SERIAL_DEEP_COPY
|
||||
#undef SERIAL_VECTOR_COMBINE_TEST
|
||||
#undef SERIAL_DUALVIEW_COMBINE_TEST
|
||||
#undef SERIAL_SEGMENTEDVIEW_TEST
|
||||
|
||||
TEST_F( serial , dynamic_view )
|
||||
{
|
||||
@ -168,6 +163,19 @@ TEST_F( serial , dynamic_view )
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(KOKKOS_CLASS_LAMBDA)
|
||||
TEST_F(serial, ErrorReporterViaLambda)
|
||||
{
|
||||
TestErrorReporter<ErrorReporterDriverUseLambda<Kokkos::Serial>>();
|
||||
}
|
||||
#endif
|
||||
|
||||
TEST_F(serial, ErrorReporter)
|
||||
{
|
||||
TestErrorReporter<ErrorReporterDriver<Kokkos::Serial>>();
|
||||
}
|
||||
|
||||
|
||||
} // namespace Test
|
||||
|
||||
#endif // KOKKOS_HAVE_SERIAL
|
||||
|
||||
@ -62,11 +62,13 @@
|
||||
#include <TestVector.hpp>
|
||||
#include <TestDualView.hpp>
|
||||
#include <TestDynamicView.hpp>
|
||||
#include <TestSegmentedView.hpp>
|
||||
|
||||
#include <Kokkos_DynRankView.hpp>
|
||||
#include <TestDynViewAPI.hpp>
|
||||
|
||||
#include <Kokkos_ErrorReporter.hpp>
|
||||
#include <TestErrorReporter.hpp>
|
||||
|
||||
namespace Test {
|
||||
|
||||
class threads : public ::testing::Test {
|
||||
@ -145,12 +147,6 @@ TEST_F( threads , staticcrsgraph )
|
||||
test_dualview_combinations<int,Kokkos::Threads>(size); \
|
||||
}
|
||||
|
||||
#define THREADS_SEGMENTEDVIEW_TEST( size ) \
|
||||
TEST_F( threads, segmentedview_##size##x) { \
|
||||
test_segmented_view<double,Kokkos::Threads>(size); \
|
||||
}
|
||||
|
||||
|
||||
THREADS_INSERT_TEST(far, 100000, 90000, 100, 500, false)
|
||||
THREADS_FAILED_INSERT_TEST( 10000, 1000 )
|
||||
THREADS_DEEP_COPY( 10000, 1 )
|
||||
@ -158,7 +154,6 @@ THREADS_DEEP_COPY( 10000, 1 )
|
||||
THREADS_VECTOR_COMBINE_TEST( 10 )
|
||||
THREADS_VECTOR_COMBINE_TEST( 3057 )
|
||||
THREADS_DUALVIEW_COMBINE_TEST( 10 )
|
||||
THREADS_SEGMENTEDVIEW_TEST( 10000 )
|
||||
|
||||
|
||||
#undef THREADS_INSERT_TEST
|
||||
@ -167,8 +162,6 @@ THREADS_SEGMENTEDVIEW_TEST( 10000 )
|
||||
#undef THREADS_DEEP_COPY
|
||||
#undef THREADS_VECTOR_COMBINE_TEST
|
||||
#undef THREADS_DUALVIEW_COMBINE_TEST
|
||||
#undef THREADS_SEGMENTEDVIEW_TEST
|
||||
|
||||
|
||||
|
||||
TEST_F( threads , dynamic_view )
|
||||
@ -181,6 +174,19 @@ TEST_F( threads , dynamic_view )
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#if defined(KOKKOS_CLASS_LAMBDA)
|
||||
TEST_F(threads, ErrorReporterViaLambda)
|
||||
{
|
||||
TestErrorReporter<ErrorReporterDriverUseLambda<Kokkos::Threads>>();
|
||||
}
|
||||
#endif
|
||||
|
||||
TEST_F(threads, ErrorReporter)
|
||||
{
|
||||
TestErrorReporter<ErrorReporterDriver<Kokkos::Threads>>();
|
||||
}
|
||||
|
||||
} // namespace Test
|
||||
|
||||
|
||||
|
||||
@ -2,3 +2,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREAD DLlib
|
||||
TEST_OPTIONAL_TPLS CUSPARSE
|
||||
)
|
||||
|
||||
TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib)
|
||||
@ -45,6 +45,16 @@
|
||||
#define KOKKOS_ENABLE_PROFILING 0
|
||||
#endif
|
||||
|
||||
#cmakedefine KOKKOS_HAVE_CUDA_RDC
|
||||
#ifdef KOKKOS_HAVE_CUDA_RDC
|
||||
#define KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE 1
|
||||
#endif
|
||||
|
||||
#cmakedefine KOKKOS_HAVE_CUDA_LAMBDA
|
||||
#ifdef KOKKOS_HAVE_CUDA_LAMBDA
|
||||
#define KOKKOS_CUDA_USE_LAMBDA 1
|
||||
#endif
|
||||
|
||||
// Don't forbid users from defining this macro on the command line,
|
||||
// but still make sure that CMake logic can control its definition.
|
||||
#if ! defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINRARY_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
SET(SOURCES
|
||||
PerfTestMain.cpp
|
||||
@ -19,7 +19,7 @@ TRIBITS_ADD_EXECUTABLE(
|
||||
TESTONLYLIBS kokkos_gtest
|
||||
)
|
||||
|
||||
TRIBITS_ADD_EXECUTABLE_AND_TEST(
|
||||
TRIBITS_ADD_TEST(
|
||||
PerfTest
|
||||
NAME PerfTestExec
|
||||
COMM serial mpi
|
||||
|
||||
@ -7,21 +7,18 @@ vpath %.cpp ${KOKKOS_PATH}/core/perf_test
|
||||
default: build_all
|
||||
echo "End Build"
|
||||
|
||||
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
|
||||
CXX = $(KOKKOS_PATH)/config/nvcc_wrapper
|
||||
else
|
||||
CXX = g++
|
||||
endif
|
||||
|
||||
CXXFLAGS = -O3
|
||||
LINK ?= $(CXX)
|
||||
LDFLAGS ?= -lpthread
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
CXX = $(NVCC_WRAPPER)
|
||||
CXXFLAGS ?= -O3
|
||||
LINK = $(CXX)
|
||||
LDFLAGS ?= -lpthread
|
||||
else
|
||||
CXX ?= g++
|
||||
CXXFLAGS ?= -O3
|
||||
LINK ?= $(CXX)
|
||||
LDFLAGS ?= -lpthread
|
||||
endif
|
||||
|
||||
KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/core/perf_test
|
||||
|
||||
TEST_TARGETS =
|
||||
|
||||
@ -79,10 +79,21 @@ class host : public ::testing::Test {
|
||||
protected:
|
||||
static void SetUpTestCase()
|
||||
{
|
||||
const unsigned team_count = Kokkos::hwloc::get_available_numa_count();
|
||||
const unsigned threads_per_team = 4 ;
|
||||
if(Kokkos::hwloc::available()) {
|
||||
const unsigned numa_count = Kokkos::hwloc::get_available_numa_count();
|
||||
const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa();
|
||||
const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
|
||||
|
||||
TestHostDevice::initialize( team_count * threads_per_team );
|
||||
unsigned threads_count = 0 ;
|
||||
|
||||
threads_count = std::max( 1u , numa_count )
|
||||
* std::max( 2u , cores_per_numa * threads_per_core );
|
||||
|
||||
TestHostDevice::initialize( threads_count );
|
||||
} else {
|
||||
const unsigned thread_count = 4 ;
|
||||
TestHostDevice::initialize( thread_count );
|
||||
}
|
||||
}
|
||||
|
||||
static void TearDownTestCase()
|
||||
|
||||
@ -1,334 +0,0 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
|
||||
#define KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#if defined( KOKKOS_HAVE_CUDA )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
struct ViewOperatorBoundsErrorAbort< Kokkos::CudaSpace > {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void apply( const size_t rank
|
||||
, const size_t n0 , const size_t n1
|
||||
, const size_t n2 , const size_t n3
|
||||
, const size_t n4 , const size_t n5
|
||||
, const size_t n6 , const size_t n7
|
||||
, const size_t i0 , const size_t i1
|
||||
, const size_t i2 , const size_t i3
|
||||
, const size_t i4 , const size_t i5
|
||||
, const size_t i6 , const size_t i7 )
|
||||
{
|
||||
const int r =
|
||||
( n0 <= i0 ? 0 :
|
||||
( n1 <= i1 ? 1 :
|
||||
( n2 <= i2 ? 2 :
|
||||
( n3 <= i3 ? 3 :
|
||||
( n4 <= i4 ? 4 :
|
||||
( n5 <= i5 ? 5 :
|
||||
( n6 <= i6 ? 6 : 7 )))))));
|
||||
const size_t n =
|
||||
( n0 <= i0 ? n0 :
|
||||
( n1 <= i1 ? n1 :
|
||||
( n2 <= i2 ? n2 :
|
||||
( n3 <= i3 ? n3 :
|
||||
( n4 <= i4 ? n4 :
|
||||
( n5 <= i5 ? n5 :
|
||||
( n6 <= i6 ? n6 : n7 )))))));
|
||||
const size_t i =
|
||||
( n0 <= i0 ? i0 :
|
||||
( n1 <= i1 ? i1 :
|
||||
( n2 <= i2 ? i2 :
|
||||
( n3 <= i3 ? i3 :
|
||||
( n4 <= i4 ? i4 :
|
||||
( n5 <= i5 ? i5 :
|
||||
( n6 <= i6 ? i6 : i7 )))))));
|
||||
printf("Cuda view array bounds error index %d : FAILED %lu < %lu\n" , r , i , n );
|
||||
Kokkos::Impl::cuda_abort("Cuda view array bounds error");
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
|
||||
// Via reinterpret_case this can be used to support all scalar types of those sizes.
|
||||
// Any other scalar type falls back to either normal reads out of global memory,
|
||||
// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
|
||||
|
||||
template< typename ValueType , typename AliasType >
|
||||
struct CudaTextureFetch {
|
||||
|
||||
::cudaTextureObject_t m_obj ;
|
||||
const ValueType * m_ptr ;
|
||||
int m_offset ;
|
||||
|
||||
// Deference operator pulls through texture object and returns by value
|
||||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ValueType operator[]( const iType & i ) const
|
||||
{
|
||||
#if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
|
||||
AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset );
|
||||
return *(reinterpret_cast<ValueType*> (&v));
|
||||
#else
|
||||
return m_ptr[ i ];
|
||||
#endif
|
||||
}
|
||||
|
||||
// Pointer to referenced memory
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
operator const ValueType * () const { return m_ptr ; }
|
||||
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch() : m_obj() , m_ptr() , m_offset() {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
~CudaTextureFetch() {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch( const CudaTextureFetch & rhs )
|
||||
: m_obj( rhs.m_obj )
|
||||
, m_ptr( rhs.m_ptr )
|
||||
, m_offset( rhs.m_offset )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch( CudaTextureFetch && rhs )
|
||||
: m_obj( rhs.m_obj )
|
||||
, m_ptr( rhs.m_ptr )
|
||||
, m_offset( rhs.m_offset )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
|
||||
{
|
||||
m_obj = rhs.m_obj ;
|
||||
m_ptr = rhs.m_ptr ;
|
||||
m_offset = rhs.m_offset ;
|
||||
return *this ;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch & operator = ( CudaTextureFetch && rhs )
|
||||
{
|
||||
m_obj = rhs.m_obj ;
|
||||
m_ptr = rhs.m_ptr ;
|
||||
m_offset = rhs.m_offset ;
|
||||
return *this ;
|
||||
}
|
||||
|
||||
// Texture object spans the entire allocation.
|
||||
// This handle may view a subset of the allocation, so an offset is required.
|
||||
template< class CudaMemorySpace >
|
||||
inline explicit
|
||||
CudaTextureFetch( const ValueType * const arg_ptr
|
||||
, Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > & record
|
||||
)
|
||||
: m_obj( record.template attach_texture_object< AliasType >() )
|
||||
, m_ptr( arg_ptr )
|
||||
, m_offset( record.attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) )
|
||||
{}
|
||||
};
|
||||
|
||||
#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC )
|
||||
|
||||
template< typename ValueType , typename AliasType >
|
||||
struct CudaLDGFetch {
|
||||
|
||||
const ValueType * m_ptr ;
|
||||
|
||||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ValueType operator[]( const iType & i ) const
|
||||
{
|
||||
AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_ptr[i]));
|
||||
return *(reinterpret_cast<ValueType*> (&v));
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
operator const ValueType * () const { return m_ptr ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaLDGFetch() : m_ptr() {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
~CudaLDGFetch() {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaLDGFetch( const CudaLDGFetch & rhs )
|
||||
: m_ptr( rhs.m_ptr )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaLDGFetch( CudaLDGFetch && rhs )
|
||||
: m_ptr( rhs.m_ptr )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaLDGFetch & operator = ( const CudaLDGFetch & rhs )
|
||||
{
|
||||
m_ptr = rhs.m_ptr ;
|
||||
return *this ;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaLDGFetch & operator = ( CudaLDGFetch && rhs )
|
||||
{
|
||||
m_ptr = rhs.m_ptr ;
|
||||
return *this ;
|
||||
}
|
||||
|
||||
template< class CudaMemorySpace >
|
||||
inline explicit
|
||||
CudaTextureFetch( const ValueType * const arg_ptr
|
||||
, Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > const &
|
||||
)
|
||||
: m_ptr( arg_data_ptr )
|
||||
{}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
/** \brief Replace Default ViewDataHandle with Cuda texture fetch specialization
|
||||
* if 'const' value type, CudaSpace and random access.
|
||||
*/
|
||||
template< class Traits >
|
||||
class ViewDataHandle< Traits ,
|
||||
typename std::enable_if<(
|
||||
// Is Cuda memory space
|
||||
( std::is_same< typename Traits::memory_space,Kokkos::CudaSpace>::value ||
|
||||
std::is_same< typename Traits::memory_space,Kokkos::CudaUVMSpace>::value )
|
||||
&&
|
||||
// Is a trivial const value of 4, 8, or 16 bytes
|
||||
std::is_trivial<typename Traits::const_value_type>::value
|
||||
&&
|
||||
std::is_same<typename Traits::const_value_type,typename Traits::value_type>::value
|
||||
&&
|
||||
( sizeof(typename Traits::const_value_type) == 4 ||
|
||||
sizeof(typename Traits::const_value_type) == 8 ||
|
||||
sizeof(typename Traits::const_value_type) == 16 )
|
||||
&&
|
||||
// Random access trait
|
||||
( Traits::memory_traits::RandomAccess != 0 )
|
||||
)>::type >
|
||||
{
|
||||
public:
|
||||
|
||||
using track_type = Kokkos::Experimental::Impl::SharedAllocationTracker ;
|
||||
|
||||
using value_type = typename Traits::const_value_type ;
|
||||
using return_type = typename Traits::const_value_type ; // NOT a reference
|
||||
|
||||
using alias_type = typename std::conditional< ( sizeof(value_type) == 4 ) , int ,
|
||||
typename std::conditional< ( sizeof(value_type) == 8 ) , ::int2 ,
|
||||
typename std::conditional< ( sizeof(value_type) == 16 ) , ::int4 , void
|
||||
>::type
|
||||
>::type
|
||||
>::type ;
|
||||
|
||||
#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC )
|
||||
using handle_type = Kokkos::Experimental::Impl::CudaLDGFetch< value_type , alias_type > ;
|
||||
#else
|
||||
using handle_type = Kokkos::Experimental::Impl::CudaTextureFetch< value_type , alias_type > ;
|
||||
#endif
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static handle_type const & assign( handle_type const & arg_handle , track_type const & /* arg_tracker */ )
|
||||
{
|
||||
return arg_handle ;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static handle_type assign( value_type * arg_data_ptr, track_type const & arg_tracker )
|
||||
{
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
// Assignment of texture = non-texture requires creation of a texture object
|
||||
// which can only occur on the host. In addition, 'get_record' is only valid
|
||||
// if called in a host execution space
|
||||
return handle_type( arg_data_ptr , arg_tracker.template get_record< typename Traits::memory_space >() );
|
||||
#else
|
||||
Kokkos::Impl::cuda_abort("Cannot create Cuda texture object from within a Cuda kernel");
|
||||
return handle_type();
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
|
||||
#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */
|
||||
|
||||
@ -46,6 +46,7 @@
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
@ -58,6 +59,11 @@
|
||||
#include <Cuda/Kokkos_Cuda_Internal.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
#endif
|
||||
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
@ -65,6 +71,9 @@ namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
namespace {
|
||||
|
||||
static std::atomic<int> num_uvm_allocations(0) ;
|
||||
|
||||
cudaStream_t get_deep_copy_stream() {
|
||||
static cudaStream_t s = 0;
|
||||
if( s == 0) {
|
||||
@ -119,6 +128,7 @@ void CudaSpace::access_error( const void * const )
|
||||
Kokkos::Impl::throw_runtime_exception( msg );
|
||||
}
|
||||
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
bool CudaUVMSpace::available()
|
||||
@ -133,6 +143,11 @@ bool CudaUVMSpace::available()
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
int CudaUVMSpace::number_of_allocations()
|
||||
{
|
||||
return Kokkos::Impl::num_uvm_allocations.load();
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
@ -167,7 +182,18 @@ void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const
|
||||
{
|
||||
void * ptr = NULL;
|
||||
|
||||
CUDA_SAFE_CALL( cudaMallocManaged( &ptr, arg_alloc_size , cudaMemAttachGlobal ) );
|
||||
enum { max_uvm_allocations = 65536 };
|
||||
|
||||
if ( arg_alloc_size > 0 )
|
||||
{
|
||||
Kokkos::Impl::num_uvm_allocations++;
|
||||
|
||||
if ( Kokkos::Impl::num_uvm_allocations.load() > max_uvm_allocations ) {
|
||||
Kokkos::Impl::throw_runtime_exception( "CudaUVM error: The maximum limit of UVM allocations exceeded (currently 65536)." ) ;
|
||||
}
|
||||
|
||||
CUDA_SAFE_CALL( cudaMallocManaged( &ptr, arg_alloc_size , cudaMemAttachGlobal ) );
|
||||
}
|
||||
|
||||
return ptr ;
|
||||
}
|
||||
@ -191,7 +217,10 @@ void CudaSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_all
|
||||
void CudaUVMSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
|
||||
{
|
||||
try {
|
||||
CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) );
|
||||
if ( arg_alloc_ptr != nullptr ) {
|
||||
Kokkos::Impl::num_uvm_allocations--;
|
||||
CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) );
|
||||
}
|
||||
} catch(...) {}
|
||||
}
|
||||
|
||||
@ -202,13 +231,24 @@ void CudaHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t
|
||||
} catch(...) {}
|
||||
}
|
||||
|
||||
constexpr const char* CudaSpace::name() {
|
||||
return m_name;
|
||||
}
|
||||
|
||||
constexpr const char* CudaUVMSpace::name() {
|
||||
return m_name;
|
||||
}
|
||||
|
||||
constexpr const char* CudaHostPinnedSpace::name() {
|
||||
return m_name;
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
SharedAllocationRecord< void , void >
|
||||
@ -335,6 +375,18 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec )
|
||||
SharedAllocationRecord< Kokkos::CudaSpace , void >::
|
||||
~SharedAllocationRecord()
|
||||
{
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
|
||||
SharedAllocationHeader header ;
|
||||
Kokkos::Impl::DeepCopy<CudaSpace,HostSpace>::DeepCopy( & header , RecordBase::m_alloc_ptr , sizeof(SharedAllocationHeader) );
|
||||
|
||||
Kokkos::Profiling::deallocateData(
|
||||
Kokkos::Profiling::SpaceHandle(Kokkos::CudaSpace::name()),header.m_label,
|
||||
data(),size());
|
||||
}
|
||||
#endif
|
||||
|
||||
m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
|
||||
, SharedAllocationRecord< void , void >::m_alloc_size
|
||||
);
|
||||
@ -343,6 +395,15 @@ SharedAllocationRecord< Kokkos::CudaSpace , void >::
|
||||
SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
|
||||
~SharedAllocationRecord()
|
||||
{
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::fence(); //Make sure I can access the label ...
|
||||
Kokkos::Profiling::deallocateData(
|
||||
Kokkos::Profiling::SpaceHandle(Kokkos::CudaUVMSpace::name()),RecordBase::m_alloc_ptr->m_label,
|
||||
data(),size());
|
||||
}
|
||||
#endif
|
||||
|
||||
m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
|
||||
, SharedAllocationRecord< void , void >::m_alloc_size
|
||||
);
|
||||
@ -351,6 +412,14 @@ SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
|
||||
SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
|
||||
~SharedAllocationRecord()
|
||||
{
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::deallocateData(
|
||||
Kokkos::Profiling::SpaceHandle(Kokkos::CudaHostPinnedSpace::name()),RecordBase::m_alloc_ptr->m_label,
|
||||
data(),size());
|
||||
}
|
||||
#endif
|
||||
|
||||
m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
|
||||
, SharedAllocationRecord< void , void >::m_alloc_size
|
||||
);
|
||||
@ -373,6 +442,12 @@ SharedAllocationRecord( const Kokkos::CudaSpace & arg_space
|
||||
, m_tex_obj( 0 )
|
||||
, m_space( arg_space )
|
||||
{
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
|
||||
}
|
||||
#endif
|
||||
|
||||
SharedAllocationHeader header ;
|
||||
|
||||
// Fill in the Header information
|
||||
@ -404,7 +479,12 @@ SharedAllocationRecord( const Kokkos::CudaUVMSpace & arg_space
|
||||
, m_tex_obj( 0 )
|
||||
, m_space( arg_space )
|
||||
{
|
||||
// Fill in the Header information, directly accessible via UVM
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
|
||||
}
|
||||
#endif
|
||||
// Fill in the Header information, directly accessible via UVM
|
||||
|
||||
RecordBase::m_alloc_ptr->m_record = this ;
|
||||
|
||||
@ -430,6 +510,11 @@ SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace & arg_space
|
||||
)
|
||||
, m_space( arg_space )
|
||||
{
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
|
||||
}
|
||||
#endif
|
||||
// Fill in the Header information, directly accessible via UVM
|
||||
|
||||
RecordBase::m_alloc_ptr->m_record = this ;
|
||||
@ -502,6 +587,7 @@ void SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
|
||||
deallocate_tracked( void * const arg_alloc_ptr )
|
||||
{
|
||||
if ( arg_alloc_ptr != 0 ) {
|
||||
|
||||
SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
|
||||
|
||||
RecordBase::decrement( r );
|
||||
@ -587,7 +673,7 @@ SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr
|
||||
RecordCuda * const record = alloc_ptr ? static_cast< RecordCuda * >( head.m_record ) : (RecordCuda *) 0 ;
|
||||
|
||||
if ( ! alloc_ptr || record->m_alloc_ptr != head_cuda ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) );
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) );
|
||||
}
|
||||
|
||||
#else
|
||||
@ -598,7 +684,7 @@ SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr
|
||||
RecordCuda * const record = static_cast< RecordCuda * >( RecordBase::find( & s_root_record , alloc_ptr ) );
|
||||
|
||||
if ( record == 0 ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) );
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) );
|
||||
}
|
||||
|
||||
#endif
|
||||
@ -615,7 +701,7 @@ SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record( void * alloc_
|
||||
Header * const h = alloc_ptr ? reinterpret_cast< Header * >( alloc_ptr ) - 1 : (Header *) 0 ;
|
||||
|
||||
if ( ! alloc_ptr || h->m_record->m_alloc_ptr != h ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record ERROR" ) );
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record ERROR" ) );
|
||||
}
|
||||
|
||||
return static_cast< RecordCuda * >( h->m_record );
|
||||
@ -630,7 +716,7 @@ SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record( void *
|
||||
Header * const h = alloc_ptr ? reinterpret_cast< Header * >( alloc_ptr ) - 1 : (Header *) 0 ;
|
||||
|
||||
if ( ! alloc_ptr || h->m_record->m_alloc_ptr != h ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record ERROR" ) );
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record ERROR" ) );
|
||||
}
|
||||
|
||||
return static_cast< RecordCuda * >( h->m_record );
|
||||
@ -728,7 +814,6 @@ print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bo
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
@ -384,10 +384,10 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
||||
const bool ok_id = 0 <= cuda_device_id &&
|
||||
cuda_device_id < dev_info.m_cudaDevCount ;
|
||||
|
||||
// Need device capability 2.0 or better
|
||||
// Need device capability 3.0 or better
|
||||
|
||||
const bool ok_dev = ok_id &&
|
||||
( 2 <= dev_info.m_cudaProp[ cuda_device_id ].major &&
|
||||
( 3 <= dev_info.m_cudaProp[ cuda_device_id ].major &&
|
||||
0 <= dev_info.m_cudaProp[ cuda_device_id ].minor );
|
||||
|
||||
if ( ok_init && ok_dev ) {
|
||||
@ -444,7 +444,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
||||
//----------------------------------
|
||||
// Maximum number of blocks:
|
||||
|
||||
m_maxBlock = m_cudaArch < 300 ? 65535 : cudaProp.maxGridSize[0] ;
|
||||
m_maxBlock = cudaProp.maxGridSize[0] ;
|
||||
|
||||
//----------------------------------
|
||||
|
||||
@ -495,7 +495,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
||||
msg << dev_info.m_cudaProp[ cuda_device_id ].major ;
|
||||
msg << "." ;
|
||||
msg << dev_info.m_cudaProp[ cuda_device_id ].minor ;
|
||||
msg << " has insufficient capability, required 2.0 or better" ;
|
||||
msg << " has insufficient capability, required 3.0 or better" ;
|
||||
}
|
||||
Kokkos::Impl::throw_runtime_exception( msg.str() );
|
||||
}
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -95,27 +95,42 @@ private:
|
||||
|
||||
public:
|
||||
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
|
||||
__device__ inline
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space & team_shmem() const
|
||||
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
|
||||
__device__ inline
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space & team_scratch(const int& level) const
|
||||
{ return m_team_shared.set_team_thread_mode(level,1,0) ; }
|
||||
__device__ inline
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space & thread_scratch(const int& level) const
|
||||
{ return m_team_shared.set_team_thread_mode(level,team_size(),team_rank()) ; }
|
||||
|
||||
__device__ inline int league_rank() const { return m_league_rank ; }
|
||||
__device__ inline int league_size() const { return m_league_size ; }
|
||||
__device__ inline int team_rank() const { return threadIdx.y ; }
|
||||
__device__ inline int team_size() const { return blockDim.y ; }
|
||||
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
|
||||
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
|
||||
KOKKOS_INLINE_FUNCTION int team_rank() const {
|
||||
#ifdef __CUDA_ARCH__
|
||||
return threadIdx.y ;
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION int team_size() const {
|
||||
#ifdef __CUDA_ARCH__
|
||||
return blockDim.y ;
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ inline void team_barrier() const { __syncthreads(); }
|
||||
KOKKOS_INLINE_FUNCTION void team_barrier() const {
|
||||
#ifdef __CUDA_ARCH__
|
||||
__syncthreads();
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class ValueType>
|
||||
__device__ inline void team_broadcast(ValueType& value, const int& thread_id) const {
|
||||
KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& value, const int& thread_id) const {
|
||||
#ifdef __CUDA_ARCH__
|
||||
__shared__ ValueType sh_val;
|
||||
if(threadIdx.x == 0 && threadIdx.y == thread_id) {
|
||||
sh_val = value;
|
||||
@ -123,26 +138,17 @@ public:
|
||||
team_barrier();
|
||||
value = sh_val;
|
||||
team_barrier();
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_HAVE_CXX11
|
||||
template< class ValueType, class JoinOp >
|
||||
__device__ inline
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename JoinOp::value_type team_reduce( const ValueType & value
|
||||
, const JoinOp & op_in ) const
|
||||
{
|
||||
, const JoinOp & op_in ) const {
|
||||
#ifdef __CUDA_ARCH__
|
||||
typedef JoinLambdaAdapter<ValueType,JoinOp> JoinOpFunctor ;
|
||||
const JoinOpFunctor op(op_in);
|
||||
ValueType * const base_data = (ValueType *) m_team_reduce ;
|
||||
#else
|
||||
template< class JoinOp >
|
||||
__device__ inline
|
||||
typename JoinOp::value_type team_reduce( const typename JoinOp::value_type & value
|
||||
, const JoinOp & op ) const
|
||||
{
|
||||
typedef JoinOp JoinOpFunctor ;
|
||||
typename JoinOp::value_type * const base_data = (typename JoinOp::value_type *) m_team_reduce ;
|
||||
#endif
|
||||
|
||||
__syncthreads(); // Don't write in to shared data until all threads have entered this function
|
||||
|
||||
@ -153,6 +159,9 @@ public:
|
||||
Impl::cuda_intra_block_reduce_scan<false,JoinOpFunctor,void>( op , base_data );
|
||||
|
||||
return base_data[ blockDim.y - 1 ];
|
||||
#else
|
||||
return typename JoinOp::value_type();
|
||||
#endif
|
||||
}
|
||||
|
||||
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
|
||||
@ -165,8 +174,8 @@ public:
|
||||
* non-deterministic.
|
||||
*/
|
||||
template< typename Type >
|
||||
__device__ inline Type team_scan( const Type & value , Type * const global_accum ) const
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const {
|
||||
#ifdef __CUDA_ARCH__
|
||||
Type * const base_data = (Type *) m_team_reduce ;
|
||||
|
||||
__syncthreads(); // Don't write in to shared data until all threads have entered this function
|
||||
@ -186,6 +195,9 @@ public:
|
||||
}
|
||||
|
||||
return base_data[ threadIdx.y ];
|
||||
#else
|
||||
return Type();
|
||||
#endif
|
||||
}
|
||||
|
||||
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
|
||||
@ -194,13 +206,14 @@ public:
|
||||
* reduction_total = dev.team_scan( value ) + value ;
|
||||
*/
|
||||
template< typename Type >
|
||||
__device__ inline Type team_scan( const Type & value ) const
|
||||
{ return this->template team_scan<Type>( value , 0 ); }
|
||||
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const {
|
||||
return this->template team_scan<Type>( value , 0 );
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
// Private for the driver
|
||||
|
||||
__device__ inline
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTeamMember( void * shared
|
||||
, const int shared_begin
|
||||
, const int shared_size
|
||||
@ -210,51 +223,10 @@ public:
|
||||
, const int arg_league_size )
|
||||
: m_team_reduce( shared )
|
||||
, m_team_shared( ((char *)shared) + shared_begin , shared_size, scratch_level_1_ptr, scratch_level_1_size)
|
||||
, m_league_rank( arg_league_rank )
|
||||
, m_league_size( arg_league_size )
|
||||
, m_league_rank( arg_league_rank )
|
||||
, m_league_size( arg_league_size )
|
||||
{}
|
||||
|
||||
#else
|
||||
|
||||
const execution_space::scratch_memory_space & team_shmem() const
|
||||
{ return m_team_shared.set_team_thread_mode(0, 1,0) ; }
|
||||
const execution_space::scratch_memory_space & team_scratch(const int& level) const
|
||||
{ return m_team_shared.set_team_thread_mode(level,1,0) ; }
|
||||
const execution_space::scratch_memory_space & thread_scratch(const int& level) const
|
||||
{ return m_team_shared.set_team_thread_mode(level,team_size(),team_rank()) ; }
|
||||
|
||||
int league_rank() const {return 0;}
|
||||
int league_size() const {return 1;}
|
||||
int team_rank() const {return 0;}
|
||||
int team_size() const {return 1;}
|
||||
|
||||
void team_barrier() const {}
|
||||
template<class ValueType>
|
||||
void team_broadcast(ValueType& value, const int& thread_id) const {}
|
||||
|
||||
template< class JoinOp >
|
||||
typename JoinOp::value_type team_reduce( const typename JoinOp::value_type & value
|
||||
, const JoinOp & op ) const {return typename JoinOp::value_type();}
|
||||
|
||||
template< typename Type >
|
||||
Type team_scan( const Type & value , Type * const global_accum ) const {return Type();}
|
||||
|
||||
template< typename Type >
|
||||
Type team_scan( const Type & value ) const {return Type();}
|
||||
|
||||
//----------------------------------------
|
||||
// Private for the driver
|
||||
|
||||
CudaTeamMember( void * shared
|
||||
, const int shared_begin
|
||||
, const int shared_end
|
||||
, void* scratch_level_1_ptr
|
||||
, const int scratch_level_1_size
|
||||
, const int arg_league_rank
|
||||
, const int arg_league_size );
|
||||
|
||||
#endif /* #if ! defined( __CUDA_ARCH__ ) */
|
||||
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
@ -356,7 +328,7 @@ public:
|
||||
, m_vector_length( 0 )
|
||||
, m_team_scratch_size {0,0}
|
||||
, m_thread_scratch_size {0,0}
|
||||
, m_chunk_size ( 32 )
|
||||
, m_chunk_size ( 32 )
|
||||
{}
|
||||
|
||||
/** \brief Specify league size, request team size */
|
||||
@ -508,7 +480,7 @@ private:
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const Policy m_policy ;
|
||||
|
||||
ParallelFor() = delete ;
|
||||
ParallelFor & operator = ( const ParallelFor & ) = delete ;
|
||||
@ -638,8 +610,8 @@ public:
|
||||
|
||||
}
|
||||
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
)
|
||||
: m_functor( arg_functor )
|
||||
, m_league_size( arg_policy.league_size() )
|
||||
@ -680,7 +652,7 @@ template< class FunctorType , class ReducerType, class ... Traits >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::RangePolicy< Traits ... >
|
||||
, ReducerType
|
||||
, Kokkos::Cuda
|
||||
, Kokkos::Cuda
|
||||
>
|
||||
{
|
||||
private:
|
||||
@ -835,23 +807,22 @@ public:
|
||||
const int nwork = m_policy.end() - m_policy.begin();
|
||||
if ( nwork ) {
|
||||
const int block_size = local_block_size( m_functor );
|
||||
|
||||
|
||||
m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_size /* block_size == max block_count */ );
|
||||
m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
|
||||
m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
|
||||
|
||||
|
||||
// REQUIRED ( 1 , N , 1 )
|
||||
const dim3 block( 1 , block_size , 1 );
|
||||
// Required grid.x <= block.y
|
||||
const dim3 grid( std::min( int(block.y) , int( ( nwork + block.y - 1 ) / block.y ) ) , 1 , 1 );
|
||||
|
||||
|
||||
const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y );
|
||||
|
||||
|
||||
CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute
|
||||
|
||||
|
||||
Cuda::fence();
|
||||
|
||||
|
||||
if ( m_result_ptr ) {
|
||||
if ( m_unified_space ) {
|
||||
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
@ -871,8 +842,8 @@ public:
|
||||
}
|
||||
|
||||
template< class HostViewType >
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
, const HostViewType & arg_result
|
||||
, typename std::enable_if<
|
||||
Kokkos::is_view< HostViewType >::value
|
||||
@ -925,7 +896,6 @@ private:
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
typedef typename ValueTraits::value_type value_type ;
|
||||
|
||||
|
||||
public:
|
||||
|
||||
typedef FunctorType functor_type ;
|
||||
@ -937,7 +907,6 @@ private:
|
||||
typedef double DummyShflReductionType;
|
||||
typedef int DummySHMEMReductionType;
|
||||
|
||||
|
||||
// Algorithmic constraints: blockDim.y is a power of two AND blockDim.y == blockDim.z == 1
|
||||
// shared memory utilization:
|
||||
//
|
||||
@ -1058,36 +1027,44 @@ public:
|
||||
inline
|
||||
void execute()
|
||||
{
|
||||
const int block_count = UseShflReduction? std::min( m_league_size , size_type(1024) )
|
||||
:std::min( m_league_size , m_team_size );
|
||||
const int nwork = m_league_size * m_team_size ;
|
||||
if ( nwork ) {
|
||||
const int block_count = UseShflReduction? std::min( m_league_size , size_type(1024) )
|
||||
:std::min( m_league_size , m_team_size );
|
||||
|
||||
m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_count );
|
||||
m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
|
||||
m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
|
||||
m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_count );
|
||||
m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
|
||||
m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
|
||||
|
||||
const dim3 block( m_vector_size , m_team_size , 1 );
|
||||
const dim3 grid( block_count , 1 , 1 );
|
||||
const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
|
||||
const dim3 block( m_vector_size , m_team_size , 1 );
|
||||
const dim3 grid( block_count , 1 , 1 );
|
||||
const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
|
||||
|
||||
CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem_size_total ); // copy to device and execute
|
||||
CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem_size_total ); // copy to device and execute
|
||||
|
||||
Cuda::fence();
|
||||
Cuda::fence();
|
||||
|
||||
if ( m_result_ptr ) {
|
||||
if ( m_unified_space ) {
|
||||
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
|
||||
if ( m_result_ptr ) {
|
||||
if ( m_unified_space ) {
|
||||
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
|
||||
}
|
||||
else {
|
||||
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
|
||||
DeepCopy<HostSpace,CudaSpace>( m_result_ptr, m_scratch_space, size );
|
||||
}
|
||||
}
|
||||
else {
|
||||
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
|
||||
DeepCopy<HostSpace,CudaSpace>( m_result_ptr, m_scratch_space, size );
|
||||
}
|
||||
else {
|
||||
if (m_result_ptr) {
|
||||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template< class HostViewType >
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
, const HostViewType & arg_result
|
||||
, typename std::enable_if<
|
||||
Kokkos::is_view< HostViewType >::value
|
||||
@ -1106,9 +1083,18 @@ public:
|
||||
, m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
|
||||
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
|
||||
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
|
||||
arg_policy.vector_length() )
|
||||
arg_policy.vector_length() )
|
||||
, m_vector_size( arg_policy.vector_length() )
|
||||
, m_scratch_size{arg_policy.scratch_size(0,m_team_size),arg_policy.scratch_size(1,m_team_size)}
|
||||
, m_scratch_size{
|
||||
arg_policy.scratch_size(0,( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
|
||||
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
|
||||
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
|
||||
arg_policy.vector_length() )
|
||||
), arg_policy.scratch_size(1,( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
|
||||
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
|
||||
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
|
||||
arg_policy.vector_length() )
|
||||
)}
|
||||
{
|
||||
// Return Init value if the number of worksets is zero
|
||||
if( arg_policy.league_size() == 0) {
|
||||
@ -1342,7 +1328,7 @@ private:
|
||||
}
|
||||
|
||||
// Scan block values into locations shared_data[1..blockDim.y]
|
||||
cuda_intra_block_reduce_scan<true,FunctorType,WorkTag>( m_functor , ValueTraits::pointer_type(shared_data+word_count.value) );
|
||||
cuda_intra_block_reduce_scan<true,FunctorType,WorkTag>( m_functor , typename ValueTraits::pointer_type(shared_data+word_count.value) );
|
||||
|
||||
{
|
||||
size_type * const block_total = shared_data + word_count.value * blockDim.y ;
|
||||
@ -1391,32 +1377,32 @@ public:
|
||||
const int nwork = m_policy.end() - m_policy.begin();
|
||||
if ( nwork ) {
|
||||
enum { GridMaxComputeCapability_2x = 0x0ffff };
|
||||
|
||||
|
||||
const int block_size = local_block_size( m_functor );
|
||||
|
||||
|
||||
const int grid_max =
|
||||
( block_size * block_size ) < GridMaxComputeCapability_2x ?
|
||||
( block_size * block_size ) : GridMaxComputeCapability_2x ;
|
||||
|
||||
|
||||
// At most 'max_grid' blocks:
|
||||
const int max_grid = std::min( int(grid_max) , int(( nwork + block_size - 1 ) / block_size ));
|
||||
|
||||
|
||||
// How much work per block:
|
||||
const int work_per_block = ( nwork + max_grid - 1 ) / max_grid ;
|
||||
|
||||
|
||||
// How many block are really needed for this much work:
|
||||
const int grid_x = ( nwork + work_per_block - 1 ) / work_per_block ;
|
||||
|
||||
|
||||
m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( m_functor ) * grid_x );
|
||||
m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) * 1 );
|
||||
|
||||
|
||||
const dim3 grid( grid_x , 1 , 1 );
|
||||
const dim3 block( 1 , block_size , 1 ); // REQUIRED DIMENSIONS ( 1 , N , 1 )
|
||||
const int shmem = ValueTraits::value_size( m_functor ) * ( block_size + 2 );
|
||||
|
||||
|
||||
m_final = false ;
|
||||
CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
|
||||
|
||||
|
||||
m_final = true ;
|
||||
CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
|
||||
}
|
||||
@ -1490,18 +1476,30 @@ namespace Impl {
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
__device__ inline
|
||||
ThreadVectorRangeBoundariesStruct (const CudaTeamMember& thread, const iType& count):
|
||||
ThreadVectorRangeBoundariesStruct (const CudaTeamMember, const iType& count):
|
||||
start( threadIdx.x ),
|
||||
end( count ),
|
||||
increment( blockDim.x )
|
||||
{}
|
||||
__device__ inline
|
||||
ThreadVectorRangeBoundariesStruct (const iType& count):
|
||||
start( threadIdx.x ),
|
||||
end( count ),
|
||||
increment( blockDim.x )
|
||||
{}
|
||||
#else
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ThreadVectorRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count):
|
||||
ThreadVectorRangeBoundariesStruct (const CudaTeamMember, const iType& count):
|
||||
start( 0 ),
|
||||
end( count ),
|
||||
increment( 1 )
|
||||
{}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ThreadVectorRangeBoundariesStruct (const iType& count):
|
||||
start( 0 ),
|
||||
end( count ),
|
||||
increment( 1 )
|
||||
{}
|
||||
#endif
|
||||
};
|
||||
|
||||
@ -1509,22 +1507,24 @@ namespace Impl {
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>
|
||||
TeamThreadRange(const Impl::CudaTeamMember& thread, const iType& count) {
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>(thread,count);
|
||||
Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >
|
||||
TeamThreadRange( const Impl::CudaTeamMember & thread, const iType & count ) {
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, count );
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
template< typename iType1, typename iType2 >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>
|
||||
TeamThreadRange(const Impl::CudaTeamMember& thread, const iType& begin, const iType& end) {
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>(thread,begin,end);
|
||||
Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
|
||||
Impl::CudaTeamMember >
|
||||
TeamThreadRange( const Impl::CudaTeamMember & thread, const iType1 & begin, const iType2 & end ) {
|
||||
typedef typename std::common_type< iType1, iType2 >::type iType;
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, iType(begin), iType(end) );
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
|
||||
ThreadVectorRange(const Impl::CudaTeamMember& thread, const iType& count) {
|
||||
ThreadVectorRange(const Impl::CudaTeamMember& thread, const iType& count) {
|
||||
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >(thread,count);
|
||||
}
|
||||
|
||||
@ -1571,9 +1571,10 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Cud
|
||||
lambda(i,result);
|
||||
}
|
||||
|
||||
Impl::cuda_intra_warp_reduction(result,[&] (ValueType& dst, const ValueType& src) { dst+=src; });
|
||||
Impl::cuda_inter_warp_reduction(result,[&] (ValueType& dst, const ValueType& src) { dst+=src; });
|
||||
|
||||
Impl::cuda_intra_warp_reduction(result,[&] (ValueType& dst, const ValueType& src)
|
||||
{ dst+=src; });
|
||||
Impl::cuda_inter_warp_reduction(result,[&] (ValueType& dst, const ValueType& src)
|
||||
{ dst+=src; });
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -1923,4 +1924,3 @@ namespace Impl {
|
||||
#endif /* defined( __CUDACC__ ) */
|
||||
|
||||
#endif /* #ifndef KOKKOS_CUDA_PARALLEL_HPP */
|
||||
|
||||
|
||||
@ -139,6 +139,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
|
||||
typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type const result,
|
||||
Cuda::size_type * const m_scratch_flags,
|
||||
const int max_active_thread = blockDim.y) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
typedef typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type pointer_type;
|
||||
typedef typename FunctorValueTraits< FunctorType , ArgTag >::value_type value_type;
|
||||
|
||||
@ -213,6 +214,9 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
|
||||
|
||||
//The last block has in its thread=0 the global reduction value through "value"
|
||||
return last_block;
|
||||
#else
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -290,10 +294,10 @@ void cuda_intra_block_reduce_scan( const FunctorType & functor ,
|
||||
|
||||
if ( ! ( rtid_inter + n < blockDim.y ) ) n = 0 ;
|
||||
|
||||
BLOCK_SCAN_STEP(tdata_inter,n,8)
|
||||
BLOCK_SCAN_STEP(tdata_inter,n,7)
|
||||
BLOCK_SCAN_STEP(tdata_inter,n,6)
|
||||
BLOCK_SCAN_STEP(tdata_inter,n,5)
|
||||
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,8)
|
||||
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,7)
|
||||
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,6)
|
||||
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,5)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -308,12 +312,19 @@ void cuda_intra_block_reduce_scan( const FunctorType & functor ,
|
||||
( rtid_intra & 16 ) ? 16 : 0 ))));
|
||||
|
||||
if ( ! ( rtid_intra + n < blockDim.y ) ) n = 0 ;
|
||||
|
||||
#ifdef KOKKOS_CUDA_CLANG_WORKAROUND
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,4) __syncthreads();//__threadfence_block();
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,3) __syncthreads();//__threadfence_block();
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,2) __syncthreads();//__threadfence_block();
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,1) __syncthreads();//__threadfence_block();
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,0) __syncthreads();
|
||||
#else
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,4) __threadfence_block();
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,3) __threadfence_block();
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,2) __threadfence_block();
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,1) __threadfence_block();
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,0)
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,0) __threadfence_block();
|
||||
#endif
|
||||
}
|
||||
|
||||
#undef BLOCK_SCAN_STEP
|
||||
|
||||
@ -43,7 +43,7 @@
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
#include <impl/Kokkos_TaskQueue_impl.hpp>
|
||||
|
||||
@ -174,6 +174,6 @@ printf("cuda_task_queue_execute after\n");
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKDAG ) */
|
||||
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -44,7 +44,7 @@
|
||||
#ifndef KOKKOS_IMPL_CUDA_TASK_HPP
|
||||
#define KOKKOS_IMPL_CUDA_TASK_HPP
|
||||
|
||||
#if defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
#if defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
@ -99,7 +99,7 @@ public:
|
||||
extern template class TaskQueue< Kokkos::Cuda > ;
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/**\brief Impl::TaskExec<Cuda> is the TaskPolicy<Cuda>::member_type
|
||||
/**\brief Impl::TaskExec<Cuda> is the TaskScheduler<Cuda>::member_type
|
||||
* passed to tasks running in a Cuda space.
|
||||
*
|
||||
* Cuda thread blocks for tasking are dimensioned:
|
||||
@ -234,19 +234,23 @@ namespace Kokkos {
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
|
||||
TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
|
||||
, const iType & count )
|
||||
Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >
|
||||
TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & count )
|
||||
{
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >( thread, count );
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
template<typename iType1, typename iType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
|
||||
TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & start , const iType & end )
|
||||
Impl::TeamThreadRangeBoundariesStruct
|
||||
< typename std::common_type<iType1,iType2>::type
|
||||
, Impl::TaskExec< Kokkos::Cuda > >
|
||||
TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
|
||||
, const iType1 & begin, const iType2 & end )
|
||||
{
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >(thread,start,end);
|
||||
typedef typename std::common_type< iType1, iType2 >::type iType;
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >(
|
||||
thread, iType(begin), iType(end) );
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
@ -315,7 +319,7 @@ ValueType shfl_warp_broadcast
|
||||
}
|
||||
|
||||
// all-reduce across corresponding vector lanes between team members within warp
|
||||
// assume vec_length*team_size == warp_size
|
||||
// assume vec_length*team_size == warp_size
|
||||
// blockDim.x == vec_length == stride
|
||||
// blockDim.y == team_size
|
||||
// threadIdx.x == position in vec
|
||||
@ -344,7 +348,7 @@ void parallel_reduce
|
||||
|
||||
// all-reduce across corresponding vector lanes between team members within warp
|
||||
// if no join() provided, use sum
|
||||
// assume vec_length*team_size == warp_size
|
||||
// assume vec_length*team_size == warp_size
|
||||
// blockDim.x == vec_length == stride
|
||||
// blockDim.y == team_size
|
||||
// threadIdx.x == position in vec
|
||||
@ -372,7 +376,7 @@ void parallel_reduce
|
||||
}
|
||||
|
||||
// all-reduce within team members within warp
|
||||
// assume vec_length*team_size == warp_size
|
||||
// assume vec_length*team_size == warp_size
|
||||
// blockDim.x == vec_length == stride
|
||||
// blockDim.y == team_size
|
||||
// threadIdx.x == position in vec
|
||||
@ -397,7 +401,7 @@ void parallel_reduce
|
||||
|
||||
// all-reduce within team members within warp
|
||||
// if no join() provided, use sum
|
||||
// assume vec_length*team_size == warp_size
|
||||
// assume vec_length*team_size == warp_size
|
||||
// blockDim.x == vec_length == stride
|
||||
// blockDim.y == team_size
|
||||
// threadIdx.x == position in vec
|
||||
@ -426,7 +430,7 @@ void parallel_reduce
|
||||
}
|
||||
|
||||
// scan across corresponding vector lanes between team members within warp
|
||||
// assume vec_length*team_size == warp_size
|
||||
// assume vec_length*team_size == warp_size
|
||||
// blockDim.x == vec_length == stride
|
||||
// blockDim.y == team_size
|
||||
// threadIdx.x == position in vec
|
||||
@ -469,7 +473,7 @@ void parallel_scan
|
||||
}
|
||||
|
||||
// scan within team member (vector) within warp
|
||||
// assume vec_length*team_size == warp_size
|
||||
// assume vec_length*team_size == warp_size
|
||||
// blockDim.x == vec_length == stride
|
||||
// blockDim.y == team_size
|
||||
// threadIdx.x == position in vec
|
||||
@ -514,6 +518,6 @@ void parallel_scan
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
|
||||
#endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */
|
||||
|
||||
|
||||
@ -1,932 +0,0 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
// Experimental unified task-data parallel manycore LDRD
|
||||
|
||||
#include <stdio.h>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_TaskPolicy.hpp>
|
||||
|
||||
#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
// #define DETAILED_PRINT
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#define QLOCK reinterpret_cast<void*>( ~((uintptr_t)0) )
|
||||
#define QDENIED reinterpret_cast<void*>( ~((uintptr_t)0) - 1 )
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
void CudaTaskPolicyQueue::Destroy::destroy_shared_allocation()
|
||||
{
|
||||
// Verify the queue is empty
|
||||
|
||||
if ( m_policy->m_count_ready ||
|
||||
m_policy->m_team[0] ||
|
||||
m_policy->m_team[1] ||
|
||||
m_policy->m_team[2] ||
|
||||
m_policy->m_serial[0] ||
|
||||
m_policy->m_serial[1] ||
|
||||
m_policy->m_serial[2] ) {
|
||||
Kokkos::abort("CudaTaskPolicyQueue ERROR : Attempt to destroy non-empty queue" );
|
||||
}
|
||||
|
||||
m_policy->~CudaTaskPolicyQueue();
|
||||
|
||||
Kokkos::Cuda::fence();
|
||||
}
|
||||
|
||||
CudaTaskPolicyQueue::
|
||||
~CudaTaskPolicyQueue()
|
||||
{
|
||||
}
|
||||
|
||||
CudaTaskPolicyQueue::
|
||||
CudaTaskPolicyQueue
|
||||
( const unsigned arg_task_max_count
|
||||
, const unsigned arg_task_max_size
|
||||
, const unsigned arg_task_default_dependence_capacity
|
||||
, const unsigned arg_team_size
|
||||
)
|
||||
: m_space( Kokkos::CudaUVMSpace()
|
||||
, arg_task_max_size * arg_task_max_count * 1.2
|
||||
, 16 /* log2(superblock size) */
|
||||
)
|
||||
, m_team { 0 , 0 , 0 }
|
||||
, m_serial { 0 , 0 , 0 }
|
||||
, m_team_size( 32 /* 1 warps */ )
|
||||
, m_default_dependence_capacity( arg_task_default_dependence_capacity )
|
||||
, m_count_ready(0)
|
||||
{
|
||||
constexpr int max_team_size = 32 * 16 /* 16 warps */ ;
|
||||
|
||||
const int target_team_size =
|
||||
std::min( int(arg_team_size) , max_team_size );
|
||||
|
||||
while ( m_team_size < target_team_size ) { m_team_size *= 2 ; }
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Called by each block & thread
|
||||
|
||||
__device__
|
||||
void Kokkos::Experimental::Impl::CudaTaskPolicyQueue::driver()
|
||||
{
|
||||
task_root_type * const q_denied = reinterpret_cast<task_root_type*>(QDENIED);
|
||||
|
||||
#define IS_TEAM_LEAD ( threadIdx.x == 0 && threadIdx.y == 0 )
|
||||
|
||||
#ifdef DETAILED_PRINT
|
||||
if ( IS_TEAM_LEAD ) {
|
||||
printf( "CudaTaskPolicyQueue::driver() begin on %d with count %d\n"
|
||||
, blockIdx.x , m_count_ready );
|
||||
}
|
||||
#endif
|
||||
|
||||
// Each thread block must iterate this loop synchronously
|
||||
// to insure team-execution of team-task
|
||||
|
||||
__shared__ task_root_type * team_task ;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
do {
|
||||
|
||||
if ( IS_TEAM_LEAD ) {
|
||||
if ( 0 == m_count_ready ) {
|
||||
team_task = q_denied ; // All queues are empty and no running tasks
|
||||
}
|
||||
else {
|
||||
team_task = 0 ;
|
||||
for ( int i = 0 ; i < int(NPRIORITY) && 0 == team_task ; ++i ) {
|
||||
if ( ( i < 2 /* regular queue */ )
|
||||
|| ( ! m_space.is_empty() /* waiting for memory */ ) ) {
|
||||
team_task = pop_ready_task( & m_team[i] );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#ifdef DETAILED_PRINT
|
||||
if ( IS_TEAM_LEAD && 0 != team_task ) {
|
||||
printf( "CudaTaskPolicyQueue::driver() (%d) team_task(0x%lx)\n"
|
||||
, blockIdx.x
|
||||
, (unsigned long) team_task );
|
||||
}
|
||||
#endif
|
||||
|
||||
// team_task == q_denied if all queues are empty
|
||||
// team_task == 0 if no team tasks available
|
||||
|
||||
if ( q_denied != team_task ) {
|
||||
if ( 0 != team_task ) {
|
||||
|
||||
Kokkos::Impl::CudaTeamMember
|
||||
member( kokkos_impl_cuda_shared_memory<void>()
|
||||
, 16 /* shared_begin */
|
||||
, team_task->m_shmem_size /* shared size */
|
||||
, 0 /* scratch level 1 pointer */
|
||||
, 0 /* scratch level 1 size */
|
||||
, 0 /* league rank */
|
||||
, 1 /* league size */
|
||||
);
|
||||
|
||||
(*team_task->m_team)( team_task , member );
|
||||
|
||||
// A __synthreads was called and if completed the
|
||||
// functor was destroyed.
|
||||
|
||||
if ( IS_TEAM_LEAD ) {
|
||||
complete_executed_task( team_task );
|
||||
}
|
||||
}
|
||||
else {
|
||||
// One thread of one warp performs this serial task
|
||||
if ( threadIdx.x == 0 &&
|
||||
0 == ( threadIdx.y % 32 ) ) {
|
||||
task_root_type * task = 0 ;
|
||||
for ( int i = 0 ; i < int(NPRIORITY) && 0 == task ; ++i ) {
|
||||
if ( ( i < 2 /* regular queue */ )
|
||||
|| ( ! m_space.is_empty() /* waiting for memory */ ) ) {
|
||||
task = pop_ready_task( & m_serial[i] );
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DETAILED_PRINT
|
||||
if ( 0 != task ) {
|
||||
printf( "CudaTaskPolicyQueue::driver() (%2d)(%d) single task(0x%lx)\n"
|
||||
, blockIdx.x
|
||||
, threadIdx.y
|
||||
, (unsigned long) task );
|
||||
}
|
||||
#endif
|
||||
|
||||
if ( task ) {
|
||||
(*task->m_serial)( task );
|
||||
complete_executed_task( task );
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
} while ( q_denied != team_task );
|
||||
|
||||
#ifdef DETAILED_PRINT
|
||||
if ( IS_TEAM_LEAD ) {
|
||||
printf( "CudaTaskPolicyQueue::driver() end on %d with count %d\n"
|
||||
, blockIdx.x , m_count_ready );
|
||||
}
|
||||
#endif
|
||||
|
||||
#undef IS_TEAM_LEAD
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
|
||||
__device__
|
||||
CudaTaskPolicyQueue::task_root_type *
|
||||
CudaTaskPolicyQueue::pop_ready_task(
|
||||
CudaTaskPolicyQueue::task_root_type * volatile * const queue )
|
||||
{
|
||||
task_root_type * const q_lock = reinterpret_cast<task_root_type*>(QLOCK);
|
||||
task_root_type * task = 0 ;
|
||||
task_root_type * const task_claim = *queue ;
|
||||
|
||||
if ( ( q_lock != task_claim ) && ( 0 != task_claim ) ) {
|
||||
|
||||
// Queue is not locked and not null, try to claim head of queue.
|
||||
// Is a race among threads to claim the queue.
|
||||
|
||||
if ( task_claim == atomic_compare_exchange(queue,task_claim,q_lock) ) {
|
||||
|
||||
// Aquired the task which must be in the waiting state.
|
||||
|
||||
const int claim_state =
|
||||
atomic_compare_exchange( & task_claim->m_state
|
||||
, int(TASK_STATE_WAITING)
|
||||
, int(TASK_STATE_EXECUTING) );
|
||||
|
||||
task_root_type * lock_verify = 0 ;
|
||||
|
||||
if ( claim_state == int(TASK_STATE_WAITING) ) {
|
||||
|
||||
// Transitioned this task from waiting to executing
|
||||
// Update the queue to the next entry and release the lock
|
||||
|
||||
task_root_type * const next =
|
||||
*((task_root_type * volatile *) & task_claim->m_next );
|
||||
|
||||
*((task_root_type * volatile *) & task_claim->m_next ) = 0 ;
|
||||
|
||||
lock_verify = atomic_compare_exchange( queue , q_lock , next );
|
||||
}
|
||||
|
||||
if ( ( claim_state != int(TASK_STATE_WAITING) ) |
|
||||
( q_lock != lock_verify ) ) {
|
||||
|
||||
printf( "CudaTaskPolicyQueue::pop_ready_task(0x%lx) task(0x%lx) state(%d) ERROR %s\n"
|
||||
, (unsigned long) queue
|
||||
, (unsigned long) task
|
||||
, claim_state
|
||||
, ( claim_state != int(TASK_STATE_WAITING)
|
||||
? "NOT WAITING"
|
||||
: "UNLOCK" ) );
|
||||
Kokkos::abort("CudaTaskPolicyQueue::pop_ready_task");
|
||||
}
|
||||
|
||||
task = task_claim ;
|
||||
}
|
||||
}
|
||||
return task ;
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
|
||||
__device__
|
||||
void CudaTaskPolicyQueue::complete_executed_task(
|
||||
CudaTaskPolicyQueue::task_root_type * task )
|
||||
{
|
||||
task_root_type * const q_denied = reinterpret_cast<task_root_type*>(QDENIED);
|
||||
|
||||
|
||||
#ifdef DETAILED_PRINT
|
||||
printf( "CudaTaskPolicyQueue::complete_executed_task(0x%lx) state(%d) (%d)(%d,%d)\n"
|
||||
, (unsigned long) task
|
||||
, task->m_state
|
||||
, blockIdx.x
|
||||
, threadIdx.x
|
||||
, threadIdx.y
|
||||
);
|
||||
#endif
|
||||
|
||||
// State is either executing or if respawned then waiting,
|
||||
// try to transition from executing to complete.
|
||||
// Reads the current value.
|
||||
|
||||
const int state_old =
|
||||
atomic_compare_exchange( & task->m_state
|
||||
, int(Kokkos::Experimental::TASK_STATE_EXECUTING)
|
||||
, int(Kokkos::Experimental::TASK_STATE_COMPLETE) );
|
||||
|
||||
if ( int(Kokkos::Experimental::TASK_STATE_WAITING) == state_old ) {
|
||||
/* Task requested a respawn so reschedule it */
|
||||
schedule_task( task , false /* not initial spawn */ );
|
||||
}
|
||||
else if ( int(Kokkos::Experimental::TASK_STATE_EXECUTING) == state_old ) {
|
||||
/* Task is complete */
|
||||
|
||||
// Clear dependences of this task before locking wait queue
|
||||
|
||||
task->clear_dependence();
|
||||
|
||||
// Stop other tasks from adding themselves to this task's wait queue.
|
||||
// The wait queue is updated concurrently so guard with an atomic.
|
||||
|
||||
task_root_type * wait_queue = *((task_root_type * volatile *) & task->m_wait );
|
||||
task_root_type * wait_queue_old = 0 ;
|
||||
|
||||
do {
|
||||
wait_queue_old = wait_queue ;
|
||||
wait_queue = atomic_compare_exchange( & task->m_wait , wait_queue_old , q_denied );
|
||||
} while ( wait_queue_old != wait_queue );
|
||||
|
||||
// The task has been removed from ready queue and
|
||||
// execution is complete so decrement the reference count.
|
||||
// The reference count was incremented by the initial spawning.
|
||||
// The task may be deleted if this was the last reference.
|
||||
|
||||
task_root_type::assign( & task , 0 );
|
||||
|
||||
// Pop waiting tasks and schedule them
|
||||
while ( wait_queue ) {
|
||||
task_root_type * const x = wait_queue ; wait_queue = x->m_next ; x->m_next = 0 ;
|
||||
schedule_task( x , false /* not initial spawn */ );
|
||||
}
|
||||
}
|
||||
else {
|
||||
printf( "CudaTaskPolicyQueue::complete_executed_task(0x%lx) ERROR state_old(%d) dep_size(%d)\n"
|
||||
, (unsigned long)( task )
|
||||
, int(state_old)
|
||||
, task->m_dep_size
|
||||
);
|
||||
Kokkos::abort("CudaTaskPolicyQueue::complete_executed_task" );
|
||||
}
|
||||
|
||||
// If the task was respawned it may have already been
|
||||
// put in a ready queue and the count incremented.
|
||||
// By decrementing the count last it will never go to zero
|
||||
// with a ready or executing task.
|
||||
|
||||
atomic_fetch_add( & m_count_ready , -1 );
|
||||
}
|
||||
|
||||
__device__
|
||||
void TaskMember< Kokkos::Cuda , void , void >::latch_add( const int k )
|
||||
{
|
||||
typedef TaskMember< Kokkos::Cuda , void , void > task_root_type ;
|
||||
|
||||
task_root_type * const q_denied = reinterpret_cast<task_root_type*>(QDENIED);
|
||||
|
||||
const bool ok_input = 0 < k ;
|
||||
|
||||
const int count = ok_input ? atomic_fetch_add( & m_dep_size , -k ) - k
|
||||
: k ;
|
||||
|
||||
const bool ok_count = 0 <= count ;
|
||||
|
||||
const int state = 0 != count ? TASK_STATE_WAITING :
|
||||
atomic_compare_exchange( & m_state
|
||||
, TASK_STATE_WAITING
|
||||
, TASK_STATE_COMPLETE );
|
||||
|
||||
const bool ok_state = state == TASK_STATE_WAITING ;
|
||||
|
||||
if ( ! ok_count || ! ok_state ) {
|
||||
printf( "CudaTaskPolicyQueue::latch_add[0x%lx](%d) ERROR %s %d\n"
|
||||
, (unsigned long) this
|
||||
, k
|
||||
, ( ! ok_input ? "Non-positive input" :
|
||||
( ! ok_count ? "Negative count" : "Bad State" ) )
|
||||
, ( ! ok_input ? k :
|
||||
( ! ok_count ? count : state ) )
|
||||
);
|
||||
Kokkos::abort( "CudaTaskPolicyQueue::latch_add ERROR" );
|
||||
}
|
||||
else if ( 0 == count ) {
|
||||
// Stop other tasks from adding themselves to this latch's wait queue.
|
||||
// The wait queue is updated concurrently so guard with an atomic.
|
||||
|
||||
CudaTaskPolicyQueue & policy = *m_policy ;
|
||||
task_root_type * wait_queue = *((task_root_type * volatile *) &m_wait);
|
||||
task_root_type * wait_queue_old = 0 ;
|
||||
|
||||
do {
|
||||
wait_queue_old = wait_queue ;
|
||||
wait_queue = atomic_compare_exchange( & m_wait , wait_queue_old , q_denied );
|
||||
} while ( wait_queue_old != wait_queue );
|
||||
|
||||
// Pop waiting tasks and schedule them
|
||||
while ( wait_queue ) {
|
||||
task_root_type * const x = wait_queue ; wait_queue = x->m_next ; x->m_next = 0 ;
|
||||
policy.schedule_task( x , false /* not initial spawn */ );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
void CudaTaskPolicyQueue::reschedule_task(
|
||||
CudaTaskPolicyQueue::task_root_type * const task )
|
||||
{
|
||||
// Reschedule transitions from executing back to waiting.
|
||||
const int old_state =
|
||||
atomic_compare_exchange( & task->m_state
|
||||
, int(TASK_STATE_EXECUTING)
|
||||
, int(TASK_STATE_WAITING) );
|
||||
|
||||
if ( old_state != int(TASK_STATE_EXECUTING) ) {
|
||||
|
||||
printf( "CudaTaskPolicyQueue::reschedule_task(0x%lx) ERROR state(%d)\n"
|
||||
, (unsigned long) task
|
||||
, old_state
|
||||
);
|
||||
Kokkos::abort("CudaTaskPolicyQueue::reschedule" );
|
||||
}
|
||||
}
|
||||
|
||||
KOKKOS_FUNCTION
|
||||
void CudaTaskPolicyQueue::schedule_task(
|
||||
CudaTaskPolicyQueue::task_root_type * const task ,
|
||||
const bool initial_spawn )
|
||||
{
|
||||
task_root_type * const q_lock = reinterpret_cast<task_root_type*>(QLOCK);
|
||||
task_root_type * const q_denied = reinterpret_cast<task_root_type*>(QDENIED);
|
||||
|
||||
//----------------------------------------
|
||||
// State is either constructing or already waiting.
|
||||
// If constructing then transition to waiting.
|
||||
|
||||
{
|
||||
const int old_state = atomic_compare_exchange( & task->m_state
|
||||
, int(TASK_STATE_CONSTRUCTING)
|
||||
, int(TASK_STATE_WAITING) );
|
||||
|
||||
// Head of linked list of tasks waiting on this task
|
||||
task_root_type * const waitTask =
|
||||
*((task_root_type * volatile const *) & task->m_wait );
|
||||
|
||||
// Member of linked list of tasks waiting on some other task
|
||||
task_root_type * const next =
|
||||
*((task_root_type * volatile const *) & task->m_next );
|
||||
|
||||
// An incomplete and non-executing task has:
|
||||
// task->m_state == TASK_STATE_CONSTRUCTING or TASK_STATE_WAITING
|
||||
// task->m_wait != q_denied
|
||||
// task->m_next == 0
|
||||
//
|
||||
if ( ( q_denied == waitTask ) ||
|
||||
( 0 != next ) ||
|
||||
( old_state != int(TASK_STATE_CONSTRUCTING) &&
|
||||
old_state != int(TASK_STATE_WAITING) ) ) {
|
||||
printf( "CudaTaskPolicyQueue::schedule_task(0x%lx) STATE ERROR: state(%d) wait(0x%lx) next(0x%lx)\n"
|
||||
, (unsigned long) task
|
||||
, old_state
|
||||
, (unsigned long) waitTask
|
||||
, (unsigned long) next );
|
||||
Kokkos::abort("CudaTaskPolicyQueue::schedule" );
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
if ( initial_spawn ) {
|
||||
// The initial spawn of a task increments the reference count
|
||||
// for the task's existence in either a waiting or ready queue
|
||||
// until the task has completed.
|
||||
// Completing the task's execution is the matching
|
||||
// decrement of the reference count.
|
||||
task_root_type::assign( 0 , task );
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
// Insert this task into a dependence task that is not complete.
|
||||
// Push on to that task's wait queue.
|
||||
|
||||
bool attempt_insert_in_queue = true ;
|
||||
|
||||
task_root_type * volatile * queue =
|
||||
task->m_dep_size ? & task->m_dep[0]->m_wait : (task_root_type **) 0 ;
|
||||
|
||||
for ( int i = 0 ; attempt_insert_in_queue && ( 0 != queue ) ; ) {
|
||||
|
||||
task_root_type * const head_value_old = *queue ;
|
||||
|
||||
if ( q_denied == head_value_old ) {
|
||||
// Wait queue is closed because task is complete,
|
||||
// try again with the next dependence wait queue.
|
||||
++i ;
|
||||
queue = i < task->m_dep_size ? & task->m_dep[i]->m_wait
|
||||
: (task_root_type **) 0 ;
|
||||
}
|
||||
else {
|
||||
|
||||
// Wait queue is open and not denied.
|
||||
// Have exclusive access to this task.
|
||||
// Assign m_next assuming a successfull insertion into the queue.
|
||||
// Fence the memory assignment before attempting the CAS.
|
||||
|
||||
*((task_root_type * volatile *) & task->m_next ) = head_value_old ;
|
||||
|
||||
memory_fence();
|
||||
|
||||
// Attempt to insert this task into the queue.
|
||||
// If fails then continue the attempt.
|
||||
|
||||
attempt_insert_in_queue =
|
||||
head_value_old != atomic_compare_exchange(queue,head_value_old,task);
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
// All dependences are complete, insert into the ready list
|
||||
|
||||
if ( attempt_insert_in_queue ) {
|
||||
|
||||
// Increment the count of ready tasks.
|
||||
// Count will be decremented when task is complete.
|
||||
|
||||
atomic_fetch_add( & m_count_ready , 1 );
|
||||
|
||||
queue = task->m_queue ;
|
||||
|
||||
while ( attempt_insert_in_queue ) {
|
||||
|
||||
// A locked queue is being popped.
|
||||
|
||||
task_root_type * const head_value_old = *queue ;
|
||||
|
||||
if ( q_lock != head_value_old ) {
|
||||
// Read the head of ready queue,
|
||||
// if same as previous value then CAS locks the ready queue
|
||||
|
||||
// Have exclusive access to this task,
|
||||
// assign to head of queue, assuming successful insert
|
||||
// Fence assignment before attempting insert.
|
||||
*((task_root_type * volatile *) & task->m_next ) = head_value_old ;
|
||||
|
||||
memory_fence();
|
||||
|
||||
attempt_insert_in_queue =
|
||||
head_value_old != atomic_compare_exchange(queue,head_value_old,task);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CudaTaskPolicyQueue::deallocate_task
|
||||
( CudaTaskPolicyQueue::task_root_type * const task )
|
||||
{
|
||||
m_space.deallocate( task , task->m_size_alloc );
|
||||
}
|
||||
|
||||
KOKKOS_FUNCTION
|
||||
CudaTaskPolicyQueue::task_root_type *
|
||||
CudaTaskPolicyQueue::allocate_task
|
||||
( const unsigned arg_sizeof_task
|
||||
, const unsigned arg_dep_capacity
|
||||
, const unsigned arg_team_shmem
|
||||
)
|
||||
{
|
||||
const unsigned base_size = arg_sizeof_task +
|
||||
( arg_sizeof_task % sizeof(task_root_type*)
|
||||
? sizeof(task_root_type*) - arg_sizeof_task % sizeof(task_root_type*)
|
||||
: 0 );
|
||||
|
||||
const unsigned dep_capacity
|
||||
= ~0u == arg_dep_capacity
|
||||
? m_default_dependence_capacity
|
||||
: arg_dep_capacity ;
|
||||
|
||||
const unsigned size_alloc =
|
||||
base_size + sizeof(task_root_type*) * dep_capacity ;
|
||||
|
||||
task_root_type * const task =
|
||||
reinterpret_cast<task_root_type*>( m_space.allocate( size_alloc ) );
|
||||
|
||||
if ( task != 0 ) {
|
||||
|
||||
// Initialize task's root and value data structure
|
||||
// Calling function must copy construct the functor.
|
||||
|
||||
new( (void*) task ) task_root_type();
|
||||
|
||||
task->m_policy = this ;
|
||||
task->m_size_alloc = size_alloc ;
|
||||
task->m_dep_capacity = dep_capacity ;
|
||||
task->m_shmem_size = arg_team_shmem ;
|
||||
|
||||
if ( dep_capacity ) {
|
||||
task->m_dep =
|
||||
reinterpret_cast<task_root_type**>(
|
||||
reinterpret_cast<unsigned char*>(task) + base_size );
|
||||
|
||||
for ( unsigned i = 0 ; i < dep_capacity ; ++i )
|
||||
task->task_root_type::m_dep[i] = 0 ;
|
||||
}
|
||||
}
|
||||
return task ;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
void CudaTaskPolicyQueue::add_dependence
|
||||
( CudaTaskPolicyQueue::task_root_type * const after
|
||||
, CudaTaskPolicyQueue::task_root_type * const before
|
||||
)
|
||||
{
|
||||
if ( ( after != 0 ) && ( before != 0 ) ) {
|
||||
|
||||
int const state = *((volatile const int *) & after->m_state );
|
||||
|
||||
// Only add dependence during construction or during execution.
|
||||
// Both tasks must have the same policy.
|
||||
// Dependence on non-full memory cannot be mixed with any other dependence.
|
||||
|
||||
const bool ok_state =
|
||||
Kokkos::Experimental::TASK_STATE_CONSTRUCTING == state ||
|
||||
Kokkos::Experimental::TASK_STATE_EXECUTING == state ;
|
||||
|
||||
const bool ok_capacity =
|
||||
after->m_dep_size < after->m_dep_capacity ;
|
||||
|
||||
const bool ok_policy =
|
||||
after->m_policy == this && before->m_policy == this ;
|
||||
|
||||
if ( ok_state && ok_capacity && ok_policy ) {
|
||||
|
||||
++after->m_dep_size ;
|
||||
|
||||
task_root_type::assign( after->m_dep + (after->m_dep_size-1) , before );
|
||||
|
||||
memory_fence();
|
||||
}
|
||||
else {
|
||||
|
||||
printf( "CudaTaskPolicyQueue::add_dependence( 0x%lx , 0x%lx ) ERROR %s\n"
|
||||
, (unsigned long) after
|
||||
, (unsigned long) before
|
||||
, ( ! ok_state ? "Task not constructing or executing" :
|
||||
( ! ok_capacity ? "Task Exceeded dependence capacity"
|
||||
: "Tasks from different policies" )) );
|
||||
|
||||
Kokkos::abort("CudaTaskPolicyQueue::add_dependence ERROR");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Experimental */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
TaskPolicy< Kokkos::Cuda >::TaskPolicy
|
||||
( const unsigned arg_task_max_count
|
||||
, const unsigned arg_task_max_size
|
||||
, const unsigned arg_task_default_dependence_capacity
|
||||
, const unsigned arg_task_team_size
|
||||
)
|
||||
: m_track()
|
||||
, m_policy(0)
|
||||
{
|
||||
// Allocate the queue data sructure in UVM space
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord
|
||||
< Kokkos::CudaUVMSpace , Impl::CudaTaskPolicyQueue::Destroy > record_type ;
|
||||
|
||||
record_type * record =
|
||||
record_type::allocate( Kokkos::CudaUVMSpace()
|
||||
, "CudaUVM task queue"
|
||||
, sizeof(Impl::CudaTaskPolicyQueue)
|
||||
);
|
||||
|
||||
m_policy = reinterpret_cast< Impl::CudaTaskPolicyQueue * >( record->data() );
|
||||
|
||||
// Tasks are allocated with application's task size + sizeof(task_root_type)
|
||||
|
||||
const size_t full_task_size_estimate =
|
||||
arg_task_max_size +
|
||||
sizeof(task_root_type) +
|
||||
sizeof(task_root_type*) * arg_task_default_dependence_capacity ;
|
||||
|
||||
new( m_policy )
|
||||
Impl::CudaTaskPolicyQueue( arg_task_max_count
|
||||
, full_task_size_estimate
|
||||
, arg_task_default_dependence_capacity
|
||||
, arg_task_team_size );
|
||||
|
||||
record->m_destroy.m_policy = m_policy ;
|
||||
|
||||
m_track.assign_allocated_record_to_uninitialized( record );
|
||||
}
|
||||
|
||||
__global__
|
||||
static void kokkos_cuda_task_policy_queue_driver
|
||||
( Kokkos::Experimental::Impl::CudaTaskPolicyQueue * queue )
|
||||
{
|
||||
queue->driver();
|
||||
}
|
||||
|
||||
void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Cuda > & policy )
|
||||
{
|
||||
const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
|
||||
const dim3 block( 1 , policy.m_policy->m_team_size , 1 );
|
||||
|
||||
const int shared = 0 ; // Kokkos::Impl::CudaTraits::SharedMemoryUsage / 2 ;
|
||||
const cudaStream_t stream = 0 ;
|
||||
|
||||
|
||||
#ifdef DETAILED_PRINT
|
||||
printf("kokkos_cuda_task_policy_queue_driver grid(%d,%d,%d) block(%d,%d,%d) shared(%d) policy(0x%lx)\n"
|
||||
, grid.x , grid.y , grid.z
|
||||
, block.x , block.y , block.z
|
||||
, shared
|
||||
, (unsigned long)( policy.m_policy ) );
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
|
||||
/*
|
||||
CUDA_SAFE_CALL(
|
||||
cudaFuncSetCacheConfig( kokkos_cuda_task_policy_queue_driver
|
||||
, cudaFuncCachePreferL1 ) );
|
||||
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
*/
|
||||
|
||||
kokkos_cuda_task_policy_queue_driver<<< grid , block , shared , stream >>>
|
||||
( policy.m_policy );
|
||||
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
|
||||
#ifdef DETAILED_PRINT
|
||||
printf("kokkos_cuda_task_policy_queue_driver end\n");
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
} /* namespace Experimental */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
typedef TaskMember< Kokkos::Cuda , void , void > Task ;
|
||||
|
||||
__host__ __device__
|
||||
Task::~TaskMember()
|
||||
{
|
||||
}
|
||||
|
||||
__host__ __device__
|
||||
void Task::assign( Task ** const lhs_ptr , Task * rhs )
|
||||
{
|
||||
Task * const q_denied = reinterpret_cast<Task*>(QDENIED);
|
||||
|
||||
// Increment rhs reference count.
|
||||
if ( rhs ) { atomic_fetch_add( & rhs->m_ref_count , 1 ); }
|
||||
|
||||
if ( 0 == lhs_ptr ) return ;
|
||||
|
||||
// Must have exclusive access to *lhs_ptr.
|
||||
// Assign the pointer and retrieve the previous value.
|
||||
// Cannot use atomic exchange since *lhs_ptr may be
|
||||
// in Cuda register space.
|
||||
|
||||
#if 0
|
||||
|
||||
Task * const old_lhs = *((Task*volatile*)lhs_ptr);
|
||||
|
||||
*((Task*volatile*)lhs_ptr) = rhs ;
|
||||
|
||||
Kokkos::memory_fence();
|
||||
|
||||
#else
|
||||
|
||||
Task * const old_lhs = *lhs_ptr ;
|
||||
|
||||
*lhs_ptr = rhs ;
|
||||
|
||||
#endif
|
||||
|
||||
if ( old_lhs && rhs && old_lhs->m_policy != rhs->m_policy ) {
|
||||
Kokkos::abort( "Kokkos::Impl::TaskMember<Kokkos::Cuda>::assign ERROR different queues");
|
||||
}
|
||||
|
||||
if ( old_lhs ) {
|
||||
|
||||
Kokkos::memory_fence();
|
||||
|
||||
// Decrement former lhs reference count.
|
||||
// If reference count is zero task must be complete, then delete task.
|
||||
// Task is ready for deletion when wait == q_denied
|
||||
|
||||
int const count = atomic_fetch_add( & (old_lhs->m_ref_count) , -1 ) - 1 ;
|
||||
int const state = old_lhs->m_state ;
|
||||
Task * const wait = *((Task * const volatile *) & old_lhs->m_wait );
|
||||
|
||||
const bool ok_count = 0 <= count ;
|
||||
|
||||
// If count == 0 then will be deleting
|
||||
// and must either be constructing or complete.
|
||||
const bool ok_state = 0 < count ? true :
|
||||
( ( state == int(TASK_STATE_CONSTRUCTING) && wait == 0 ) ||
|
||||
( state == int(TASK_STATE_COMPLETE) && wait == q_denied ) )
|
||||
&&
|
||||
old_lhs->m_next == 0 &&
|
||||
old_lhs->m_dep_size == 0 ;
|
||||
|
||||
if ( ! ok_count || ! ok_state ) {
|
||||
|
||||
printf( "%s Kokkos::Impl::TaskManager<Kokkos::Cuda>::assign ERROR deleting task(0x%lx) m_ref_count(%d) m_state(%d) m_wait(0x%ld)\n"
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_SPACE_CUDA )
|
||||
, "CUDA "
|
||||
#else
|
||||
, "HOST "
|
||||
#endif
|
||||
, (unsigned long) old_lhs
|
||||
, count
|
||||
, state
|
||||
, (unsigned long) wait );
|
||||
Kokkos::abort( "Kokkos::Impl::TaskMember<Kokkos::Cuda>::assign ERROR deleting");
|
||||
}
|
||||
|
||||
if ( count == 0 ) {
|
||||
// When 'count == 0' this thread has exclusive access to 'old_lhs'
|
||||
|
||||
#ifdef DETAILED_PRINT
|
||||
printf( "Task::assign(...) old_lhs(0x%lx) deallocate\n"
|
||||
, (unsigned long) old_lhs
|
||||
);
|
||||
#endif
|
||||
|
||||
old_lhs->m_policy->deallocate_task( old_lhs );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
__device__
|
||||
int Task::get_dependence() const
|
||||
{
|
||||
return m_dep_size ;
|
||||
}
|
||||
|
||||
__device__
|
||||
Task * Task::get_dependence( int i ) const
|
||||
{
|
||||
Task * const t = ((Task*volatile*)m_dep)[i] ;
|
||||
|
||||
if ( Kokkos::Experimental::TASK_STATE_EXECUTING != m_state || i < 0 || m_dep_size <= i || 0 == t ) {
|
||||
|
||||
printf( "TaskMember< Cuda >::get_dependence ERROR : task[%lx]{ state(%d) dep_size(%d) dep[%d] = %lx }\n"
|
||||
, (unsigned long) this
|
||||
, m_state
|
||||
, m_dep_size
|
||||
, i
|
||||
, (unsigned long) t
|
||||
);
|
||||
|
||||
Kokkos::abort("TaskMember< Cuda >::get_dependence ERROR");
|
||||
}
|
||||
|
||||
return t ;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
__device__ __host__
|
||||
void Task::clear_dependence()
|
||||
{
|
||||
for ( int i = m_dep_size - 1 ; 0 <= i ; --i ) {
|
||||
assign( m_dep + i , 0 );
|
||||
}
|
||||
|
||||
*((volatile int *) & m_dep_size ) = 0 ;
|
||||
|
||||
memory_fence();
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Experimental */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
|
||||
@ -1,833 +0,0 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
// Experimental unified task-data parallel manycore LDRD
|
||||
|
||||
#ifndef KOKKOS_CUDA_TASKPOLICY_HPP
|
||||
#define KOKKOS_CUDA_TASKPOLICY_HPP
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
#include <Kokkos_Cuda.hpp>
|
||||
#include <Kokkos_TaskPolicy.hpp>
|
||||
|
||||
#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
struct CudaTaskPolicyQueue ;
|
||||
|
||||
/** \brief Base class for all Kokkos::Cuda tasks */
|
||||
template<>
|
||||
class TaskMember< Kokkos::Cuda , void , void > {
|
||||
public:
|
||||
|
||||
template< class > friend class Kokkos::Experimental::TaskPolicy ;
|
||||
friend struct CudaTaskPolicyQueue ;
|
||||
|
||||
typedef void (* function_single_type) ( TaskMember * );
|
||||
typedef void (* function_team_type) ( TaskMember * , Kokkos::Impl::CudaTeamMember & );
|
||||
|
||||
private:
|
||||
|
||||
CudaTaskPolicyQueue * m_policy ;
|
||||
TaskMember * volatile * m_queue ;
|
||||
function_team_type m_team ; ///< Apply function on CUDA
|
||||
function_single_type m_serial ; ///< Apply function on CUDA
|
||||
TaskMember ** m_dep ; ///< Dependences
|
||||
TaskMember * m_wait ; ///< Linked list of tasks waiting on this task
|
||||
TaskMember * m_next ; ///< Linked list of tasks waiting on a different task
|
||||
int m_dep_capacity ; ///< Capacity of dependences
|
||||
int m_dep_size ; ///< Actual count of dependences
|
||||
int m_size_alloc ;
|
||||
int m_shmem_size ;
|
||||
int m_ref_count ; ///< Reference count
|
||||
int m_state ; ///< State of the task
|
||||
|
||||
|
||||
TaskMember( TaskMember && ) = delete ;
|
||||
TaskMember( const TaskMember & ) = delete ;
|
||||
TaskMember & operator = ( TaskMember && ) = delete ;
|
||||
TaskMember & operator = ( const TaskMember & ) = delete ;
|
||||
|
||||
protected:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskMember()
|
||||
: m_policy(0)
|
||||
, m_queue(0)
|
||||
, m_team(0)
|
||||
, m_serial(0)
|
||||
, m_dep(0)
|
||||
, m_wait(0)
|
||||
, m_next(0)
|
||||
, m_size_alloc(0)
|
||||
, m_dep_capacity(0)
|
||||
, m_dep_size(0)
|
||||
, m_shmem_size(0)
|
||||
, m_ref_count(0)
|
||||
, m_state( TASK_STATE_CONSTRUCTING )
|
||||
{}
|
||||
|
||||
public:
|
||||
|
||||
KOKKOS_FUNCTION
|
||||
~TaskMember();
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int reference_count() const
|
||||
{ return *((volatile int *) & m_ref_count ); }
|
||||
|
||||
// Cannot use the function pointer to verify the type
|
||||
// since the function pointer is not unique between
|
||||
// Host and Cuda. Don't run verificaton for Cuda.
|
||||
// Assume testing on Host-only back-end will catch such errors.
|
||||
|
||||
template< typename ResultType >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
TaskMember * verify_type( TaskMember * t ) { return t ; }
|
||||
|
||||
//----------------------------------------
|
||||
/* Inheritence Requirements on task types:
|
||||
*
|
||||
* class DerivedTaskType
|
||||
* : public TaskMember< Cuda , DerivedType::value_type , FunctorType >
|
||||
* { ... };
|
||||
*
|
||||
* class TaskMember< Cuda , DerivedType::value_type , FunctorType >
|
||||
* : public TaskMember< Cuda , DerivedType::value_type , void >
|
||||
* , public Functor
|
||||
* { ... };
|
||||
*
|
||||
* If value_type != void
|
||||
* class TaskMember< Cuda , value_type , void >
|
||||
* : public TaskMember< Cuda , void , void >
|
||||
*
|
||||
* Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
|
||||
*
|
||||
*/
|
||||
//----------------------------------------
|
||||
// If after the 'apply' the task's state is waiting
|
||||
// then it will be rescheduled and called again.
|
||||
// Otherwise the functor must be destroyed.
|
||||
|
||||
template< class DerivedTaskType , class Tag >
|
||||
__device__ static
|
||||
void apply_single(
|
||||
typename std::enable_if
|
||||
<( std::is_same< Tag , void >::value &&
|
||||
std::is_same< typename DerivedTaskType::result_type , void >::value
|
||||
), TaskMember * >::type t )
|
||||
{
|
||||
typedef typename DerivedTaskType::functor_type functor_type ;
|
||||
|
||||
functor_type * const f =
|
||||
static_cast< functor_type * >( static_cast< DerivedTaskType * >(t) );
|
||||
|
||||
f->apply();
|
||||
|
||||
if ( t->m_state == int(Kokkos::Experimental::TASK_STATE_EXECUTING) ) {
|
||||
f->~functor_type();
|
||||
}
|
||||
}
|
||||
|
||||
template< class DerivedTaskType , class Tag >
|
||||
__device__ static
|
||||
void apply_single(
|
||||
typename std::enable_if
|
||||
<( std::is_same< Tag , void >::value &&
|
||||
! std::is_same< typename DerivedTaskType::result_type , void >::value
|
||||
), TaskMember * >::type t )
|
||||
{
|
||||
typedef typename DerivedTaskType::functor_type functor_type ;
|
||||
|
||||
DerivedTaskType * const self = static_cast< DerivedTaskType * >(t);
|
||||
functor_type * const f = static_cast< functor_type * >( self );
|
||||
|
||||
f->apply( self->m_result );
|
||||
|
||||
if ( t->m_state == int(Kokkos::Experimental::TASK_STATE_EXECUTING) ) {
|
||||
f->~functor_type();
|
||||
}
|
||||
}
|
||||
|
||||
template< class DerivedTaskType , class Tag >
|
||||
__device__
|
||||
void set_apply_single()
|
||||
{
|
||||
m_serial = & TaskMember::template apply_single<DerivedTaskType,Tag> ;
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< class DerivedTaskType , class Tag >
|
||||
__device__ static
|
||||
void apply_team(
|
||||
typename std::enable_if
|
||||
<( std::is_same<Tag,void>::value &&
|
||||
std::is_same<typename DerivedTaskType::result_type,void>::value
|
||||
), TaskMember * >::type t
|
||||
, Kokkos::Impl::CudaTeamMember & member
|
||||
)
|
||||
{
|
||||
typedef typename DerivedTaskType::functor_type functor_type ;
|
||||
|
||||
functor_type * const f =
|
||||
static_cast< functor_type * >( static_cast< DerivedTaskType * >(t) );
|
||||
|
||||
f->apply( member );
|
||||
|
||||
__syncthreads(); // Wait for team to finish calling function
|
||||
|
||||
if ( threadIdx.x == 0 &&
|
||||
threadIdx.y == 0 &&
|
||||
t->m_state == int(Kokkos::Experimental::TASK_STATE_EXECUTING) ) {
|
||||
f->~functor_type();
|
||||
}
|
||||
}
|
||||
|
||||
template< class DerivedTaskType , class Tag >
|
||||
__device__ static
|
||||
void apply_team(
|
||||
typename std::enable_if
|
||||
<( std::is_same<Tag,void>::value &&
|
||||
! std::is_same<typename DerivedTaskType::result_type,void>::value
|
||||
), TaskMember * >::type t
|
||||
, Kokkos::Impl::CudaTeamMember & member
|
||||
)
|
||||
{
|
||||
typedef typename DerivedTaskType::functor_type functor_type ;
|
||||
|
||||
DerivedTaskType * const self = static_cast< DerivedTaskType * >(t);
|
||||
functor_type * const f = static_cast< functor_type * >( self );
|
||||
|
||||
f->apply( member , self->m_result );
|
||||
|
||||
__syncthreads(); // Wait for team to finish calling function
|
||||
|
||||
if ( threadIdx.x == 0 &&
|
||||
threadIdx.y == 0 &&
|
||||
t->m_state == int(Kokkos::Experimental::TASK_STATE_EXECUTING) ) {
|
||||
f->~functor_type();
|
||||
}
|
||||
}
|
||||
|
||||
template< class DerivedTaskType , class Tag >
|
||||
__device__
|
||||
void set_apply_team()
|
||||
{
|
||||
m_team = & TaskMember::template apply_team<DerivedTaskType,Tag> ;
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
KOKKOS_FUNCTION static
|
||||
void assign( TaskMember ** const lhs , TaskMember * const rhs );
|
||||
|
||||
__device__
|
||||
TaskMember * get_dependence( int i ) const ;
|
||||
|
||||
__device__
|
||||
int get_dependence() const ;
|
||||
|
||||
KOKKOS_FUNCTION void clear_dependence();
|
||||
|
||||
__device__
|
||||
void latch_add( const int k );
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void construct_result( TaskMember * const ) {}
|
||||
|
||||
typedef FutureValueTypeIsVoidError get_result_type ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
get_result_type get() const { return get_result_type() ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Kokkos::Experimental::TaskState get_state() const { return Kokkos::Experimental::TaskState( m_state ); }
|
||||
|
||||
};
|
||||
|
||||
/** \brief A Future< Kokkos::Cuda , ResultType > will cast
|
||||
* from TaskMember< Kokkos::Cuda , void , void >
|
||||
* to TaskMember< Kokkos::Cuda , ResultType , void >
|
||||
* to query the result.
|
||||
*/
|
||||
template< class ResultType >
|
||||
class TaskMember< Kokkos::Cuda , ResultType , void >
|
||||
: public TaskMember< Kokkos::Cuda , void , void >
|
||||
{
|
||||
public:
|
||||
|
||||
typedef ResultType result_type ;
|
||||
|
||||
result_type m_result ;
|
||||
|
||||
typedef const result_type & get_result_type ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
get_result_type get() const { return m_result ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void construct_result( TaskMember * const ptr )
|
||||
{
|
||||
new((void*)(& ptr->m_result)) result_type();
|
||||
}
|
||||
|
||||
TaskMember() = delete ;
|
||||
TaskMember( TaskMember && ) = delete ;
|
||||
TaskMember( const TaskMember & ) = delete ;
|
||||
TaskMember & operator = ( TaskMember && ) = delete ;
|
||||
TaskMember & operator = ( const TaskMember & ) = delete ;
|
||||
};
|
||||
|
||||
/** \brief Callback functions will cast
|
||||
* from TaskMember< Kokkos::Cuda , void , void >
|
||||
* to TaskMember< Kokkos::Cuda , ResultType , FunctorType >
|
||||
* to execute work functions.
|
||||
*/
|
||||
template< class ResultType , class FunctorType >
|
||||
class TaskMember< Kokkos::Cuda , ResultType , FunctorType >
|
||||
: public TaskMember< Kokkos::Cuda , ResultType , void >
|
||||
, public FunctorType
|
||||
{
|
||||
public:
|
||||
typedef ResultType result_type ;
|
||||
typedef FunctorType functor_type ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void copy_construct( TaskMember * const ptr
|
||||
, const functor_type & arg_functor )
|
||||
{
|
||||
typedef TaskMember< Kokkos::Cuda , ResultType , void > base_type ;
|
||||
|
||||
new((void*)static_cast<FunctorType*>(ptr)) functor_type( arg_functor );
|
||||
|
||||
base_type::construct_result( static_cast<base_type*>( ptr ) );
|
||||
}
|
||||
|
||||
TaskMember() = delete ;
|
||||
TaskMember( TaskMember && ) = delete ;
|
||||
TaskMember( const TaskMember & ) = delete ;
|
||||
TaskMember & operator = ( TaskMember && ) = delete ;
|
||||
TaskMember & operator = ( const TaskMember & ) = delete ;
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace {
|
||||
|
||||
template< class DerivedTaskType , class Tag >
|
||||
__global__
|
||||
void cuda_set_apply_single( DerivedTaskType * task )
|
||||
{
|
||||
typedef Kokkos::Experimental::Impl::TaskMember< Kokkos::Cuda , void , void >
|
||||
task_root_type ;
|
||||
|
||||
task->task_root_type::template set_apply_single< DerivedTaskType , Tag >();
|
||||
}
|
||||
|
||||
template< class DerivedTaskType , class Tag >
|
||||
__global__
|
||||
void cuda_set_apply_team( DerivedTaskType * task )
|
||||
{
|
||||
typedef Kokkos::Experimental::Impl::TaskMember< Kokkos::Cuda , void , void >
|
||||
task_root_type ;
|
||||
|
||||
task->task_root_type::template set_apply_team< DerivedTaskType , Tag >();
|
||||
}
|
||||
|
||||
} /* namespace */
|
||||
} /* namespace Impl */
|
||||
} /* namespace Experimental */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
struct CudaTaskPolicyQueue {
|
||||
|
||||
enum { NPRIORITY = 3 };
|
||||
|
||||
// Must use UVM so that tasks can be created in both
|
||||
// Host and Cuda space.
|
||||
|
||||
typedef Kokkos::Experimental::MemoryPool< Kokkos::CudaUVMSpace >
|
||||
memory_space ;
|
||||
|
||||
typedef Kokkos::Experimental::Impl::TaskMember< Kokkos::Cuda , void , void >
|
||||
task_root_type ;
|
||||
|
||||
memory_space m_space ;
|
||||
task_root_type * m_team[ NPRIORITY ] ;
|
||||
task_root_type * m_serial[ NPRIORITY ];
|
||||
int m_team_size ;
|
||||
int m_default_dependence_capacity ;
|
||||
int volatile m_count_ready ; ///< Ready plus executing tasks
|
||||
|
||||
// Execute tasks until all non-waiting tasks are complete
|
||||
__device__
|
||||
void driver();
|
||||
|
||||
__device__ static
|
||||
task_root_type * pop_ready_task( task_root_type * volatile * const queue );
|
||||
|
||||
// When a task finishes executing.
|
||||
__device__
|
||||
void complete_executed_task( task_root_type * );
|
||||
|
||||
KOKKOS_FUNCTION void schedule_task( task_root_type * const
|
||||
, const bool initial_spawn = true );
|
||||
KOKKOS_FUNCTION void reschedule_task( task_root_type * const );
|
||||
KOKKOS_FUNCTION
|
||||
void add_dependence( task_root_type * const after
|
||||
, task_root_type * const before );
|
||||
|
||||
|
||||
CudaTaskPolicyQueue() = delete ;
|
||||
CudaTaskPolicyQueue( CudaTaskPolicyQueue && ) = delete ;
|
||||
CudaTaskPolicyQueue( const CudaTaskPolicyQueue & ) = delete ;
|
||||
CudaTaskPolicyQueue & operator = ( CudaTaskPolicyQueue && ) = delete ;
|
||||
CudaTaskPolicyQueue & operator = ( const CudaTaskPolicyQueue & ) = delete ;
|
||||
|
||||
|
||||
~CudaTaskPolicyQueue();
|
||||
|
||||
// Construct only on the Host
|
||||
CudaTaskPolicyQueue
|
||||
( const unsigned arg_task_max_count
|
||||
, const unsigned arg_task_max_size
|
||||
, const unsigned arg_task_default_dependence_capacity
|
||||
, const unsigned arg_task_team_size
|
||||
);
|
||||
|
||||
struct Destroy {
|
||||
CudaTaskPolicyQueue * m_policy ;
|
||||
void destroy_shared_allocation();
|
||||
};
|
||||
|
||||
//----------------------------------------
|
||||
/** \brief Allocate and construct a task.
|
||||
*
|
||||
* Allocate space for DerivedTaskType followed
|
||||
* by TaskMember*[ dependence_capacity ]
|
||||
*/
|
||||
KOKKOS_FUNCTION
|
||||
task_root_type *
|
||||
allocate_task( const unsigned arg_sizeof_task
|
||||
, const unsigned arg_dep_capacity
|
||||
, const unsigned arg_team_shmem = 0 );
|
||||
|
||||
KOKKOS_FUNCTION void deallocate_task( task_root_type * const );
|
||||
};
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Experimental */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
void wait( TaskPolicy< Kokkos::Cuda > & );
|
||||
|
||||
template<>
|
||||
class TaskPolicy< Kokkos::Cuda >
|
||||
{
|
||||
public:
|
||||
|
||||
typedef Kokkos::Cuda execution_space ;
|
||||
typedef TaskPolicy execution_policy ;
|
||||
typedef Kokkos::Impl::CudaTeamMember member_type ;
|
||||
|
||||
private:
|
||||
|
||||
typedef Impl::TaskMember< Kokkos::Cuda , void , void > task_root_type ;
|
||||
typedef Kokkos::Experimental::MemoryPool< Kokkos::CudaUVMSpace > memory_space ;
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ;
|
||||
|
||||
track_type m_track ;
|
||||
Impl::CudaTaskPolicyQueue * m_policy ;
|
||||
|
||||
template< class FunctorType >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
const task_root_type * get_task_root( const FunctorType * f )
|
||||
{
|
||||
typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
|
||||
return static_cast< const task_root_type * >( static_cast< const task_type * >(f) );
|
||||
}
|
||||
|
||||
template< class FunctorType >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
task_root_type * get_task_root( FunctorType * f )
|
||||
{
|
||||
typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
|
||||
return static_cast< task_root_type * >( static_cast< task_type * >(f) );
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
TaskPolicy
|
||||
( const unsigned arg_task_max_count
|
||||
, const unsigned arg_task_max_size
|
||||
, const unsigned arg_task_default_dependence_capacity = 4
|
||||
, const unsigned arg_task_team_size = 0 /* choose default */
|
||||
);
|
||||
|
||||
KOKKOS_FUNCTION TaskPolicy() = default ;
|
||||
KOKKOS_FUNCTION TaskPolicy( TaskPolicy && rhs ) = default ;
|
||||
KOKKOS_FUNCTION TaskPolicy( const TaskPolicy & rhs ) = default ;
|
||||
KOKKOS_FUNCTION TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
|
||||
KOKKOS_FUNCTION TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ;
|
||||
|
||||
KOKKOS_FUNCTION
|
||||
int allocated_task_count() const { return 0 ; }
|
||||
|
||||
//----------------------------------------
|
||||
// Create serial-thread task
|
||||
// Main process and tasks must use different functions
|
||||
// to work around CUDA limitation where __host__ __device__
|
||||
// functions are not allowed to invoke templated __global__ functions.
|
||||
|
||||
template< class FunctorType >
|
||||
Future< typename FunctorType::value_type , execution_space >
|
||||
proc_create( const FunctorType & arg_functor
|
||||
, const unsigned arg_dep_capacity = ~0u ) const
|
||||
{
|
||||
typedef typename FunctorType::value_type value_type ;
|
||||
|
||||
typedef Impl::TaskMember< execution_space , value_type , FunctorType >
|
||||
task_type ;
|
||||
|
||||
task_type * const task =
|
||||
static_cast<task_type*>(
|
||||
m_policy->allocate_task( sizeof(task_type) , arg_dep_capacity ) );
|
||||
|
||||
if ( task ) {
|
||||
// The root part of the class has been constructed.
|
||||
// Must now construct the functor and result specific part.
|
||||
|
||||
task_type::copy_construct( task , arg_functor );
|
||||
|
||||
// Setting the apply pointer on the device requires code
|
||||
// executing on the GPU. This function is called on the
|
||||
// host process so a kernel must be run.
|
||||
|
||||
// Launching a kernel will cause the allocated task in
|
||||
// UVM memory to be copied to the GPU.
|
||||
// Synchronize to guarantee non-concurrent access
|
||||
// between host and device.
|
||||
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
|
||||
Impl::cuda_set_apply_single<task_type,void><<<1,1>>>( task );
|
||||
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
return Future< value_type , execution_space >( task );
|
||||
}
|
||||
|
||||
template< class FunctorType >
|
||||
__device__
|
||||
Future< typename FunctorType::value_type , execution_space >
|
||||
task_create( const FunctorType & arg_functor
|
||||
, const unsigned arg_dep_capacity = ~0u ) const
|
||||
{
|
||||
typedef typename FunctorType::value_type value_type ;
|
||||
|
||||
typedef Impl::TaskMember< execution_space , value_type , FunctorType >
|
||||
task_type ;
|
||||
|
||||
task_type * const task =
|
||||
static_cast<task_type*>(
|
||||
m_policy->allocate_task( sizeof(task_type) , arg_dep_capacity ) );
|
||||
|
||||
if ( task ) {
|
||||
// The root part of the class has been constructed.
|
||||
// Must now construct the functor and result specific part.
|
||||
|
||||
task_type::copy_construct( task , arg_functor );
|
||||
|
||||
// Setting the apply pointer on the device requires code
|
||||
// executing on the GPU. If this function is called on the
|
||||
// Host then a kernel must be run.
|
||||
|
||||
task->task_root_type::template set_apply_single< task_type , void >();
|
||||
}
|
||||
|
||||
return Future< value_type , execution_space >( task );
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
// Create thread-team task
|
||||
// Main process and tasks must use different functions
|
||||
// to work around CUDA limitation where __host__ __device__
|
||||
// functions are not allowed to invoke templated __global__ functions.
|
||||
|
||||
template< class FunctorType >
|
||||
Future< typename FunctorType::value_type , execution_space >
|
||||
proc_create_team( const FunctorType & arg_functor
|
||||
, const unsigned arg_dep_capacity = ~0u ) const
|
||||
{
|
||||
typedef typename FunctorType::value_type value_type ;
|
||||
|
||||
typedef Impl::TaskMember< execution_space , value_type , FunctorType >
|
||||
task_type ;
|
||||
|
||||
const unsigned team_shmem_size =
|
||||
Kokkos::Impl::FunctorTeamShmemSize< FunctorType >::value
|
||||
( arg_functor , m_policy->m_team_size );
|
||||
|
||||
task_type * const task =
|
||||
static_cast<task_type*>(
|
||||
m_policy->allocate_task( sizeof(task_type) , arg_dep_capacity , team_shmem_size ) );
|
||||
|
||||
if ( task ) {
|
||||
// The root part of the class has been constructed.
|
||||
// Must now construct the functor and result specific part.
|
||||
|
||||
task_type::copy_construct( task , arg_functor );
|
||||
|
||||
// Setting the apply pointer on the device requires code
|
||||
// executing on the GPU. This function is called on the
|
||||
// host process so a kernel must be run.
|
||||
|
||||
// Launching a kernel will cause the allocated task in
|
||||
// UVM memory to be copied to the GPU.
|
||||
// Synchronize to guarantee non-concurrent access
|
||||
// between host and device.
|
||||
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
|
||||
Impl::cuda_set_apply_team<task_type,void><<<1,1>>>( task );
|
||||
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
return Future< value_type , execution_space >( task );
|
||||
}
|
||||
|
||||
template< class FunctorType >
|
||||
__device__
|
||||
Future< typename FunctorType::value_type , execution_space >
|
||||
task_create_team( const FunctorType & arg_functor
|
||||
, const unsigned arg_dep_capacity = ~0u ) const
|
||||
{
|
||||
typedef typename FunctorType::value_type value_type ;
|
||||
|
||||
typedef Impl::TaskMember< execution_space , value_type , FunctorType >
|
||||
task_type ;
|
||||
|
||||
const unsigned team_shmem_size =
|
||||
Kokkos::Impl::FunctorTeamShmemSize< FunctorType >::value
|
||||
( arg_functor , m_policy->m_team_size );
|
||||
|
||||
task_type * const task =
|
||||
static_cast<task_type*>(
|
||||
m_policy->allocate_task( sizeof(task_type) , arg_dep_capacity , team_shmem_size ) );
|
||||
|
||||
if ( task ) {
|
||||
// The root part of the class has been constructed.
|
||||
// Must now construct the functor and result specific part.
|
||||
|
||||
task_type::copy_construct( task , arg_functor );
|
||||
|
||||
// Setting the apply pointer on the device requires code
|
||||
// executing on the GPU. If this function is called on the
|
||||
// Host then a kernel must be run.
|
||||
|
||||
task->task_root_type::template set_apply_team< task_type , void >();
|
||||
}
|
||||
|
||||
return Future< value_type , execution_space >( task );
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
Future< Latch , execution_space >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
create_latch( const int N ) const
|
||||
{
|
||||
task_root_type * const task =
|
||||
m_policy->allocate_task( sizeof(task_root_type) , 0 , 0 );
|
||||
task->m_dep_size = N ; // Using m_dep_size for latch counter
|
||||
task->m_state = TASK_STATE_WAITING ;
|
||||
return Future< Latch , execution_space >( task );
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< class A1 , class A2 , class A3 , class A4 >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void add_dependence( const Future<A1,A2> & after
|
||||
, const Future<A3,A4> & before
|
||||
, typename std::enable_if
|
||||
< std::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
|
||||
&&
|
||||
std::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
|
||||
>::type * = 0
|
||||
) const
|
||||
{ m_policy->add_dependence( after.m_task , before.m_task ); }
|
||||
|
||||
template< class FunctorType , class A3 , class A4 >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void add_dependence( FunctorType * task_functor
|
||||
, const Future<A3,A4> & before
|
||||
, typename std::enable_if
|
||||
< std::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
|
||||
>::type * = 0
|
||||
) const
|
||||
{ m_policy->add_dependence( get_task_root(task_functor) , before.m_task ); }
|
||||
|
||||
|
||||
template< class ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const Future< ValueType , execution_space > &
|
||||
spawn( const Future< ValueType , execution_space > & f
|
||||
, const bool priority = false ) const
|
||||
{
|
||||
if ( f.m_task ) {
|
||||
f.m_task->m_queue =
|
||||
( f.m_task->m_team != 0
|
||||
? & ( m_policy->m_team[ priority ? 0 : 1 ] )
|
||||
: & ( m_policy->m_serial[ priority ? 0 : 1 ] ) );
|
||||
m_policy->schedule_task( f.m_task );
|
||||
}
|
||||
return f ;
|
||||
}
|
||||
|
||||
template< class FunctorType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void respawn( FunctorType * task_functor
|
||||
, const bool priority = false ) const
|
||||
{
|
||||
task_root_type * const t = get_task_root(task_functor);
|
||||
t->m_queue =
|
||||
( t->m_team != 0 ? & ( m_policy->m_team[ priority ? 0 : 1 ] )
|
||||
: & ( m_policy->m_serial[ priority ? 0 : 1 ] ) );
|
||||
m_policy->reschedule_task( t );
|
||||
}
|
||||
|
||||
// When a create method fails by returning a null Future
|
||||
// the task that called the create method may respawn
|
||||
// with a dependence on memory becoming available.
|
||||
// This is a race as more than one task may be respawned
|
||||
// with this need.
|
||||
|
||||
template< class FunctorType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void respawn_needing_memory( FunctorType * task_functor ) const
|
||||
{
|
||||
task_root_type * const t = get_task_root(task_functor);
|
||||
t->m_queue =
|
||||
( t->m_team != 0 ? & ( m_policy->m_team[ 2 ] )
|
||||
: & ( m_policy->m_serial[ 2 ] ) );
|
||||
m_policy->reschedule_task( t );
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
// Functions for an executing task functor to query dependences,
|
||||
// set new dependences, and respawn itself.
|
||||
|
||||
template< class FunctorType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Future< void , execution_space >
|
||||
get_dependence( const FunctorType * task_functor , int i ) const
|
||||
{
|
||||
return Future<void,execution_space>(
|
||||
get_task_root(task_functor)->get_dependence(i)
|
||||
);
|
||||
}
|
||||
|
||||
template< class FunctorType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int get_dependence( const FunctorType * task_functor ) const
|
||||
{ return get_task_root(task_functor)->get_dependence(); }
|
||||
|
||||
template< class FunctorType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void clear_dependence( FunctorType * task_functor ) const
|
||||
{ get_task_root(task_functor)->clear_dependence(); }
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
__device__
|
||||
static member_type member_single()
|
||||
{
|
||||
return
|
||||
member_type( 0 /* shared memory pointer */
|
||||
, 0 /* shared memory begin offset */
|
||||
, 0 /* shared memory end offset */
|
||||
, 0 /* scratch level_1 pointer */
|
||||
, 0 /* scratch level_1 size */
|
||||
, 0 /* league rank */
|
||||
, 1 /* league size */ );
|
||||
}
|
||||
|
||||
friend void wait( TaskPolicy< Kokkos::Cuda > & );
|
||||
};
|
||||
|
||||
} /* namespace Experimental */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
#endif /* #ifndef KOKKOS_CUDA_TASKPOLICY_HPP */
|
||||
|
||||
|
||||
@ -41,53 +41,266 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_CUDA_VIEW_HPP
|
||||
#define KOKKOS_CUDA_VIEW_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#ifndef KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
|
||||
#define KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#ifdef KOKKOS_HAVE_CUDA
|
||||
|
||||
#include <cstring>
|
||||
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
#include <Kokkos_CudaSpace.hpp>
|
||||
#include <impl/Kokkos_Shape.hpp>
|
||||
#include <Kokkos_View.hpp>
|
||||
#if defined( KOKKOS_HAVE_CUDA )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
struct AssertShapeBoundsAbort< CudaSpace >
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void apply( const size_t /* rank */ ,
|
||||
const size_t /* n0 */ , const size_t /* n1 */ ,
|
||||
const size_t /* n2 */ , const size_t /* n3 */ ,
|
||||
const size_t /* n4 */ , const size_t /* n5 */ ,
|
||||
const size_t /* n6 */ , const size_t /* n7 */ ,
|
||||
// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
|
||||
// Via reinterpret_case this can be used to support all scalar types of those sizes.
|
||||
// Any other scalar type falls back to either normal reads out of global memory,
|
||||
// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
|
||||
|
||||
const size_t /* arg_rank */ ,
|
||||
const size_t /* i0 */ , const size_t /* i1 */ ,
|
||||
const size_t /* i2 */ , const size_t /* i3 */ ,
|
||||
const size_t /* i4 */ , const size_t /* i5 */ ,
|
||||
const size_t /* i6 */ , const size_t /* i7 */ )
|
||||
template< typename ValueType , typename AliasType >
|
||||
struct CudaTextureFetch {
|
||||
|
||||
::cudaTextureObject_t m_obj ;
|
||||
const ValueType * m_ptr ;
|
||||
int m_offset ;
|
||||
|
||||
// Deference operator pulls through texture object and returns by value
|
||||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ValueType operator[]( const iType & i ) const
|
||||
{
|
||||
Kokkos::abort("Kokkos::View array bounds violation");
|
||||
#if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
|
||||
AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset );
|
||||
return *(reinterpret_cast<ValueType*> (&v));
|
||||
#else
|
||||
return m_ptr[ i ];
|
||||
#endif
|
||||
}
|
||||
|
||||
// Pointer to referenced memory
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
operator const ValueType * () const { return m_ptr ; }
|
||||
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch() : m_obj() , m_ptr() , m_offset() {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
~CudaTextureFetch() {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch( const CudaTextureFetch & rhs )
|
||||
: m_obj( rhs.m_obj )
|
||||
, m_ptr( rhs.m_ptr )
|
||||
, m_offset( rhs.m_offset )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch( CudaTextureFetch && rhs )
|
||||
: m_obj( rhs.m_obj )
|
||||
, m_ptr( rhs.m_ptr )
|
||||
, m_offset( rhs.m_offset )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
|
||||
{
|
||||
m_obj = rhs.m_obj ;
|
||||
m_ptr = rhs.m_ptr ;
|
||||
m_offset = rhs.m_offset ;
|
||||
return *this ;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch & operator = ( CudaTextureFetch && rhs )
|
||||
{
|
||||
m_obj = rhs.m_obj ;
|
||||
m_ptr = rhs.m_ptr ;
|
||||
m_offset = rhs.m_offset ;
|
||||
return *this ;
|
||||
}
|
||||
|
||||
// Texture object spans the entire allocation.
|
||||
// This handle may view a subset of the allocation, so an offset is required.
|
||||
template< class CudaMemorySpace >
|
||||
inline explicit
|
||||
CudaTextureFetch( const ValueType * const arg_ptr
|
||||
, Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > & record
|
||||
)
|
||||
: m_obj( record.template attach_texture_object< AliasType >() )
|
||||
, m_ptr( arg_ptr )
|
||||
, m_offset( record.attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) )
|
||||
{}
|
||||
|
||||
// Texture object spans the entire allocation.
|
||||
// This handle may view a subset of the allocation, so an offset is required.
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch( const CudaTextureFetch & rhs , size_t offset )
|
||||
: m_obj( rhs.m_obj )
|
||||
, m_ptr( rhs.m_ptr + offset)
|
||||
, m_offset( offset + rhs.m_offset )
|
||||
{}
|
||||
};
|
||||
|
||||
#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC )
|
||||
|
||||
template< typename ValueType , typename AliasType >
|
||||
struct CudaLDGFetch {
|
||||
|
||||
const ValueType * m_ptr ;
|
||||
|
||||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ValueType operator[]( const iType & i ) const
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
AliasType v = __ldg(reinterpret_cast<const AliasType*>(&m_ptr[i]));
|
||||
return *(reinterpret_cast<ValueType*> (&v));
|
||||
#else
|
||||
return m_ptr[i];
|
||||
#endif
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
operator const ValueType * () const { return m_ptr ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaLDGFetch() : m_ptr() {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
~CudaLDGFetch() {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaLDGFetch( const CudaLDGFetch & rhs )
|
||||
: m_ptr( rhs.m_ptr )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaLDGFetch( CudaLDGFetch && rhs )
|
||||
: m_ptr( rhs.m_ptr )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaLDGFetch & operator = ( const CudaLDGFetch & rhs )
|
||||
{
|
||||
m_ptr = rhs.m_ptr ;
|
||||
return *this ;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaLDGFetch & operator = ( CudaLDGFetch && rhs )
|
||||
{
|
||||
m_ptr = rhs.m_ptr ;
|
||||
return *this ;
|
||||
}
|
||||
|
||||
template< class CudaMemorySpace >
|
||||
inline explicit
|
||||
CudaLDGFetch( const ValueType * const arg_ptr
|
||||
, Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > const &
|
||||
)
|
||||
: m_ptr( arg_ptr )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaLDGFetch( CudaLDGFetch const rhs ,size_t offset)
|
||||
: m_ptr( rhs.m_ptr + offset )
|
||||
{}
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
/** \brief Replace Default ViewDataHandle with Cuda texture fetch specialization
|
||||
* if 'const' value type, CudaSpace and random access.
|
||||
*/
|
||||
template< class Traits >
|
||||
class ViewDataHandle< Traits ,
|
||||
typename std::enable_if<(
|
||||
// Is Cuda memory space
|
||||
( std::is_same< typename Traits::memory_space,Kokkos::CudaSpace>::value ||
|
||||
std::is_same< typename Traits::memory_space,Kokkos::CudaUVMSpace>::value )
|
||||
&&
|
||||
// Is a trivial const value of 4, 8, or 16 bytes
|
||||
std::is_trivial<typename Traits::const_value_type>::value
|
||||
&&
|
||||
std::is_same<typename Traits::const_value_type,typename Traits::value_type>::value
|
||||
&&
|
||||
( sizeof(typename Traits::const_value_type) == 4 ||
|
||||
sizeof(typename Traits::const_value_type) == 8 ||
|
||||
sizeof(typename Traits::const_value_type) == 16 )
|
||||
&&
|
||||
// Random access trait
|
||||
( Traits::memory_traits::RandomAccess != 0 )
|
||||
)>::type >
|
||||
{
|
||||
public:
|
||||
|
||||
using track_type = Kokkos::Experimental::Impl::SharedAllocationTracker ;
|
||||
|
||||
using value_type = typename Traits::const_value_type ;
|
||||
using return_type = typename Traits::const_value_type ; // NOT a reference
|
||||
|
||||
using alias_type = typename std::conditional< ( sizeof(value_type) == 4 ) , int ,
|
||||
typename std::conditional< ( sizeof(value_type) == 8 ) , ::int2 ,
|
||||
typename std::conditional< ( sizeof(value_type) == 16 ) , ::int4 , void
|
||||
>::type
|
||||
>::type
|
||||
>::type ;
|
||||
|
||||
#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC )
|
||||
using handle_type = Kokkos::Experimental::Impl::CudaLDGFetch< value_type , alias_type > ;
|
||||
#else
|
||||
using handle_type = Kokkos::Experimental::Impl::CudaTextureFetch< value_type , alias_type > ;
|
||||
#endif
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static handle_type const & assign( handle_type const & arg_handle , track_type const & /* arg_tracker */ )
|
||||
{
|
||||
return arg_handle ;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static handle_type const assign( handle_type const & arg_handle , size_t offset )
|
||||
{
|
||||
return handle_type(arg_handle,offset) ;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static handle_type assign( value_type * arg_data_ptr, track_type const & arg_tracker )
|
||||
{
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
// Assignment of texture = non-texture requires creation of a texture object
|
||||
// which can only occur on the host. In addition, 'get_record' is only valid
|
||||
// if called in a host execution space
|
||||
return handle_type( arg_data_ptr , arg_tracker.template get_record< typename Traits::memory_space >() );
|
||||
#else
|
||||
Kokkos::Impl::cuda_abort("Cannot create Cuda texture object from within a Cuda kernel");
|
||||
return handle_type();
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif // KOKKOS_HAVE_CUDA
|
||||
#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
|
||||
#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */
|
||||
|
||||
|
||||
@ -47,18 +47,10 @@
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
#include "Kokkos_Macros.hpp"
|
||||
#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA )
|
||||
#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA )
|
||||
|
||||
#include <cuda.h>
|
||||
|
||||
#if ! defined( CUDA_VERSION ) || ( CUDA_VERSION < 4010 )
|
||||
#error "Cuda version 4.1 or greater required"
|
||||
#endif
|
||||
|
||||
#if ( __CUDA_ARCH__ < 200 )
|
||||
#error "Cuda device capability 2.0 or greater required"
|
||||
#endif
|
||||
|
||||
extern "C" {
|
||||
/* Cuda runtime function, declared in <crt/device_runtime.h>
|
||||
* Requires capability 2.x or better.
|
||||
@ -90,30 +82,6 @@ void cuda_abort( const char * const message )
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#else
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void cuda_abort( const char * const ) {}
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* #if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
|
||||
namespace Kokkos {
|
||||
__device__ inline
|
||||
void abort( const char * const message ) { Kokkos::Impl::cuda_abort(message); }
|
||||
}
|
||||
#endif /* defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined(__CUDACC__) && defined( KOKKOS_HAVE_CUDA ) */
|
||||
#endif /* #ifndef KOKKOS_CUDA_ABORT_HPP */
|
||||
|
||||
|
||||
@ -75,15 +75,16 @@
|
||||
#if defined(_WIN32)
|
||||
#define KOKKOS_ATOMICS_USE_WINDOWS
|
||||
#else
|
||||
#if defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA )
|
||||
#if defined( KOKKOS_HAVE_CUDA )
|
||||
|
||||
// Compiling NVIDIA device code, must use Cuda atomics:
|
||||
|
||||
#define KOKKOS_ATOMICS_USE_CUDA
|
||||
#endif
|
||||
|
||||
#elif ! defined( KOKKOS_ATOMICS_USE_GCC ) && \
|
||||
! defined( KOKKOS_ATOMICS_USE_INTEL ) && \
|
||||
! defined( KOKKOS_ATOMICS_USE_OMP31 )
|
||||
#if ! defined( KOKKOS_ATOMICS_USE_GCC ) && \
|
||||
! defined( KOKKOS_ATOMICS_USE_INTEL ) && \
|
||||
! defined( KOKKOS_ATOMICS_USE_OMP31 )
|
||||
|
||||
// Compiling for non-Cuda atomic implementation has not been pre-selected.
|
||||
// Choose the best implementation for the detected compiler.
|
||||
@ -91,7 +92,7 @@
|
||||
|
||||
#if defined( KOKKOS_COMPILER_GNU ) || \
|
||||
defined( KOKKOS_COMPILER_CLANG ) || \
|
||||
( defined ( KOKKOS_COMPILER_NVCC ) && defined ( __GNUC__ ) )
|
||||
( defined ( KOKKOS_COMPILER_NVCC ) )
|
||||
|
||||
#define KOKKOS_ATOMICS_USE_GCC
|
||||
|
||||
@ -126,6 +127,9 @@ namespace Impl {
|
||||
/// This function tries to aquire the lock for the hash value derived
|
||||
/// from the provided ptr. If the lock is successfully aquired the
|
||||
/// function returns true. Otherwise it returns false.
|
||||
#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
|
||||
extern
|
||||
#endif
|
||||
__device__ inline
|
||||
bool lock_address_cuda_space(void* ptr);
|
||||
|
||||
@ -135,6 +139,9 @@ bool lock_address_cuda_space(void* ptr);
|
||||
/// from the provided ptr. This function should only be called
|
||||
/// after previously successfully aquiring a lock with
|
||||
/// lock_address.
|
||||
#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
|
||||
extern
|
||||
#endif
|
||||
__device__ inline
|
||||
void unlock_address_cuda_space(void* ptr);
|
||||
}
|
||||
@ -287,7 +294,7 @@ const char * atomic_query_version()
|
||||
//----------------------------------------------------------------------------
|
||||
// This atomic-style macro should be an inlined function, not a macro
|
||||
|
||||
#if defined( KOKKOS_COMPILER_GNU ) && !defined(__PGIC__)
|
||||
#if defined( KOKKOS_COMPILER_GNU ) && !defined(__PGIC__) && !defined(__CUDA_ARCH__)
|
||||
|
||||
#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) __builtin_prefetch(addr,0,0)
|
||||
#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) __builtin_prefetch(addr,1,0)
|
||||
|
||||
@ -46,7 +46,14 @@
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
// Needed for 'is_space<S>::host_mirror_space
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
//Schedules for Execution Policies
|
||||
struct Static {};
|
||||
struct Dynamic {};
|
||||
@ -59,7 +66,7 @@ struct Schedule
|
||||
|| std::is_same<T,Dynamic>::value
|
||||
, "Kokkos: Invalid Schedule<> type."
|
||||
);
|
||||
using schedule_type = Schedule<T>;
|
||||
using schedule_type = Schedule ;
|
||||
using type = T;
|
||||
};
|
||||
|
||||
@ -68,11 +75,268 @@ template<typename T>
|
||||
struct IndexType
|
||||
{
|
||||
static_assert(std::is_integral<T>::value,"Kokkos: Invalid IndexType<>.");
|
||||
using index_type = IndexType<T>;
|
||||
using index_type = IndexType ;
|
||||
using type = T;
|
||||
};
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
#define KOKKOS_IMPL_IS_CONCEPT( CONCEPT ) \
|
||||
template< typename T > struct is_ ## CONCEPT { \
|
||||
private: \
|
||||
template< typename , typename = std::true_type > struct have : std::false_type {}; \
|
||||
template< typename U > struct have<U,typename std::is_same<U,typename U:: CONCEPT >::type> : std::true_type {}; \
|
||||
public: \
|
||||
enum { value = is_ ## CONCEPT::template have<T>::value }; \
|
||||
};
|
||||
|
||||
// Public concept:
|
||||
|
||||
KOKKOS_IMPL_IS_CONCEPT( memory_space )
|
||||
KOKKOS_IMPL_IS_CONCEPT( memory_traits )
|
||||
KOKKOS_IMPL_IS_CONCEPT( execution_space )
|
||||
KOKKOS_IMPL_IS_CONCEPT( execution_policy )
|
||||
KOKKOS_IMPL_IS_CONCEPT( array_layout )
|
||||
|
||||
namespace Impl {
|
||||
|
||||
// For backward compatibility:
|
||||
|
||||
using Kokkos::is_memory_space ;
|
||||
using Kokkos::is_memory_traits ;
|
||||
using Kokkos::is_execution_space ;
|
||||
using Kokkos::is_execution_policy ;
|
||||
using Kokkos::is_array_layout ;
|
||||
|
||||
// Implementation concept:
|
||||
|
||||
KOKKOS_IMPL_IS_CONCEPT( iteration_pattern )
|
||||
KOKKOS_IMPL_IS_CONCEPT( schedule_type )
|
||||
KOKKOS_IMPL_IS_CONCEPT( index_type )
|
||||
|
||||
}
|
||||
|
||||
#undef KOKKOS_IMPL_IS_CONCEPT
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
template< class ExecutionSpace , class MemorySpace >
|
||||
struct Device {
|
||||
static_assert( Kokkos::is_execution_space<ExecutionSpace>::value
|
||||
, "Execution space is not valid" );
|
||||
static_assert( Kokkos::is_memory_space<MemorySpace>::value
|
||||
, "Memory space is not valid" );
|
||||
typedef ExecutionSpace execution_space;
|
||||
typedef MemorySpace memory_space;
|
||||
typedef Device<execution_space,memory_space> device_type;
|
||||
};
|
||||
|
||||
|
||||
template< typename T >
|
||||
struct is_space {
|
||||
private:
|
||||
|
||||
template< typename , typename = void >
|
||||
struct exe : std::false_type { typedef void space ; };
|
||||
|
||||
template< typename , typename = void >
|
||||
struct mem : std::false_type { typedef void space ; };
|
||||
|
||||
template< typename , typename = void >
|
||||
struct dev : std::false_type { typedef void space ; };
|
||||
|
||||
template< typename U >
|
||||
struct exe<U,typename std::conditional<true,void,typename U::execution_space>::type>
|
||||
: std::is_same<U,typename U::execution_space>::type
|
||||
{ typedef typename U::execution_space space ; };
|
||||
|
||||
template< typename U >
|
||||
struct mem<U,typename std::conditional<true,void,typename U::memory_space>::type>
|
||||
: std::is_same<U,typename U::memory_space>::type
|
||||
{ typedef typename U::memory_space space ; };
|
||||
|
||||
template< typename U >
|
||||
struct dev<U,typename std::conditional<true,void,typename U::device_type>::type>
|
||||
: std::is_same<U,typename U::device_type>::type
|
||||
{ typedef typename U::device_type space ; };
|
||||
|
||||
typedef typename is_space::template exe<T> is_exe ;
|
||||
typedef typename is_space::template mem<T> is_mem ;
|
||||
typedef typename is_space::template dev<T> is_dev ;
|
||||
|
||||
public:
|
||||
|
||||
enum { value = is_exe::value || is_mem::value || is_dev::value };
|
||||
|
||||
typedef typename is_exe::space execution_space ;
|
||||
typedef typename is_mem::space memory_space ;
|
||||
|
||||
// For backward compatibility, deprecated in favor of
|
||||
// Kokkos::Impl::HostMirror<S>::host_mirror_space
|
||||
|
||||
typedef typename std::conditional
|
||||
< std::is_same< memory_space , Kokkos::HostSpace >::value
|
||||
#if defined( KOKKOS_HAVE_CUDA )
|
||||
|| std::is_same< memory_space , Kokkos::CudaUVMSpace >::value
|
||||
|| std::is_same< memory_space , Kokkos::CudaHostPinnedSpace >::value
|
||||
#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
|
||||
, memory_space
|
||||
, Kokkos::HostSpace
|
||||
>::type host_memory_space ;
|
||||
|
||||
#if defined( KOKKOS_HAVE_CUDA )
|
||||
typedef typename std::conditional
|
||||
< std::is_same< execution_space , Kokkos::Cuda >::value
|
||||
, Kokkos::DefaultHostExecutionSpace , execution_space
|
||||
>::type host_execution_space ;
|
||||
#else
|
||||
typedef execution_space host_execution_space ;
|
||||
#endif
|
||||
|
||||
typedef typename std::conditional
|
||||
< std::is_same< execution_space , host_execution_space >::value &&
|
||||
std::is_same< memory_space , host_memory_space >::value
|
||||
, T , Kokkos::Device< host_execution_space , host_memory_space >
|
||||
>::type host_mirror_space ;
|
||||
};
|
||||
|
||||
// For backward compatiblity
|
||||
|
||||
namespace Impl {
|
||||
|
||||
using Kokkos::is_space ;
|
||||
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
/**\brief Access relationship between DstMemorySpace and SrcMemorySpace
|
||||
*
|
||||
* The default case can assume accessibility for the same space.
|
||||
* Specializations must be defined for different memory spaces.
|
||||
*/
|
||||
template< typename DstMemorySpace , typename SrcMemorySpace >
|
||||
struct MemorySpaceAccess {
|
||||
|
||||
static_assert( Kokkos::is_memory_space< DstMemorySpace >::value &&
|
||||
Kokkos::is_memory_space< SrcMemorySpace >::value
|
||||
, "template arguments must be memory spaces" );
|
||||
|
||||
/**\brief Can a View (or pointer) to memory in SrcMemorySpace
|
||||
* be assigned to a View (or pointer) to memory marked DstMemorySpace.
|
||||
*
|
||||
* 1. DstMemorySpace::execution_space == SrcMemorySpace::execution_space
|
||||
* 2. All execution spaces that can access DstMemorySpace can also access
|
||||
* SrcMemorySpace.
|
||||
*/
|
||||
enum { assignable = std::is_same<DstMemorySpace,SrcMemorySpace>::value };
|
||||
|
||||
/**\brief For all DstExecSpace::memory_space == DstMemorySpace
|
||||
* DstExecSpace can access SrcMemorySpace.
|
||||
*/
|
||||
enum { accessible = assignable };
|
||||
|
||||
/**\brief Does a DeepCopy capability exist
|
||||
* to DstMemorySpace from SrcMemorySpace
|
||||
*/
|
||||
enum { deepcopy = assignable };
|
||||
};
|
||||
|
||||
|
||||
/**\brief Can AccessSpace access MemorySpace ?
|
||||
*
|
||||
* Requires:
|
||||
* Kokkos::is_space< AccessSpace >::value
|
||||
* Kokkos::is_memory_space< MemorySpace >::value
|
||||
*
|
||||
* Can AccessSpace::execution_space access MemorySpace ?
|
||||
* enum : bool { accessible };
|
||||
*
|
||||
* Is View<AccessSpace::memory_space> assignable from View<MemorySpace> ?
|
||||
* enum : bool { assignable };
|
||||
*
|
||||
* If ! accessible then through which intercessory memory space
|
||||
* should a be used to deep copy memory for
|
||||
* AccessSpace::execution_space
|
||||
* to get access.
|
||||
* When AccessSpace::memory_space == Kokkos::HostSpace
|
||||
* then space is the View host mirror space.
|
||||
*/
|
||||
template< typename AccessSpace , typename MemorySpace >
|
||||
struct SpaceAccessibility {
|
||||
private:
|
||||
|
||||
static_assert( Kokkos::is_space< AccessSpace >::value
|
||||
, "template argument #1 must be a Kokkos space" );
|
||||
|
||||
static_assert( Kokkos::is_memory_space< MemorySpace >::value
|
||||
, "template argument #2 must be a Kokkos memory space" );
|
||||
|
||||
// The input AccessSpace may be a Device<ExecSpace,MemSpace>
|
||||
// verify that it is a valid combination of spaces.
|
||||
static_assert( Kokkos::Impl::MemorySpaceAccess
|
||||
< typename AccessSpace::execution_space::memory_space
|
||||
, typename AccessSpace::memory_space
|
||||
>::accessible
|
||||
, "template argument #1 is an invalid space" );
|
||||
|
||||
typedef Kokkos::Impl::MemorySpaceAccess
|
||||
< typename AccessSpace::execution_space::memory_space , MemorySpace >
|
||||
exe_access ;
|
||||
|
||||
typedef Kokkos::Impl::MemorySpaceAccess
|
||||
< typename AccessSpace::memory_space , MemorySpace >
|
||||
mem_access ;
|
||||
|
||||
public:
|
||||
|
||||
/**\brief Can AccessSpace::execution_space access MemorySpace ?
|
||||
*
|
||||
* Default based upon memory space accessibility.
|
||||
* Specialization required for other relationships.
|
||||
*/
|
||||
enum { accessible = exe_access::accessible };
|
||||
|
||||
/**\brief Can assign to AccessSpace from MemorySpace ?
|
||||
*
|
||||
* Default based upon memory space accessibility.
|
||||
* Specialization required for other relationships.
|
||||
*/
|
||||
enum { assignable =
|
||||
is_memory_space< AccessSpace >::value && mem_access::assignable };
|
||||
|
||||
/**\brief Can deep copy to AccessSpace::memory_Space from MemorySpace ? */
|
||||
enum { deepcopy = mem_access::deepcopy };
|
||||
|
||||
// What intercessory space for AccessSpace::execution_space
|
||||
// to be able to access MemorySpace?
|
||||
// If same memory space or not accessible use the AccessSpace
|
||||
// else construct a device with execution space and memory space.
|
||||
typedef typename std::conditional
|
||||
< std::is_same<typename AccessSpace::memory_space,MemorySpace>::value ||
|
||||
! exe_access::accessible
|
||||
, AccessSpace
|
||||
, Kokkos::Device< typename AccessSpace::execution_space , MemorySpace >
|
||||
>::type space ;
|
||||
};
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif // KOKKOS_CORE_CONCEPTS_HPP
|
||||
|
||||
|
||||
@ -72,6 +72,7 @@
|
||||
#include <Kokkos_Vectorization.hpp>
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
#include <Kokkos_hwloc.hpp>
|
||||
#include <Kokkos_Timer.hpp>
|
||||
|
||||
#ifdef KOKKOS_HAVE_CXX11
|
||||
#include <Kokkos_Complex.hpp>
|
||||
@ -112,7 +113,6 @@ void fence();
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
/* Allocate memory from a memory space.
|
||||
* The allocation is tracked in Kokkos memory tracking system, so
|
||||
@ -155,18 +155,8 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
|
||||
reallocate_tracked( arg_alloc , arg_alloc_size );
|
||||
}
|
||||
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
using Kokkos::Experimental::kokkos_malloc ;
|
||||
using Kokkos::Experimental::kokkos_realloc ;
|
||||
using Kokkos::Experimental::kokkos_free ;
|
||||
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -49,6 +49,7 @@
|
||||
// and compiler environment then sets a collection of #define macros.
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#include <impl/Kokkos_Utilities.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Have assumed a 64bit build (8byte pointers) throughout the code base.
|
||||
@ -207,7 +208,7 @@ namespace Impl {
|
||||
|
||||
template< class Functor
|
||||
, class Policy
|
||||
, class EnableFunctor = void
|
||||
, class EnableFunctor = void
|
||||
, class EnablePolicy = void
|
||||
>
|
||||
struct FunctorPolicyExecutionSpace;
|
||||
@ -220,7 +221,7 @@ struct FunctorPolicyExecutionSpace;
|
||||
/// This is an implementation detail of parallel_for. Users should
|
||||
/// skip this and go directly to the nonmember function parallel_for.
|
||||
template< class FunctorType , class ExecPolicy , class ExecutionSpace =
|
||||
typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
|
||||
typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
|
||||
> class ParallelFor ;
|
||||
|
||||
/// \class ParallelReduce
|
||||
@ -229,7 +230,7 @@ template< class FunctorType , class ExecPolicy , class ExecutionSpace =
|
||||
/// This is an implementation detail of parallel_reduce. Users should
|
||||
/// skip this and go directly to the nonmember function parallel_reduce.
|
||||
template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType, class ExecutionSpace =
|
||||
typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
|
||||
typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
|
||||
> class ParallelReduce ;
|
||||
|
||||
/// \class ParallelScan
|
||||
@ -238,8 +239,8 @@ template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType
|
||||
/// This is an implementation detail of parallel_scan. Users should
|
||||
/// skip this and go directly to the documentation of the nonmember
|
||||
/// template function Kokkos::parallel_scan.
|
||||
template< class FunctorType , class ExecPolicy , class ExecutionSapce =
|
||||
typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
|
||||
template< class FunctorType , class ExecPolicy , class ExecutionSapce =
|
||||
typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
|
||||
> class ParallelScan ;
|
||||
|
||||
}}
|
||||
|
||||
@ -56,7 +56,7 @@
|
||||
#include <Kokkos_CudaSpace.hpp>
|
||||
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <Kokkos_TaskPolicy.hpp>
|
||||
#include <Kokkos_TaskScheduler.hpp>
|
||||
#include <Kokkos_Layout.hpp>
|
||||
#include <Kokkos_ScratchSpace.hpp>
|
||||
#include <Kokkos_MemoryTraits.hpp>
|
||||
@ -229,6 +229,39 @@ private:
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess
|
||||
< Kokkos::CudaSpace
|
||||
, Kokkos::Cuda::scratch_memory_space
|
||||
>
|
||||
{
|
||||
enum { assignable = false };
|
||||
enum { accessible = true };
|
||||
enum { deepcopy = false };
|
||||
};
|
||||
|
||||
#if defined( KOKKOS_USE_CUDA_UVM )
|
||||
|
||||
// If forcing use of UVM everywhere
|
||||
// then must assume that CudaUVMSpace
|
||||
// can be a stand-in for CudaSpace.
|
||||
// This will fail when a strange host-side execution space
|
||||
// that defines CudaUVMSpace as its preferredmemory space.
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess
|
||||
< Kokkos::CudaUVMSpace
|
||||
, Kokkos::Cuda::scratch_memory_space
|
||||
>
|
||||
{
|
||||
enum { assignable = false };
|
||||
enum { accessible = true };
|
||||
enum { deepcopy = false };
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
template<>
|
||||
struct VerifyExecutionCanAccessMemorySpace
|
||||
< Kokkos::CudaSpace
|
||||
@ -259,9 +292,6 @@ struct VerifyExecutionCanAccessMemorySpace
|
||||
|
||||
#include <Cuda/Kokkos_CudaExec.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_View.hpp>
|
||||
|
||||
#include <Cuda/KokkosExp_Cuda_View.hpp>
|
||||
|
||||
#include <Cuda/Kokkos_Cuda_Parallel.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Task.hpp>
|
||||
|
||||
|
||||
@ -88,6 +88,9 @@ public:
|
||||
void deallocate( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size ) const ;
|
||||
|
||||
/**\brief Return Name of the MemorySpace */
|
||||
static constexpr const char* name();
|
||||
|
||||
/*--------------------------------*/
|
||||
/** \brief Error reporting for HostSpace attempt to access CudaSpace */
|
||||
static void access_error();
|
||||
@ -97,7 +100,8 @@ private:
|
||||
|
||||
int m_device ; ///< Which Cuda device
|
||||
|
||||
// friend class Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > ;
|
||||
static constexpr const char* m_name = "Cuda";
|
||||
friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > ;
|
||||
};
|
||||
|
||||
namespace Impl {
|
||||
@ -156,6 +160,14 @@ public:
|
||||
/** \brief If UVM capability is available */
|
||||
static bool available();
|
||||
|
||||
|
||||
/*--------------------------------*/
|
||||
/** \brief CudaUVMSpace specific routine */
|
||||
static int number_of_allocations();
|
||||
|
||||
/*--------------------------------*/
|
||||
|
||||
|
||||
/*--------------------------------*/
|
||||
|
||||
CudaUVMSpace();
|
||||
@ -172,11 +184,16 @@ public:
|
||||
void deallocate( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size ) const ;
|
||||
|
||||
/**\brief Return Name of the MemorySpace */
|
||||
static constexpr const char* name();
|
||||
|
||||
/*--------------------------------*/
|
||||
|
||||
private:
|
||||
|
||||
int m_device ; ///< Which Cuda device
|
||||
|
||||
static constexpr const char* m_name = "CudaUVM";
|
||||
|
||||
};
|
||||
|
||||
} // namespace Kokkos
|
||||
@ -215,6 +232,13 @@ public:
|
||||
void deallocate( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size ) const ;
|
||||
|
||||
/**\brief Return Name of the MemorySpace */
|
||||
static constexpr const char* name();
|
||||
|
||||
private:
|
||||
|
||||
static constexpr const char* m_name = "CudaHostPinned";
|
||||
|
||||
/*--------------------------------*/
|
||||
};
|
||||
|
||||
@ -226,6 +250,126 @@ public:
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::CudaSpace >::assignable , "" );
|
||||
static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaUVMSpace >::assignable , "" );
|
||||
static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaHostPinnedSpace >::assignable , "" );
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaSpace > {
|
||||
enum { assignable = false };
|
||||
enum { accessible = false };
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaUVMSpace > {
|
||||
// HostSpace::execution_space != CudaUVMSpace::execution_space
|
||||
enum { assignable = false };
|
||||
enum { accessible = true };
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaHostPinnedSpace > {
|
||||
// HostSpace::execution_space == CudaHostPinnedSpace::execution_space
|
||||
enum { assignable = true };
|
||||
enum { accessible = true };
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::HostSpace > {
|
||||
enum { assignable = false };
|
||||
enum { accessible = false };
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::CudaUVMSpace > {
|
||||
// CudaSpace::execution_space == CudaUVMSpace::execution_space
|
||||
enum { assignable = true };
|
||||
enum { accessible = true };
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::CudaHostPinnedSpace > {
|
||||
// CudaSpace::execution_space != CudaHostPinnedSpace::execution_space
|
||||
enum { assignable = false };
|
||||
enum { accessible = true }; // CudaSpace::execution_space
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
//----------------------------------------
|
||||
// CudaUVMSpace::execution_space == Cuda
|
||||
// CudaUVMSpace accessible to both Cuda and Host
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::HostSpace > {
|
||||
enum { assignable = false };
|
||||
enum { accessible = false }; // Cuda cannot access HostSpace
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaSpace > {
|
||||
// CudaUVMSpace::execution_space == CudaSpace::execution_space
|
||||
// Can access CudaUVMSpace from Host but cannot access CudaSpace from Host
|
||||
enum { assignable = false };
|
||||
|
||||
// CudaUVMSpace::execution_space can access CudaSpace
|
||||
enum { accessible = true };
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaHostPinnedSpace > {
|
||||
// CudaUVMSpace::execution_space != CudaHostPinnedSpace::execution_space
|
||||
enum { assignable = false };
|
||||
enum { accessible = true }; // CudaUVMSpace::execution_space
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
|
||||
//----------------------------------------
|
||||
// CudaHostPinnedSpace::execution_space == HostSpace::execution_space
|
||||
// CudaHostPinnedSpace accessible to both Cuda and Host
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace > {
|
||||
enum { assignable = false }; // Cannot access from Cuda
|
||||
enum { accessible = true }; // CudaHostPinnedSpace::execution_space
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaSpace > {
|
||||
enum { assignable = false }; // Cannot access from Host
|
||||
enum { accessible = false };
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaUVMSpace > {
|
||||
enum { assignable = false }; // different execution_space
|
||||
enum { accessible = true }; // same accessibility
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
void DeepCopyAsyncCuda( void * dst , const void * src , size_t n);
|
||||
|
||||
template<> struct DeepCopy< CudaSpace , CudaSpace , Cuda>
|
||||
@ -553,7 +697,6 @@ struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaHost
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
@ -791,7 +934,6 @@ public:
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -52,6 +52,7 @@
|
||||
#include <impl/Kokkos_AnalyzePolicy.hpp>
|
||||
#include <Kokkos_Concepts.hpp>
|
||||
#include <iostream>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
@ -82,7 +83,6 @@ class RangePolicy
|
||||
: public Impl::PolicyTraits<Properties ... >
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Impl::PolicyTraits<Properties ... > traits;
|
||||
|
||||
typename traits::execution_space m_space ;
|
||||
@ -90,8 +90,8 @@ private:
|
||||
typename traits::index_type m_end ;
|
||||
typename traits::index_type m_granularity ;
|
||||
typename traits::index_type m_granularity_mask ;
|
||||
public:
|
||||
|
||||
public:
|
||||
//! Tag this class as an execution policy
|
||||
typedef RangePolicy execution_policy;
|
||||
typedef typename traits::index_type member_type ;
|
||||
@ -100,7 +100,6 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin ; }
|
||||
KOKKOS_INLINE_FUNCTION member_type end() const { return m_end ; }
|
||||
|
||||
|
||||
//TODO: find a better workaround for Clangs weird instantiation order
|
||||
// This thing is here because of an instantiation error, where the RangePolicy is inserted into FunctorValue Traits, which
|
||||
// tries decltype on the operator. It tries to do this even though the first argument of parallel for clearly doesn't match.
|
||||
@ -135,47 +134,45 @@ public:
|
||||
, work_begin , work_end )
|
||||
{}
|
||||
|
||||
public:
|
||||
public:
|
||||
/** \brief return chunk_size */
|
||||
inline member_type chunk_size() const {
|
||||
return m_granularity;
|
||||
}
|
||||
|
||||
/** \brief return chunk_size */
|
||||
inline member_type chunk_size() const {
|
||||
return m_granularity;
|
||||
}
|
||||
/** \brief set chunk_size to a discrete value*/
|
||||
inline RangePolicy set_chunk_size(int chunk_size_) const {
|
||||
RangePolicy p = *this;
|
||||
p.m_granularity = chunk_size_;
|
||||
p.m_granularity_mask = p.m_granularity - 1;
|
||||
return p;
|
||||
}
|
||||
|
||||
/** \brief set chunk_size to a discrete value*/
|
||||
inline RangePolicy set_chunk_size(int chunk_size_) const {
|
||||
RangePolicy p = *this;
|
||||
p.m_granularity = chunk_size_;
|
||||
p.m_granularity_mask = p.m_granularity - 1;
|
||||
return p;
|
||||
}
|
||||
private:
|
||||
/** \brief finalize chunk_size if it was set to AUTO*/
|
||||
inline void set_auto_chunk_size() {
|
||||
|
||||
private:
|
||||
/** \brief finalize chunk_size if it was set to AUTO*/
|
||||
inline void set_auto_chunk_size() {
|
||||
typename traits::index_type concurrency = traits::execution_space::concurrency();
|
||||
if( concurrency==0 ) concurrency=1;
|
||||
|
||||
typename traits::index_type concurrency = traits::execution_space::concurrency();
|
||||
if( concurrency==0 ) concurrency=1;
|
||||
if(m_granularity > 0) {
|
||||
if(!Impl::is_integral_power_of_two( m_granularity ))
|
||||
Kokkos::abort("RangePolicy blocking granularity must be power of two" );
|
||||
}
|
||||
|
||||
if(m_granularity > 0) {
|
||||
if(!Impl::is_integral_power_of_two( m_granularity ))
|
||||
Kokkos::abort("RangePolicy blocking granularity must be power of two" );
|
||||
}
|
||||
member_type new_chunk_size = 1;
|
||||
while(new_chunk_size*100*concurrency < m_end-m_begin)
|
||||
new_chunk_size *= 2;
|
||||
if(new_chunk_size < 128) {
|
||||
new_chunk_size = 1;
|
||||
while( (new_chunk_size*40*concurrency < m_end-m_begin ) && (new_chunk_size<128) )
|
||||
new_chunk_size*=2;
|
||||
}
|
||||
m_granularity = new_chunk_size;
|
||||
m_granularity_mask = m_granularity - 1;
|
||||
}
|
||||
|
||||
|
||||
member_type new_chunk_size = 1;
|
||||
while(new_chunk_size*100*concurrency < m_end-m_begin)
|
||||
new_chunk_size *= 2;
|
||||
if(new_chunk_size < 128) {
|
||||
new_chunk_size = 1;
|
||||
while( (new_chunk_size*40*concurrency < m_end-m_begin ) && (new_chunk_size<128) )
|
||||
new_chunk_size*=2;
|
||||
}
|
||||
m_granularity = new_chunk_size;
|
||||
m_granularity_mask = m_granularity - 1;
|
||||
}
|
||||
|
||||
public:
|
||||
public:
|
||||
/** \brief Subrange for a partition's rank and size.
|
||||
*
|
||||
* Typically used to partition a range over a group of threads.
|
||||
@ -212,16 +209,15 @@ public:
|
||||
if ( range.end() < m_end ) m_end = range.end() ;
|
||||
}
|
||||
}
|
||||
private:
|
||||
member_type m_begin ;
|
||||
member_type m_end ;
|
||||
WorkRange();
|
||||
WorkRange & operator = ( const WorkRange & );
|
||||
|
||||
private:
|
||||
member_type m_begin ;
|
||||
member_type m_end ;
|
||||
WorkRange();
|
||||
WorkRange & operator = ( const WorkRange & );
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -231,7 +227,6 @@ namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
|
||||
template< class ExecSpace, class ... Properties>
|
||||
class TeamPolicyInternal: public Impl::PolicyTraits<Properties ... > {
|
||||
private:
|
||||
@ -245,6 +240,10 @@ public:
|
||||
* This size takes into account execution space concurrency limitations and
|
||||
* scratch memory space limitations for reductions, team reduce/scan, and
|
||||
* team shared memory.
|
||||
*
|
||||
* This function only works for single-operator functors.
|
||||
* With multi-operator functors it cannot be determined
|
||||
* which operator will be called.
|
||||
*/
|
||||
template< class FunctorType >
|
||||
static int team_size_max( const FunctorType & );
|
||||
@ -254,6 +253,10 @@ public:
|
||||
* This size takes into account execution space concurrency limitations and
|
||||
* scratch memory space limitations for reductions, team reduce/scan, and
|
||||
* team shared memory.
|
||||
*
|
||||
* This function only works for single-operator functors.
|
||||
* With multi-operator functors it cannot be determined
|
||||
* which operator will be called.
|
||||
*/
|
||||
template< class FunctorType >
|
||||
static int team_size_recommended( const FunctorType & );
|
||||
@ -344,9 +347,7 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const ;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
namespace Impl {
|
||||
struct PerTeamValue {
|
||||
int value;
|
||||
PerTeamValue(int arg);
|
||||
@ -356,12 +357,12 @@ namespace Impl {
|
||||
int value;
|
||||
PerThreadValue(int arg);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
Impl::PerTeamValue PerTeam(const int& arg);
|
||||
Impl::PerThreadValue PerThread(const int& arg);
|
||||
|
||||
|
||||
/** \brief Execution policy for parallel work over a league of teams of threads.
|
||||
*
|
||||
* The work functor is called for each thread of each team such that
|
||||
@ -443,10 +444,6 @@ public:
|
||||
|
||||
};
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template<typename iType, class TeamMemberType>
|
||||
@ -484,8 +481,8 @@ public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TeamThreadRangeBoundariesStruct( const TeamMemberType& arg_thread
|
||||
, const iType& arg_end
|
||||
)
|
||||
, const iType& arg_end
|
||||
)
|
||||
: start( ibegin( 0 , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
|
||||
, end( iend( 0 , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
|
||||
, thread( arg_thread )
|
||||
@ -502,32 +499,33 @@ public:
|
||||
{}
|
||||
};
|
||||
|
||||
template<typename iType, class TeamMemberType>
|
||||
struct ThreadVectorRangeBoundariesStruct {
|
||||
typedef iType index_type;
|
||||
enum {start = 0};
|
||||
const iType end;
|
||||
enum {increment = 1};
|
||||
template<typename iType, class TeamMemberType>
|
||||
struct ThreadVectorRangeBoundariesStruct {
|
||||
typedef iType index_type;
|
||||
enum {start = 0};
|
||||
const iType end;
|
||||
enum {increment = 1};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ThreadVectorRangeBoundariesStruct (const TeamMemberType& thread, const iType& count):
|
||||
end( count )
|
||||
{}
|
||||
};
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ThreadVectorRangeBoundariesStruct ( const TeamMemberType, const iType& count ) : end( count ) {}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ThreadVectorRangeBoundariesStruct ( const iType& count ) : end( count ) {}
|
||||
};
|
||||
|
||||
template<class TeamMemberType>
|
||||
struct ThreadSingleStruct {
|
||||
const TeamMemberType& team_member;
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ThreadSingleStruct(const TeamMemberType& team_member_):team_member(team_member_){}
|
||||
};
|
||||
template<class TeamMemberType>
|
||||
struct ThreadSingleStruct {
|
||||
const TeamMemberType& team_member;
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ThreadSingleStruct( const TeamMemberType& team_member_ ) : team_member( team_member_ ) {}
|
||||
};
|
||||
|
||||
template<class TeamMemberType>
|
||||
struct VectorSingleStruct {
|
||||
const TeamMemberType& team_member;
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
VectorSingleStruct( const TeamMemberType& team_member_ ) : team_member( team_member_ ) {}
|
||||
};
|
||||
|
||||
template<class TeamMemberType>
|
||||
struct VectorSingleStruct {
|
||||
const TeamMemberType& team_member;
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
VectorSingleStruct(const TeamMemberType& team_member_):team_member(team_member_){}
|
||||
};
|
||||
} // namespace Impl
|
||||
|
||||
/** \brief Execution policy for parallel work over a threads within a team.
|
||||
@ -538,7 +536,8 @@ public:
|
||||
*/
|
||||
template<typename iType, class TeamMemberType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,TeamMemberType> TeamThreadRange(const TeamMemberType&, const iType& count);
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,TeamMemberType>
|
||||
TeamThreadRange( const TeamMemberType&, const iType& count );
|
||||
|
||||
/** \brief Execution policy for parallel work over a threads within a team.
|
||||
*
|
||||
@ -546,9 +545,10 @@ Impl::TeamThreadRangeBoundariesStruct<iType,TeamMemberType> TeamThreadRange(cons
|
||||
* This policy is used together with a parallel pattern as a nested layer within a kernel launched
|
||||
* with the TeamPolicy. This variant expects a begin and end. So the range is (begin,end].
|
||||
*/
|
||||
template<typename iType, class TeamMemberType>
|
||||
template<typename iType1, typename iType2, class TeamMemberType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,TeamMemberType> TeamThreadRange(const TeamMemberType&, const iType& begin, const iType& end);
|
||||
Impl::TeamThreadRangeBoundariesStruct<typename std::common_type<iType1, iType2>::type, TeamMemberType>
|
||||
TeamThreadRange( const TeamMemberType&, const iType1& begin, const iType2& end );
|
||||
|
||||
/** \brief Execution policy for a vector parallel loop.
|
||||
*
|
||||
@ -558,13 +558,12 @@ Impl::TeamThreadRangeBoundariesStruct<iType,TeamMemberType> TeamThreadRange(cons
|
||||
*/
|
||||
template<typename iType, class TeamMemberType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType> ThreadVectorRange(const TeamMemberType&, const iType& count);
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType>
|
||||
ThreadVectorRange( const TeamMemberType&, const iType& count );
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
#endif /* #define KOKKOS_EXECPOLICY_HPP */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@ -46,7 +46,6 @@
|
||||
|
||||
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
#include <impl/Kokkos_HBWAllocators.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
#ifdef KOKKOS_HAVE_HBWSPACE
|
||||
@ -148,11 +147,14 @@ public:
|
||||
void deallocate( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size ) const ;
|
||||
|
||||
/**\brief Return Name of the MemorySpace */
|
||||
static constexpr const char* name();
|
||||
|
||||
private:
|
||||
|
||||
AllocationMechanism m_alloc_mech ;
|
||||
|
||||
friend class Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void > ;
|
||||
static constexpr const char* m_name = "HBW";
|
||||
friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void > ;
|
||||
};
|
||||
|
||||
} // namespace Experimental
|
||||
@ -162,7 +164,6 @@ private:
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
@ -239,9 +240,33 @@ public:
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::HBWSpace , Kokkos::Experimental::HBWSpace >::assignable , "" );
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::Experimental::HBWSpace > {
|
||||
enum { assignable = true };
|
||||
enum { accessible = true };
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::Experimental::HBWSpace , Kokkos::HostSpace> {
|
||||
enum { assignable = false };
|
||||
enum { accessible = true };
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
}}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user