Compare commits

...

63 Commits

Author SHA1 Message Date
d2fc88a626 patch 17Jan17 2017-01-17 10:14:53 -07:00
c52a26382f Merge pull request #339 from akohlmey/fixes-for-srp-example
Fixes for srp example
2017-01-17 09:36:28 -07:00
ad4d299975 Merge pull request #335 from stanmoore1/neighbor_fixes
Neighbor fixes
2017-01-17 09:33:25 -07:00
83408b195f Merge pull request #342 from epfl-cosmo/ipi-multiinit-bug
Bugfix in the fix_ipi initialization - prevents multiple open_socket calls
2017-01-17 09:14:03 -07:00
cd7bdf9251 Merge pull request #341 from stanmoore1/qeq_kk_neighlist
Make fix_qeq_reax_kokkos request its own neighbor list
2017-01-17 09:13:47 -07:00
8c5b108900 Merge pull request #340 from stanmoore1/fix_rx_neighborlist
Make fix_rx request its own neighbor list
2017-01-17 09:13:27 -07:00
c19d2011bb Merge pull request #334 from sstrong99/flow_gauss_changeRef
Updated the reference for the flow/gauss method
2017-01-17 09:12:22 -07:00
973bef4d45 Merge pull request #332 from akohlmey/coord-atom-orientorder-atom-enhancements
Coord atom orientorder atom enhancements
2017-01-17 09:11:45 -07:00
1b9e50c8cb Merge pull request #331 from timattox/USER-DPD_fix_example_typo
USER-DPD: fix a typo in the DPD-H example input; update reference output.
2017-01-17 09:08:14 -07:00
252e07e083 Merge pull request #330 from akohlmey/collected-small-bugfixes
Collected small bugfixes
2017-01-17 09:08:00 -07:00
74a661ae26 Merge pull request #328 from akohlmey/print-last-command-on-error
print the last input line, when error->all() is called
2017-01-17 09:05:19 -07:00
d8bc590aaf Merge pull request #327 from stanmoore1/kokkos_lib_update
Updating Kokkos lib
2017-01-17 09:04:12 -07:00
c9bea60710 Merge pull request #326 from Pakketeretet2/github-tutorial-update
Updated images of succesful merge.
2017-01-17 09:03:46 -07:00
5cd856c97f fix spring doc page update 2017-01-17 09:02:56 -07:00
2f13365cf5 avoid spurious error message, when no storage fix is active/used 2017-01-16 17:08:00 -05:00
0a2b78acb8 rather than adjusting the communication cutoff, we just print out the minimum value needed and error out
i suspect, this communication cutoff adjustment was included into the code before it was possible to separately set it via comm_modify. stopping with an error message printing the needed/current value is cleaner, in keeping with other modules in LAMMPS and much less problematic.
2017-01-16 15:47:02 -05:00
3f46b6d782 fix bugs from incorrect code synchronization 2017-01-16 11:15:54 -05:00
5abd6e5122 reordering operations in Pair::init_style() to avoid segfaults w/o a kspace style 2017-01-16 11:08:48 -05:00
f3a82f454e Included a flag to prevent multiple open_socket calls if run is included multiple times in the LAMMPS input 2017-01-16 08:42:23 +01:00
473a3ebeef fix for bug with compute rdf with pair reax/c. we must not copy a neighbor list, if newton settings are not compatible
an alternate route to address this issue would be to allow an "ANY" setting for neighbor list requests and then query the neighbor list for newton setting instead of the force class.
2017-01-15 12:05:19 -05:00
b220850377 Removing neighbor list hack in fix_qeq_reax_kokkos 2017-01-14 16:16:02 -07:00
fa00e0593f Make fix_rx request its own neighbor list 2017-01-14 15:39:37 -07:00
4a09399dc6 during setup, checking timestep doesn't seem to be sufficient. comparing bins and stencil point, too.
in addition, relevant pointers were not properly initialized to NULL
2017-01-14 17:13:22 -05:00
5821fe8dd5 correct out-of-bounds accesses 2017-01-14 17:06:23 -05:00
98ceb6feb1 add missing html files to lammps.book 2017-01-13 18:11:23 -05:00
61cff85435 avoid not only division by zero, but also computing variance for short runs with insufficient resolution 2017-01-13 14:35:35 -05:00
aa0b327f7e Merge branch 'bugfix_dividebyzero' of https://github.com/timattox/lammps_USER-DPD into collected-small-bugfixes 2017-01-13 14:26:10 -05:00
04fe071968 Merge pull request #6 from ibaned/cuda-lj-ctor-warning
fix a CUDA constructor warning
2017-01-13 12:13:43 -07:00
78498715b4 Protect from divide by zero in mpi_timings() when printing results.
e.g. If neighbor list(s) are never rebuilt, the Neigh time will be zero.
2017-01-13 13:32:15 -05:00
b2f67fea30 Merge branch 'collected-small-bugfixes' of github.com:akohlmey/lammps into collected-small-bugfixes 2017-01-13 08:12:10 -05:00
c59bcf31d1 change $MKLROOT to $(MKLROOT) as reported by @WeiLiPenguin
This closes #336
2017-01-13 08:10:51 -05:00
2540fc281c Merge branch 'flow_gauss_changeRef' of github.com:sstrong99/lammps into pull-334 2017-01-12 23:54:52 -05:00
e8e03dd440 Updated the reference for the flow/gauss method, the new reference is much more comprehensive 2017-01-12 23:44:33 -05:00
daf766d4f8 Fixing Kokkos neighbor bug 2017-01-12 16:22:38 -07:00
630783c8e8 Fixing neighbor bug 2017-01-12 16:22:24 -07:00
c94030d966 put pair_lj_coul in kokkos_type.h
also rename pair_lj_coul_gromacs
so it doesn't conflict with the
one now in kokkos_type.h
2017-01-12 13:37:53 -07:00
1229f6f60b Updated the reference for the flow/gauss method, the new reference is much more comprehensive 2017-01-12 10:15:18 -07:00
0b081b0086 whitespace cleanup 2017-01-11 21:05:32 -05:00
8e1cf6643c apply bugfix to fix wall/gran by eric_lyster@agilent.com on lammps-users 2017-01-11 20:59:40 -05:00
6950a99162 Revert "remove obsolete warning about fix rigid image flag restrictions"
This reverts commit 51e52b477a.
2017-01-11 19:49:58 -05:00
9f4e5e0661 fix a CUDA constructor warning
The class params_lj_coul was copy-pasted
into many different pair styles, and only
one of them had the proper KOKKOS_INLINE_FUNCTION
annotations for CUDA.
created a header file for this class that
most of the pair styles now include.
One pair style did add extra members,
so it keeps a local copy of the class.
2017-01-11 09:11:35 -07:00
34cb4027df make formatting comment consistent 2017-01-11 07:46:07 -05:00
1d0e600ab7 formatting improvements and small corrections for timer settings and output discussions 2017-01-10 23:47:14 -05:00
7162cafdf5 Squelching output from Makefile 2017-01-10 14:46:30 -07:00
ee9e7cfbd5 Fixing Kokkos CUDA Makefile issue 2017-01-10 13:22:36 -07:00
7839c335da Fixing compile error with Kokkos CUDA Makefiles 2017-01-10 13:05:00 -07:00
622d926849 adapt example inputs for TAD and PRD to the change in compute coord/atom 2017-01-10 13:41:35 -05:00
92d15d4a89 replace string compare with enums, fix memory leak, formatting cleanup 2017-01-10 12:52:37 -05:00
95706ac846 import contributed code for computes coord/atom and orientorder/atom 2017-01-10 12:29:22 -05:00
d06688bb91 USER-DPD: fix a typo in the DPD-H example input; update reference output. 2017-01-10 12:11:20 -05:00
d014e00e53 ignore some newly added styles from packages. 2017-01-09 17:51:38 -05:00
0db2a07993 another workaround for duplicate labels (which sphinx does not like) 2017-01-09 17:51:19 -05:00
33412c76ed correct some formatting issues with USER-NC-DUMP 2017-01-09 17:50:49 -05:00
e5ac49d1de Merge branch 'master' into collected-small-bugfixes 2017-01-09 17:13:46 -05:00
1a81da0f73 print the last input line, when error->all() is called
this should help tracking down input file errors for many
common cases without having to repeat the run with -echo screen
and avoid having to explain how to use that feature all the time
2017-01-09 17:03:06 -05:00
ebd25cc078 Updating docs for Kokkos package 2017-01-09 12:40:33 -07:00
9250a55923 Adding enable_lambda to KOKKOS_CUDA_OPTIONS 2017-01-09 12:24:30 -07:00
a9f0b7d523 Updating Kokkos lib 2017-01-09 10:39:46 -07:00
20f8a8c219 Merge branch 'master' into github-tutorial-update 2017-01-09 14:38:09 +01:00
09af780aa8 remove misleading comments 2017-01-06 21:31:39 -05:00
51e52b477a remove obsolete warning about fix rigid image flag restrictions 2017-01-06 21:30:33 -05:00
20a4e365b7 reduce warning when processing manual with sphinx 2017-01-06 21:30:13 -05:00
ccd09e3967 Updated images of succesful merge. 2017-01-06 19:04:26 +01:00
441 changed files with 21060 additions and 28170 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 57 KiB

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

View File

@ -1,7 +1,7 @@
<!-- HTML_ONLY -->
<HEAD>
<TITLE>LAMMPS Users Manual</TITLE>
<META NAME="docnumber" CONTENT="9 Jan 2017 version">
<META NAME="docnumber" CONTENT="17 Jan 2017 version">
<META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
<META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation. This software and manual is distributed under the GNU General Public License.">
</HEAD>
@ -21,7 +21,7 @@
<H1></H1>
LAMMPS Documentation :c,h3
9 Jan 2017 version :c,h4
17 Jan 2017 version :c,h4
Version info: :h4

View File

@ -55,12 +55,13 @@ LAMMPS errors are detected at setup time; others like a bond
stretching too far may not occur until the middle of a run.
LAMMPS tries to flag errors and print informative error messages so
you can fix the problem. Of course, LAMMPS cannot figure out your
physics or numerical mistakes, like choosing too big a timestep,
specifying erroneous force field coefficients, or putting 2 atoms on
top of each other! If you run into errors that LAMMPS doesn't catch
that you think it should flag, please send an email to the
"developers"_http://lammps.sandia.gov/authors.html.
you can fix the problem. For most errors it will also print the last
input script command that it was processing. Of course, LAMMPS cannot
figure out your physics or numerical mistakes, like choosing too big a
timestep, specifying erroneous force field coefficients, or putting 2
atoms on top of each other! If you run into errors that LAMMPS
doesn't catch that you think it should flag, please send an email to
the "developers"_http://lammps.sandia.gov/authors.html.
If you get an error message about an invalid command in your input
script, you can determine what command is causing the problem by

View File

@ -1153,7 +1153,7 @@ Package, Description, Author(s), Doc page, Example, Pic/movie, Library
"USER-MISC"_#USER-MISC, single-file contributions, USER-MISC/README, USER-MISC/README, -, -, -
"USER-MANIFOLD"_#USER-MANIFOLD, motion on 2d surface, Stefan Paquay (Eindhoven U of Technology), "fix manifoldforce"_fix_manifoldforce.html, USER/manifold, "manifold"_manifold, -
"USER-MOLFILE"_#USER-MOLFILE, "VMD"_VMD molfile plug-ins, Axel Kohlmeyer (Temple U), "dump molfile"_dump_molfile.html, -, -, VMD-MOLFILE
"USER-NC-DUMP"_#USER-NC-DUMP, dump output via NetCDF, Lars Pastewka (Karlsruhe Institute of Technology, KIT), "dump nc, dump nc/mpiio"_dump_nc.html, -, -, lib/netcdf
"USER-NC-DUMP"_#USER-NC-DUMP, dump output via NetCDF, Lars Pastewka (Karlsruhe Institute of Technology, KIT), "dump nc / dump nc/mpiio"_dump_nc.html, -, -, lib/netcdf
"USER-OMP"_#USER-OMP, OpenMP threaded styles, Axel Kohlmeyer (Temple U), "Section 5.3.4"_accelerate_omp.html, -, -, -
"USER-PHONON"_#USER-PHONON, phonon dynamical matrix, Ling-Ti Kong (Shanghai Jiao Tong U), "fix phonon"_fix_phonon.html, USER/phonon, -, -
"USER-QMMM"_#USER-QMMM, QM/MM coupling, Axel Kohlmeyer (Temple U), "fix qmmm"_fix_qmmm.html, USER/qmmm, -, lib/qmmm
@ -1610,11 +1610,12 @@ and a "dump nc/mpiio"_dump_nc.html command to output LAMMPS snapshots
in this format. See src/USER-NC-DUMP/README for more details.
NetCDF files can be directly visualized with the following tools:
Ovito (http://www.ovito.org/). Ovito supports the AMBER convention
and all of the above extensions. :ulb,l
and all of the above extensions. :ulb,l
VMD (http://www.ks.uiuc.edu/Research/vmd/) :l
AtomEye (http://www.libatoms.org/). The libAtoms version of AtomEye contains
a NetCDF reader that is not present in the standard distribution of AtomEye :l,ule
a NetCDF reader that is not present in the standard distribution of AtomEye :l,ule
The person who created these files is Lars Pastewka at
Karlsruhe Institute of Technology (lars.pastewka at kit.edu).

View File

@ -1727,7 +1727,7 @@ thermodynamic state and a total run time for the simulation. It then
appends statistics about the CPU time and storage requirements for the
simulation. An example set of statistics is shown here:
Loop time of 2.81192 on 4 procs for 300 steps with 2004 atoms
Loop time of 2.81192 on 4 procs for 300 steps with 2004 atoms :pre
Performance: 18.436 ns/day 1.302 hours/ns 106.689 timesteps/s
97.0% CPU use with 4 MPI tasks x no OpenMP threads :pre
@ -1757,14 +1757,14 @@ Ave special neighs/atom = 2.34032
Neighbor list builds = 26
Dangerous builds = 0 :pre
The first section provides a global loop timing summary. The loop time
The first section provides a global loop timing summary. The {loop time}
is the total wall time for the section. The {Performance} line is
provided for convenience to help predicting the number of loop
continuations required and for comparing performance with other
similar MD codes. The CPU use line provides the CPU utilzation per
continuations required and for comparing performance with other,
similar MD codes. The {CPU use} line provides the CPU utilzation per
MPI task; it should be close to 100% times the number of OpenMP
threads (or 1). Lower numbers correspond to delays due to file I/O or
insufficient thread utilization.
threads (or 1 of no OpenMP). Lower numbers correspond to delays due
to file I/O or insufficient thread utilization.
The MPI task section gives the breakdown of the CPU run time (in
seconds) into major categories:
@ -1791,7 +1791,7 @@ is present that also prints the CPU utilization in percent. In
addition, when using {timer full} and the "package omp"_package.html
command are active, a similar timing summary of time spent in threaded
regions to monitor thread utilization and load balance is provided. A
new entry is the {Reduce} section, which lists the time spend in
new entry is the {Reduce} section, which lists the time spent in
reducing the per-thread data elements to the storage for non-threaded
computation. These thread timings are taking from the first MPI rank
only and and thus, as the breakdown for MPI tasks can change from MPI

View File

@ -110,14 +110,14 @@ mpirun -np 96 -ppn 12 lmp_g++ -k on t 20 -sf kk -in in.lj # ditto on 8 Phis :p
[Required hardware/software:]
Kokkos support within LAMMPS must be built with a C++11 compatible
compiler. If using gcc, version 4.8.1 or later is required.
compiler. If using gcc, version 4.7.2 or later is required.
To build with Kokkos support for CPUs, your compiler must support the
OpenMP interface. You should have one or more multi-core CPUs so that
multiple threads can be launched by each MPI task running on a CPU.
To build with Kokkos support for NVIDIA GPUs, NVIDIA Cuda software
version 6.5 or later must be installed on your system. See the
version 7.5 or later must be installed on your system. See the
discussion for the "GPU"_accelerate_gpu.html package for details of
how to check and do this.

View File

@ -91,6 +91,7 @@ Commands :h1
suffix
tad
temper
temper_grem
thermo
thermo_modify
thermo_style

View File

@ -10,22 +10,34 @@ compute coord/atom command :h3
[Syntax:]
compute ID group-ID coord/atom cutoff type1 type2 ... :pre
compute ID group-ID coord/atom cstyle args ... :pre
ID, group-ID are documented in "compute"_compute.html command
coord/atom = style name of this compute command
cutoff = distance within which to count coordination neighbors (distance units)
typeN = atom type for Nth coordination count (see asterisk form below) :ul
one cstyle must be appended :ul
cstyle = {cutoff} or {orientorder}
{cutoff} args = cutoff typeN
cutoff = distance within which to count coordination neighbors (distance units)
typeN = atom type for Nth coordination count (see asterisk form below) :pre
{orientorder} args = orientorderID threshold
orientorderID = ID of a previously defined orientorder/atom compute
threshold = minimum value of the scalar product between two 'connected' atoms (see text for explanation) :pre
[Examples:]
compute 1 all coord/atom 2.0
compute 1 all coord/atom 6.0 1 2
compute 1 all coord/atom 6.0 2*4 5*8 * :pre
compute 1 all coord/atom cutoff 2.0
compute 1 all coord/atom cutoff 6.0 1 2
compute 1 all coord/atom cutoff 6.0 2*4 5*8 *
compute 1 all coord/atom orientorder 2 0.5 :pre
[Description:]
Define a computation that calculates one or more coordination numbers
This compute performs generic calculations between neighboring atoms. So far,
there are two cstyles implemented: {cutoff} and {orientorder}.
The {cutoff} cstyle calculates one or more coordination numbers
for each atom in a group.
A coordination number is defined as the number of neighbor atoms with
@ -49,6 +61,14 @@ from 1 to N. A leading asterisk means all types from 1 to n
(inclusive). A middle asterisk means all types from m to n
(inclusive).
The {orientorder} cstyle calculates the number of 'connected' atoms j
around each atom i. The atom j is connected to i if the scalar product
({Ybar_lm(i)},{Ybar_lm(j)}) is larger than {threshold}. Thus, this cstyle
will work only if a "compute orientorder/atom"_compute_orientorder_atom.html
has been previously defined. This cstyle allows one to apply the
ten Wolde's criterion to identify cristal-like atoms in a system
(see "ten Wolde et al."_#tenWolde).
The value of all coordination numbers will be 0.0 for atoms not in the
specified compute group.
@ -83,10 +103,19 @@ options.
The per-atom vector or array values will be a number >= 0.0, as
explained above.
[Restrictions:] none
[Restrictions:]
The cstyle {orientorder} can only be used if a
"compute orientorder/atom"_compute_orientorder_atom.html command
was previously defined. Otherwise, an error message will be issued.
[Related commands:]
"compute cluster/atom"_compute_cluster_atom.html
"compute orientorder/atom"_compute_orientorder_atom.html
[Default:] none
:line
:link(tenWolde)
[(tenWolde)] P. R. ten Wolde, M. J. Ruiz-Montero, D. Frenkel, J. Chem. Phys. 104, 9932 (1996).

View File

@ -15,17 +15,19 @@ compute ID group-ID orientorder/atom keyword values ... :pre
ID, group-ID are documented in "compute"_compute.html command :ulb,l
orientorder/atom = style name of this compute command :l
one or more keyword/value pairs may be appended :l
keyword = {cutoff} or {nnn} or {degrees}
keyword = {cutoff} or {nnn} or {degrees} or {components}
{cutoff} value = distance cutoff
{nnn} value = number of nearest neighbors
{degrees} values = nlvalues, l1, l2,... :pre
{degrees} values = nlvalues, l1, l2,...
{components} value = l :pre
:ule
[Examples:]
compute 1 all orientorder/atom
compute 1 all orientorder/atom degrees 5 4 6 8 10 12 nnn NULL cutoff 1.5 :pre
compute 1 all orientorder/atom degrees 5 4 6 8 10 12 nnn NULL cutoff 1.5
compute 1 all orientorder/atom degrees 4 6 components 6 nnn NULL cutoff 3.0 :pre
[Description:]
@ -71,6 +73,13 @@ The numerical values of all order parameters up to {Q}12
for a range of commonly encountered high-symmetry structures are given
in Table I of "Mickel et al."_#Mickel.
The optional keyword {components} will output the components of
the normalized complex vector {Ybar_lm} of degree {l}, which must be
explicitly included in the keyword {degrees}. This option can be used
in conjunction with "compute coord_atom"_compute_coord_atom.html to
calculate the ten Wolde's criterion to identify crystal-like particles
(see "ten Wolde et al."_#tenWolde96).
The value of {Ql} is set to zero for atoms not in the
specified compute group, as well as for atoms that have less than
{nnn} neighbors within the distance cutoff.
@ -98,6 +107,12 @@ the neighbor list.
This compute calculates a per-atom array with {nlvalues} columns, giving the
{Ql} values for each atom, which are real numbers on the range 0 <= {Ql} <= 1.
If the keyword {components} is set, then the real and imaginary parts of each
component of (normalized) {Ybar_lm} will be added to the output array in the
following order:
Re({Ybar_-m}) Im({Ybar_-m}) Re({Ybar_-m+1}) Im({Ybar_-m+1}) ... Re({Ybar_m}) Im({Ybar_m}).
This way, the per-atom array will have a total of {nlvalues}+2*(2{l}+1) columns.
These values can be accessed by any command that uses
per-atom values from a compute as input. See "Section
6.15"_Section_howto.html#howto_15 for an overview of LAMMPS output
@ -117,5 +132,9 @@ The option defaults are {cutoff} = pair style cutoff, {nnn} = 12, {degrees} = 5
:link(Steinhardt)
[(Steinhardt)] P. Steinhardt, D. Nelson, and M. Ronchetti, Phys. Rev. B 28, 784 (1983).
:link(Mickel)
[(Mickel)] W. Mickel, S. C. Kapfer, G. E. Schroeder-Turkand, K. Mecke, J. Chem. Phys. 138, 044501 (2013).
:link(tenWolde96)
[(tenWolde)] P. R. ten Wolde, M. J. Ruiz-Montero, D. Frenkel, J. Chem. Phys. 104, 9932 (1996).

View File

@ -35,6 +35,7 @@ Computes :h1
compute_erotate_sphere_atom
compute_event_displace
compute_fep
compute_global_atom
compute_group_group
compute_gyration
compute_gyration_chunk

View File

@ -151,7 +151,7 @@ The option default for the {energy} keyword is energy = no.
:line
:link(Strong)
[(Strong)] Strong and Eaves, J. Phys. Chem. Lett. 7, 1907 (2016).
[(Strong)] Strong and Eaves, J. Phys. Chem. B 121, 189 (2017).
:link(Evans)
[(Evans)] Evans and Morriss, Phys. Rev. Lett. 56, 2172 (1986).

View File

@ -29,7 +29,7 @@ fix fxgREM all grem 502 -0.15 -80000 fxnvt :pre
[Description:]
This fix implements the molecular dynamics version of the generalized
replica exchange method (gREM) originally developed by "(Kim)"_#Kim,
replica exchange method (gREM) originally developed by "(Kim)"_#Kim2010,
which uses non-Boltzmann ensembles to sample over first order phase
transitions. The is done by defining replicas with an enthalpy
dependent effective temperature
@ -103,7 +103,7 @@ npt"_fix_nh.html, "thermo_modify"_thermo_modify.html
:line
:link(Kim)
:link(Kim2010)
[(Kim)] Kim, Keyes, Straub, J Chem. Phys, 132, 224107 (2010).
:link(Malolepsza)

View File

@ -89,11 +89,7 @@ NOTE: The center of mass of a group of atoms is calculated in
group can straddle a periodic boundary. See the "dump"_dump.html doc
page for a discussion of unwrapped coordinates. It also means that a
spring connecting two groups or a group and the tether point can cross
a periodic boundary and its length be calculated correctly. One
exception is for rigid bodies, which should not be used with the fix
spring command, if the rigid body will cross a periodic boundary.
This is because image flags for rigid bodies are used in a different
way, as explained on the "fix rigid"_fix_rigid.html doc page.
a periodic boundary and its length be calculated correctly.
[Restart, fix_modify, output, run start/stop, minimize info:]

View File

@ -23,6 +23,7 @@ Section_history.html
tutorial_drude.html
tutorial_github.html
tutorial_pylammps.html
body.html
manifolds.html
@ -113,6 +114,7 @@ special_bonds.html
suffix.html
tad.html
temper.html
temper_grem.html
thermo.html
thermo_modify.html
thermo_style.html

View File

@ -32,7 +32,7 @@ Run a parallel tempering or replica exchange simulation in LAMMPS
partition mode using multiple generalized replicas (ensembles) of a
system defined by "fix grem"_fix_grem.html, which stands for the
generalized replica exchange method (gREM) originally developed by
"(Kim)"_#Kim. It uses non-Boltzmann ensembles to sample over first
"(Kim)"_#KimStraub. It uses non-Boltzmann ensembles to sample over first
order phase transitions. The is done by defining replicas with an
enthalpy dependent effective temperature
@ -105,5 +105,5 @@ This command must be used with "fix grem"_fix_grem.html.
[Default:] none
:link(Kim)
:link(KimStraub)
[(Kim)] Kim, Keyes, Straub, J Chem Phys, 132, 224107 (2010).

View File

@ -33,14 +33,14 @@ timer loop :pre
Select the level of detail at which LAMMPS performs its CPU timings.
Multiple keywords can be specified with the {timer} command. For
keywords that are mutually exclusive, the last one specified takes
effect.
precedence.
During a simulation run LAMMPS collects information about how much
time is spent in different sections of the code and thus can provide
information for determining performance and load imbalance problems.
This can be done at different levels of detail and accuracy. For more
information about the timing output, see this "discussion of screen
output"_Section_start.html#start_8.
output in Section 2.8"_Section_start.html#start_8.
The {off} setting will turn all time measurements off. The {loop}
setting will only measure the total time for a run and not collect any
@ -52,20 +52,22 @@ procsessors. The {full} setting adds information about CPU
utilization and thread utilization, when multi-threading is enabled.
With the {sync} setting, all MPI tasks are synchronized at each timer
call which meaures load imbalance more accuractly, though it can also
slow down the simulation. Using the {nosync} setting (which is the
default) turns off this synchronization.
call which measures load imbalance for each section more accuractly,
though it can also slow down the simulation by prohibiting overlapping
independent computations on different MPI ranks Using the {nosync}
setting (which is the default) turns this synchronization off.
With the {timeout} keyword a walltime limit can be imposed that
With the {timeout} keyword a walltime limit can be imposed, that
affects the "run"_run.html and "minimize"_minimize.html commands.
This can be convenient when runs have to confirm to time limits,
e.g. when running under a batch system and you want to maximize
the utilization of the batch time slot, especially when the time
per timestep varies and is thus difficult to predict how many
steps a simulation can perform, or for difficult to converge
minimizations. The timeout {elapse} value should be somewhat smaller
than the time requested from the batch system, as there is usually
some overhead to launch jobs, and it may be advisable to write
This can be convenient when calculations have to comply with execution
time limits, e.g. when running under a batch system when you want to
maximize the utilization of the batch time slot, especially for runs
where the time per timestep varies much and thus it becomes difficult
to predict how many steps a simulation can perform for a given walltime
limit. This also applies for difficult to converge minimizations.
The timeout {elapse} value should be somewhat smaller than the maximum
wall time requested from the batch system, as there is usually
some overhead to launch jobs, and it is advisable to write
out a restart after terminating a run due to a timeout.
The timeout timer starts when the command is issued. When the time

View File

@ -336,12 +336,15 @@ commit and push again:
$ git commit -m "Merged Axel's suggestions and updated text"
$ git push git@github.com:Pakketeretet2/lammps :pre
This merge also shows up on the lammps Github page:
:c,image(JPG/tutorial_reverse_pull_request7.png)
:line
[After a merge]
When everything is fine, the feature branch is merged into the master branch.
When everything is fine, the feature branch is merged into the master branch:
:c,image(JPG/tutorial_merged.png)

View File

@ -18,7 +18,7 @@ neigh_modify every 1 delay 0 check no once no
timestep 0.001
compute dpdU all dpd
variable totEnergy equal pe+ke+c_dpdU[1]+c_dpdU[1]+press*vol
variable totEnergy equal pe+ke+c_dpdU[1]+c_dpdU[2]+press*vol
thermo 1
thermo_style custom step temp press vol pe ke v_totEnergy cella cellb cellc

View File

@ -22,7 +22,7 @@ neigh_modify every 1 delay 0 check no once no
timestep 0.001
compute dpdU all dpd
variable totEnergy equal pe+ke+c_dpdU[1]+c_dpdU[1]+press*vol
variable totEnergy equal pe+ke+c_dpdU[1]+c_dpdU[2]+press*vol
thermo 1
thermo_style custom step temp press vol pe ke v_totEnergy cella cellb cellc
@ -34,129 +34,137 @@ fix 2 all eos/cv 0.0005
run 100
Neighbor list info ...
1 neighbor list requests
update every 1 steps, delay 0 steps, check no
max neighbors/atom: 2000, page size: 100000
master list distance cutoff = 12
ghost atom cutoff = 12
binsize = 6 -> bins = 22 22 22
Memory usage per processor = 6.48143 Mbytes
binsize = 6, bins = 22 22 22
2 neighbor lists, perpetual/occasional/extra = 2 0 0
(1) pair dpd/fdt/energy, perpetual
pair build: half/bin/newton
stencil: half/bin/3d/newton
bin: standard
(2) fix shardlow, perpetual, ssa
pair build: half/bin/newton/ssa
stencil: half/bin/3d/newton/ssa
bin: ssa
Memory usage per processor = 8.55503 Mbytes
Step Temp Press Volume PotEng KinEng v_totEnergy Cella Cellb Cellc
0 239.4274282976 2817.4421750949 2146689.0000000000 2639.8225470740 313.3218455755 6048176597.3066043854 129.0000000000 129.0000000000 129.0000000000
1 239.4771405316 2817.4798146419 2146689.0000581890 2639.8304543632 313.3869004818 6048257397.9450111389 129.0000000012 129.0000000012 129.0000000012
2 239.5643955010 2817.5423194969 2146689.0002327557 2639.8379071907 313.5010849268 6048391577.0431985855 129.0000000047 129.0000000047 129.0000000047
3 239.6633839196 2817.6123662396 2146689.0005237064 2639.8445238058 313.6306241122 6048541946.5712032318 129.0000000105 129.0000000105 129.0000000105
4 239.5371222027 2817.5355424336 2146689.0009310376 2639.8505035043 313.4653942786 6048377030.7404460907 129.0000000186 129.0000000186 129.0000000186
5 239.6512678169 2817.6153097076 2146689.0014547524 2639.8561498340 313.6147686202 6048548267.9007377625 129.0000000291 129.0000000291 129.0000000291
6 239.5617886781 2817.5624195435 2146689.0020948485 2639.8617493725 313.4976735610 6048434730.8592004776 129.0000000420 129.0000000420 129.0000000420
7 239.5228587856 2817.5420009502 2146689.0028513218 2639.8666590407 313.4467287471 6048390900.5748577118 129.0000000571 129.0000000571 129.0000000571
8 239.6066877934 2817.6008649264 2146689.0037241788 2639.8710757645 313.5564298772 6048517265.7987136841 129.0000000746 129.0000000746 129.0000000746
9 239.5719861485 2817.5823530300 2146689.0047134170 2639.8752557893 313.5110182737 6048477529.2603597641 129.0000000944 129.0000000944 129.0000000944
10 239.5800176776 2817.5915671176 2146689.0058190385 2639.8793778438 313.5215285712 6048497312.1706552505 129.0000001166 129.0000001166 129.0000001166
11 239.6299830954 2817.6281223139 2146689.0070410441 2639.8829762049 313.5869148014 6048575788.3208351135 129.0000001410 129.0000001410 129.0000001410
12 239.6011995911 2817.6132377273 2146689.0083794324 2639.8860704236 313.5492478526 6048543839.4788360596 129.0000001678 129.0000001678 129.0000001678
13 239.6407681166 2817.6427924824 2146689.0098342048 2639.8889816934 313.6010284005 6048607288.5005025864 129.0000001970 129.0000001970 129.0000001970
14 239.6981172055 2817.6844100046 2146689.0114053637 2639.8913405110 313.6760771219 6048696632.8825626373 129.0000002285 129.0000002285 129.0000002285
15 239.8563971968 2817.7922519039 2146689.0130929090 2639.8934358481 313.8832070208 6048928140.8671455383 129.0000002623 129.0000002623 129.0000002623
16 239.8561894618 2817.7971208197 2146689.0148968464 2639.8950496967 313.8829351726 6048938597.9994916916 129.0000002984 129.0000002984 129.0000002984
17 239.8816520361 2817.8185621543 2146689.0168171758 2639.8961257823 313.9162562538 6048984631.3226108551 129.0000003369 129.0000003369 129.0000003369
18 239.9099966096 2817.8417368960 2146689.0188538977 2639.8965743204 313.9533488047 6049034386.0627622604 129.0000003777 129.0000003777 129.0000003777
19 240.0514024347 2817.9389205774 2146689.0210070144 2639.8966103811 314.1383966683 6049243015.4568052292 129.0000004208 129.0000004208 129.0000004208
20 239.8802541140 2817.8327386176 2146689.0232765260 2639.8962085210 313.9144268914 6049015081.9802341461 129.0000004662 129.0000004662 129.0000004662
21 239.8462621903 2817.8160306167 2146689.0256624296 2639.8953174755 313.8699440502 6048979221.7758703232 129.0000005140 129.0000005140 129.0000005140
22 240.0487944678 2817.9533849157 2146689.0281647225 2639.8938590354 314.1349838054 6049274086.0571212769 129.0000005642 129.0000005642 129.0000005642
23 240.0966314441 2817.9897873787 2146689.0307834130 2639.8918104774 314.1975846937 6049352238.2649183273 129.0000006166 129.0000006166 129.0000006166
24 240.1765312516 2818.0463843765 2146689.0335185044 2639.8891292321 314.3021439554 6049473742.2287187576 129.0000006714 129.0000006714 129.0000006714
25 240.1500705973 2818.0336048048 2146689.0363699966 2639.8858785483 314.2675167572 6049446316.4600162506 129.0000007285 129.0000007285 129.0000007285
26 240.2681423500 2818.1151708195 2146689.0393378921 2639.8825176506 314.4220289603 6049621421.8445177078 129.0000007880 129.0000007880 129.0000007880
27 240.4728815247 2818.2527327079 2146689.0424221945 2639.8784158747 314.6899567267 6049916733.3989181519 129.0000008498 129.0000008498 129.0000008498
28 240.4793027032 2818.2613348477 2146689.0456229053 2639.8736089473 314.6983596717 6049935208.5421981812 129.0000009139 129.0000009139 129.0000009139
29 240.5020619198 2818.2805472685 2146689.0489400285 2639.8681043704 314.7281430587 6049976461.0082206726 129.0000009803 129.0000009803 129.0000009803
30 240.5513721776 2818.3167157263 2146689.0523735629 2639.8623484053 314.7926719270 6050054113.1760177612 129.0000010491 129.0000010491 129.0000010491
31 240.7340393104 2818.4391703712 2146689.0559235099 2639.8563442170 315.0317155636 6050316995.4599781036 129.0000011202 129.0000011202 129.0000011202
32 240.8254719483 2818.5014640740 2146689.0595898777 2639.8498122053 315.1513670299 6050450731.1168394089 129.0000011936 129.0000011936 129.0000011936
33 240.9681573541 2818.5965480750 2146689.0633726656 2639.8425779528 315.3380893908 6050654857.7432861328 129.0000012694 129.0000012694 129.0000012694
34 241.0039494187 2818.6217008564 2146689.0672718794 2639.8347174393 315.3849279499 6050708863.9733209610 129.0000013475 129.0000013475 129.0000013475
35 241.0314566197 2818.6411150538 2146689.0712875174 2639.8262983643 315.4209246902 6050750551.5649127960 129.0000014279 129.0000014279 129.0000014279
36 241.0829173424 2818.6763455617 2146689.0754195810 2639.8174397481 315.4882677207 6050826192.2165899277 129.0000015107 129.0000015107 129.0000015107
37 241.2845682012 2818.8087982181 2146689.0796680767 2639.8080129872 315.7521540252 6051110539.1171846390 129.0000015958 129.0000015958 129.0000015958
38 241.3214712920 2818.8336260248 2146689.0840330068 2639.7981963574 315.8004465062 6051163849.0412235260 129.0000016833 129.0000016833 129.0000016833
39 241.3392127125 2818.8456991528 2146689.0885143690 2639.7879618658 315.8236634561 6051189778.9386901855 129.0000017730 129.0000017730 129.0000017730
40 241.5383770555 2818.9753950055 2146689.0931121684 2639.7769824244 316.0842958321 6051468208.8210506439 129.0000018651 129.0000018651 129.0000018651
41 241.5059730674 2818.9543817992 2146689.0978264087 2639.7656512498 316.0418910106 6051423113.2358427048 129.0000019595 129.0000019595 129.0000019595
42 241.3907605672 2818.8793800508 2146689.1026570834 2639.7541331920 315.8911205101 6051262121.2551422119 129.0000020563 129.0000020563 129.0000020563
43 241.5095917610 2818.9559595711 2146689.1076041958 2639.7424355740 316.0466265406 6051426527.7663059235 129.0000021554 129.0000021554 129.0000021554
44 241.6271631762 2819.0312325531 2146689.1126677482 2639.7297705654 316.2004839873 6051588129.8722610474 129.0000022568 129.0000022568 129.0000022568
45 241.5702411838 2818.9923790176 2146689.1178477411 2639.7163554760 316.1259941770 6051504737.9250564575 129.0000023606 129.0000023606 129.0000023606
46 241.7029985068 2819.0771124986 2146689.1231441777 2639.7024246704 316.2997243538 6051686649.4576120377 129.0000024667 129.0000024667 129.0000024667
47 241.7966144965 2819.1357830868 2146689.1285570571 2639.6882106593 316.4222330191 6051812612.3391046524 129.0000025751 129.0000025751 129.0000025751
48 241.8573480255 2819.1726205120 2146689.1340863821 2639.6735287925 316.5017107195 6051891706.4921989441 129.0000026859 129.0000026859 129.0000026859
49 241.9611147338 2819.2374095379 2146689.1397321564 2639.6583357477 316.6375029166 6052030804.4275226593 129.0000027990 129.0000027990 129.0000027990
50 242.1023518806 2819.3259059811 2146689.1454943856 2639.6424863169 316.8223300428 6052220795.1955394745 129.0000029144 129.0000029144 129.0000029144
51 242.1174105473 2819.3319633044 2146689.1513730693 2639.6264141131 316.8420362613 6052233814.9634265900 129.0000030321 129.0000030321 129.0000030321
52 242.2534914901 2819.4164594322 2146689.1573682069 2639.6098392670 317.0201158259 6052415218.9485445023 129.0000031522 129.0000031522 129.0000031522
53 242.3504633236 2819.4754119996 2146689.1634798055 2639.5930076506 317.1470160479 6052541789.1274013519 129.0000032746 129.0000032746 129.0000032746
54 242.2982323323 2819.4368568264 2146689.1697078613 2639.5756353782 317.0786650211 6052459040.6286897659 129.0000033994 129.0000033994 129.0000033994
55 242.3452896272 2819.4623310219 2146689.1760523771 2639.5575918586 317.1402455951 6052513743.7400159836 129.0000035265 129.0000035265 129.0000035265
56 242.4181903333 2819.5048897011 2146689.1825133534 2639.5390347547 317.2356456249 6052605122.2894439697 129.0000036559 129.0000036559 129.0000036559
57 242.5317091656 2819.5739975787 2146689.1890907930 2639.5199828249 317.3841997413 6052753494.0979280472 129.0000037876 129.0000037876 129.0000037876
58 242.5478978740 2819.5796954935 2146689.1957846982 2639.5006137388 317.4053847660 6052765744.6257629395 129.0000039217 129.0000039217 129.0000039217
59 242.6655316466 2819.6519225743 2146689.2025950695 2639.4808234811 317.5593238156 6052920813.0568208694 129.0000040582 129.0000040582 129.0000040582
60 242.8126131177 2819.7431588157 2146689.2095219092 2639.4607996998 317.7517989980 6053116688.6155729294 129.0000041969 129.0000041969 129.0000041969
61 242.7957124913 2819.7275989047 2146689.2165652174 2639.4406312730 317.7296823362 6053083306.1403274536 129.0000043380 129.0000043380 129.0000043380
62 242.9276177041 2819.8088790098 2146689.2237249981 2639.4201279058 317.9022974164 6053257809.6067762375 129.0000044814 129.0000044814 129.0000044814
63 243.0465445938 2819.8814758895 2146689.2310012528 2639.3991657500 318.0579286774 6053413673.1989650726 129.0000046272 129.0000046272 129.0000046272
64 242.9890585501 2819.8387587817 2146689.2383939880 2639.3781767844 317.9827007328 6053321993.5937871933 129.0000047752 129.0000047752 129.0000047752
65 242.9653746583 2819.8180104181 2146689.2459031967 2639.3568184374 317.9517072884 6053277474.4272727966 129.0000049256 129.0000049256 129.0000049256
66 243.0259297024 2819.8514334947 2146689.2535288804 2639.3352568621 318.0309514181 6053349244.9473772049 129.0000050784 129.0000050784 129.0000050784
67 242.9638979697 2819.8046112742 2146689.2612710390 2639.3134547096 317.9497748498 6053248753.9180717468 129.0000052335 129.0000052335 129.0000052335
68 243.0283540775 2819.8395632725 2146689.2691296688 2639.2912303374 318.0341240273 6053323807.2197017670 129.0000053909 129.0000053909 129.0000053909
69 243.2256418664 2819.9609646019 2146689.2771047787 2639.2684509205 318.2923006889 6053584440.8757400513 129.0000055506 129.0000055506 129.0000055506
70 243.2507495334 2819.9706145524 2146689.2851963686 2639.2450126010 318.3251573278 6053605179.1483964920 129.0000057127 129.0000057127 129.0000057127
71 243.4287155518 2820.0794853386 2146689.2934044413 2639.2213699915 318.5580489464 6053838914.2552747726 129.0000058771 129.0000058771 129.0000058771
72 243.5097518574 2820.1249498194 2146689.3017290002 2639.1971212009 318.6640954635 6053936535.9274711609 129.0000060439 129.0000060439 129.0000060439
73 243.5356790969 2820.1337977544 2146689.3101700447 2639.1723394661 318.6980246193 6053955553.5090074539 129.0000062130 129.0000062130 129.0000062130
74 243.5479180498 2820.1331964183 2146689.3187275808 2639.1473868749 318.7140408766 6053954286.7515821457 129.0000063844 129.0000063844 129.0000063844
75 243.7115573025 2820.2314361523 2146689.3274016059 2639.1220411207 318.9281840641 6054165201.5909118652 129.0000065581 129.0000065581 129.0000065581
76 243.7457279618 2820.2454531429 2146689.3361921217 2639.0963868224 318.9729008040 6054195316.5254154205 129.0000067342 129.0000067342 129.0000067342
77 243.8345031069 2820.2948644965 2146689.3450991292 2639.0700900389 319.0890745962 6054301412.5615310669 129.0000069126 129.0000069126 129.0000069126
78 244.0193931195 2820.4067881628 2146689.3541226317 2639.0435094409 319.3310271594 6054541703.5689058304 129.0000070934 129.0000070934 129.0000070934
79 243.9919100078 2820.3799166166 2146689.3632626338 2639.0164249037 319.2950619430 6054484044.4218587875 129.0000072765 129.0000072765 129.0000072765
80 244.0965612207 2820.4387335935 2146689.3725191355 2638.9888176882 319.4320116291 6054610332.4174261093 129.0000074619 129.0000074619 129.0000074619
81 244.1334315951 2820.4535208568 2146689.3818921377 2638.9608330195 319.4802612965 6054642102.5347270966 129.0000076496 129.0000076496 129.0000076496
82 244.3029520408 2820.5543485196 2146689.3913816395 2638.9318525796 319.7021007878 6054858575.1664342880 129.0000078397 129.0000078397 129.0000078397
83 244.3445761189 2820.5713690935 2146689.4009876498 2638.9021684795 319.7565712929 6054895140.1710596085 129.0000080321 129.0000080321 129.0000080321
84 244.2696671559 2820.5125763350 2146689.4107101629 2638.8720941742 319.6585431986 6054768957.6739044189 129.0000082269 129.0000082269 129.0000082269
85 244.5161919319 2820.6629431352 2146689.4205491822 2638.8415194387 319.9811528443 6055091776.5361995697 129.0000084240 129.0000084240 129.0000084240
86 244.5641090282 2820.6838080201 2146689.4305047127 2638.8103612394 320.0438585800 6055136595.0767974854 129.0000086234 129.0000086234 129.0000086234
87 244.5348240638 2820.6541129118 2146689.4405767513 2638.7789728309 320.0055354056 6055072877.2416200638 129.0000088251 129.0000088251 129.0000088251
88 244.6939431427 2820.7468233396 2146689.4507653015 2638.7470269267 320.2137633592 6055271926.6536149979 129.0000090292 129.0000090292 129.0000090292
89 244.8800201091 2820.8567117003 2146689.4610703662 2638.7147520097 320.4572692055 6055507852.1186332703 129.0000092356 129.0000092356 129.0000092356
90 244.8804280382 2820.8451141876 2146689.4714919478 2638.6820441173 320.4578030336 6055482985.2258749008 129.0000094444 129.0000094444 129.0000094444
91 244.9558851986 2820.8815975090 2146689.4820300462 2638.6491836104 320.5565485155 6055561333.3803453445 129.0000096555 129.0000096555 129.0000096555
92 244.9965893140 2820.8949614294 2146689.4926846647 2638.6159817170 320.6098151301 6055590051.6433181763 129.0000098689 129.0000098689 129.0000098689
93 245.1381056687 2820.9732811388 2146689.5034558061 2638.5824451870 320.7950076360 6055758210.2774200439 129.0000100846 129.0000100846 129.0000100846
94 245.2954807041 2821.0619342131 2146689.5143434699 2638.5485198222 321.0009532826 6055948551.7882709503 129.0000103027 129.0000103027 129.0000103027
95 245.3535822199 2821.0860553731 2146689.5253476589 2638.5144817512 321.0769866522 6056000363.5151576996 129.0000105232 129.0000105232 129.0000105232
96 245.5013476026 2821.1682908185 2146689.5364683764 2638.4801107361 321.2703568219 6056176929.0169925690 129.0000107459 129.0000107459 129.0000107459
97 245.4166531417 2821.0989038023 2146689.5477056229 2638.4453663061 321.1595231342 6056028008.1910057068 129.0000109710 129.0000109710 129.0000109710
98 245.4121937790 2821.0817490953 2146689.5590593945 2638.4097762390 321.1536874797 6055991214.3494396210 129.0000111984 129.0000111984 129.0000111984
99 245.4532592994 2821.0946353191 2146689.5705296928 2638.3738037546 321.2074270397 6056018909.4480972290 129.0000114282 129.0000114282 129.0000114282
100 245.7500657390 2821.2735939427 2146689.5821165247 2638.3375549051 321.5958367642 6056403111.1006488800 129.0000116603 129.0000116603 129.0000116603
Loop time of 4.05006 on 1 procs for 100 steps with 10125 atoms
0 239.4274282976 2817.4421750949 2146689.0000000000 2639.8225470740 313.3218455755 6048176597.3066034317 129.0000000000 129.0000000000 129.0000000000
1 239.4771405316 2817.4798146419 2146689.0000581890 2639.8304543632 313.3869004818 6048257397.8720483780 129.0000000012 129.0000000012 129.0000000012
2 239.5643955010 2817.5423194969 2146689.0002327557 2639.8379071907 313.5010849268 6048391576.8485937119 129.0000000047 129.0000000047 129.0000000047
3 239.6633839196 2817.6123662396 2146689.0005237064 2639.8445238058 313.6306241122 6048541946.2404479980 129.0000000105 129.0000000105 129.0000000105
4 239.5371222027 2817.5355424336 2146689.0009310376 2639.8505035043 313.4653942786 6048377030.5689325333 129.0000000186 129.0000000186 129.0000000186
5 239.6512678169 2817.6153097076 2146689.0014547524 2639.8561498340 313.6147686202 6048548267.5742130280 129.0000000291 129.0000000291 129.0000000291
6 239.5617886781 2817.5624195435 2146689.0020948485 2639.8617493725 313.4976735610 6048434730.6441593170 129.0000000420 129.0000000420 129.0000000420
7 239.5228587856 2817.5420009502 2146689.0028513218 2639.8666590407 313.4467287471 6048390900.4058599472 129.0000000571 129.0000000571 129.0000000571
8 239.6066877934 2817.6008649264 2146689.0037241788 2639.8710757645 313.5564298772 6048517265.5155982971 129.0000000746 129.0000000746 129.0000000746
9 239.5719861485 2817.5823530300 2146689.0047134170 2639.8752557893 313.5110182737 6048477529.0184717178 129.0000000944 129.0000000944 129.0000000944
10 239.5800176776 2817.5915671176 2146689.0058190385 2639.8793778438 313.5215285712 6048497311.9141387939 129.0000001166 129.0000001166 129.0000001166
11 239.6299830954 2817.6281223139 2146689.0070410441 2639.8829762049 313.5869148014 6048575787.9953098297 129.0000001410 129.0000001410 129.0000001410
12 239.6011995911 2817.6132377273 2146689.0083794324 2639.8860704236 313.5492478526 6048543839.1878814697 129.0000001678 129.0000001678 129.0000001678
13 239.6407681166 2817.6427924824 2146689.0098342048 2639.8889816934 313.6010284005 6048607288.1548709869 129.0000001970 129.0000001970 129.0000001970
14 239.6981172055 2817.6844100046 2146689.0114053637 2639.8913405110 313.6760771219 6048696632.4595127106 129.0000002285 129.0000002285 129.0000002285
15 239.8563971968 2817.7922519039 2146689.0130929090 2639.8934358481 313.8832070208 6048928140.2348766327 129.0000002623 129.0000002623 129.0000002623
16 239.8561894618 2817.7971208196 2146689.0148968464 2639.8950496967 313.8829351726 6048938597.3658657074 129.0000002984 129.0000002984 129.0000002984
17 239.8816520361 2817.8185621543 2146689.0168171758 2639.8961257823 313.9162562538 6048984630.6545839310 129.0000003369 129.0000003369 129.0000003369
18 239.9099966096 2817.8417368960 2146689.0188538977 2639.8965743204 313.9533488047 6049034385.3571958542 129.0000003777 129.0000003777 129.0000003777
19 240.0514024347 2817.9389205774 2146689.0210070144 2639.8966103811 314.1383966683 6049243014.5661621094 129.0000004208 129.0000004208 129.0000004208
20 239.8802541140 2817.8327386176 2146689.0232765260 2639.8962085210 313.9144268914 6049015081.3139505386 129.0000004662 129.0000004662 129.0000004662
21 239.8462621903 2817.8160306167 2146689.0256624296 2639.8953174755 313.8699440502 6048979221.1549577713 129.0000005140 129.0000005140 129.0000005140
22 240.0487944678 2817.9533849157 2146689.0281647225 2639.8938590354 314.1349838054 6049274085.1726217270 129.0000005642 129.0000005642 129.0000005642
23 240.0966314441 2817.9897873787 2146689.0307834130 2639.8918104774 314.1975846937 6049352237.3198652267 129.0000006166 129.0000006166 129.0000006166
24 240.1765312516 2818.0463843765 2146689.0335185044 2639.8891292321 314.3021439554 6049473741.1817827225 129.0000006714 129.0000006714 129.0000006714
25 240.1500705973 2818.0336048048 2146689.0363699966 2639.8858785483 314.2675167572 6049446315.4509468079 129.0000007285 129.0000007285 129.0000007285
26 240.2681423500 2818.1151708195 2146689.0393378921 2639.8825176506 314.4220289603 6049621420.6842966080 129.0000007880 129.0000007880 129.0000007880
27 240.4728815247 2818.2527327079 2146689.0424221945 2639.8784158747 314.6899567267 6049916731.9748563766 129.0000008498 129.0000008498 129.0000008498
28 240.4793027032 2818.2613348477 2146689.0456229053 2639.8736089473 314.6983596717 6049935207.1145420074 129.0000009139 129.0000009139 129.0000009139
29 240.5020619198 2818.2805472685 2146689.0489400285 2639.8681043704 314.7281430587 6049976459.5562763214 129.0000009803 129.0000009803 129.0000009803
30 240.5513721776 2818.3167157263 2146689.0523735629 2639.8623484053 314.7926719270 6050054111.6652946472 129.0000010491 129.0000010491 129.0000010491
31 240.7340393104 2818.4391703712 2146689.0559235099 2639.8563442170 315.0317155636 6050316993.7162160873 129.0000011202 129.0000011202 129.0000011202
32 240.8254719483 2818.5014640740 2146689.0595898777 2639.8498122053 315.1513670299 6050450729.2599506378 129.0000011936 129.0000011936 129.0000011936
33 240.9681573541 2818.5965480750 2146689.0633726656 2639.8425779528 315.3380893908 6050654855.7068986893 129.0000012694 129.0000012694 129.0000012694
34 241.0039494187 2818.6217008564 2146689.0672718794 2639.8347174393 315.3849279499 6050708861.8979463577 129.0000013475 129.0000013475 129.0000013475
35 241.0314566197 2818.6411150538 2146689.0712875174 2639.8262983643 315.4209246902 6050750549.4619541168 129.0000014279 129.0000014279 129.0000014279
36 241.0829173424 2818.6763455617 2146689.0754195810 2639.8174397481 315.4882677207 6050826190.0551443100 129.0000015107 129.0000015107 129.0000015107
37 241.2845682012 2818.8087982181 2146689.0796680767 2639.8080129872 315.7521540252 6051110536.7012710571 129.0000015958 129.0000015958 129.0000015958
38 241.3214712920 2818.8336260248 2146689.0840330068 2639.7981963574 315.8004465062 6051163846.5868301392 129.0000016833 129.0000016833 129.0000016833
39 241.3392127125 2818.8456991528 2146689.0885143690 2639.7879618658 315.8236634561 6051189776.4712991714 129.0000017730 129.0000017730 129.0000017730
40 241.5383770555 2818.9753950055 2146689.0931121684 2639.7769824244 316.0842958321 6051468206.1039972305 129.0000018651 129.0000018651 129.0000018651
41 241.5059730674 2818.9543817992 2146689.0978264087 2639.7656512498 316.0418910106 6051423110.5725250244 129.0000019595 129.0000019595 129.0000019595
42 241.3907605672 2818.8793800508 2146689.1026570834 2639.7541331920 315.8911205101 6051262118.7541017532 129.0000020563 129.0000020563 129.0000020563
43 241.5095917610 2818.9559595711 2146689.1076041958 2639.7424355740 316.0466265406 6051426525.1214485168 129.0000021554 129.0000021554 129.0000021554
44 241.6271631762 2819.0312325531 2146689.1126677482 2639.7297705654 316.2004839873 6051588127.0861988068 129.0000022568 129.0000022568 129.0000022568
45 241.5702411838 2818.9923790176 2146689.1178477411 2639.7163554760 316.1259941770 6051504735.2269029617 129.0000023606 129.0000023606 129.0000023606
46 241.7029985068 2819.0771124986 2146689.1231441777 2639.7024246704 316.2997243538 6051686646.5996389389 129.0000024667 129.0000024667 129.0000024667
47 241.7966144965 2819.1357830868 2146689.1285570571 2639.6882106593 316.4222330191 6051812609.3728218079 129.0000025751 129.0000025751 129.0000025751
48 241.8573480255 2819.1726205120 2146689.1340863821 2639.6735287925 316.5017107195 6051891703.4611186981 129.0000026859 129.0000026859 129.0000026859
49 241.9611147338 2819.2374095379 2146689.1397321564 2639.6583357477 316.6375029166 6052030801.2758235931 129.0000027990 129.0000027990 129.0000027990
50 242.1023518806 2819.3259059811 2146689.1454943856 2639.6424863169 316.8223300428 6052220791.8748512268 129.0000029144 129.0000029144 129.0000029144
51 242.1174105473 2819.3319633044 2146689.1513730693 2639.6264141131 316.8420362613 6052233811.6391019821 129.0000030321 129.0000030321 129.0000030321
52 242.2534914901 2819.4164594322 2146689.1573682069 2639.6098392671 317.0201158259 6052415215.4627037048 129.0000031522 129.0000031522 129.0000031522
53 242.3504633236 2819.4754119996 2146689.1634798055 2639.5930076506 317.1470160479 6052541785.5314817429 129.0000032746 129.0000032746 129.0000032746
54 242.2982323323 2819.4368568264 2146689.1697078613 2639.5756353782 317.0786650211 6052459037.1184797287 129.0000033994 129.0000033994 129.0000033994
55 242.3452896272 2819.4623310219 2146689.1760523771 2639.5575918586 317.1402455951 6052513740.1862611771 129.0000035265 129.0000035265 129.0000035265
56 242.4181903333 2819.5048897011 2146689.1825133534 2639.5390347547 317.2356456249 6052605118.6588287354 129.0000036559 129.0000036559 129.0000036559
57 242.5317091656 2819.5739975787 2146689.1890907930 2639.5199828249 317.3841997413 6052753490.3378009796 129.0000037876 129.0000037876 129.0000037876
58 242.5478978740 2819.5796954935 2146689.1957846982 2639.5006137388 317.4053847660 6052765740.8638200760 129.0000039217 129.0000039217 129.0000039217
59 242.6655316466 2819.6519225743 2146689.2025950695 2639.4808234811 317.5593238156 6052920809.1607065201 129.0000040582 129.0000040582 129.0000040582
60 242.8126131177 2819.7431588157 2146689.2095219092 2639.4607996998 317.7517989980 6053116684.5470046997 129.0000041969 129.0000041969 129.0000041969
61 242.7957124913 2819.7275989047 2146689.2165652174 2639.4406312730 317.7296823362 6053083302.1140241623 129.0000043380 129.0000043380 129.0000043380
62 242.9276177041 2819.8088790098 2146689.2237249981 2639.4201279058 317.9022974164 6053257805.4283437729 129.0000044814 129.0000044814 129.0000044814
63 243.0465445938 2819.8814758895 2146689.2310012528 2639.3991657500 318.0579286774 6053413668.8858547211 129.0000046272 129.0000046272 129.0000046272
64 242.9890585501 2819.8387587817 2146689.2383939880 2639.3781767844 317.9827007328 6053321989.3768787384 129.0000047752 129.0000047752 129.0000047752
65 242.9653746583 2819.8180104181 2146689.2459031967 2639.3568184374 317.9517072884 6053277470.2627182007 129.0000049256 129.0000049256 129.0000049256
66 243.0259297024 2819.8514334947 2146689.2535288804 2639.3352568621 318.0309514181 6053349240.7251205444 129.0000050784 129.0000050784 129.0000050784
67 242.9638979697 2819.8046112742 2146689.2612710390 2639.3134547096 317.9497748498 6053248749.7987766266 129.0000052335 129.0000052335 129.0000052335
68 243.0283540775 2819.8395632725 2146689.2691296688 2639.2912303374 318.0341240273 6053323803.0382738113 129.0000053909 129.0000053909 129.0000053909
69 243.2256418664 2819.9609646019 2146689.2771047787 2639.2684509205 318.2923006889 6053584436.4588871002 129.0000055506 129.0000055506 129.0000055506
70 243.2507495334 2819.9706145524 2146689.2851963686 2639.2450126010 318.3251573278 6053605174.7221174240 129.0000057127 129.0000057127 129.0000057127
71 243.4287155518 2820.0794853386 2146689.2934044413 2639.2213699915 318.5580489464 6053838909.6197280884 129.0000058771 129.0000058771 129.0000058771
72 243.5097518574 2820.1249498194 2146689.3017290002 2639.1971212009 318.6640954635 6053936531.2101163864 129.0000060439 129.0000060439 129.0000060439
73 243.5356790969 2820.1337977544 2146689.3101700447 2639.1723394661 318.6980246193 6053955548.7824945450 129.0000062130 129.0000062130 129.0000062130
74 243.5479180498 2820.1331964183 2146689.3187275808 2639.1473868749 318.7140408766 6053954282.0339813232 129.0000063844 129.0000063844 129.0000063844
75 243.7115573025 2820.2314361523 2146689.3274016059 2639.1220411207 318.9281840641 6054165196.6845111847 129.0000065581 129.0000065581 129.0000065581
76 243.7457279618 2820.2454531429 2146689.3361921217 2639.0963868224 318.9729008040 6054195311.5999307632 129.0000067342 129.0000067342 129.0000067342
77 243.8345031069 2820.2948644965 2146689.3450991292 2639.0700900389 319.0890745962 6054301407.5461502075 129.0000069126 129.0000069126 129.0000069126
78 244.0193931195 2820.4067881628 2146689.3541226317 2639.0435094409 319.3310271594 6054541698.3381366730 129.0000070934 129.0000070934 129.0000070934
79 243.9919100078 2820.3799166166 2146689.3632626338 2639.0164249037 319.2950619430 6054484039.2541246414 129.0000072765 129.0000072765 129.0000072765
80 244.0965612207 2820.4387335935 2146689.3725191355 2638.9888176882 319.4320116291 6054610327.1403293610 129.0000074619 129.0000074619 129.0000074619
81 244.1334315951 2820.4535208568 2146689.3818921377 2638.9608330195 319.4802612965 6054642097.2373485565 129.0000076496 129.0000076496 129.0000076496
82 244.3029520408 2820.5543485196 2146689.3913816395 2638.9318525796 319.7021007878 6054858569.6761827469 129.0000078397 129.0000078397 129.0000078397
83 244.3445761189 2820.5713690935 2146689.4009876498 2638.9021684795 319.7565712929 6054895134.6560049057 129.0000080321 129.0000080321 129.0000080321
84 244.2696671559 2820.5125763350 2146689.4107101629 2638.8720941742 319.6585431986 6054768952.2869329453 129.0000082269 129.0000082269 129.0000082269
85 244.5161919319 2820.6629431352 2146689.4205491822 2638.8415194387 319.9811528443 6055091770.8571672440 129.0000084240 129.0000084240 129.0000084240
86 244.5641090282 2820.6838080201 2146689.4305047127 2638.8103612394 320.0438585800 6055136589.3662166595 129.0000086234 129.0000086234 129.0000086234
87 244.5348240638 2820.6541129118 2146689.4405767513 2638.7789728309 320.0055354056 6055072871.6007261276 129.0000088251 129.0000088251 129.0000088251
88 244.6939431427 2820.7468233396 2146689.4507653015 2638.7470269267 320.2137633592 6055271920.8364210129 129.0000090292 129.0000090292 129.0000090292
89 244.8800201091 2820.8567117003 2146689.4610703662 2638.7147520097 320.4572692055 6055507846.0901927948 129.0000092356 129.0000092356 129.0000092356
90 244.8804280382 2820.8451141876 2146689.4714919478 2638.6820441173 320.4578030336 6055482979.2295818329 129.0000094444 129.0000094444 129.0000094444
91 244.9558851986 2820.8815975090 2146689.4820300462 2638.6491836104 320.5565485155 6055561327.3181543350 129.0000096555 129.0000096555 129.0000096555
92 244.9965893140 2820.8949614294 2146689.4926846647 2638.6159817170 320.6098151301 6055590045.5610351562 129.0000098689 129.0000098689 129.0000098689
93 245.1381056687 2820.9732811388 2146689.5034558061 2638.5824451870 320.7950076360 6055758204.0434722900 129.0000100846 129.0000100846 129.0000100846
94 245.2954807041 2821.0619342131 2146689.5143434699 2638.5485198222 321.0009532826 6055948545.3822879791 129.0000103027 129.0000103027 129.0000103027
95 245.3535822199 2821.0860553731 2146689.5253476589 2638.5144817512 321.0769866522 6056000357.0671482086 129.0000105232 129.0000105232 129.0000105232
96 245.5013476026 2821.1682908185 2146689.5364683764 2638.4801107361 321.2703568219 6056176922.4099712372 129.0000107459 129.0000107459 129.0000107459
97 245.4166531417 2821.0989038023 2146689.5477056229 2638.4453663061 321.1595231342 6056028001.7295455933 129.0000109710 129.0000109710 129.0000109710
98 245.4121937790 2821.0817490953 2146689.5590593945 2638.4097762390 321.1536874797 6055991207.9293851852 129.0000111984 129.0000111984 129.0000111984
99 245.4532592994 2821.0946353191 2146689.5705296928 2638.3738037546 321.2074270397 6056018903.0102539062 129.0000114282 129.0000114282 129.0000114282
100 245.7500657390 2821.2735939427 2146689.5821165247 2638.3375549051 321.5958367642 6056403104.3106222153 129.0000116603 129.0000116603 129.0000116603
Loop time of 5.22601 on 1 procs for 100 steps with 10125 atoms
Performance: 2.133 ns/day, 11.250 hours/ns, 24.691 timesteps/s
99.8% CPU use with 1 MPI tasks x no OpenMP threads
Performance: 1.653 ns/day, 14.517 hours/ns, 19.135 timesteps/s
99.7% CPU use with 1 MPI tasks x no OpenMP threads
MPI task timing breakdown:
Section | min time | avg time | max time |%varavg| %total
---------------------------------------------------------------
Pair | 0.46587 | 0.46587 | 0.46587 | 0.0 | 11.50
Neigh | 1.4713 | 1.4713 | 1.4713 | 0.0 | 36.33
Comm | 0.05567 | 0.05567 | 0.05567 | 0.0 | 1.37
Output | 0.011364 | 0.011364 | 0.011364 | 0.0 | 0.28
Modify | 2.0158 | 2.0158 | 2.0158 | 0.0 | 49.77
Other | | 0.03004 | | | 0.74
Pair | 0.44045 | 0.44045 | 0.44045 | 0.0 | 8.43
Neigh | 2.669 | 2.669 | 2.669 | 0.0 | 51.07
Comm | 0.056143 | 0.056143 | 0.056143 | 0.0 | 1.07
Output | 0.012469 | 0.012469 | 0.012469 | 0.0 | 0.24
Modify | 2.0163 | 2.0163 | 2.0163 | 0.0 | 38.58
Other | | 0.03168 | | | 0.61
Nlocal: 10125 ave 10125 max 10125 min
Histogram: 1 0 0 0 0 0 0 0 0 0
@ -172,4 +180,4 @@ Dangerous builds not checked
Please see the log.cite file for references relevant to this simulation
Total wall time: 0:00:04
Total wall time: 0:00:05

View File

@ -1,163 +1,163 @@
############################################################################
# Input file for investigating twinning nucleation under uniaxial loading with basal plane vector analysis
# Christopher Barrett, March 2013
# This script requires a Mg pair potential file to be in the same directory.
# fname is the file name. It is necessary for loops to work correctly. (See jump command)
variable fname index in.basal
######################################
# POTENTIAL VARIABLES
# lattice parameters and the minimum energy per atom which should be obtained with the current pair potential and homogeneous lattice
variable lx equal 3.181269601
variable b equal sqrt(3)
variable c equal sqrt(8/3)
variable ly equal ${b}*${lx}
variable lz equal ${c}*${lx}
variable pairlocation index almg.liu
variable pairstyle index eam/alloy/opt
######################################
# EQUILIBRATION/DEFORMATION VARIABLES
# eqpress = 10 bar = 1 MPa
# tstep (the timestep) is set to a default value of 0.001 (1 fs)
# seed randomizes the velocity
# srate is the rate of strain in 1/s
# Ndump is the number of timesteps in between each dump of the atom coordinates
variable tstep equal 0.001
variable seed equal 95812384
variable srate equal 1e9
######################################
# INITIALIZATION
units metal
dimension 3
boundary s s s
atom_style atomic
######################################
# ATOM BUILD
atom_modify map array
# lattice custom scale a1 "coordinates of a1" a2 "coordinates of a2" a3 "coordinates of a3" basis "atom1 coordinates" basis "atom2 coordinates" basis "atom3 coordinates" basis "atom4 coordinates" orient x "crystallagraphic orientation of x axis" orient y "crystallagraphic orientation of y axis" z "crystallagraphic orientation of z axis"
lattice custom 3.181269601 a1 1 0 0 a2 0 1.732050808 0 a3 0 0 1.632993162 basis 0.0 0.0 0.0 basis 0.5 0.5 0 basis 0 0.3333333 0.5 basis 0.5 0.833333 0.5 orient x 0 1 1 orient y 1 0 0 orient z 0 1 -1
variable multiple equal 20
variable mx equal "v_lx*v_multiple"
variable my equal "v_ly*v_multiple"
variable mz equal "v_lz*v_multiple"
# the simulation region should be from 0 to a multiple of the periodic boundary in x, y and z.
region whole block 0 ${mz} 0 ${mx} 0 ${my} units box
create_box 2 whole
create_atoms 1 box basis 1 1 basis 2 1 basis 3 1 basis 4 1
region fixed1 block INF INF INF INF INF 10 units box
region fixed2 block INF INF INF INF 100 INF units box
group lower region fixed1
group upper region fixed2
group boundary union upper lower
group mobile subtract all boundary
variable natoms equal "count(all)"
print "# of atoms are: ${natoms}"
######################################
# INTERATOMIC POTENTIAL
pair_style ${pairstyle}
pair_coeff * * ${pairlocation} Mg Mg
######################################
# COMPUTES REQUIRED
compute csym all centro/atom 12
compute eng all pe/atom
compute eatoms all reduce sum c_eng
compute basal all basal/atom
######################################
# MINIMIZATION
# Primarily adjusts the c/a ratio to value predicted by EAM potential
reset_timestep 0
thermo 1
thermo_style custom step pe c_eatoms
min_style cg
minimize 1e-15 1e-15 1000 2000
variable eminimum equal "c_eatoms / count(all)"
print "%%e(it,1)=${eminimum}"
######################################
# EQUILIBRATION
reset_timestep 0
timestep ${tstep}
# atoms are given a random velocity based on a temperature of 100K.
velocity all create 100 ${seed} mom yes rot no
# temperature and pressure are set to 100 and 0
fix 1 all nve
# Set thermo output
thermo 100
thermo_style custom step lx ly lz press pxx pyy pzz pe temp
# Run for at least 2 picosecond (assuming 1 fs timestep)
run 2000
# Loop to run until pressure is below the variable eqpress (defined at beginning of file)
label loopeq
variable eq loop 100
run 250
variable converge equal press
if "${converge} <= 0" then "variable converge equal -press" else "variable converge equal press"
if "${converge} <= 50" then "jump ${fname} breakeq"
next eq
jump ${fname} loopeq
label breakeq
# Store length for strain rate calculations
variable tmp equal "lx"
variable L0 equal ${tmp}
print "Initial Length, L0: ${L0}"
unfix 1
######################################
# DEFORMATION
reset_timestep 0
timestep ${tstep}
# Impose constant strain rate
variable srate1 equal "v_srate / 1.0e10"
velocity upper set 0.0 NULL 0.0 units box
velocity lower set 0.0 NULL 0.0 units box
fix 2 upper setforce 0.0 NULL 0.0
fix 3 lower setforce 0.0 NULL 0.0
fix 1 all nve
# Output strain and stress info to file
# for units metal, pressure is in [bars] = 100 [kPa] = 1/10000 [GPa]
# p2 is in GPa
variable strain equal "(lx - v_L0)/v_L0"
variable p1 equal "v_strain"
variable p2 equal "-pxz/10000"
variable p3 equal "lx"
variable p4 equal "temp"
variable p5 equal "pe"
variable p6 equal "ke"
fix def1 all print 100 "${p1} ${p2} ${p3} ${p4} ${p5} ${p6}" file output.def1.txt screen no
# Dump coordinates to file (for void size calculations)
dump 1 all custom 1000 output.dump.* id x y z c_basal[1] c_basal[2] c_basal[3]
# Display thermo
thermo_style custom step v_strain pxz lx temp pe ke
restart 50000 output.restart
# run deformation for 100000 timesteps (10% strain assuming 1 fs timestep and 1e9/s strainrate)
variable runtime equal 0
label loop
displace_atoms all ramp x 0.0 ${srate1} z 10 100 units box
run 100
variable runtime equal ${runtime}+100
if "${runtime} < 100000" then "jump ${fname} loop"
######################################
# SIMULATION DONE
print "All done"
############################################################################
# Input file for investigating twinning nucleation under uniaxial loading with basal plane vector analysis
# Christopher Barrett, March 2013
# This script requires a Mg pair potential file to be in the same directory.
# fname is the file name. It is necessary for loops to work correctly. (See jump command)
variable fname index in.basal
######################################
# POTENTIAL VARIABLES
# lattice parameters and the minimum energy per atom which should be obtained with the current pair potential and homogeneous lattice
variable lx equal 3.181269601
variable b equal sqrt(3)
variable c equal sqrt(8/3)
variable ly equal ${b}*${lx}
variable lz equal ${c}*${lx}
variable pairlocation index almg.liu
variable pairstyle index eam/alloy/opt
######################################
# EQUILIBRATION/DEFORMATION VARIABLES
# eqpress = 10 bar = 1 MPa
# tstep (the timestep) is set to a default value of 0.001 (1 fs)
# seed randomizes the velocity
# srate is the rate of strain in 1/s
# Ndump is the number of timesteps in between each dump of the atom coordinates
variable tstep equal 0.001
variable seed equal 95812384
variable srate equal 1e9
######################################
# INITIALIZATION
units metal
dimension 3
boundary s s s
atom_style atomic
######################################
# ATOM BUILD
atom_modify map array
# lattice custom scale a1 "coordinates of a1" a2 "coordinates of a2" a3 "coordinates of a3" basis "atom1 coordinates" basis "atom2 coordinates" basis "atom3 coordinates" basis "atom4 coordinates" orient x "crystallagraphic orientation of x axis" orient y "crystallagraphic orientation of y axis" z "crystallagraphic orientation of z axis"
lattice custom 3.181269601 a1 1 0 0 a2 0 1.732050808 0 a3 0 0 1.632993162 basis 0.0 0.0 0.0 basis 0.5 0.5 0 basis 0 0.3333333 0.5 basis 0.5 0.833333 0.5 orient x 0 1 1 orient y 1 0 0 orient z 0 1 -1
variable multiple equal 20
variable mx equal "v_lx*v_multiple"
variable my equal "v_ly*v_multiple"
variable mz equal "v_lz*v_multiple"
# the simulation region should be from 0 to a multiple of the periodic boundary in x, y and z.
region whole block 0 ${mz} 0 ${mx} 0 ${my} units box
create_box 2 whole
create_atoms 1 box basis 1 1 basis 2 1 basis 3 1 basis 4 1
region fixed1 block INF INF INF INF INF 10 units box
region fixed2 block INF INF INF INF 100 INF units box
group lower region fixed1
group upper region fixed2
group boundary union upper lower
group mobile subtract all boundary
variable natoms equal "count(all)"
print "# of atoms are: ${natoms}"
######################################
# INTERATOMIC POTENTIAL
pair_style ${pairstyle}
pair_coeff * * ${pairlocation} Mg Mg
######################################
# COMPUTES REQUIRED
compute csym all centro/atom 12
compute eng all pe/atom
compute eatoms all reduce sum c_eng
compute basal all basal/atom
######################################
# MINIMIZATION
# Primarily adjusts the c/a ratio to value predicted by EAM potential
reset_timestep 0
thermo 1
thermo_style custom step pe c_eatoms
min_style cg
minimize 1e-15 1e-15 1000 2000
variable eminimum equal "c_eatoms / count(all)"
print "%%e(it,1)=${eminimum}"
######################################
# EQUILIBRATION
reset_timestep 0
timestep ${tstep}
# atoms are given a random velocity based on a temperature of 100K.
velocity all create 100 ${seed} mom yes rot no
# temperature and pressure are set to 100 and 0
fix 1 all nve
# Set thermo output
thermo 100
thermo_style custom step lx ly lz press pxx pyy pzz pe temp
# Run for at least 2 picosecond (assuming 1 fs timestep)
run 2000
# Loop to run until pressure is below the variable eqpress (defined at beginning of file)
label loopeq
variable eq loop 100
run 250
variable converge equal press
if "${converge} <= 0" then "variable converge equal -press" else "variable converge equal press"
if "${converge} <= 50" then "jump ${fname} breakeq"
next eq
jump ${fname} loopeq
label breakeq
# Store length for strain rate calculations
variable tmp equal "lx"
variable L0 equal ${tmp}
print "Initial Length, L0: ${L0}"
unfix 1
######################################
# DEFORMATION
reset_timestep 0
timestep ${tstep}
# Impose constant strain rate
variable srate1 equal "v_srate / 1.0e10"
velocity upper set 0.0 NULL 0.0 units box
velocity lower set 0.0 NULL 0.0 units box
fix 2 upper setforce 0.0 NULL 0.0
fix 3 lower setforce 0.0 NULL 0.0
fix 1 all nve
# Output strain and stress info to file
# for units metal, pressure is in [bars] = 100 [kPa] = 1/10000 [GPa]
# p2 is in GPa
variable strain equal "(lx - v_L0)/v_L0"
variable p1 equal "v_strain"
variable p2 equal "-pxz/10000"
variable p3 equal "lx"
variable p4 equal "temp"
variable p5 equal "pe"
variable p6 equal "ke"
fix def1 all print 100 "${p1} ${p2} ${p3} ${p4} ${p5} ${p6}" file output.def1.txt screen no
# Dump coordinates to file (for void size calculations)
dump 1 all custom 1000 output.dump.* id x y z c_basal[1] c_basal[2] c_basal[3]
# Display thermo
thermo_style custom step v_strain pxz lx temp pe ke
restart 50000 output.restart
# run deformation for 100000 timesteps (10% strain assuming 1 fs timestep and 1e9/s strainrate)
variable runtime equal 0
label loop
displace_atoms all ramp x 0.0 ${srate1} z 10 100 units box
run 100
variable runtime equal ${runtime}+100
if "${runtime} < 100000" then "jump ${fname} loop"
######################################
# SIMULATION DONE
print "All done"

View File

@ -15,6 +15,7 @@ bond_style harmonic
bond_coeff * 225.0 0.85
comm_modify vel yes
comm_modify cutoff 3.6
# must use pair hybrid, since srp bond particles
# do not interact with other atoms types

View File

@ -78,7 +78,7 @@ run 100
# only output atoms near vacancy
compute coord all coord/atom $r
compute coord all coord/atom cutoff $r
#dump events all custom 1 dump.prd id type x y z
#dump_modify events thresh c_coord != 4

View File

@ -80,7 +80,7 @@ velocity all zero linear
# only output atoms near vacancy
compute coord all coord/atom $r
compute coord all coord/atom cutoff $r
#dump events all custom 1 dump.prd id type x y z
#dump_modify events thresh c_coord != 4

View File

@ -1,8 +0,0 @@
# Standard ignores
*~
*.pyc
\#*#
.#*
.*.swp
.cproject
.project

284
lib/kokkos/CHANGELOG.md Normal file
View File

@ -0,0 +1,284 @@
# Change Log
## [2.02.07](https://github.com/kokkos/kokkos/tree/2.02.07) (2016-12-16)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.01...2.02.07)
**Implemented enhancements:**
- Add CMake option to enable Cuda Lambda support [\#589](https://github.com/kokkos/kokkos/issues/589)
- Add CMake option to enable Cuda RDC support [\#588](https://github.com/kokkos/kokkos/issues/588)
- Add Initial Intel Sky Lake Xeon-HPC Compiler Support to Kokkos Make System [\#584](https://github.com/kokkos/kokkos/issues/584)
- Building Tutorial Examples [\#582](https://github.com/kokkos/kokkos/issues/582)
- Internal way for using ThreadVectorRange without TeamHandle [\#574](https://github.com/kokkos/kokkos/issues/574)
- Testing: Add testing for uvm and rdc [\#571](https://github.com/kokkos/kokkos/issues/571)
- Profiling: Add Memory Tracing and Region Markers [\#557](https://github.com/kokkos/kokkos/issues/557)
- nvcc\_wrapper not installed with Kokkos built with CUDA through CMake [\#543](https://github.com/kokkos/kokkos/issues/543)
- Improve DynRankView debug check [\#541](https://github.com/kokkos/kokkos/issues/541)
- Benchmarks: Add Gather benchmark [\#536](https://github.com/kokkos/kokkos/issues/536)
- Testing: add spot\_check option to test\_all\_sandia [\#535](https://github.com/kokkos/kokkos/issues/535)
- Deprecate Kokkos::Impl::VerifyExecutionCanAccessMemorySpace [\#527](https://github.com/kokkos/kokkos/issues/527)
- Add AtomicAdd support for 64bit float for Pascal [\#522](https://github.com/kokkos/kokkos/issues/522)
- Add Restrict and Aligned memory trait [\#517](https://github.com/kokkos/kokkos/issues/517)
- Kokkos Tests are Not Run using Compiler Optimization [\#501](https://github.com/kokkos/kokkos/issues/501)
- Add support for clang 3.7 w/ openmp backend [\#393](https://github.com/kokkos/kokkos/issues/393)
- Provide an error throw class [\#79](https://github.com/kokkos/kokkos/issues/79)
**Fixed bugs:**
- Cuda UVM Allocation test broken with UVM as default space [\#586](https://github.com/kokkos/kokkos/issues/586)
- Bug \(develop branch only\): multiple tests are now failing when forcing uvm usage. [\#570](https://github.com/kokkos/kokkos/issues/570)
- Error in generate\_makefile.sh for Kokkos when Compiler is Empty String/Fails [\#568](https://github.com/kokkos/kokkos/issues/568)
- XL 13.1.4 incorrect C++11 flag [\#553](https://github.com/kokkos/kokkos/issues/553)
- Improve DynRankView debug check [\#541](https://github.com/kokkos/kokkos/issues/541)
- Installing Library on MAC broken due to cp -u [\#539](https://github.com/kokkos/kokkos/issues/539)
- Intel Nightly Testing with Debug enabled fails [\#534](https://github.com/kokkos/kokkos/issues/534)
## [2.02.01](https://github.com/kokkos/kokkos/tree/2.02.01) (2016-11-01)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.00...2.02.01)
**Implemented enhancements:**
- Add Changelog generation to our process. [\#506](https://github.com/kokkos/kokkos/issues/506)
**Fixed bugs:**
- Test scratch\_request fails in Serial with Debug enabled [\#520](https://github.com/kokkos/kokkos/issues/520)
- Bug In BoundsCheck for DynRankView [\#516](https://github.com/kokkos/kokkos/issues/516)
## [2.02.00](https://github.com/kokkos/kokkos/tree/2.02.00) (2016-10-30)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.01.10...2.02.00)
**Implemented enhancements:**
- Add PowerPC assembly for grabbing clock register in memory pool [\#511](https://github.com/kokkos/kokkos/issues/511)
- Add GCC 6.x support [\#508](https://github.com/kokkos/kokkos/issues/508)
- Test install and build against installed library [\#498](https://github.com/kokkos/kokkos/issues/498)
- Makefile.kokkos adds expt-extended-lambda to cuda build with clang [\#490](https://github.com/kokkos/kokkos/issues/490)
- Add top-level makefile option to just test kokkos-core unit-test [\#485](https://github.com/kokkos/kokkos/issues/485)
- Split and harmonize Object Files of Core UnitTests to increase build parallelism [\#484](https://github.com/kokkos/kokkos/issues/484)
- LayoutLeft to LayoutLeft subview for 3D and 4D views [\#473](https://github.com/kokkos/kokkos/issues/473)
- Add official Cuda 8.0 support [\#468](https://github.com/kokkos/kokkos/issues/468)
- Allow C++1Z Flag for Class Lambda capture [\#465](https://github.com/kokkos/kokkos/issues/465)
- Add Clang 4.0+ compilation of Cuda code [\#455](https://github.com/kokkos/kokkos/issues/455)
- Possible Issue with Intel 17.0.098 and GCC 6.1.0 in Develop Branch [\#445](https://github.com/kokkos/kokkos/issues/445)
- Add name of view to "View bounds error" [\#432](https://github.com/kokkos/kokkos/issues/432)
- Move Sort Binning Operators into Kokkos namespace [\#421](https://github.com/kokkos/kokkos/issues/421)
- TaskPolicy - generate error when attempt to use uninitialized [\#396](https://github.com/kokkos/kokkos/issues/396)
- Import WithoutInitializing and AllowPadding into Kokkos namespace [\#325](https://github.com/kokkos/kokkos/issues/325)
- TeamThreadRange requires begin, end to be the same type [\#305](https://github.com/kokkos/kokkos/issues/305)
- CudaUVMSpace should track \# allocations, due to CUDA limit on \# UVM allocations [\#300](https://github.com/kokkos/kokkos/issues/300)
- Remove old View and its infrastructure [\#259](https://github.com/kokkos/kokkos/issues/259)
**Fixed bugs:**
- Bug in TestCuda\_Other.cpp: most likely assembly inserted into Device code [\#515](https://github.com/kokkos/kokkos/issues/515)
- Cuda Compute Capability check of GPU is outdated [\#509](https://github.com/kokkos/kokkos/issues/509)
- multi\_scratch test with hwloc and pthreads seg-faults. [\#504](https://github.com/kokkos/kokkos/issues/504)
- generate\_makefile.bash: "make install" is broken [\#503](https://github.com/kokkos/kokkos/issues/503)
- make clean in Out of Source Build/Tests Does Not Work Correctly [\#502](https://github.com/kokkos/kokkos/issues/502)
- Makefiles for test and examples have issues in Cuda when CXX is not explicitly specified [\#497](https://github.com/kokkos/kokkos/issues/497)
- Dispatch lambda test directly inside GTEST macro doesn't work with nvcc [\#491](https://github.com/kokkos/kokkos/issues/491)
- UnitTests with HWLOC enabled fail if run with mpirun bound to a single core [\#489](https://github.com/kokkos/kokkos/issues/489)
- Failing Reducer Test on Mac with Pthreads [\#479](https://github.com/kokkos/kokkos/issues/479)
- make test Dumps Error with Clang Not Found [\#471](https://github.com/kokkos/kokkos/issues/471)
- OpenMP TeamPolicy member broadcast not using correct volatile shared variable [\#424](https://github.com/kokkos/kokkos/issues/424)
- TaskPolicy - generate error when attempt to use uninitialized [\#396](https://github.com/kokkos/kokkos/issues/396)
- New task policy implementation is pulling in old experimental code. [\#372](https://github.com/kokkos/kokkos/issues/372)
- MemoryPool unit test hangs on Power8 with GCC 6.1.0 [\#298](https://github.com/kokkos/kokkos/issues/298)
## [2.01.10](https://github.com/kokkos/kokkos/tree/2.01.10) (2016-09-27)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.01.06...2.01.10)
**Implemented enhancements:**
- Enable Profiling by default in Tribits build [\#438](https://github.com/kokkos/kokkos/issues/438)
- parallel\_reduce\(0\), parallel\_scan\(0\) unit tests [\#436](https://github.com/kokkos/kokkos/issues/436)
- data\(\)==NULL after realloc with LayoutStride [\#351](https://github.com/kokkos/kokkos/issues/351)
- Fix tutorials to track new Kokkos::View [\#323](https://github.com/kokkos/kokkos/issues/323)
- Rename team policy set\_scratch\_size. [\#195](https://github.com/kokkos/kokkos/issues/195)
**Fixed bugs:**
- Possible Issue with Intel 17.0.098 and GCC 6.1.0 in Develop Branch [\#445](https://github.com/kokkos/kokkos/issues/445)
- Makefile spits syntax error [\#435](https://github.com/kokkos/kokkos/issues/435)
- Kokkos::sort fails for view with all the same values [\#422](https://github.com/kokkos/kokkos/issues/422)
- Generic Reducers: can't accept inline constructed reducer [\#404](https://github.com/kokkos/kokkos/issues/404)
- data\\(\\)==NULL after realloc with LayoutStride [\#351](https://github.com/kokkos/kokkos/issues/351)
- const subview of const view with compile time dimensions on Cuda backend [\#310](https://github.com/kokkos/kokkos/issues/310)
- Kokkos \(in Trilinos\) Causes Internal Compiler Error on CUDA 8.0.21-EA on POWER8 [\#307](https://github.com/kokkos/kokkos/issues/307)
- Core Oversubscription Detection Broken? [\#159](https://github.com/kokkos/kokkos/issues/159)
## [2.01.06](https://github.com/kokkos/kokkos/tree/2.01.06) (2016-09-02)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.01.00...2.01.06)
**Implemented enhancements:**
- Add "standard" reducers for lambda-supportable customized reduce [\#411](https://github.com/kokkos/kokkos/issues/411)
- TaskPolicy - single thread back-end execution [\#390](https://github.com/kokkos/kokkos/issues/390)
- Kokkos master clone tag [\#387](https://github.com/kokkos/kokkos/issues/387)
- Query memory requirements from task policy [\#378](https://github.com/kokkos/kokkos/issues/378)
- Output order of test\_atomic.cpp is confusing [\#373](https://github.com/kokkos/kokkos/issues/373)
- Missing testing for atomics [\#341](https://github.com/kokkos/kokkos/issues/341)
- Feature request for Kokkos to provide Kokkos::atomic\_fetch\_max and atomic\_fetch\_min [\#336](https://github.com/kokkos/kokkos/issues/336)
- TaskPolicy\<Cuda\> performance requires teams mapped to warps [\#218](https://github.com/kokkos/kokkos/issues/218)
**Fixed bugs:**
- Reduce with Teams broken for custom initialize [\#407](https://github.com/kokkos/kokkos/issues/407)
- Failing Kokkos build on Debian [\#402](https://github.com/kokkos/kokkos/issues/402)
- Failing Tests on NVIDIA Pascal GPUs [\#398](https://github.com/kokkos/kokkos/issues/398)
- Algorithms: fill\_random assumes dimensions fit in unsigned int [\#389](https://github.com/kokkos/kokkos/issues/389)
- Kokkos::subview with RandomAccess Memory Trait [\#385](https://github.com/kokkos/kokkos/issues/385)
- Build warning \(signed / unsigned comparison\) in Cuda implementation [\#365](https://github.com/kokkos/kokkos/issues/365)
- wrong results for a parallel\_reduce with CUDA8 / Maxwell50 [\#352](https://github.com/kokkos/kokkos/issues/352)
- Hierarchical parallelism - 3 level unit test [\#344](https://github.com/kokkos/kokkos/issues/344)
- Can I allocate a View w/ both WithoutInitializing & AllowPadding? [\#324](https://github.com/kokkos/kokkos/issues/324)
- subview View layout determination [\#309](https://github.com/kokkos/kokkos/issues/309)
- Unit tests with Cuda - Maxwell [\#196](https://github.com/kokkos/kokkos/issues/196)
## [2.01.00](https://github.com/kokkos/kokkos/tree/2.01.00) (2016-07-21)
[Full Changelog](https://github.com/kokkos/kokkos/compare/End_C++98...2.01.00)
**Implemented enhancements:**
- Edit ViewMapping so assigning Views with the same custom layout compiles when const casting [\#327](https://github.com/kokkos/kokkos/issues/327)
- DynRankView: Performance improvement for operator\(\) [\#321](https://github.com/kokkos/kokkos/issues/321)
- Interoperability between static and dynamic rank views [\#295](https://github.com/kokkos/kokkos/issues/295)
- subview member function ? [\#280](https://github.com/kokkos/kokkos/issues/280)
- Inter-operatibility between View and DynRankView. [\#245](https://github.com/kokkos/kokkos/issues/245)
- \(Trilinos\) build warning in atomic\_assign, with Kokkos::complex [\#177](https://github.com/kokkos/kokkos/issues/177)
- View\<\>::shmem\_size should runtime check for number of arguments equal to rank [\#176](https://github.com/kokkos/kokkos/issues/176)
- Custom reduction join via lambda argument [\#99](https://github.com/kokkos/kokkos/issues/99)
- DynRankView with 0 dimensions passed in at construction [\#293](https://github.com/kokkos/kokkos/issues/293)
- Inject view\_alloc and friends into Kokkos namespace [\#292](https://github.com/kokkos/kokkos/issues/292)
- Less restrictive TeamPolicy reduction on Cuda [\#286](https://github.com/kokkos/kokkos/issues/286)
- deep\_copy using remap with source execution space [\#267](https://github.com/kokkos/kokkos/issues/267)
- Suggestion: Enable opt-in L1 caching via nvcc-wrapper [\#261](https://github.com/kokkos/kokkos/issues/261)
- More flexible create\_mirror functions [\#260](https://github.com/kokkos/kokkos/issues/260)
- Rename View::memory\_span to View::required\_allocation\_size [\#256](https://github.com/kokkos/kokkos/issues/256)
- Use of subviews and views with compile-time dimensions [\#237](https://github.com/kokkos/kokkos/issues/237)
- Use of subviews and views with compile-time dimensions [\#237](https://github.com/kokkos/kokkos/issues/237)
- Kokkos::Timer [\#234](https://github.com/kokkos/kokkos/issues/234)
- Fence CudaUVMSpace allocations [\#230](https://github.com/kokkos/kokkos/issues/230)
- View::operator\(\) accept std::is\_integral and std::is\_enum [\#227](https://github.com/kokkos/kokkos/issues/227)
- Allocating zero size View [\#216](https://github.com/kokkos/kokkos/issues/216)
- Thread scalable memory pool [\#212](https://github.com/kokkos/kokkos/issues/212)
- Add a way to disable memory leak output [\#194](https://github.com/kokkos/kokkos/issues/194)
- Kokkos exec space init should init Kokkos profiling [\#192](https://github.com/kokkos/kokkos/issues/192)
- Runtime rank wrapper for View [\#189](https://github.com/kokkos/kokkos/issues/189)
- Profiling Interface [\#158](https://github.com/kokkos/kokkos/issues/158)
- Fix View assignment \(of managed to unmanaged\) [\#153](https://github.com/kokkos/kokkos/issues/153)
- Add unit test for assignment of managed View to unmanaged View [\#152](https://github.com/kokkos/kokkos/issues/152)
- Check for oversubscription of threads with MPI in Kokkos::initialize [\#149](https://github.com/kokkos/kokkos/issues/149)
- Dynamic resizeable 1dimensional view [\#143](https://github.com/kokkos/kokkos/issues/143)
- Develop TaskPolicy for CUDA [\#142](https://github.com/kokkos/kokkos/issues/142)
- New View : Test Compilation Downstream [\#138](https://github.com/kokkos/kokkos/issues/138)
- New View Implementation [\#135](https://github.com/kokkos/kokkos/issues/135)
- Add variant of subview that lets users add traits [\#134](https://github.com/kokkos/kokkos/issues/134)
- NVCC-WRAPPER: Add --host-only flag [\#121](https://github.com/kokkos/kokkos/issues/121)
- Address gtest issue with TriBITS Kokkos build outside of Trilinos [\#117](https://github.com/kokkos/kokkos/issues/117)
- Make tests pass with -expt-extended-lambda on CUDA [\#108](https://github.com/kokkos/kokkos/issues/108)
- Dynamic scheduling for parallel\_for and parallel\_reduce [\#106](https://github.com/kokkos/kokkos/issues/106)
- Runtime or compile time error when reduce functor's join is not properly specified as const member function or with volatile arguments [\#105](https://github.com/kokkos/kokkos/issues/105)
- Error out when the number of threads is modified after kokkos is initialized [\#104](https://github.com/kokkos/kokkos/issues/104)
- Porting to POWER and remove assumption of X86 default [\#103](https://github.com/kokkos/kokkos/issues/103)
- Dynamic scheduling option for RangePolicy [\#100](https://github.com/kokkos/kokkos/issues/100)
- SharedMemory Support for Lambdas [\#81](https://github.com/kokkos/kokkos/issues/81)
- Recommended TeamSize for Lambdas [\#80](https://github.com/kokkos/kokkos/issues/80)
- Add Aggressive Vectorization Compilation mode [\#72](https://github.com/kokkos/kokkos/issues/72)
- Dynamic scheduling team execution policy [\#53](https://github.com/kokkos/kokkos/issues/53)
- UVM allocations in multi-GPU systems [\#50](https://github.com/kokkos/kokkos/issues/50)
- Synchronic in Kokkos::Impl [\#44](https://github.com/kokkos/kokkos/issues/44)
- index and dimension types in for loops [\#28](https://github.com/kokkos/kokkos/issues/28)
- Subview assign of 1D Strided with stride 1 to LayoutLeft/Right [\#1](https://github.com/kokkos/kokkos/issues/1)
**Fixed bugs:**
- misspelled variable name in Kokkos\_Atomic\_Fetch + missing unit tests [\#340](https://github.com/kokkos/kokkos/issues/340)
- seg fault Kokkos::Impl::CudaInternal::print\_configuration [\#338](https://github.com/kokkos/kokkos/issues/338)
- Clang compiler error with named parallel\_reduce, tags, and TeamPolicy. [\#335](https://github.com/kokkos/kokkos/issues/335)
- Shared Memory Allocation Error at parallel\_reduce [\#311](https://github.com/kokkos/kokkos/issues/311)
- DynRankView: Fix resize and realloc [\#303](https://github.com/kokkos/kokkos/issues/303)
- Scratch memory and dynamic scheduling [\#279](https://github.com/kokkos/kokkos/issues/279)
- MemoryPool infinite loop when out of memory [\#312](https://github.com/kokkos/kokkos/issues/312)
- Kokkos DynRankView changes break Sacado and Panzer [\#299](https://github.com/kokkos/kokkos/issues/299)
- MemoryPool fails to compile on non-cuda non-x86 [\#297](https://github.com/kokkos/kokkos/issues/297)
- Random Number Generator Fix [\#296](https://github.com/kokkos/kokkos/issues/296)
- View template parameter ordering Bug [\#282](https://github.com/kokkos/kokkos/issues/282)
- Serial task policy broken. [\#281](https://github.com/kokkos/kokkos/issues/281)
- deep\_copy with LayoutStride should not memcpy [\#262](https://github.com/kokkos/kokkos/issues/262)
- DualView::need\_sync should be a const method [\#248](https://github.com/kokkos/kokkos/issues/248)
- Arbitrary-sized atomics on GPUs broken; loop forever [\#238](https://github.com/kokkos/kokkos/issues/238)
- boolean reduction value\_type changes answer [\#225](https://github.com/kokkos/kokkos/issues/225)
- Custom init\(\) function for parallel\_reduce with array value\_type [\#210](https://github.com/kokkos/kokkos/issues/210)
- unit\_test Makefile is Broken - Recursively Calls itself until Machine Apocalypse. [\#202](https://github.com/kokkos/kokkos/issues/202)
- nvcc\_wrapper Does Not Support -Xcompiler \<compiler option\> [\#198](https://github.com/kokkos/kokkos/issues/198)
- Kokkos exec space init should init Kokkos profiling [\#192](https://github.com/kokkos/kokkos/issues/192)
- Kokkos Threads Backend impl\_shared\_alloc Broken on Intel 16.1 \(Shepard Haswell\) [\#186](https://github.com/kokkos/kokkos/issues/186)
- pthread back end hangs if used uninitialized [\#182](https://github.com/kokkos/kokkos/issues/182)
- parallel\_reduce of size 0, not calling init/join [\#175](https://github.com/kokkos/kokkos/issues/175)
- Bug in Threads with OpenMP enabled [\#173](https://github.com/kokkos/kokkos/issues/173)
- KokkosExp\_SharedAlloc, m\_team\_work\_index inaccessible [\#166](https://github.com/kokkos/kokkos/issues/166)
- 128-bit CAS without Assembly Broken? [\#161](https://github.com/kokkos/kokkos/issues/161)
- fatal error: Cuda/Kokkos\_Cuda\_abort.hpp: No such file or directory [\#157](https://github.com/kokkos/kokkos/issues/157)
- Power8: Fix OpenMP backend [\#139](https://github.com/kokkos/kokkos/issues/139)
- Data race in Kokkos OpenMP initialization [\#131](https://github.com/kokkos/kokkos/issues/131)
- parallel\_launch\_local\_memory and cuda 7.5 [\#125](https://github.com/kokkos/kokkos/issues/125)
- Resize can fail with Cuda due to asynchronous dispatch [\#119](https://github.com/kokkos/kokkos/issues/119)
- Qthread taskpolicy initialization bug. [\#92](https://github.com/kokkos/kokkos/issues/92)
- Windows: sys/mman.h [\#89](https://github.com/kokkos/kokkos/issues/89)
- Windows: atomic\_fetch\_sub\(\) [\#88](https://github.com/kokkos/kokkos/issues/88)
- Windows: snprintf [\#87](https://github.com/kokkos/kokkos/issues/87)
- Parallel\_Reduce with TeamPolicy and league size of 0 returns garbage [\#85](https://github.com/kokkos/kokkos/issues/85)
- Throw with Cuda when using \(2D\) team\_policy parallel\_reduce with less than a warp size [\#76](https://github.com/kokkos/kokkos/issues/76)
- Scalar views don't work with Kokkos::Atomic memory trait [\#69](https://github.com/kokkos/kokkos/issues/69)
- Reduce the number of threads per team for Cuda [\#63](https://github.com/kokkos/kokkos/issues/63)
- Named Kernels fail for reductions with CUDA [\#60](https://github.com/kokkos/kokkos/issues/60)
- Kokkos View dimension\_\(\) for long returning unsigned int [\#20](https://github.com/kokkos/kokkos/issues/20)
- atomic test hangs with LLVM [\#6](https://github.com/kokkos/kokkos/issues/6)
- OpenMP Test should set omp\_set\_num\_threads to 1 [\#4](https://github.com/kokkos/kokkos/issues/4)
**Closed issues:**
- develop branch broken with CUDA 8 and --expt-extended-lambda [\#354](https://github.com/kokkos/kokkos/issues/354)
- --arch=KNL with Intel 2016 build failure [\#349](https://github.com/kokkos/kokkos/issues/349)
- Error building with Cuda when passing -DKOKKOS\_CUDA\_USE\_LAMBDA to generate\_makefile.bash [\#343](https://github.com/kokkos/kokkos/issues/343)
- Can I safely use int indices in a 2-D View with capacity \> 2B? [\#318](https://github.com/kokkos/kokkos/issues/318)
- Kokkos::ViewAllocateWithoutInitializing is not working [\#317](https://github.com/kokkos/kokkos/issues/317)
- Intel build on Mac OS X [\#277](https://github.com/kokkos/kokkos/issues/277)
- deleted [\#271](https://github.com/kokkos/kokkos/issues/271)
- Broken Mira build [\#268](https://github.com/kokkos/kokkos/issues/268)
- 32-bit build [\#246](https://github.com/kokkos/kokkos/issues/246)
- parallel\_reduce with RDC crashes linker [\#232](https://github.com/kokkos/kokkos/issues/232)
- build of Kokkos\_Sparse\_MV\_impl\_spmv\_Serial.cpp.o fails if you use nvcc and have cuda disabled [\#209](https://github.com/kokkos/kokkos/issues/209)
- Kokkos Serial execution space is not tested with TeamPolicy. [\#207](https://github.com/kokkos/kokkos/issues/207)
- Unit test failure on Hansen KokkosCore\_UnitTest\_Cuda\_MPI\_1 [\#200](https://github.com/kokkos/kokkos/issues/200)
- nvcc compiler warning: calling a \_\_host\_\_ function from a \_\_host\_\_ \_\_device\_\_ function is not allowed [\#180](https://github.com/kokkos/kokkos/issues/180)
- Intel 15 build error with defaulted "move" operators [\#171](https://github.com/kokkos/kokkos/issues/171)
- missing libkokkos.a during Trilinos 12.4.2 build, yet other libkokkos\*.a libs are there [\#165](https://github.com/kokkos/kokkos/issues/165)
- Tie atomic updates to execution space or even to thread team? \(speculation\) [\#144](https://github.com/kokkos/kokkos/issues/144)
- New View: Compiletime/size Test [\#137](https://github.com/kokkos/kokkos/issues/137)
- New View : Performance Test [\#136](https://github.com/kokkos/kokkos/issues/136)
- Signed/unsigned comparison warning in CUDA parallel [\#130](https://github.com/kokkos/kokkos/issues/130)
- Kokkos::complex: Need op\* w/ std::complex & real [\#126](https://github.com/kokkos/kokkos/issues/126)
- Use uintptr\_t for casting pointers [\#110](https://github.com/kokkos/kokkos/issues/110)
- Default thread mapping behavior between P and Q threads. [\#91](https://github.com/kokkos/kokkos/issues/91)
- Windows: Atomic\_Fetch\_Exchange\(\) return type [\#90](https://github.com/kokkos/kokkos/issues/90)
- Synchronic unit test is way too long [\#84](https://github.com/kokkos/kokkos/issues/84)
- nvcc\_wrapper -\> $\(NVCC\_WRAPPER\) [\#42](https://github.com/kokkos/kokkos/issues/42)
- Check compiler version and print helpful message [\#39](https://github.com/kokkos/kokkos/issues/39)
- Kokkos shared memory on Cuda uses a lot of registers [\#31](https://github.com/kokkos/kokkos/issues/31)
- Can not pass unit test `cuda.space` without a GT 720 [\#25](https://github.com/kokkos/kokkos/issues/25)
- Makefile.kokkos lacks bounds checking option that CMake has [\#24](https://github.com/kokkos/kokkos/issues/24)
- Kokkos can not complete unit tests with CUDA UVM enabled [\#23](https://github.com/kokkos/kokkos/issues/23)
- Simplify teams + shared memory histogram example to remove vectorization [\#21](https://github.com/kokkos/kokkos/issues/21)
- Kokkos needs to rever to ${PROJECT\_NAME}\_ENABLE\_CXX11 not Trilinos\_ENABLE\_CXX11 [\#17](https://github.com/kokkos/kokkos/issues/17)
- Kokkos Base Makefile adds AVX to KNC Build [\#16](https://github.com/kokkos/kokkos/issues/16)
- MS Visual Studio 2013 Build Errors [\#9](https://github.com/kokkos/kokkos/issues/9)
- subview\(X, ALL\(\), j\) for 2-D LayoutRight View X: should it view a column? [\#5](https://github.com/kokkos/kokkos/issues/5)
## [End_C++98](https://github.com/kokkos/kokkos/tree/End_C++98) (2015-04-15)
\* *This Change Log was automatically generated by [github_changelog_generator](https://github.com/skywinder/Github-Changelog-Generator)*

View File

@ -34,8 +34,8 @@ TRIBITS_PACKAGE_DECL(Kokkos) # ENABLE_SHADOWING_WARNINGS)
# for compatibility with Kokkos' Makefile build system.
TRIBITS_ADD_OPTION_AND_DEFINE(
${PACKAGE_NAME}_ENABLE_DEBUG
${PACKAGE_NAME_UC}_HAVE_DEBUG
Kokkos_ENABLE_DEBUG
KOKKOS_HAVE_DEBUG
"Enable run-time debug checks. These checks may be expensive, so they are disabled by default in a release build."
${${PROJECT_NAME}_ENABLE_DEBUG}
)
@ -57,7 +57,21 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_Cuda_UVM
KOKKOS_USE_CUDA_UVM
"Enable CUDA Unified Virtual Memory support in Kokkos."
"Enable CUDA Unified Virtual Memory as the default in Kokkos."
OFF
)
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_Cuda_RDC
KOKKOS_HAVE_CUDA_RDC
"Enable CUDA Relocatable Device Code support in Kokkos."
OFF
)
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_Cuda_Lambda
KOKKOS_HAVE_CUDA_LAMBDA
"Enable CUDA LAMBDA support in Kokkos."
OFF
)
@ -72,6 +86,9 @@ ASSERT_DEFINED(TPL_ENABLE_Pthread)
IF (Kokkos_ENABLE_Pthread AND NOT TPL_ENABLE_Pthread)
MESSAGE(FATAL_ERROR "You set Kokkos_ENABLE_Pthread=ON, but Trilinos' support for Pthread(s) is not enabled (TPL_ENABLE_Pthread=OFF). This is not allowed. Please enable Pthreads in Trilinos before attempting to enable Kokkos' support for Pthreads.")
ENDIF ()
IF (NOT TPL_ENABLE_Pthread)
ADD_DEFINITIONS(-DGTEST_HAS_PTHREAD=0)
ENDIF()
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_OpenMP
@ -162,13 +179,28 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
#------------------------------------------------------------------------------
#
# C) Process the subpackages for Kokkos
# C) Install Kokkos' executable scripts
#
# nvcc_wrapper is Kokkos' wrapper for NVIDIA's NVCC CUDA compiler.
# Kokkos needs nvcc_wrapper in order to build. Other libraries and
# executables also need nvcc_wrapper. Thus, we need to install it.
# If the argument of DESTINATION is a relative path, CMake computes it
# as relative to ${CMAKE_INSTALL_PATH}.
INSTALL(PROGRAMS ${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper DESTINATION bin)
#------------------------------------------------------------------------------
#
# D) Process the subpackages for Kokkos
#
TRIBITS_PROCESS_SUBPACKAGES()
#
# D) If Kokkos itself is enabled, process the Kokkos package
# E) If Kokkos itself is enabled, process the Kokkos package
#
TRIBITS_PACKAGE_DEF()

View File

@ -7,25 +7,26 @@ CXXFLAGS=$(CCFLAGS)
#Options: OpenMP,Serial,Pthreads,Cuda
KOKKOS_DEVICES ?= "OpenMP"
#KOKKOS_DEVICES ?= "Pthreads"
#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal61,ARMv8,BGQ,Power7,Power8,KNL,BDW
#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal61,ARMv80,ARMv81,ARMv8-ThunderX,BGQ,Power7,Power8,KNL,BDW,SKX
KOKKOS_ARCH ?= ""
#Options: yes,no
KOKKOS_DEBUG ?= "no"
#Options: hwloc,librt,experimental_memkind
KOKKOS_USE_TPLS ?= ""
#Options: c++11
#Options: c++11,c++1z
KOKKOS_CXX_STANDARD ?= "c++11"
#Options: aggressive_vectorization,disable_profiling
KOKKOS_OPTIONS ?= ""
#Default settings specific options
#Options: force_uvm,use_ldg,rdc,enable_lambda
KOKKOS_CUDA_OPTIONS ?= ""
KOKKOS_CUDA_OPTIONS ?= "enable_lambda"
# Check for general settings
KOKKOS_INTERNAL_ENABLE_DEBUG := $(strip $(shell echo $(KOKKOS_DEBUG) | grep "yes" | wc -l))
KOKKOS_INTERNAL_ENABLE_CXX11 := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++11" | wc -l))
KOKKOS_INTERNAL_ENABLE_CXX1Z := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++1z" | wc -l))
# Check for external libraries
KOKKOS_INTERNAL_USE_HWLOC := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "hwloc" | wc -l))
@ -53,23 +54,71 @@ ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 0)
endif
endif
# Check for other Execution Spaces
KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda | wc -l))
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc)
CUDA_PATH ?= $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=)
KOKKOS_INTERNAL_COMPILER_NVCC_VERSION := $(shell nvcc --version 2>&1 | grep release | cut -d' ' -f5 | cut -d',' -f1 | tr -d .)
endif
# Check OS
KOKKOS_OS := $(shell uname -s)
KOKKOS_INTERNAL_OS_CYGWIN := $(shell uname -s | grep CYGWIN | wc -l)
KOKKOS_INTERNAL_OS_LINUX := $(shell uname -s | grep Linux | wc -l)
KOKKOS_INTERNAL_OS_DARWIN := $(shell uname -s | grep Darwin | wc -l)
# Check compiler
KOKKOS_INTERNAL_COMPILER_INTEL := $(shell $(CXX) --version 2>&1 | grep "Intel Corporation" | wc -l)
KOKKOS_INTERNAL_COMPILER_PGI := $(shell $(CXX) --version 2>&1 | grep PGI | wc -l)
KOKKOS_INTERNAL_COMPILER_XL := $(shell $(CXX) -qversion 2>&1 | grep XL | wc -l)
KOKKOS_INTERNAL_COMPILER_CRAY := $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-" | wc -l)
KOKKOS_INTERNAL_OS_CYGWIN := $(shell uname | grep CYGWIN | wc -l)
KOKKOS_INTERNAL_COMPILER_NVCC := $(shell $(CXX) --version 2>&1 | grep "nvcc" | wc -l)
ifneq ($(OMPI_CXX),)
KOKKOS_INTERNAL_COMPILER_NVCC := $(shell $(OMPI_CXX) --version 2>&1 | grep "nvcc" | wc -l)
endif
ifneq ($(MPICH_CXX),)
KOKKOS_INTERNAL_COMPILER_NVCC := $(shell $(MPICH_CXX) --version 2>&1 | grep "nvcc" | wc -l)
endif
KOKKOS_INTERNAL_COMPILER_CLANG := $(shell $(CXX) --version 2>&1 | grep "clang" | wc -l)
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2)
KOKKOS_INTERNAL_COMPILER_CLANG = 1
endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 2)
KOKKOS_INTERNAL_COMPILER_XL = 1
endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_INTERNAL_COMPILER_CLANG_VERSION := $(shell clang --version | grep version | cut -d ' ' -f3 | tr -d '.')
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_CLANG_VERSION) -lt 400; echo $$?),0)
$(error Compiling Cuda code directly with Clang requires version 4.0.0 or higher)
endif
KOKKOS_INTERNAL_CUDA_USE_LAMBDA := 1
endif
endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_INTERNAL_OPENMP_FLAG := -mp
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
KOKKOS_INTERNAL_OPENMP_FLAG := -qsmp=omp
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
# OpenMP is turned on by default in Cray compiler environment
KOKKOS_INTERNAL_OPENMP_FLAG :=
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
KOKKOS_INTERNAL_OPENMP_FLAG := -qsmp=omp
else
KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
# OpenMP is turned on by default in Cray compiler environment
KOKKOS_INTERNAL_OPENMP_FLAG :=
else
KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
endif
endif
endif
endif
@ -84,13 +133,11 @@ else
KOKKOS_INTERNAL_CXX11_FLAG := -hstd=c++11
else
KOKKOS_INTERNAL_CXX11_FLAG := --std=c++11
KOKKOS_INTERNAL_CXX1Z_FLAG := --std=c++1z
endif
endif
endif
# Check for other Execution Spaces
KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda | wc -l))
# Check for Kokkos Architecture settings
#Intel based
@ -98,6 +145,7 @@ KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC |
KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW | wc -l))
KOKKOS_INTERNAL_USE_ARCH_SKX := $(strip $(shell echo $(KOKKOS_ARCH) | grep SKX | wc -l))
KOKKOS_INTERNAL_USE_ARCH_KNL := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNL | wc -l))
#NVIDIA based
@ -110,11 +158,13 @@ KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep
KOKKOS_INTERNAL_USE_ARCH_MAXWELL52 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell52 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_MAXWELL53 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell53 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_PASCAL61 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Pascal61 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_PASCAL60 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Pascal60 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
@ -127,13 +177,16 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_AR
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
endif
#ARM based
KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv8 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv80 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_ARMV81 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv81 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv8-ThunderX | wc -l))
#IBM based
KOKKOS_INTERNAL_USE_ARCH_BGQ := $(strip $(shell echo $(KOKKOS_ARCH) | grep BGQ | wc -l))
@ -145,17 +198,18 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l))
#Any AVX?
KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
# Decide what ISA level we are able to support
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
KOKKOS_INTERNAL_USE_ISA_KNC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8) | bc ))
#Incompatible flags?
KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV80)>1" | bc ))
KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)>1" | bc ))
KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc))
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
@ -207,15 +261,21 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
endif
ifeq ($(KOKKOS_INTERNAL_USE_ISA_X86_64), 1)
tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_USE_ISA_X86_64" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
endif
ifeq ($(KOKKOS_INTERNAL_USE_ISA_KNC), 1)
tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_USE_ISA_KNC" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
endif
ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCLE), 1)
tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_USE_ISA_POWERPCLE" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
endif
ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
@ -230,9 +290,15 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1)
tmp := $(shell echo "\#define KOKKOS_HAVE_CXX11 1" >> KokkosCore_config.tmp )
endif
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Z), 1)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Z_FLAG)
tmp := $(shell echo "\#define KOKKOS_HAVE_CXX11 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_HAVE_CXX1Z 1" >> KokkosCore_config.tmp )
endif
ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
KOKKOS_CXXFLAGS += -G
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
KOKKOS_CXXFLAGS += -lineinfo
endif
KOKKOS_CXXFLAGS += -g
KOKKOS_LDFLAGS += -g -ldl
@ -273,13 +339,14 @@ endif
tmp := $(shell echo "/* Cuda Settings */" >> KokkosCore_config.tmp)
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LDG), 1)
tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LDG_INTRINSIC 1" >> KokkosCore_config.tmp )
endif
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
tmp := $(shell echo "\#define KOKKOS_CUDA_USE_UVM 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_USE_CUDA_UVM 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_USE_CUDA_UVM 1" >> KokkosCore_config.tmp )
endif
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1)
@ -289,27 +356,101 @@ ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1)
endif
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LAMBDA), 1)
tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += -expt-extended-lambda
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION) -gt 70; echo $$?),0)
tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += -expt-extended-lambda
else
$(warning Warning: Cuda Lambda support was requested but NVCC version is too low. This requires NVCC for Cuda version 7.5 or higher. Disabling Lambda support now.)
endif
endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp )
endif
endif
endif
#Add Architecture flags
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp )
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp )
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
KOKKOS_CXXFLAGS +=
KOKKOS_LDFLAGS +=
else
KOKKOS_CXXFLAGS += -mavx
KOKKOS_LDFLAGS += -mavx
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_CXXFLAGS +=
KOKKOS_LDFLAGS +=
else
KOKKOS_CXXFLAGS += -march=armv8-a
KOKKOS_LDFLAGS += -march=armv8-a
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV81 1" >> KokkosCore_config.tmp )
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
KOKKOS_CXXFLAGS +=
KOKKOS_LDFLAGS +=
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_CXXFLAGS +=
KOKKOS_LDFLAGS +=
else
KOKKOS_CXXFLAGS += -march=armv8.1-a
KOKKOS_LDFLAGS += -march=armv8.1-a
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV8_THUNDERX 1" >> KokkosCore_config.tmp )
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
KOKKOS_CXXFLAGS +=
KOKKOS_LDFLAGS +=
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_CXXFLAGS +=
KOKKOS_LDFLAGS +=
else
KOKKOS_CXXFLAGS += -march=armv8-a -mtune=thunderx
KOKKOS_LDFLAGS += -march=armv8-a -mtune=thunderx
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp )
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
KOKKOS_CXXFLAGS += -mavx
KOKKOS_LDFLAGS += -mavx
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_CXXFLAGS += -tp=sandybridge
KOKKOS_LDFLAGS += -tp=sandybridge
else
# Assume that this is a really a GNU compiler
KOKKOS_CXXFLAGS += -mavx
KOKKOS_LDFLAGS += -mavx
endif
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
else
# Assume that this is a really a GNU compiler or it could be XL on P8
KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
@ -322,7 +463,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_CXXFLAGS += -tp=haswell
KOKKOS_LDFLAGS += -tp=haswell
else
# Assume that this is a really a GNU compiler
KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2
@ -352,52 +494,85 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_AVX512XEON 1" >> KokkosCore_config.tmp )
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
KOKKOS_CXXFLAGS += -xCORE-AVX512
KOKKOS_LDFLAGS += -xCORE-AVX512
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
else
# Nothing here yet
KOKKOS_CXXFLAGS += -march=skylake-avx512
KOKKOS_LDFLAGS += -march=skylake-avx512
endif
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_KNC 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += -mmic
KOKKOS_LDFLAGS += -mmic
endif
#Figure out the architecture flag for Cuda
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG=-arch
endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG=-x cuda --cuda-gpu-arch
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER30 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += -arch=sm_30
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_30
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER32 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += -arch=sm_32
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_32
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER35 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += -arch=sm_35
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_35
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER37 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += -arch=sm_37
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_37
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL50 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += -arch=sm_50
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_50
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL52 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += -arch=sm_52
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_52
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL53 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += -arch=sm_53
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_53
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL61 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += -arch=sm_61
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_61
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL60 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_60
endif
endif
@ -424,6 +599,7 @@ KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp)
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
KOKKOS_CXXFLAGS += -I$(CUDA_PATH)/include
KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64
KOKKOS_LIBS += -lcudart -lcuda
endif
@ -443,7 +619,7 @@ endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp)
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
KOKKOS_CXXFLAGS += -Xcompiler $(KOKKOS_INTERNAL_OPENMP_FLAG)
else
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
@ -451,6 +627,14 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
endif
#Explicitly set the GCC Toolchain for Clang
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_INTERNAL_GCC_PATH = $(shell which g++)
KOKKOS_INTERNAL_GCC_TOOLCHAIN = $(KOKKOS_INTERNAL_GCC_PATH:/bin/g++=)
KOKKOS_CXXFLAGS += --gcc-toolchain=$(KOKKOS_INTERNAL_GCC_TOOLCHAIN) -DKOKKOS_CUDA_CLANG_WORKAROUND -DKOKKOS_CUDA_USE_LDG_INTRINSIC
KOKKOS_LDFLAGS += --gcc-toolchain=$(KOKKOS_INTERNAL_GCC_TOOLCHAIN)
endif
#With Cygwin functions such as fdopen and fileno are not defined
#when strict ansi is enabled. strict ansi gets enabled with --std=c++11
#though. So we hard undefine it here. Not sure if that has any bad side effects
@ -471,7 +655,7 @@ KOKKOS_OBJ_LINK = $(notdir $(KOKKOS_OBJ))
include $(KOKKOS_PATH)/Makefile.targets
kokkos-clean:
-rm -f $(KOKKOS_OBJ_LINK) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a
rm -f $(KOKKOS_OBJ_LINK) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a
libkokkos.a: $(KOKKOS_OBJ_LINK) $(KOKKOS_SRC) $(KOKKOS_HEADERS)
ar cr libkokkos.a $(KOKKOS_OBJ_LINK)

View File

@ -14,20 +14,16 @@ Kokkos_hwloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp
Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp
Kokkos_Serial_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
Kokkos_Shape.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp
Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
KokkosExp_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/KokkosExp_SharedAlloc.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/KokkosExp_SharedAlloc.cpp
Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
@ -38,8 +34,6 @@ Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
Kokkos_Cuda_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
endif
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
@ -47,8 +41,6 @@ Kokkos_ThreadsExec_base.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp
Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
Kokkos_Threads_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp
endif
ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
@ -67,6 +59,4 @@ endif
Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
Kokkos_HBWAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWAllocators.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWAllocators.cpp

View File

@ -45,31 +45,32 @@ Primary tested compilers on X86 are:
Intel 14.0.4
Intel 15.0.2
Intel 16.0.1
Intel 17.0.098
Clang 3.5.2
Clang 3.6.1
Clang 3.9.0
Primary tested compilers on Power 8 are:
IBM XL 13.1.3 (OpenMP,Serial)
GCC 4.9.2 (OpenMP,Serial)
GCC 5.3.0 (OpenMP,Serial)
GCC 5.4.0 (OpenMP,Serial)
IBM XL 13.1.3 (OpenMP, Serial) (There is a workaround in place to avoid a compiler bug)
Primary tested compilers on Intel KNL are:
Intel 16.2.181 (with gcc 4.7.2)
Intel 17.0.098 (with gcc 4.7.2)
Secondary tested compilers are:
CUDA 6.5 (with gcc 4.7.2)
CUDA 7.0 (with gcc 4.7.2)
CUDA 7.5 (with gcc 4.8.4)
CUDA 7.5 (with gcc 4.7.2)
CUDA 8.0 (with gcc 5.3.0 on X86 and gcc 5.4.0 on Power8)
CUDA/Clang 8.0 using Clang/Trunk compiler
Other compilers working:
X86:
Intel 17.0.042 (the FENL example causes internal compiler error)
PGI 15.4
Cygwin 2.1.0 64bit with gcc 4.9.3
KNL:
Intel 16.2.181 (the FENL example causes internal compiler error)
Intel 17.0.042 (the FENL example causes internal compiler error)
Known non-working combinations:
Power8:
GCC 6.1.0
Pthreads backend
@ -92,9 +93,10 @@ master branch, without -Werror and only for a select set of backends.
In the 'example/tutorial' directory you will find step by step tutorial
examples which explain many of the features of Kokkos. They work with
simple Makefiles. To build with g++ and OpenMP simply type 'make openmp'
simple Makefiles. To build with g++ and OpenMP simply type 'make'
in the 'example/tutorial' directory. This will build all examples in the
subfolders.
subfolders. To change the build options refer to the Programming Guide
in the compilation section.
============================================================================
====Running Unit Tests======================================================

View File

@ -476,54 +476,54 @@ namespace Kokkos {
};
template<class Generator>
struct rand<Generator, ::Kokkos::complex<float> > {
struct rand<Generator, Kokkos::complex<float> > {
KOKKOS_INLINE_FUNCTION
static ::Kokkos::complex<float> max () {
return ::Kokkos::complex<float> (1.0, 1.0);
static Kokkos::complex<float> max () {
return Kokkos::complex<float> (1.0, 1.0);
}
KOKKOS_INLINE_FUNCTION
static ::Kokkos::complex<float> draw (Generator& gen) {
static Kokkos::complex<float> draw (Generator& gen) {
const float re = gen.frand ();
const float im = gen.frand ();
return ::Kokkos::complex<float> (re, im);
return Kokkos::complex<float> (re, im);
}
KOKKOS_INLINE_FUNCTION
static ::Kokkos::complex<float> draw (Generator& gen, const ::Kokkos::complex<float>& range) {
static Kokkos::complex<float> draw (Generator& gen, const Kokkos::complex<float>& range) {
const float re = gen.frand (real (range));
const float im = gen.frand (imag (range));
return ::Kokkos::complex<float> (re, im);
return Kokkos::complex<float> (re, im);
}
KOKKOS_INLINE_FUNCTION
static ::Kokkos::complex<float> draw (Generator& gen, const ::Kokkos::complex<float>& start, const ::Kokkos::complex<float>& end) {
static Kokkos::complex<float> draw (Generator& gen, const Kokkos::complex<float>& start, const Kokkos::complex<float>& end) {
const float re = gen.frand (real (start), real (end));
const float im = gen.frand (imag (start), imag (end));
return ::Kokkos::complex<float> (re, im);
return Kokkos::complex<float> (re, im);
}
};
template<class Generator>
struct rand<Generator, ::Kokkos::complex<double> > {
struct rand<Generator, Kokkos::complex<double> > {
KOKKOS_INLINE_FUNCTION
static ::Kokkos::complex<double> max () {
return ::Kokkos::complex<double> (1.0, 1.0);
static Kokkos::complex<double> max () {
return Kokkos::complex<double> (1.0, 1.0);
}
KOKKOS_INLINE_FUNCTION
static ::Kokkos::complex<double> draw (Generator& gen) {
static Kokkos::complex<double> draw (Generator& gen) {
const double re = gen.drand ();
const double im = gen.drand ();
return ::Kokkos::complex<double> (re, im);
return Kokkos::complex<double> (re, im);
}
KOKKOS_INLINE_FUNCTION
static ::Kokkos::complex<double> draw (Generator& gen, const ::Kokkos::complex<double>& range) {
static Kokkos::complex<double> draw (Generator& gen, const Kokkos::complex<double>& range) {
const double re = gen.drand (real (range));
const double im = gen.drand (imag (range));
return ::Kokkos::complex<double> (re, im);
return Kokkos::complex<double> (re, im);
}
KOKKOS_INLINE_FUNCTION
static ::Kokkos::complex<double> draw (Generator& gen, const ::Kokkos::complex<double>& start, const ::Kokkos::complex<double>& end) {
static Kokkos::complex<double> draw (Generator& gen, const Kokkos::complex<double>& start, const Kokkos::complex<double>& end) {
const double re = gen.drand (real (start), real (end));
const double im = gen.drand (imag (start), imag (end));
return ::Kokkos::complex<double> (re, im);
return Kokkos::complex<double> (re, im);
}
};
@ -670,8 +670,8 @@ namespace Kokkos {
double S = 2.0;
double U;
while(S>=1.0) {
U = drand();
const double V = drand();
U = 2.0*drand() - 1.0;
const double V = 2.0*drand() - 1.0;
S = U*U+V*V;
}
return U*sqrt(-2.0*log(S)/S);
@ -910,8 +910,8 @@ namespace Kokkos {
double S = 2.0;
double U;
while(S>=1.0) {
U = drand();
const double V = drand();
U = 2.0*drand() - 1.0;
const double V = 2.0*drand() - 1.0;
S = U*U+V*V;
}
return U*sqrt(-2.0*log(S)/S);
@ -1163,8 +1163,8 @@ namespace Kokkos {
double S = 2.0;
double U;
while(S>=1.0) {
U = drand();
const double V = drand();
U = 2.0*drand() - 1.0;
const double V = 2.0*drand() - 1.0;
S = U*U+V*V;
}
return U*sqrt(-2.0*log(S)/S);

View File

@ -51,7 +51,7 @@
namespace Kokkos {
namespace SortImpl {
namespace Impl {
template<class ValuesViewType, int Rank=ValuesViewType::Rank>
struct CopyOp;
@ -199,7 +199,7 @@ public:
parallel_for(values.dimension_0(),
bin_sort_sort_functor<ValuesViewType, offset_type,
SortImpl::CopyOp<ValuesViewType> >(values,sorted_values,sort_order));
Impl::CopyOp<ValuesViewType> >(values,sorted_values,sort_order));
deep_copy(values,sorted_values);
}
@ -262,17 +262,15 @@ public:
}
};
namespace SortImpl {
template<class KeyViewType>
struct DefaultBinOp1D {
struct BinOp1D {
const int max_bins_;
const double mul_;
typename KeyViewType::const_value_type range_;
typename KeyViewType::const_value_type min_;
//Construct BinOp with number of bins, minimum value and maxuimum value
DefaultBinOp1D(int max_bins__, typename KeyViewType::const_value_type min,
BinOp1D(int max_bins__, typename KeyViewType::const_value_type min,
typename KeyViewType::const_value_type max )
:max_bins_(max_bins__+1),mul_(1.0*max_bins__/(max-min)),range_(max-min),min_(min) {}
@ -298,13 +296,13 @@ struct DefaultBinOp1D {
};
template<class KeyViewType>
struct DefaultBinOp3D {
struct BinOp3D {
int max_bins_[3];
double mul_[3];
typename KeyViewType::non_const_value_type range_[3];
typename KeyViewType::non_const_value_type min_[3];
DefaultBinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[],
BinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[],
typename KeyViewType::const_value_type max[] )
{
max_bins_[0] = max_bins__[0]+1;
@ -348,109 +346,11 @@ struct DefaultBinOp3D {
}
};
template<typename Scalar>
struct min_max {
Scalar min;
Scalar max;
bool init;
KOKKOS_INLINE_FUNCTION
min_max() {
min = 0;
max = 0;
init = 0;
}
KOKKOS_INLINE_FUNCTION
min_max (const min_max& val) {
min = val.min;
max = val.max;
init = val.init;
}
KOKKOS_INLINE_FUNCTION
min_max operator = (const min_max& val) {
min = val.min;
max = val.max;
init = val.init;
return *this;
}
KOKKOS_INLINE_FUNCTION
void operator+= (const Scalar& val) {
if(init) {
min = min<val?min:val;
max = max>val?max:val;
} else {
min = val;
max = val;
init = 1;
}
}
KOKKOS_INLINE_FUNCTION
void operator+= (const min_max& val) {
if(init && val.init) {
min = min<val.min?min:val.min;
max = max>val.max?max:val.max;
} else {
if(val.init) {
min = val.min;
max = val.max;
init = 1;
}
}
}
KOKKOS_INLINE_FUNCTION
void operator+= (volatile const Scalar& val) volatile {
if(init) {
min = min<val?min:val;
max = max>val?max:val;
} else {
min = val;
max = val;
init = 1;
}
}
KOKKOS_INLINE_FUNCTION
void operator+= (volatile const min_max& val) volatile {
if(init && val.init) {
min = min<val.min?min:val.min;
max = max>val.max?max:val.max;
} else {
if(val.init) {
min = val.min;
max = val.max;
init = 1;
}
}
}
};
template<class ViewType>
struct min_max_functor {
typedef typename ViewType::execution_space execution_space;
ViewType view;
typedef min_max<typename ViewType::non_const_value_type> value_type;
min_max_functor (const ViewType view_):view(view_) {
}
KOKKOS_INLINE_FUNCTION
void operator()(const size_t& i, value_type& val) const {
val += view(i);
}
};
namespace Impl {
template<class ViewType>
bool try_std_sort(ViewType view) {
bool possible = true;
#if ! KOKKOS_USING_EXP_VIEW
size_t stride[8];
view.stride(stride);
#else
size_t stride[8] = { view.stride_0()
, view.stride_1()
, view.stride_2()
@ -460,8 +360,7 @@ bool try_std_sort(ViewType view) {
, view.stride_6()
, view.stride_7()
};
#endif
possible = possible && Impl::is_same<typename ViewType::memory_space, HostSpace>::value;
possible = possible && std::is_same<typename ViewType::memory_space, HostSpace>::value;
possible = possible && (ViewType::Rank == 1);
possible = possible && (stride[0] == 1);
if(possible) {
@ -470,27 +369,39 @@ bool try_std_sort(ViewType view) {
return possible;
}
template<class ViewType>
struct min_max_functor {
typedef Kokkos::Experimental::MinMaxScalar<typename ViewType::non_const_value_type> minmax_scalar;
ViewType view;
min_max_functor(const ViewType& view_):view(view_) {}
KOKKOS_INLINE_FUNCTION
void operator() (const size_t& i, minmax_scalar& minmax) const {
if(view(i) < minmax.min_val) minmax.min_val = view(i);
if(view(i) > minmax.max_val) minmax.max_val = view(i);
}
};
}
template<class ViewType>
void sort(ViewType view, bool always_use_kokkos_sort = false) {
if(!always_use_kokkos_sort) {
if(SortImpl::try_std_sort(view)) return;
if(Impl::try_std_sort(view)) return;
}
typedef BinOp1D<ViewType> CompType;
typedef SortImpl::DefaultBinOp1D<ViewType> CompType;
SortImpl::min_max<typename ViewType::non_const_value_type> val;
parallel_reduce(view.dimension_0(),SortImpl::min_max_functor<ViewType>(view),val);
BinSort<ViewType, CompType> bin_sort(view,CompType(view.dimension_0()/2,val.min,val.max),true);
Kokkos::Experimental::MinMaxScalar<typename ViewType::non_const_value_type> result;
Kokkos::Experimental::MinMax<typename ViewType::non_const_value_type> reducer(result);
parallel_reduce(Kokkos::RangePolicy<typename ViewType::execution_space>(0,view.dimension_0()),
Impl::min_max_functor<ViewType>(view),reducer);
if(result.min_val == result.max_val) return;
BinSort<ViewType, CompType> bin_sort(view,CompType(view.dimension_0()/2,result.min_val,result.max_val),true);
bin_sort.create_permute_vector();
bin_sort.sort(view);
}
/*template<class ViewType, class Comparator>
void sort(ViewType view, Comparator comp, bool always_use_kokkos_sort = false) {
}*/
}
#endif

View File

@ -1,6 +1,6 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
SET(SOURCES

View File

@ -7,21 +7,18 @@ vpath %.cpp ${KOKKOS_PATH}/algorithms/unit_tests
default: build_all
echo "End Build"
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
CXX = $(KOKKOS_PATH)/config/nvcc_wrapper
else
CXX = g++
endif
CXXFLAGS = -O3
LINK ?= $(CXX)
LDFLAGS ?= -lpthread
include $(KOKKOS_PATH)/Makefile.kokkos
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
CXX = $(NVCC_WRAPPER)
CXXFLAGS ?= -O3
LINK = $(CXX)
LDFLAGS ?= -lpthread
else
CXX ?= g++
CXXFLAGS ?= -O3
LINK ?= $(CXX)
LDFLAGS ?= -lpthread
endif
KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/algorithms/unit_tests
TEST_TARGETS =

View File

@ -131,6 +131,10 @@ void test_1D_sort(unsigned int n,bool force_kokkos) {
typedef Kokkos::View<KeyType*,ExecutionSpace> KeyViewType;
KeyViewType keys("Keys",n);
// Test sorting array with all numbers equal
Kokkos::deep_copy(keys,KeyType(1));
Kokkos::sort(keys,force_kokkos);
Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
Kokkos::fill_random(keys,g,Kokkos::Random_XorShift64_Pool<ExecutionSpace>::generator_type::MAX_URAND);
@ -174,7 +178,7 @@ void test_3D_sort(unsigned int n) {
typename KeyViewType::value_type min[3] = {0,0,0};
typename KeyViewType::value_type max[3] = {100,100,100};
typedef Kokkos::SortImpl::DefaultBinOp3D< KeyViewType > BinOp;
typedef Kokkos::BinOp3D< KeyViewType > BinOp;
BinOp bin_op(bin_max,min,max);
Kokkos::BinSort< KeyViewType , BinOp >
Sorter(keys,bin_op,false);

View File

@ -0,0 +1,43 @@
KOKKOS_PATH = ${HOME}/kokkos
SRC = $(wildcard *.cpp)
KOKKOS_DEVICES=Cuda
KOKKOS_CUDA_OPTIONS=enable_lambda
default: build
echo "Start Build"
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
CXX = ${KOKKOS_PATH}/config/nvcc_wrapper
EXE = bytes_and_flops.cuda
KOKKOS_DEVICES = "Cuda,OpenMP"
KOKKOS_ARCH = "SNB,Kepler35"
else
CXX = g++
EXE = bytes_and_flops.host
KOKKOS_DEVICES = "OpenMP"
KOKKOS_ARCH = "SNB"
endif
CXXFLAGS = -O3 -g
DEPFLAGS = -M
LINK = ${CXX}
LINKFLAGS =
OBJ = $(SRC:.cpp=.o)
LIB =
include $(KOKKOS_PATH)/Makefile.kokkos
build: $(EXE)
$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
clean: kokkos-clean
rm -f *.o *.cuda *.host
# Compilation rules
%.o:%.cpp $(KOKKOS_CPP_DEPENDS) bench.hpp bench_unroll_stride.hpp bench_stride.hpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<

View File

@ -0,0 +1,99 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include<Kokkos_Core.hpp>
#include<impl/Kokkos_Timer.hpp>
template<class Scalar, int Unroll,int Stride>
struct Run {
static void run(int N, int K, int R, int F, int T, int S);
};
template<class Scalar, int Stride>
struct RunStride {
static void run_1(int N, int K, int R, int F, int T, int S);
static void run_2(int N, int K, int R, int F, int T, int S);
static void run_3(int N, int K, int R, int F, int T, int S);
static void run_4(int N, int K, int R, int F, int T, int S);
static void run_5(int N, int K, int R, int F, int T, int S);
static void run_6(int N, int K, int R, int F, int T, int S);
static void run_7(int N, int K, int R, int F, int T, int S);
static void run_8(int N, int K, int R, int F, int T, int S);
static void run(int N, int K, int R, int U, int F, int T, int S);
};
#define STRIDE 1
#include<bench_stride.hpp>
#undef STRIDE
#define STRIDE 2
#include<bench_stride.hpp>
#undef STRIDE
#define STRIDE 4
#include<bench_stride.hpp>
#undef STRIDE
#define STRIDE 8
#include<bench_stride.hpp>
#undef STRIDE
#define STRIDE 16
#include<bench_stride.hpp>
#undef STRIDE
#define STRIDE 32
#include<bench_stride.hpp>
#undef STRIDE
template<class Scalar>
void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S) {
if(D == 1)
RunStride<Scalar,1>::run(N,K,R,U,F,T,S);
if(D == 2)
RunStride<Scalar,2>::run(N,K,R,U,F,T,S);
if(D == 4)
RunStride<Scalar,4>::run(N,K,R,U,F,T,S);
if(D == 8)
RunStride<Scalar,8>::run(N,K,R,U,F,T,S);
if(D == 16)
RunStride<Scalar,16>::run(N,K,R,U,F,T,S);
if(D == 32)
RunStride<Scalar,32>::run(N,K,R,U,F,T,S);
}

View File

@ -0,0 +1,124 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#define UNROLL 1
#include<bench_unroll_stride.hpp>
#undef UNROLL
#define UNROLL 2
#include<bench_unroll_stride.hpp>
#undef UNROLL
#define UNROLL 3
#include<bench_unroll_stride.hpp>
#undef UNROLL
#define UNROLL 4
#include<bench_unroll_stride.hpp>
#undef UNROLL
#define UNROLL 5
#include<bench_unroll_stride.hpp>
#undef UNROLL
#define UNROLL 6
#include<bench_unroll_stride.hpp>
#undef UNROLL
#define UNROLL 7
#include<bench_unroll_stride.hpp>
#undef UNROLL
#define UNROLL 8
#include<bench_unroll_stride.hpp>
#undef UNROLL
template<class Scalar>
struct RunStride<Scalar,STRIDE> {
static void run_1(int N, int K, int R, int F, int T, int S) {
Run<Scalar,1,STRIDE>::run(N,K,R,F,T,S);
}
static void run_2(int N, int K, int R, int F, int T, int S) {
Run<Scalar,2,STRIDE>::run(N,K,R,F,T,S);
}
static void run_3(int N, int K, int R, int F, int T, int S) {
Run<Scalar,3,STRIDE>::run(N,K,R,F,T,S);
}
static void run_4(int N, int K, int R, int F, int T, int S) {
Run<Scalar,4,STRIDE>::run(N,K,R,F,T,S);
}
static void run_5(int N, int K, int R, int F, int T, int S) {
Run<Scalar,5,STRIDE>::run(N,K,R,F,T,S);
}
static void run_6(int N, int K, int R, int F, int T, int S) {
Run<Scalar,6,STRIDE>::run(N,K,R,F,T,S);
}
static void run_7(int N, int K, int R, int F, int T, int S) {
Run<Scalar,7,STRIDE>::run(N,K,R,F,T,S);
}
static void run_8(int N, int K, int R, int F, int T, int S) {
Run<Scalar,8,STRIDE>::run(N,K,R,F,T,S);
}
static void run(int N, int K, int R, int U, int F, int T, int S) {
if(U==1) {
run_1(N,K,R,F,T,S);
}
if(U==2) {
run_2(N,K,R,F,T,S);
}
if(U==3) {
run_3(N,K,R,F,T,S);
}
if(U==4) {
run_4(N,K,R,F,T,S);
}
if(U==5) {
run_5(N,K,R,F,T,S);
}
if(U==6) {
run_6(N,K,R,F,T,S);
}
if(U==7) {
run_7(N,K,R,F,T,S);
}
if(U==8) {
run_8(N,K,R,F,T,S);
}
}
};

View File

@ -0,0 +1,148 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
template<class Scalar>
struct Run<Scalar,UNROLL,STRIDE> {
static void run(int N, int K, int R, int F, int T, int S) {
Kokkos::View<Scalar**[STRIDE],Kokkos::LayoutRight> A("A",N,K);
Kokkos::View<Scalar**[STRIDE],Kokkos::LayoutRight> B("B",N,K);
Kokkos::View<Scalar**[STRIDE],Kokkos::LayoutRight> C("C",N,K);
Kokkos::deep_copy(A,Scalar(1.5));
Kokkos::deep_copy(B,Scalar(2.5));
Kokkos::deep_copy(C,Scalar(3.5));
Kokkos::Timer timer;
Kokkos::parallel_for("BenchmarkKernel",Kokkos::TeamPolicy<>(N,T).set_scratch_size(0,Kokkos::PerTeam(S)),
KOKKOS_LAMBDA ( const Kokkos::TeamPolicy<>::member_type& team) {
const int n = team.league_rank();
for(int r=0; r<R; r++) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,K), [&] (const int& i) {
Scalar a1 = A(n,i,0);
const Scalar b = B(n,i,0);
#if(UNROLL>1)
Scalar a2 = a1*1.3;
#endif
#if(UNROLL>2)
Scalar a3 = a2*1.1;
#endif
#if(UNROLL>3)
Scalar a4 = a3*1.1;
#endif
#if(UNROLL>4)
Scalar a5 = a4*1.3;
#endif
#if(UNROLL>5)
Scalar a6 = a5*1.1;
#endif
#if(UNROLL>6)
Scalar a7 = a6*1.1;
#endif
#if(UNROLL>7)
Scalar a8 = a7*1.1;
#endif
for(int f = 0; f<F; f++) {
a1 += b*a1;
#if(UNROLL>1)
a2 += b*a2;
#endif
#if(UNROLL>2)
a3 += b*a3;
#endif
#if(UNROLL>3)
a4 += b*a4;
#endif
#if(UNROLL>4)
a5 += b*a5;
#endif
#if(UNROLL>5)
a6 += b*a6;
#endif
#if(UNROLL>6)
a7 += b*a7;
#endif
#if(UNROLL>7)
a8 += b*a8;
#endif
}
#if(UNROLL==1)
C(n,i,0) = a1;
#endif
#if(UNROLL==2)
C(n,i,0) = a1+a2;
#endif
#if(UNROLL==3)
C(n,i,0) = a1+a2+a3;
#endif
#if(UNROLL==4)
C(n,i,0) = a1+a2+a3+a4;
#endif
#if(UNROLL==5)
C(n,i,0) = a1+a2+a3+a4+a5;
#endif
#if(UNROLL==6)
C(n,i,0) = a1+a2+a3+a4+a5+a6;
#endif
#if(UNROLL==7)
C(n,i,0) = a1+a2+a3+a4+a5+a6+a7;
#endif
#if(UNROLL==8)
C(n,i,0) = a1+a2+a3+a4+a5+a6+a7+a8;
#endif
});
}
});
Kokkos::fence();
double seconds = timer.seconds();
double bytes = 1.0*N*K*R*3*sizeof(Scalar);
double flops = 1.0*N*K*R*(F*2*UNROLL + 2*(UNROLL-1));
printf("NKRUFTS: %i %i %i %i %i %i %i Time: %lfs Bandwidth: %lfGiB/s GFlop/s: %lf\n",N,K,R,UNROLL,F,T,S,seconds,1.0*bytes/seconds/1024/1024/1024,1.e-9*flops/seconds);
}
};

View File

@ -0,0 +1,96 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include<Kokkos_Core.hpp>
#include<impl/Kokkos_Timer.hpp>
#include<bench.hpp>
int main(int argc, char* argv[]) {
Kokkos::initialize();
if(argc<10) {
printf("Arguments: N K R D U F T S\n");
printf(" P: Precision (1==float, 2==double)\n");
printf(" N,K: dimensions of the 2D array to allocate\n");
printf(" R: how often to loop through the K dimension with each team\n");
printf(" D: distance between loaded elements (stride)\n");
printf(" U: how many independent flops to do per load\n");
printf(" F: how many times to repeat the U unrolled operations before reading next element\n");
printf(" T: team size\n");
printf(" S: shared memory per team (used to control occupancy on GPUs)\n");
printf("Example Input GPU:\n");
printf(" Bandwidth Bound : 2 100000 1024 1 1 1 1 256 6000\n");
printf(" Cache Bound : 2 100000 1024 64 1 1 1 512 20000\n");
printf(" Compute Bound : 2 100000 1024 1 1 8 64 256 6000\n");
printf(" Load Slots Used : 2 20000 256 32 16 1 1 256 6000\n");
printf(" Inefficient Load: 2 20000 256 32 2 1 1 256 20000\n");
Kokkos::finalize();
return 0;
}
int P = atoi(argv[1]);
int N = atoi(argv[2]);
int K = atoi(argv[3]);
int R = atoi(argv[4]);
int D = atoi(argv[5]);
int U = atoi(argv[6]);
int F = atoi(argv[7]);
int T = atoi(argv[8]);
int S = atoi(argv[9]);
if(U>8) {printf("U must be 1-8\n"); return 0;}
if( (D!=1) && (D!=2) && (D!=4) && (D!=8) && (D!=16) && (D!=32)) {printf("D must be one of 1,2,4,8,16,32\n"); return 0;}
if( (P!=1) && (P!=2) ) {printf("P must be one of 1,2\n"); return 0;}
if(P==1) {
run_stride_unroll<float>(N,K,R,D,U,F,T,S);
}
if(P==2) {
run_stride_unroll<double>(N,K,R,D,U,F,T,S);
}
Kokkos::finalize();
}

View File

@ -0,0 +1,44 @@
KOKKOS_PATH = ${HOME}/kokkos
SRC = $(wildcard *.cpp)
KOKKOS_DEVICES=Cuda
KOKKOS_CUDA_OPTIONS=enable_lambda
default: build
echo "Start Build"
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
CXX = ${KOKKOS_PATH}/config/nvcc_wrapper
EXE = gather.cuda
KOKKOS_DEVICES = "Cuda,OpenMP"
KOKKOS_ARCH = "SNB,Kepler35"
else
CXX = g++
EXE = gather.host
KOKKOS_DEVICES = "OpenMP"
KOKKOS_ARCH = "SNB"
endif
CXXFLAGS = -O3 -g
DEPFLAGS = -M
LINK = ${CXX}
LINKFLAGS =
OBJ = $(SRC:.cpp=.o)
LIB =
include $(KOKKOS_PATH)/Makefile.kokkos
$(warning ${KOKKOS_CPPFLAGS})
build: $(EXE)
$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
clean: kokkos-clean
rm -f *.o *.cuda *.host
# Compilation rules
%.o:%.cpp $(KOKKOS_CPP_DEPENDS) gather_unroll.hpp gather.hpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<

View File

@ -0,0 +1,92 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
template<class Scalar, int UNROLL>
struct RunGather {
static void run(int N, int K, int D, int R, int F);
};
#define UNROLL 1
#include<gather_unroll.hpp>
#undef UNROLL
#define UNROLL 2
#include<gather_unroll.hpp>
#undef UNROLL
#define UNROLL 3
#include<gather_unroll.hpp>
#undef UNROLL
#define UNROLL 4
#include<gather_unroll.hpp>
#undef UNROLL
#define UNROLL 5
#include<gather_unroll.hpp>
#undef UNROLL
#define UNROLL 6
#include<gather_unroll.hpp>
#undef UNROLL
#define UNROLL 7
#include<gather_unroll.hpp>
#undef UNROLL
#define UNROLL 8
#include<gather_unroll.hpp>
#undef UNROLL
template<class Scalar>
void run_gather_test(int N, int K, int D, int R, int U, int F) {
if(U == 1)
RunGather<Scalar,1>::run(N,K,D,R,F);
if(U == 2)
RunGather<Scalar,2>::run(N,K,D,R,F);
if(U == 3)
RunGather<Scalar,3>::run(N,K,D,R,F);
if(U == 4)
RunGather<Scalar,4>::run(N,K,D,R,F);
if(U == 5)
RunGather<Scalar,5>::run(N,K,D,R,F);
if(U == 6)
RunGather<Scalar,6>::run(N,K,D,R,F);
if(U == 7)
RunGather<Scalar,7>::run(N,K,D,R,F);
if(U == 8)
RunGather<Scalar,8>::run(N,K,D,R,F);
}

View File

@ -0,0 +1,169 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include<Kokkos_Core.hpp>
#include<Kokkos_Random.hpp>
template<class Scalar>
struct RunGather<Scalar,UNROLL> {
static void run(int N, int K, int D, int R, int F) {
Kokkos::View<int**> connectivity("Connectivity",N,K);
Kokkos::View<Scalar*> A_in("Input",N);
Kokkos::View<Scalar*> B_in("Input",N);
Kokkos::View<Scalar*> C("Output",N);
Kokkos::Random_XorShift64_Pool<> rand_pool(12313);
Kokkos::deep_copy(A_in,1.5);
Kokkos::deep_copy(B_in,2.0);
Kokkos::View<const Scalar*, Kokkos::MemoryTraits<Kokkos::RandomAccess> > A(A_in);
Kokkos::View<const Scalar*, Kokkos::MemoryTraits<Kokkos::RandomAccess> > B(B_in);
Kokkos::parallel_for("InitKernel",N,
KOKKOS_LAMBDA (const int& i) {
auto rand_gen = rand_pool.get_state();
for( int jj=0; jj<K; jj++) {
connectivity(i,jj) = (rand_gen.rand(D) + i - D/2 + N)%N;
}
rand_pool.free_state(rand_gen);
});
Kokkos::fence();
Kokkos::Timer timer;
for(int r = 0; r<R; r++) {
Kokkos::parallel_for("BenchmarkKernel",N,
KOKKOS_LAMBDA (const int& i) {
Scalar c = Scalar(0.0);
for( int jj=0; jj<K; jj++) {
const int j = connectivity(i,jj);
Scalar a1 = A(j);
const Scalar b = B(j);
#if(UNROLL>1)
Scalar a2 = a1*Scalar(1.3);
#endif
#if(UNROLL>2)
Scalar a3 = a2*Scalar(1.1);
#endif
#if(UNROLL>3)
Scalar a4 = a3*Scalar(1.1);
#endif
#if(UNROLL>4)
Scalar a5 = a4*Scalar(1.3);
#endif
#if(UNROLL>5)
Scalar a6 = a5*Scalar(1.1);
#endif
#if(UNROLL>6)
Scalar a7 = a6*Scalar(1.1);
#endif
#if(UNROLL>7)
Scalar a8 = a7*Scalar(1.1);
#endif
for(int f = 0; f<F; f++) {
a1 += b*a1;
#if(UNROLL>1)
a2 += b*a2;
#endif
#if(UNROLL>2)
a3 += b*a3;
#endif
#if(UNROLL>3)
a4 += b*a4;
#endif
#if(UNROLL>4)
a5 += b*a5;
#endif
#if(UNROLL>5)
a6 += b*a6;
#endif
#if(UNROLL>6)
a7 += b*a7;
#endif
#if(UNROLL>7)
a8 += b*a8;
#endif
}
#if(UNROLL==1)
c += a1;
#endif
#if(UNROLL==2)
c += a1+a2;
#endif
#if(UNROLL==3)
c += a1+a2+a3;
#endif
#if(UNROLL==4)
c += a1+a2+a3+a4;
#endif
#if(UNROLL==5)
c += a1+a2+a3+a4+a5;
#endif
#if(UNROLL==6)
c += a1+a2+a3+a4+a5+a6;
#endif
#if(UNROLL==7)
c += a1+a2+a3+a4+a5+a6+a7;
#endif
#if(UNROLL==8)
c += a1+a2+a3+a4+a5+a6+a7+a8;
#endif
}
C(i) = c ;
});
Kokkos::fence();
}
double seconds = timer.seconds();
double bytes = 1.0*N*K*R*(2*sizeof(Scalar)+sizeof(int)) + 1.0*N*R*sizeof(Scalar);
double flops = 1.0*N*K*R*(F*2*UNROLL + 2*(UNROLL-1));
double gather_ops = 1.0*N*K*R*2;
printf("SNKDRUF: %i %i %i %i %i %i %i Time: %lfs Bandwidth: %lfGiB/s GFlop/s: %lf GGather/s: %lf\n",sizeof(Scalar)/4,N,K,D,R,UNROLL,F,seconds,1.0*bytes/seconds/1024/1024/1024,1.e-9*flops/seconds,1.e-9*gather_ops/seconds);
}
};

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,73 +36,58 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_HostSpace.hpp>
#include<Kokkos_Core.hpp>
#include<impl/Kokkos_Timer.hpp>
#include<gather.hpp>
#include <impl/Kokkos_HBWAllocators.hpp>
#include <impl/Kokkos_Error.hpp>
int main(int argc, char* argv[]) {
Kokkos::initialize(argc,argv);
#include <stdint.h> // uintptr_t
#include <cstdlib> // for malloc, realloc, and free
#include <cstring> // for memcpy
#if defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
#include <sys/mman.h> // for mmap, munmap, MAP_ANON, etc
#include <unistd.h> // for sysconf, _SC_PAGE_SIZE, _SC_PHYS_PAGES
#endif
#include <sstream>
#include <iostream>
#ifdef KOKKOS_HAVE_HBWSPACE
#include <memkind.h>
namespace Kokkos {
namespace Experimental {
namespace Impl {
#define MEMKIND_TYPE MEMKIND_HBW //hbw_get_kind(HBW_PAGESIZE_4KB)
/*--------------------------------------------------------------------------*/
void* HBWMallocAllocator::allocate( size_t size )
{
std::cout<< "Allocate HBW: " << 1.0e-6*size << "MB" << std::endl;
void * ptr = NULL;
if (size) {
ptr = memkind_malloc(MEMKIND_TYPE,size);
if (!ptr)
{
std::ostringstream msg ;
msg << name() << ": allocate(" << size << ") FAILED";
Kokkos::Impl::throw_runtime_exception( msg.str() );
}
if(argc<8) {
printf("Arguments: S N K D\n");
printf(" S: Scalar Type Size (1==float, 2==double, 4=complex<double>)\n");
printf(" N: Number of entities\n");
printf(" K: Number of things to gather per entity\n");
printf(" D: Max distance of gathered things of an entity\n");
printf(" R: how often to loop through the K dimension with each team\n");
printf(" U: how many independent flops to do per load\n");
printf(" F: how many times to repeat the U unrolled operations before reading next element\n");
printf("Example Input GPU:\n");
printf(" Bandwidth Bound : 2 10000000 1 1 10 1 1\n");
printf(" Cache Bound : 2 10000000 64 1 10 1 1\n");
printf(" Cache Gather : 2 10000000 64 256 10 1 1\n");
printf(" Global Gather : 2 100000000 16 100000000 1 1 1\n");
printf(" Typical MD : 2 100000 32 512 1000 8 2\n");
Kokkos::finalize();
return 0;
}
return ptr;
int S = atoi(argv[1]);
int N = atoi(argv[2]);
int K = atoi(argv[3]);
int D = atoi(argv[4]);
int R = atoi(argv[5]);
int U = atoi(argv[6]);
int F = atoi(argv[7]);
if( (S!=1) && (S!=2) && (S!=4)) {printf("S must be one of 1,2,4\n"); return 0;}
if( N<D ) {printf("N must be larger or equal to D\n"); return 0; }
if(S==1) {
run_gather_test<float>(N,K,D,R,U,F);
}
if(S==2) {
run_gather_test<double>(N,K,D,R,U,F);
}
if(S==4) {
run_gather_test<Kokkos::complex<double> >(N,K,D,R,U,F);
}
Kokkos::finalize();
}
void HBWMallocAllocator::deallocate( void * ptr, size_t /*size*/ )
{
if (ptr) {
memkind_free(MEMKIND_TYPE,ptr);
}
}
void * HBWMallocAllocator::reallocate(void * old_ptr, size_t /*old_size*/, size_t new_size)
{
void * ptr = memkind_realloc(MEMKIND_TYPE, old_ptr, new_size);
if (new_size > 0u && ptr == NULL) {
Kokkos::Impl::throw_runtime_exception("Error: Malloc Allocator could not reallocate memory");
}
return ptr;
}
} // namespace Impl
} // namespace Experimental
} // namespace Kokkos
#endif

284
lib/kokkos/bin/nvcc_wrapper Executable file
View File

@ -0,0 +1,284 @@
#!/bin/bash
#
# This shell script (nvcc_wrapper) wraps both the host compiler and
# NVCC, if you are building legacy C or C++ code with CUDA enabled.
# The script remedies some differences between the interface of NVCC
# and that of the host compiler, in particular for linking.
# It also means that a legacy code doesn't need separate .cu files;
# it can just use .cpp files.
#
# Default settings: change those according to your machine. For
# example, you may have have two different wrappers with either icpc
# or g++ as their back-end compiler. The defaults can be overwritten
# by using the usual arguments (e.g., -arch=sm_30 -ccbin icpc).
default_arch="sm_35"
#default_arch="sm_50"
#
# The default C++ compiler.
#
host_compiler=${NVCC_WRAPPER_DEFAULT_COMPILER:-"g++"}
#host_compiler="icpc"
#host_compiler="/usr/local/gcc/4.8.3/bin/g++"
#host_compiler="/usr/local/gcc/4.9.1/bin/g++"
#
# Internal variables
#
# C++ files
cpp_files=""
# Host compiler arguments
xcompiler_args=""
# Cuda (NVCC) only arguments
cuda_args=""
# Arguments for both NVCC and Host compiler
shared_args=""
# Linker arguments
xlinker_args=""
# Object files passable to NVCC
object_files=""
# Link objects for the host linker only
object_files_xlinker=""
# Shared libraries with version numbers are not handled correctly by NVCC
shared_versioned_libraries_host=""
shared_versioned_libraries=""
# Does the User set the architecture
arch_set=0
# Does the user overwrite the host compiler
ccbin_set=0
#Error code of compilation
error_code=0
# Do a dry run without actually compiling
dry_run=0
# Skip NVCC compilation and use host compiler directly
host_only=0
# Enable workaround for CUDA 6.5 for pragma ident
replace_pragma_ident=0
# Mark first host compiler argument
first_xcompiler_arg=1
temp_dir=${TMPDIR:-/tmp}
# Check if we have an optimization argument already
optimization_applied=0
#echo "Arguments: $# $@"
while [ $# -gt 0 ]
do
case $1 in
#show the executed command
--show|--nvcc-wrapper-show)
dry_run=1
;;
#run host compilation only
--host-only)
host_only=1
;;
#replace '#pragma ident' with '#ident' this is needed to compile OpenMPI due to a configure script bug and a non standardized behaviour of pragma with macros
--replace-pragma-ident)
replace_pragma_ident=1
;;
#handle source files to be compiled as cuda files
*.cpp|*.cxx|*.cc|*.C|*.c++|*.cu)
cpp_files="$cpp_files $1"
;;
# Ensure we only have one optimization flag because NVCC doesn't allow muliple
-O*)
if [ $optimization_applied -eq 1 ]; then
echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-O*), only the first is used because nvcc can only accept a single optimization setting."
else
shared_args="$shared_args $1"
optimization_applied=1
fi
;;
#Handle shared args (valid for both nvcc and the host compiler)
-D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared)
shared_args="$shared_args $1"
;;
#Handle shared args that have an argument
-o|-MT)
shared_args="$shared_args $1 $2"
shift
;;
#Handle known nvcc args
-gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*)
cuda_args="$cuda_args $1"
;;
#Handle more known nvcc args
--expt-extended-lambda|--expt-relaxed-constexpr)
cuda_args="$cuda_args $1"
;;
#Handle known nvcc args that have an argument
-rdc|-maxrregcount|--default-stream)
cuda_args="$cuda_args $1 $2"
shift
;;
#Handle c++11 setting
--std=c++11|-std=c++11)
shared_args="$shared_args $1"
;;
#strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98
-std=c++98|--std=c++98)
;;
#strip of pedantic because it produces endless warnings about #LINE added by the preprocessor
-pedantic|-Wpedantic|-ansi)
;;
#strip -Xcompiler because we add it
-Xcompiler)
if [ $first_xcompiler_arg -eq 1 ]; then
xcompiler_args="$2"
first_xcompiler_arg=0
else
xcompiler_args="$xcompiler_args,$2"
fi
shift
;;
#strip of "-x cu" because we add that
-x)
if [[ $2 != "cu" ]]; then
if [ $first_xcompiler_arg -eq 1 ]; then
xcompiler_args="-x,$2"
first_xcompiler_arg=0
else
xcompiler_args="$xcompiler_args,-x,$2"
fi
fi
shift
;;
#Handle -ccbin (if its not set we can set it to a default value)
-ccbin)
cuda_args="$cuda_args $1 $2"
ccbin_set=1
host_compiler=$2
shift
;;
#Handle -arch argument (if its not set use a default
-arch*)
cuda_args="$cuda_args $1"
arch_set=1
;;
#Handle -Xcudafe argument
-Xcudafe)
cuda_args="$cuda_args -Xcudafe $2"
shift
;;
#Handle args that should be sent to the linker
-Wl*)
xlinker_args="$xlinker_args -Xlinker ${1:4:${#1}}"
host_linker_args="$host_linker_args ${1:4:${#1}}"
;;
#Handle object files: -x cu applies to all input files, so give them to linker, except if only linking
*.a|*.so|*.o|*.obj)
object_files="$object_files $1"
object_files_xlinker="$object_files_xlinker -Xlinker $1"
;;
#Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking
*.dylib)
object_files="$object_files -Xlinker $1"
object_files_xlinker="$object_files_xlinker -Xlinker $1"
;;
#Handle shared libraries with *.so.* names which nvcc can't do.
*.so.*)
shared_versioned_libraries_host="$shared_versioned_libraries_host $1"
shared_versioned_libraries="$shared_versioned_libraries -Xlinker $1"
;;
#All other args are sent to the host compiler
*)
if [ $first_xcompiler_arg -eq 1 ]; then
xcompiler_args=$1
first_xcompiler_arg=0
else
xcompiler_args="$xcompiler_args,$1"
fi
;;
esac
shift
done
#Add default host compiler if necessary
if [ $ccbin_set -ne 1 ]; then
cuda_args="$cuda_args -ccbin $host_compiler"
fi
#Add architecture command
if [ $arch_set -ne 1 ]; then
cuda_args="$cuda_args -arch=$default_arch"
fi
#Compose compilation command
nvcc_command="nvcc $cuda_args $shared_args $xlinker_args $shared_versioned_libraries"
if [ $first_xcompiler_arg -eq 0 ]; then
nvcc_command="$nvcc_command -Xcompiler $xcompiler_args"
fi
#Compose host only command
host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args $shared_versioned_libraries_host"
#nvcc does not accept '#pragma ident SOME_MACRO_STRING' but it does accept '#ident SOME_MACRO_STRING'
if [ $replace_pragma_ident -eq 1 ]; then
cpp_files2=""
for file in $cpp_files
do
var=`grep pragma ${file} | grep ident | grep "#"`
if [ "${#var}" -gt 0 ]
then
sed 's/#[\ \t]*pragma[\ \t]*ident/#ident/g' $file > $temp_dir/nvcc_wrapper_tmp_$file
cpp_files2="$cpp_files2 $temp_dir/nvcc_wrapper_tmp_$file"
else
cpp_files2="$cpp_files2 $file"
fi
done
cpp_files=$cpp_files2
#echo $cpp_files
fi
if [ "$cpp_files" ]; then
nvcc_command="$nvcc_command $object_files_xlinker -x cu $cpp_files"
else
nvcc_command="$nvcc_command $object_files"
fi
if [ "$cpp_files" ]; then
host_command="$host_command $object_files $cpp_files"
else
host_command="$host_command $object_files"
fi
#Print command for dryrun
if [ $dry_run -eq 1 ]; then
if [ $host_only -eq 1 ]; then
echo $host_command
else
echo $nvcc_command
fi
exit 0
fi
#Run compilation command
if [ $host_only -eq 1 ]; then
$host_command
else
$nvcc_command
fi
error_code=$?
#Report error code
exit $error_code

View File

@ -53,12 +53,12 @@
# ************************************************************************
# @HEADER
include(${TRIBITS_DEPS_DIR}/CUDA.cmake)
#include(${TRIBITS_DEPS_DIR}/CUDA.cmake)
IF (TPL_ENABLE_CUDA)
GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS)
GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS})
GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY})
TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE)
ENDIF()
#IF (TPL_ENABLE_CUDA)
# GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS)
# GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS})
# GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY})
# TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE)
#ENDIF()

View File

@ -1,6 +1,16 @@
INCLUDE(CMakeParseArguments)
INCLUDE(CTest)
cmake_policy(SET CMP0054 NEW)
IF(NOT DEFINED ${PROJECT_NAME})
project(Kokkos)
ENDIF()
IF(NOT DEFINED ${${PROJECT_NAME}_ENABLE_DEBUG}})
SET(${PROJECT_NAME}_ENABLE_DEBUG OFF)
ENDIF()
FUNCTION(ASSERT_DEFINED VARS)
FOREACH(VAR ${VARS})
IF(NOT DEFINED ${VAR})
@ -75,6 +85,13 @@ MACRO(TRIBITS_ADD_EXAMPLE_DIRECTORIES)
ENDMACRO()
function(INCLUDE_DIRECTORIES)
cmake_parse_arguments(INCLUDE_DIRECTORIES "REQUIRED_DURING_INSTALLATION_TESTING" "" "" ${ARGN})
_INCLUDE_DIRECTORIES(${INCLUDE_DIRECTORIES_UNPARSED_ARGUMENTS})
endfunction()
MACRO(TARGET_TRANSFER_PROPERTY TARGET_NAME PROP_IN PROP_OUT)
SET(PROP_VALUES)
FOREACH(TARGET_X ${ARGN})
@ -271,6 +288,11 @@ ENDFUNCTION()
ADD_CUSTOM_TARGET(check COMMAND ${CMAKE_CTEST_COMMAND} -VV -C ${CMAKE_CFG_INTDIR})
FUNCTION(TRIBITS_ADD_TEST)
ENDFUNCTION()
FUNCTION(TRIBITS_TPL_TENTATIVELY_ENABLE)
ENDFUNCTION()
FUNCTION(TRIBITS_ADD_EXECUTABLE_AND_TEST EXE_NAME)
SET(options STANDARD_PASS_OUTPUT WILL_FAIL)

0
lib/kokkos/config/configure_compton_cpu.sh Executable file → Normal file
View File

0
lib/kokkos/config/configure_compton_mic.sh Executable file → Normal file
View File

0
lib/kokkos/config/configure_kokkos.sh Executable file → Normal file
View File

0
lib/kokkos/config/configure_kokkos_nvidia.sh Executable file → Normal file
View File

0
lib/kokkos/config/configure_shannon.sh Executable file → Normal file
View File

View File

@ -91,9 +91,20 @@ Step 3:
// -------------------------------------------------------------------------------- //
Step 4:
4.1. Once all Trilinos tests pass promote Kokkos develop branch to master on Github
Step 4: Once all Trilinos tests pass promote Kokkos develop branch to master on Github
4.1. Generate Changelog (You need a github API token)
Close all Open issues with "InDevelop" tag on github
(Not from kokkos directory)
gitthub_changelog_generator kokkos/kokkos --token TOKEN --no-pull-requests --include-labels 'InDevelop' --enhancement-labels 'enhancement,Feature Request' --future-release 'NEWTAG' --between-tags 'NEWTAG,OLDTAG'
(Copy the new section from the generated CHANGELOG.md to the kokkos/CHANGELOG.md)
(Make desired changes to CHANGELOG.md to enhance clarity)
(Commit and push the CHANGELOG to develop)
4.2 Merge develop into Master
- DO NOT fast-forward the merge!!!!
(From kokkos directory):
@ -103,7 +114,7 @@ Step 4:
git reset --hard origin/master
git merge --no-ff origin/develop
4.2. Update the tag in kokkos/config/master_history.txt
4.3. Update the tag in kokkos/config/master_history.txt
Tag description: MajorNumber.MinorNumber.WeeksSinceMinorNumberUpdate
Tag format: #.#.##

View File

@ -1,3 +1,6 @@
tag: 2.01.00 date: 07:21:2016 master: xxxxxxxx develop: fa6dfcc4
tag: 2.01.06 date: 09:02:2016 master: 9afaa87f develop: 555f1a3a
tag: 2.01.10 date: 09:27:2016 master: e4119325 develop: e6cda11e
tag: 2.02.00 date: 10:30:2016 master: 6c90a581 develop: ca3dd56e
tag: 2.02.01 date: 11:01:2016 master: 9c698c86 develop: b0072304
tag: 2.02.07 date: 12:16:2016 master: 4b4cc4ba develop: 382c0966

View File

@ -121,6 +121,10 @@ do
-gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*)
cuda_args="$cuda_args $1"
;;
#Handle more known nvcc args
--expt-extended-lambda|--expt-relaxed-constexpr)
cuda_args="$cuda_args $1"
;;
#Handle known nvcc args that have an argument
-rdc|-maxrregcount|--default-stream)
cuda_args="$cuda_args $1 $2"

View File

@ -16,6 +16,8 @@ elif [[ "$HOSTNAME" =~ .*bowman.* ]]; then
MACHINE=bowman
elif [[ "$HOSTNAME" =~ node.* ]]; then # Warning: very generic name
MACHINE=shepard
elif [[ "$HOSTNAME" =~ apollo ]]; then
MACHINE=apollo
elif [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then
MACHINE=sems
else
@ -28,6 +30,7 @@ IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial"
CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial"
CUDA_IBM_BUILD_LIST="Cuda_OpenMP,Cuda_Serial"
GCC_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized"
IBM_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
@ -44,102 +47,12 @@ BUILD_ONLY=False
declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3
TEST_SCRIPT=False
SKIP_HWLOC=False
SPOT_CHECK=False
ARCH_FLAG=""
PRINT_HELP=False
OPT_FLAG=""
KOKKOS_OPTIONS=""
#
# Machine specific config
#
if [ "$MACHINE" = "sems" ]; then
source /projects/modulefiles/utils/sems-modules-init.sh
source /projects/modulefiles/utils/kokkos-modules-init.sh
BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>/base,hwloc/1.10.1/<COMPILER_NAME>/<COMPILER_VERSION>/base"
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.7.2/base"
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"cuda/6.5.14 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
"cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
"cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
)
elif [ "$MACHINE" = "white" ]; then
source /etc/profile.d/modules.sh
SKIP_HWLOC=True
export SLURM_TASKS_PER_NODE=32
BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.9.2"
# Don't do pthread on white
GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("gcc/4.9.2 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/5.3.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
)
ARCH_FLAG="--arch=Power8"
NUM_JOBS_TO_RUN_IN_PARALLEL=8
elif [ "$MACHINE" = "bowman" ]; then
source /etc/profile.d/modules.sh
SKIP_HWLOC=True
export SLURM_TASKS_PER_NODE=32
BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
)
ARCH_FLAG="--arch=KNL"
NUM_JOBS_TO_RUN_IN_PARALLEL=8
elif [ "$MACHINE" = "shepard" ]; then
source /etc/profile.d/modules.sh
SKIP_HWLOC=True
export SLURM_TASKS_PER_NODE=32
BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
)
ARCH_FLAG="--arch=HSW"
NUM_JOBS_TO_RUN_IN_PARALLEL=8
else
echo "Unhandled machine $MACHINE" >&2
exit 1
fi
export OMP_NUM_THREADS=4
declare -i NUM_RESULTS_TO_KEEP=7
RESULT_ROOT_PREFIX=TestAll
SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
#
# Handle arguments
@ -173,7 +86,211 @@ NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
--dry-run*)
DRYRUN=True
;;
--help)
--spot-check*)
SPOT_CHECK=True
;;
--arch*)
ARCH_FLAG="--arch=${key#*=}"
;;
--opt-flag*)
OPT_FLAG="${key#*=}"
;;
--with-cuda-options*)
KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}"
;;
--help*)
PRINT_HELP=True
;;
*)
# args, just append
ARGS="$ARGS $1"
;;
esac
shift
done
SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
# set kokkos path
if [ -z "$KOKKOS_PATH" ]; then
KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT
else
# Ensure KOKKOS_PATH is abs path
KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd )
fi
#
# Machine specific config
#
if [ "$MACHINE" = "sems" ]; then
source /projects/sems/modulefiles/utils/sems-modules-init.sh
BASE_MODULE_LIST="sems-env,kokkos-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>,kokkos-hwloc/1.10.1/base"
CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base"
CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base"
if [ -z "$ARCH_FLAG" ]; then
ARCH_FLAG=""
fi
if [ "$SPOT_CHECK" = "True" ]; then
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS"
"gcc/5.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
"intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
"clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
"cuda/8.0.44 $CUDA8_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
)
else
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
"cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
"cuda/8.0.44 $CUDA8_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
)
fi
elif [ "$MACHINE" = "white" ]; then
source /etc/profile.d/modules.sh
SKIP_HWLOC=True
export SLURM_TASKS_PER_NODE=32
BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/5.4.0"
# Don't do pthread on white
GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
"cuda/8.0.44 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
)
if [ -z "$ARCH_FLAG" ]; then
ARCH_FLAG="--arch=Power8,Kepler37"
fi
NUM_JOBS_TO_RUN_IN_PARALLEL=2
elif [ "$MACHINE" = "bowman" ]; then
source /etc/profile.d/modules.sh
SKIP_HWLOC=True
export SLURM_TASKS_PER_NODE=32
BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/17.0.098 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
)
if [ -z "$ARCH_FLAG" ]; then
ARCH_FLAG="--arch=KNL"
fi
NUM_JOBS_TO_RUN_IN_PARALLEL=2
elif [ "$MACHINE" = "shepard" ]; then
source /etc/profile.d/modules.sh
SKIP_HWLOC=True
export SLURM_TASKS_PER_NODE=32
BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/17.0.098 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
)
if [ -z "$ARCH_FLAG" ]; then
ARCH_FLAG="--arch=HSW"
fi
NUM_JOBS_TO_RUN_IN_PARALLEL=2
elif [ "$MACHINE" = "apollo" ]; then
source /projects/sems/modulefiles/utils/sems-modules-init.sh
module use /home/projects/modulefiles/local/x86-64
module load kokkos-env
module load sems-git
module load sems-tex
module load sems-cmake/3.5.2
module load sems-gdb
SKIP_HWLOC=True
BASE_MODULE_LIST="sems-env,kokkos-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>,kokkos-hwloc/1.10.1/base"
CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base"
CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base"
CLANG_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/8.0.44"
NVCC_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0"
BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP"
BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread"
BUILD_LIST_CLANG="Serial,Pthread,OpenMP"
if [ "$SPOT_CHECK" = "True" ]; then
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS"
"gcc/5.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
"intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
"clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
"clang/head $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
"cuda/8.0.44 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
)
else
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("cuda/8.0.44 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
"clang/head $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
"clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
"gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
"cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
)
fi
if [ -z "$ARCH_FLAG" ]; then
ARCH_FLAG="--arch=SNB,Kepler35"
fi
NUM_JOBS_TO_RUN_IN_PARALLEL=2
else
echo "Unhandled machine $MACHINE" >&2
exit 1
fi
export OMP_NUM_THREADS=4
declare -i NUM_RESULTS_TO_KEEP=7
RESULT_ROOT_PREFIX=TestAll
if [ "$PRINT_HELP" = "True" ]; then
echo "test_all_sandia <ARGS> <OPTIONS>:"
echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
echo " Defaults to root repo containing this script"
@ -183,6 +300,9 @@ echo "--skip-hwloc: Do not do hwloc tests"
echo "--num=N: Number of jobs to run in parallel "
echo "--dry-run: Just print what would be executed"
echo "--build-only: Just do builds, don't run anything"
echo "--opt-flag=FLAG: Optimization flag (default: -O3)"
echo "--arch=ARCHITECTURE: overwrite architecture flags"
echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS"
echo "--build-list=BUILD,BUILD,BUILD..."
echo " Provide a comma-separated list of builds instead of running all builds"
echo " Valid items:"
@ -220,21 +340,6 @@ echo " hit ctrl-z"
echo " % kill -9 %1"
echo
exit 0
;;
*)
# args, just append
ARGS="$ARGS $1"
;;
esac
shift
done
# set kokkos path
if [ -z "$KOKKOS_PATH" ]; then
KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT
else
# Ensure KOKKOS_PATH is abs path
KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd )
fi
# set build type
@ -381,11 +486,15 @@ single_build_and_test() {
local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info)))
fi
if [[ "$OPT_FLAG" = "" ]]; then
OPT_FLAG="-O3"
fi
if [[ "$build_type" = *debug* ]]; then
local extra_args="$extra_args --debug"
local cxxflags="-g $compiler_warning_flags"
else
local cxxflags="-O3 $compiler_warning_flags"
local cxxflags="$OPT_FLAG $compiler_warning_flags"
fi
if [[ "$compiler" == cuda* ]]; then
@ -393,7 +502,9 @@ single_build_and_test() {
export TMPDIR=$(pwd)
fi
# cxxflags="-DKOKKOS_USING_EXP_VIEW=1 $cxxflags"
if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then
local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS"
fi
echo " Starting job $desc"
@ -440,13 +551,14 @@ run_in_background() {
local compiler=$1
local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL
if [[ "$BUILD_ONLY" == True ]]; then
num_jobs=8
else
# don't override command line input
# if [[ "$BUILD_ONLY" == True ]]; then
# num_jobs=8
# else
if [[ "$compiler" == cuda* ]]; then
num_jobs=1
fi
fi
# fi
wait_for_jobs $num_jobs
single_build_and_test $* &

View File

@ -0,0 +1,50 @@
#!/bin/bash -le
export TRILINOS_UPDATED_PATH=${PWD}/trilinos-update
export TRILINOS_PRISTINE_PATH=${PWD}/trilinos-pristine
#rm -rf ${KOKKOS_PATH}
#rm -rf ${TRILINOS_UPDATED_PATH}
#rm -rf ${TRILINOS_PRISTINE_PATH}
#Already done:
if [ ! -d "${TRILINOS_UPDATED_PATH}" ]; then
git clone https://github.com/trilinos/trilinos ${TRILINOS_UPDATED_PATH}
fi
if [ ! -d "${TRILINOS_PRISTINE_PATH}" ]; then
git clone https://github.com/trilinos/trilinos ${TRILINOS_PRISTINE_PATH}
fi
cd ${TRILINOS_UPDATED_PATH}
git checkout develop
git reset --hard origin/develop
git pull
cd ..
python kokkos/config/snapshot.py ${KOKKOS_PATH} ${TRILINOS_UPDATED_PATH}/packages
cd ${TRILINOS_UPDATED_PATH}
echo ""
echo ""
echo "Trilinos State:"
git log --pretty=oneline --since=2.days
SHA=`git log --pretty=oneline --since=2.days | head -n 2 | tail -n 1 | awk '{print $1}'`
cd ..
cd ${TRILINOS_PRISTINE_PATH}
git status
git log --pretty=oneline --since=2.days
echo "Checkout develop"
git checkout develop
echo "Pull"
git pull
echo "Checkout SHA"
git checkout ${SHA}
cd ..
cd ${TRILINOS_PRISTINE_PATH}
echo ""
echo ""
echo "Trilinos Pristine State:"
git log --pretty=oneline --since=2.days
cd ..

View File

@ -1,6 +1,6 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
SET(SOURCES

View File

@ -7,21 +7,18 @@ vpath %.cpp ${KOKKOS_PATH}/containers/performance_tests
default: build_all
echo "End Build"
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
CXX = $(KOKKOS_PATH)/config/nvcc_wrapper
else
CXX = g++
endif
CXXFLAGS = -O3
LINK ?= $(CXX)
LDFLAGS ?= -lpthread
include $(KOKKOS_PATH)/Makefile.kokkos
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
CXX = $(NVCC_WRAPPER)
CXXFLAGS ?= -O3
LINK = $(CXX)
LDFLAGS ?= -lpthread
else
CXX ?= g++
CXXFLAGS ?= -O3
LINK ?= $(CXX)
LDFLAGS ?= -lpthread
endif
KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/performance_tests
TEST_TARGETS =

View File

@ -83,7 +83,7 @@ TEST_F( cuda, dynrankview_perf )
{
std::cout << "Cuda" << std::endl;
std::cout << " DynRankView vs View: Initialization Only " << std::endl;
test_dynrankview_op_perf<Kokkos::Cuda>( 4096 );
test_dynrankview_op_perf<Kokkos::Cuda>( 40960 );
}
TEST_F( cuda, global_2_local)

View File

@ -180,8 +180,8 @@ void test_dynrankview_op_perf( const int par_size )
typedef DeviceType execution_space;
typedef typename execution_space::size_type size_type;
const size_type dim2 = 900;
const size_type dim3 = 300;
const size_type dim2 = 90;
const size_type dim3 = 30;
double elapsed_time_view = 0;
double elapsed_time_compview = 0;

View File

@ -261,9 +261,6 @@ public:
modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device")),
modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
{
#if ! KOKKOS_USING_EXP_VIEW
Impl::assert_shapes_are_equal (d_view.shape (), h_view.shape ());
#else
if ( int(d_view.rank) != int(h_view.rank) ||
d_view.dimension_0() != h_view.dimension_0() ||
d_view.dimension_1() != h_view.dimension_1() ||
@ -284,7 +281,6 @@ public:
d_view.span() != h_view.span() ) {
Kokkos::Impl::throw_runtime_exception("DualView constructed with incompatible views");
}
#endif
}
//@}
@ -315,13 +311,13 @@ public:
template< class Device >
KOKKOS_INLINE_FUNCTION
const typename Impl::if_c<
Impl::is_same<typename t_dev::memory_space,
std::is_same<typename t_dev::memory_space,
typename Device::memory_space>::value,
t_dev,
t_host>::type& view () const
{
return Impl::if_c<
Impl::is_same<
std::is_same<
typename t_dev::memory_space,
typename Device::memory_space>::value,
t_dev,
@ -347,13 +343,13 @@ public:
/// appropriate template parameter.
template<class Device>
void sync( const typename Impl::enable_if<
( Impl::is_same< typename traits::data_type , typename traits::non_const_data_type>::value) ||
( Impl::is_same< Device , int>::value)
( std::is_same< typename traits::data_type , typename traits::non_const_data_type>::value) ||
( std::is_same< Device , int>::value)
, int >::type& = 0)
{
const unsigned int dev =
Impl::if_c<
Impl::is_same<
std::is_same<
typename t_dev::memory_space,
typename Device::memory_space>::value ,
unsigned int,
@ -370,7 +366,7 @@ public:
modified_host() = modified_device() = 0;
}
}
if(Impl::is_same<typename t_host::memory_space,typename t_dev::memory_space>::value) {
if(std::is_same<typename t_host::memory_space,typename t_dev::memory_space>::value) {
t_dev::execution_space::fence();
t_host::execution_space::fence();
}
@ -378,13 +374,13 @@ public:
template<class Device>
void sync ( const typename Impl::enable_if<
( ! Impl::is_same< typename traits::data_type , typename traits::non_const_data_type>::value ) ||
( Impl::is_same< Device , int>::value)
( ! std::is_same< typename traits::data_type , typename traits::non_const_data_type>::value ) ||
( std::is_same< Device , int>::value)
, int >::type& = 0 )
{
const unsigned int dev =
Impl::if_c<
Impl::is_same<
std::is_same<
typename t_dev::memory_space,
typename Device::memory_space>::value,
unsigned int,
@ -405,7 +401,7 @@ public:
{
const unsigned int dev =
Impl::if_c<
Impl::is_same<
std::is_same<
typename t_dev::memory_space,
typename Device::memory_space>::value ,
unsigned int,
@ -431,7 +427,7 @@ public:
void modify () {
const unsigned int dev =
Impl::if_c<
Impl::is_same<
std::is_same<
typename t_dev::memory_space,
typename Device::memory_space>::value,
unsigned int,
@ -514,11 +510,7 @@ public:
//! The allocation size (same as Kokkos::View::capacity).
size_t capacity() const {
#if KOKKOS_USING_EXP_VIEW
return d_view.span();
#else
return d_view.capacity();
#endif
}
//! Get stride(s) for each dimension.
@ -555,8 +547,6 @@ public:
// Partial specializations of Kokkos::subview() for DualView objects.
//
#if KOKKOS_USING_EXP_VIEW
namespace Kokkos {
namespace Impl {
@ -590,352 +580,6 @@ subview( const DualView<D,A1,A2,A3> & src , Args ... args )
} /* namespace Kokkos */
#else
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
//
// Partial specializations of Kokkos::subview() for DualView objects.
//
namespace Kokkos {
namespace Impl {
template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
, class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
, class SubArg4_type , class SubArg5_type , class SubArg6_type , class SubArg7_type
>
struct ViewSubview< DualView< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type >
, SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type
, SubArg4_type , SubArg5_type , SubArg6_type , SubArg7_type >
{
private:
typedef DualView< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type > SrcViewType ;
enum { V0 = Impl::is_same< SubArg0_type , void >::value ? 1 : 0 };
enum { V1 = Impl::is_same< SubArg1_type , void >::value ? 1 : 0 };
enum { V2 = Impl::is_same< SubArg2_type , void >::value ? 1 : 0 };
enum { V3 = Impl::is_same< SubArg3_type , void >::value ? 1 : 0 };
enum { V4 = Impl::is_same< SubArg4_type , void >::value ? 1 : 0 };
enum { V5 = Impl::is_same< SubArg5_type , void >::value ? 1 : 0 };
enum { V6 = Impl::is_same< SubArg6_type , void >::value ? 1 : 0 };
enum { V7 = Impl::is_same< SubArg7_type , void >::value ? 1 : 0 };
// The source view rank must be equal to the input argument rank
// Once a void argument is encountered all subsequent arguments must be void.
enum { InputRank =
Impl::StaticAssert<( SrcViewType::rank ==
( V0 ? 0 : (
V1 ? 1 : (
V2 ? 2 : (
V3 ? 3 : (
V4 ? 4 : (
V5 ? 5 : (
V6 ? 6 : (
V7 ? 7 : 8 ))))))) ))
&&
( SrcViewType::rank ==
( 8 - ( V0 + V1 + V2 + V3 + V4 + V5 + V6 + V7 ) ) )
>::value ? SrcViewType::rank : 0 };
enum { R0 = Impl::ViewOffsetRange< SubArg0_type >::is_range ? 1 : 0 };
enum { R1 = Impl::ViewOffsetRange< SubArg1_type >::is_range ? 1 : 0 };
enum { R2 = Impl::ViewOffsetRange< SubArg2_type >::is_range ? 1 : 0 };
enum { R3 = Impl::ViewOffsetRange< SubArg3_type >::is_range ? 1 : 0 };
enum { R4 = Impl::ViewOffsetRange< SubArg4_type >::is_range ? 1 : 0 };
enum { R5 = Impl::ViewOffsetRange< SubArg5_type >::is_range ? 1 : 0 };
enum { R6 = Impl::ViewOffsetRange< SubArg6_type >::is_range ? 1 : 0 };
enum { R7 = Impl::ViewOffsetRange< SubArg7_type >::is_range ? 1 : 0 };
enum { OutputRank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3)
+ unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) };
// Reverse
enum { R0_rev = 0 == InputRank ? 0u : (
1 == InputRank ? unsigned(R0) : (
2 == InputRank ? unsigned(R1) : (
3 == InputRank ? unsigned(R2) : (
4 == InputRank ? unsigned(R3) : (
5 == InputRank ? unsigned(R4) : (
6 == InputRank ? unsigned(R5) : (
7 == InputRank ? unsigned(R6) : unsigned(R7) ))))))) };
typedef typename SrcViewType::array_layout SrcViewLayout ;
// Choose array layout, attempting to preserve original layout if at all possible.
typedef typename Impl::if_c<
( // Same Layout IF
// OutputRank 0
( OutputRank == 0 )
||
// OutputRank 1 or 2, InputLayout Left, Interval 0
// because single stride one or second index has a stride.
( OutputRank <= 2 && R0 && Impl::is_same<SrcViewLayout,LayoutLeft>::value )
||
// OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1]
// because single stride one or second index has a stride.
( OutputRank <= 2 && R0_rev && Impl::is_same<SrcViewLayout,LayoutRight>::value )
), SrcViewLayout , Kokkos::LayoutStride >::type OutputViewLayout ;
// Choose data type as a purely dynamic rank array to accomodate a runtime range.
typedef typename Impl::if_c< OutputRank == 0 , typename SrcViewType::value_type ,
typename Impl::if_c< OutputRank == 1 , typename SrcViewType::value_type *,
typename Impl::if_c< OutputRank == 2 , typename SrcViewType::value_type **,
typename Impl::if_c< OutputRank == 3 , typename SrcViewType::value_type ***,
typename Impl::if_c< OutputRank == 4 , typename SrcViewType::value_type ****,
typename Impl::if_c< OutputRank == 5 , typename SrcViewType::value_type *****,
typename Impl::if_c< OutputRank == 6 , typename SrcViewType::value_type ******,
typename Impl::if_c< OutputRank == 7 , typename SrcViewType::value_type *******,
typename SrcViewType::value_type ********
>::type >::type >::type >::type >::type >::type >::type >::type OutputData ;
// Choose space.
// If the source view's template arg1 or arg2 is a space then use it,
// otherwise use the source view's execution space.
typedef typename Impl::if_c< Impl::is_space< SrcArg1Type >::value , SrcArg1Type ,
typename Impl::if_c< Impl::is_space< SrcArg2Type >::value , SrcArg2Type , typename SrcViewType::execution_space
>::type >::type OutputSpace ;
public:
// If keeping the layout then match non-data type arguments
// else keep execution space and memory traits.
typedef typename
Impl::if_c< Impl::is_same< SrcViewLayout , OutputViewLayout >::value
, Kokkos::DualView< OutputData , SrcArg1Type , SrcArg2Type , SrcArg3Type >
, Kokkos::DualView< OutputData , OutputViewLayout , OutputSpace
, typename SrcViewType::memory_traits >
>::type type ;
};
} /* namespace Impl */
} /* namespace Kokkos */
namespace Kokkos {
template< class D , class A1 , class A2 , class A3 ,
class ArgType0 >
typename Impl::ViewSubview< DualView<D,A1,A2,A3>
, ArgType0 , void , void , void
, void , void , void , void
>::type
subview( const DualView<D,A1,A2,A3> & src ,
const ArgType0 & arg0 )
{
typedef typename
Impl::ViewSubview< DualView<D,A1,A2,A3>
, ArgType0 , void , void , void
, void , void , void , void
>::type
DstViewType ;
DstViewType sub_view;
sub_view.d_view = subview(src.d_view,arg0);
sub_view.h_view = subview(src.h_view,arg0);
sub_view.modified_device = src.modified_device;
sub_view.modified_host = src.modified_host;
return sub_view;
}
template< class D , class A1 , class A2 , class A3 ,
class ArgType0 , class ArgType1 >
typename Impl::ViewSubview< DualView<D,A1,A2,A3>
, ArgType0 , ArgType1 , void , void
, void , void , void , void
>::type
subview( const DualView<D,A1,A2,A3> & src ,
const ArgType0 & arg0 ,
const ArgType1 & arg1 )
{
typedef typename
Impl::ViewSubview< DualView<D,A1,A2,A3>
, ArgType0 , ArgType1 , void , void
, void , void , void , void
>::type
DstViewType ;
DstViewType sub_view;
sub_view.d_view = subview(src.d_view,arg0,arg1);
sub_view.h_view = subview(src.h_view,arg0,arg1);
sub_view.modified_device = src.modified_device;
sub_view.modified_host = src.modified_host;
return sub_view;
}
template< class D , class A1 , class A2 , class A3 ,
class ArgType0 , class ArgType1 , class ArgType2 >
typename Impl::ViewSubview< DualView<D,A1,A2,A3>
, ArgType0 , ArgType1 , ArgType2 , void
, void , void , void , void
>::type
subview( const DualView<D,A1,A2,A3> & src ,
const ArgType0 & arg0 ,
const ArgType1 & arg1 ,
const ArgType2 & arg2 )
{
typedef typename
Impl::ViewSubview< DualView<D,A1,A2,A3>
, ArgType0 , ArgType1 , ArgType2 , void
, void , void , void , void
>::type
DstViewType ;
DstViewType sub_view;
sub_view.d_view = subview(src.d_view,arg0,arg1,arg2);
sub_view.h_view = subview(src.h_view,arg0,arg1,arg2);
sub_view.modified_device = src.modified_device;
sub_view.modified_host = src.modified_host;
return sub_view;
}
template< class D , class A1 , class A2 , class A3 ,
class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 >
typename Impl::ViewSubview< DualView<D,A1,A2,A3>
, ArgType0 , ArgType1 , ArgType2 , ArgType3
, void , void , void , void
>::type
subview( const DualView<D,A1,A2,A3> & src ,
const ArgType0 & arg0 ,
const ArgType1 & arg1 ,
const ArgType2 & arg2 ,
const ArgType3 & arg3 )
{
typedef typename
Impl::ViewSubview< DualView<D,A1,A2,A3>
, ArgType0 , ArgType1 , ArgType2 , ArgType3
, void , void , void , void
>::type
DstViewType ;
DstViewType sub_view;
sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3);
sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3);
sub_view.modified_device = src.modified_device;
sub_view.modified_host = src.modified_host;
return sub_view;
}
template< class D , class A1 , class A2 , class A3 ,
class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
class ArgType4 >
typename Impl::ViewSubview< DualView<D,A1,A2,A3>
, ArgType0 , ArgType1 , ArgType2 , ArgType3
, ArgType4 , void , void , void
>::type
subview( const DualView<D,A1,A2,A3> & src ,
const ArgType0 & arg0 ,
const ArgType1 & arg1 ,
const ArgType2 & arg2 ,
const ArgType3 & arg3 ,
const ArgType4 & arg4 )
{
typedef typename
Impl::ViewSubview< DualView<D,A1,A2,A3>
, ArgType0 , ArgType1 , ArgType2 , ArgType3
, ArgType4 , void , void ,void
>::type
DstViewType ;
DstViewType sub_view;
sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4);
sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4);
sub_view.modified_device = src.modified_device;
sub_view.modified_host = src.modified_host;
return sub_view;
}
template< class D , class A1 , class A2 , class A3 ,
class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
class ArgType4 , class ArgType5 >
typename Impl::ViewSubview< DualView<D,A1,A2,A3>
, ArgType0 , ArgType1 , ArgType2 , ArgType3
, ArgType4 , ArgType5 , void , void
>::type
subview( const DualView<D,A1,A2,A3> & src ,
const ArgType0 & arg0 ,
const ArgType1 & arg1 ,
const ArgType2 & arg2 ,
const ArgType3 & arg3 ,
const ArgType4 & arg4 ,
const ArgType5 & arg5 )
{
typedef typename
Impl::ViewSubview< DualView<D,A1,A2,A3>
, ArgType0 , ArgType1 , ArgType2 , ArgType3
, ArgType4 , ArgType5 , void , void
>::type
DstViewType ;
DstViewType sub_view;
sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5);
sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5);
sub_view.modified_device = src.modified_device;
sub_view.modified_host = src.modified_host;
return sub_view;
}
template< class D , class A1 , class A2 , class A3 ,
class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
class ArgType4 , class ArgType5 , class ArgType6 >
typename Impl::ViewSubview< DualView<D,A1,A2,A3>
, ArgType0 , ArgType1 , ArgType2 , ArgType3
, ArgType4 , ArgType5 , ArgType6 , void
>::type
subview( const DualView<D,A1,A2,A3> & src ,
const ArgType0 & arg0 ,
const ArgType1 & arg1 ,
const ArgType2 & arg2 ,
const ArgType3 & arg3 ,
const ArgType4 & arg4 ,
const ArgType5 & arg5 ,
const ArgType6 & arg6 )
{
typedef typename
Impl::ViewSubview< DualView<D,A1,A2,A3>
, ArgType0 , ArgType1 , ArgType2 , ArgType3
, ArgType4 , ArgType5 , ArgType6 , void
>::type
DstViewType ;
DstViewType sub_view;
sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6);
sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6);
sub_view.modified_device = src.modified_device;
sub_view.modified_host = src.modified_host;
return sub_view;
}
template< class D , class A1 , class A2 , class A3 ,
class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
class ArgType4 , class ArgType5 , class ArgType6 , class ArgType7 >
typename Impl::ViewSubview< DualView<D,A1,A2,A3>
, ArgType0 , ArgType1 , ArgType2 , ArgType3
, ArgType4 , ArgType5 , ArgType6 , ArgType7
>::type
subview( const DualView<D,A1,A2,A3> & src ,
const ArgType0 & arg0 ,
const ArgType1 & arg1 ,
const ArgType2 & arg2 ,
const ArgType3 & arg3 ,
const ArgType4 & arg4 ,
const ArgType5 & arg5 ,
const ArgType6 & arg6 ,
const ArgType7 & arg7 )
{
typedef typename
Impl::ViewSubview< DualView<D,A1,A2,A3>
, ArgType0 , ArgType1 , ArgType2 , ArgType3
, ArgType4 , ArgType5 , ArgType6 , ArgType7
>::type
DstViewType ;
DstViewType sub_view;
sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7);
sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7);
sub_view.modified_device = src.modified_device;
sub_view.modified_host = src.modified_host;
return sub_view;
}
} // namespace Kokkos
#endif /* KOKKOS_USING_EXP_VIEW */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -223,14 +223,85 @@ struct DynRankDimTraits {
);
}
template < typename DynRankViewType , typename iType >
void verify_dynrankview_rank ( iType N , const DynRankViewType &drv )
{
if ( static_cast<iType>(drv.rank()) > N )
{
Kokkos::abort( "Need at least rank arguments to the operator()" );
}
/** \brief Debug bounds-checking routines */
// Enhanced debug checking - most infrastructure matches that of functions in
// Kokkos_ViewMapping; additional checks for extra arguments beyond rank are 0
template< unsigned , typename iType0 , class MapType >
KOKKOS_INLINE_FUNCTION
bool dyn_rank_view_verify_operator_bounds( const iType0 & , const MapType & )
{ return true ; }
template< unsigned R , typename iType0 , class MapType , typename iType1 , class ... Args >
KOKKOS_INLINE_FUNCTION
bool dyn_rank_view_verify_operator_bounds
( const iType0 & rank
, const MapType & map
, const iType1 & i
, Args ... args
)
{
if ( static_cast<iType0>(R) < rank ) {
return ( size_t(i) < map.extent(R) )
&& dyn_rank_view_verify_operator_bounds<R+1>( rank , map , args ... );
}
else if ( i != 0 ) {
printf("DynRankView Debug Bounds Checking Error: at rank %u\n Extra arguments beyond the rank must be zero \n",R);
return ( false )
&& dyn_rank_view_verify_operator_bounds<R+1>( rank , map , args ... );
}
else {
return ( true )
&& dyn_rank_view_verify_operator_bounds<R+1>( rank , map , args ... );
}
}
template< unsigned , class MapType >
inline
void dyn_rank_view_error_operator_bounds( char * , int , const MapType & )
{}
template< unsigned R , class MapType , class iType , class ... Args >
inline
void dyn_rank_view_error_operator_bounds
( char * buf
, int len
, const MapType & map
, const iType & i
, Args ... args
)
{
const int n =
snprintf(buf,len," %ld < %ld %c"
, static_cast<unsigned long>(i)
, static_cast<unsigned long>( map.extent(R) )
, ( sizeof...(Args) ? ',' : ')' )
);
dyn_rank_view_error_operator_bounds<R+1>(buf+n,len-n,map,args...);
}
// op_rank = rank of the operator version that was called
template< typename iType0 , typename iType1 , class MapType , class ... Args >
KOKKOS_INLINE_FUNCTION
void dyn_rank_view_verify_operator_bounds
( const iType0 & op_rank , const iType1 & rank , const char* label , const MapType & map , Args ... args )
{
if ( static_cast<iType0>(rank) > op_rank ) {
Kokkos::abort( "DynRankView Bounds Checking Error: Need at least rank arguments to the operator()" );
}
if ( ! dyn_rank_view_verify_operator_bounds<0>( rank , map , args ... ) ) {
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
enum { LEN = 1024 };
char buffer[ LEN ];
int n = snprintf(buffer,LEN,"DynRankView bounds error of view %s (", label);
dyn_rank_view_error_operator_bounds<0>( buffer + n , LEN - n , map , args ... );
Kokkos::Impl::throw_runtime_exception(std::string(buffer));
#else
Kokkos::abort("DynRankView bounds error");
#endif
}
}
/** \brief Assign compatible default mappings */
@ -341,7 +412,6 @@ class DynRankView : public ViewTraits< DataType , Properties ... >
private:
template < class , class ... > friend class DynRankView ;
// template < class , class ... > friend class Kokkos::Experimental::View ; //unnecessary now...
template < class , class ... > friend class Impl::ViewMapping ;
public:
@ -504,20 +574,26 @@ private:
( is_layout_left || is_layout_right || is_layout_stride )
};
template< class Space , bool = Kokkos::Impl::MemorySpaceAccess< Space , typename traits::memory_space >::accessible > struct verify_space
{ KOKKOS_FORCEINLINE_FUNCTION static void check() {} };
template< class Space > struct verify_space<Space,false>
{ KOKKOS_FORCEINLINE_FUNCTION static void check()
{ Kokkos::abort("Kokkos::DynRankView ERROR: attempt to access inaccessible memory space"); };
};
// Bounds checking macros
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
#define KOKKOS_VIEW_OPERATOR_VERIFY( N , ARG ) \
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \
< Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify(); \
Kokkos::Experimental::Impl::verify_dynrankview_rank ( N , *this ) ; \
Kokkos::Experimental::Impl::view_verify_operator_bounds ARG ;
// rank of the calling operator - included as first argument in ARG
#define KOKKOS_VIEW_OPERATOR_VERIFY( ARG ) \
DynRankView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check(); \
Kokkos::Experimental::Impl::dyn_rank_view_verify_operator_bounds ARG ;
#else
#define KOKKOS_VIEW_OPERATOR_VERIFY( N , ARG ) \
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \
< Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify();
#define KOKKOS_VIEW_OPERATOR_VERIFY( ARG ) \
DynRankView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
#endif
@ -532,7 +608,11 @@ public:
KOKKOS_INLINE_FUNCTION
reference_type operator()() const
{
KOKKOS_VIEW_OPERATOR_VERIFY( 0 , ( implementation_map() ) )
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_VIEW_OPERATOR_VERIFY( (0 , this->rank() , NULL , m_map) )
#else
KOKKOS_VIEW_OPERATOR_VERIFY( (0 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map) )
#endif
return implementation_map().reference();
//return m_map.reference(0,0,0,0,0,0,0);
}
@ -563,12 +643,17 @@ public:
return rankone_view(i0);
}
// Rank 1 parenthesis
template< typename iType >
KOKKOS_INLINE_FUNCTION
typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType>::value), reference_type>::type
operator()(const iType & i0 ) const
{
KOKKOS_VIEW_OPERATOR_VERIFY( 1 , ( m_map , i0 ) )
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_VIEW_OPERATOR_VERIFY( (1 , this->rank() , NULL , m_map , i0) )
#else
KOKKOS_VIEW_OPERATOR_VERIFY( (1 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0) )
#endif
return m_map.reference(i0);
}
@ -577,6 +662,11 @@ public:
typename std::enable_if< !(std::is_same<typename traits::specialize , void>::value && std::is_integral<iType>::value), reference_type>::type
operator()(const iType & i0 ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_VIEW_OPERATOR_VERIFY( (1 , this->rank() , NULL , m_map , i0) )
#else
KOKKOS_VIEW_OPERATOR_VERIFY( (1 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0) )
#endif
return m_map.reference(i0,0,0,0,0,0,0);
}
@ -586,7 +676,11 @@ public:
typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value && std::is_integral<iType1>::value), reference_type>::type
operator()(const iType0 & i0 , const iType1 & i1 ) const
{
KOKKOS_VIEW_OPERATOR_VERIFY( 2 , ( m_map , i0 , i1 ) )
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_VIEW_OPERATOR_VERIFY( (2 , this->rank() , NULL , m_map , i0 , i1) )
#else
KOKKOS_VIEW_OPERATOR_VERIFY( (2 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1) )
#endif
return m_map.reference(i0,i1);
}
@ -595,7 +689,11 @@ public:
typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
operator()(const iType0 & i0 , const iType1 & i1 ) const
{
KOKKOS_VIEW_OPERATOR_VERIFY( 2 , ( m_map , i0 , i1 ) )
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_VIEW_OPERATOR_VERIFY( (2 , this->rank() , NULL , m_map , i0 , i1) )
#else
KOKKOS_VIEW_OPERATOR_VERIFY( (2 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1) )
#endif
return m_map.reference(i0,i1,0,0,0,0,0);
}
@ -605,7 +703,11 @@ public:
typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value && std::is_integral<iType1>::value && std::is_integral<iType2>::value), reference_type>::type
operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const
{
KOKKOS_VIEW_OPERATOR_VERIFY( 3 , ( m_map , i0 , i1 , i2 ) )
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_VIEW_OPERATOR_VERIFY( (3 , this->rank() , NULL , m_map , i0 , i1 , i2) )
#else
KOKKOS_VIEW_OPERATOR_VERIFY( (3 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2) )
#endif
return m_map.reference(i0,i1,i2);
}
@ -614,7 +716,11 @@ public:
typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const
{
KOKKOS_VIEW_OPERATOR_VERIFY( 3 , ( m_map , i0 , i1 , i2 ) )
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_VIEW_OPERATOR_VERIFY( (3 , this->rank() , NULL , m_map , i0 , i1 , i2) )
#else
KOKKOS_VIEW_OPERATOR_VERIFY( (3 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2) )
#endif
return m_map.reference(i0,i1,i2,0,0,0,0);
}
@ -624,7 +730,11 @@ public:
typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value), reference_type>::type
operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const
{
KOKKOS_VIEW_OPERATOR_VERIFY( 4 , ( m_map , i0 , i1 , i2 , i3 ) )
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_VIEW_OPERATOR_VERIFY( (4 , this->rank() , NULL , m_map , i0 , i1 , i2 , i3) )
#else
KOKKOS_VIEW_OPERATOR_VERIFY( (4 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3) )
#endif
return m_map.reference(i0,i1,i2,i3);
}
@ -633,7 +743,11 @@ public:
typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const
{
KOKKOS_VIEW_OPERATOR_VERIFY( 4 , ( m_map , i0 , i1 , i2 , i3 ) )
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_VIEW_OPERATOR_VERIFY( (4 , this->rank() , NULL , m_map , i0 , i1 , i2 , i3) )
#else
KOKKOS_VIEW_OPERATOR_VERIFY( (4 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3) )
#endif
return m_map.reference(i0,i1,i2,i3,0,0,0);
}
@ -643,7 +757,11 @@ public:
typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value && std::is_integral<iType4>::value), reference_type>::type
operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 ) const
{
KOKKOS_VIEW_OPERATOR_VERIFY( 5 , ( m_map , i0 , i1 , i2 , i3 , i4 ) )
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_VIEW_OPERATOR_VERIFY( (5 , this->rank() , NULL , m_map , i0 , i1 , i2 , i3, i4) )
#else
KOKKOS_VIEW_OPERATOR_VERIFY( (5 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4) )
#endif
return m_map.reference(i0,i1,i2,i3,i4);
}
@ -652,7 +770,11 @@ public:
typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 ) const
{
KOKKOS_VIEW_OPERATOR_VERIFY( 5 , ( m_map , i0 , i1 , i2 , i3 , i4 ) )
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_VIEW_OPERATOR_VERIFY( (5 , this->rank() , NULL , m_map , i0 , i1 , i2 , i3, i4) )
#else
KOKKOS_VIEW_OPERATOR_VERIFY( (5 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4) )
#endif
return m_map.reference(i0,i1,i2,i3,i4,0,0);
}
@ -662,7 +784,11 @@ public:
typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value && std::is_integral<iType4>::value && std::is_integral<iType5>::value), reference_type>::type
operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 ) const
{
KOKKOS_VIEW_OPERATOR_VERIFY( 6 , ( m_map , i0 , i1 , i2 , i3 , i4 , i5 ) )
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_VIEW_OPERATOR_VERIFY( (6 , this->rank() , NULL , m_map , i0 , i1 , i2 , i3, i4 , i5) )
#else
KOKKOS_VIEW_OPERATOR_VERIFY( (6 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5) )
#endif
return m_map.reference(i0,i1,i2,i3,i4,i5);
}
@ -671,7 +797,11 @@ public:
typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 ) const
{
KOKKOS_VIEW_OPERATOR_VERIFY( 6 , ( m_map , i0 , i1 , i2 , i3 , i4 , i5 ) )
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_VIEW_OPERATOR_VERIFY( (6 , this->rank() , NULL , m_map , i0 , i1 , i2 , i3, i4 , i5) )
#else
KOKKOS_VIEW_OPERATOR_VERIFY( (6 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5) )
#endif
return m_map.reference(i0,i1,i2,i3,i4,i5,0);
}
@ -681,7 +811,11 @@ public:
typename std::enable_if< (std::is_integral<iType0>::value && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value && std::is_integral<iType4>::value && std::is_integral<iType5>::value && std::is_integral<iType6>::value), reference_type>::type
operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const
{
KOKKOS_VIEW_OPERATOR_VERIFY( 7 , ( m_map , i0 , i1 , i2 , i3 , i4 , i5 , i6 ) )
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_VIEW_OPERATOR_VERIFY( (7 , this->rank() , NULL , m_map , i0 , i1 , i2 , i3, i4 , i5 , i6) )
#else
KOKKOS_VIEW_OPERATOR_VERIFY( (7 , this->rank() , m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6) )
#endif
return m_map.reference(i0,i1,i2,i3,i4,i5,i6);
}
@ -1136,13 +1270,13 @@ private:
public:
typedef Kokkos::Experimental::ViewTraits
typedef Kokkos::ViewTraits
< data_type
, array_layout
, typename SrcTraits::device_type
, typename SrcTraits::memory_traits > traits_type ;
typedef Kokkos::Experimental::View
typedef Kokkos::View
< data_type
, array_layout
, typename SrcTraits::device_type
@ -1154,13 +1288,13 @@ public:
static_assert( Kokkos::Impl::is_memory_traits< MemoryTraits >::value , "" );
typedef Kokkos::Experimental::ViewTraits
typedef Kokkos::ViewTraits
< data_type
, array_layout
, typename SrcTraits::device_type
, MemoryTraits > traits_type ;
typedef Kokkos::Experimental::View
typedef Kokkos::View
< data_type
, array_layout
, typename SrcTraits::device_type
@ -1264,7 +1398,7 @@ subdynrankview( const Kokkos::Experimental::DynRankView< D , P... > &src , Args.
if ( src.rank() > sizeof...(Args) ) //allow sizeof...(Args) >= src.rank(), ignore the remaining args
{ Kokkos::abort("subdynrankview: num of args must be >= rank of the source DynRankView"); }
typedef Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::Experimental::ViewTraits< D*******, P... > , Args... > metafcn ;
typedef Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ;
return metafcn::subview( src.rank() , src , args... );
}
@ -1502,10 +1636,10 @@ void deep_copy
typedef typename src_type::memory_space src_memory_space ;
enum { DstExecCanAccessSrc =
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value };
Kokkos::Impl::SpaceAccessibility< dst_execution_space , src_memory_space >::accessible };
enum { SrcExecCanAccessDst =
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename src_execution_space::memory_space , dst_memory_space >::value };
Kokkos::Impl::SpaceAccessibility< src_execution_space , dst_memory_space >::accessible };
if ( (void *) dst.data() != (void*) src.data() ) {
@ -1666,7 +1800,7 @@ inline
typename DynRankView<T,P...>::HostMirror
create_mirror( const DynRankView<T,P...> & src
, typename std::enable_if<
! std::is_same< typename Kokkos::Experimental::ViewTraits<T,P...>::array_layout
! std::is_same< typename Kokkos::ViewTraits<T,P...>::array_layout
, Kokkos::LayoutStride >::value
>::type * = 0
)
@ -1684,7 +1818,7 @@ inline
typename DynRankView<T,P...>::HostMirror
create_mirror( const DynRankView<T,P...> & src
, typename std::enable_if<
std::is_same< typename Kokkos::Experimental::ViewTraits<T,P...>::array_layout
std::is_same< typename Kokkos::ViewTraits<T,P...>::array_layout
, Kokkos::LayoutStride >::value
>::type * = 0
)
@ -1779,7 +1913,7 @@ void resize( DynRankView<T,P...> & v ,
{
typedef DynRankView<T,P...> drview_type ;
static_assert( Kokkos::Experimental::ViewTraits<T,P...>::is_managed , "Can only resize managed views" );
static_assert( Kokkos::ViewTraits<T,P...>::is_managed , "Can only resize managed views" );
drview_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6 );
@ -1803,7 +1937,7 @@ void realloc( DynRankView<T,P...> & v ,
{
typedef DynRankView<T,P...> drview_type ;
static_assert( Kokkos::Experimental::ViewTraits<T,P...>::is_managed , "Can only realloc managed views" );
static_assert( Kokkos::ViewTraits<T,P...>::is_managed , "Can only realloc managed views" );
const std::string label = v.label();

View File

@ -56,7 +56,7 @@ namespace Experimental {
* Subviews are not allowed.
*/
template< typename DataType , typename ... P >
class DynamicView : public Kokkos::Experimental::ViewTraits< DataType , P ... >
class DynamicView : public Kokkos::ViewTraits< DataType , P ... >
{
public:
@ -75,6 +75,15 @@ private:
std::is_same< typename traits::specialize , void >::value
, "DynamicView must have trivial data type" );
template< class Space , bool = Kokkos::Impl::MemorySpaceAccess< Space , typename traits::memory_space >::accessible > struct verify_space
{ KOKKOS_FORCEINLINE_FUNCTION static void check() {} };
template< class Space > struct verify_space<Space,false>
{ KOKKOS_FORCEINLINE_FUNCTION static void check()
{ Kokkos::abort("Kokkos::DynamicView ERROR: attempt to access inaccessible memory space"); };
};
public:
typedef Kokkos::Experimental::MemoryPool< typename traits::device_type > memory_pool ;
@ -117,10 +126,10 @@ public:
KOKKOS_INLINE_FUNCTION constexpr size_t size() const
{
return
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace
Kokkos::Impl::MemorySpaceAccess
< Kokkos::Impl::ActiveExecutionMemorySpace
, typename traits::memory_space
>::value
>::accessible
? // Runtime size is at the end of the chunk pointer array
(*reinterpret_cast<const uintptr_t*>( m_chunks + m_chunk_max ))
<< m_chunk_shift
@ -179,10 +188,7 @@ public:
static_assert( Kokkos::Impl::are_integral<I0,Args...>::value
, "Indices must be integral type" );
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace
< Kokkos::Impl::ActiveExecutionMemorySpace
, typename traits::memory_space
>::verify();
DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
// Which chunk is being indexed.
const uintptr_t ic = uintptr_t( i0 >> m_chunk_shift );
@ -223,15 +229,13 @@ public:
{
typedef typename traits::value_type value_type ;
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace
< Kokkos::Impl::ActiveExecutionMemorySpace
, typename traits::memory_space >::verify();
DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ;
if ( m_chunk_max < NC ) {
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
printf("DynamicView::resize_parallel(%lu) m_chunk_max(%lu) NC(%lu)\n"
printf("DynamicView::resize_parallel(%lu) m_chunk_max(%u) NC(%lu)\n"
, n , m_chunk_max , NC );
#endif
Kokkos::abort("DynamicView::resize_parallel exceeded maximum size");
@ -269,9 +273,7 @@ public:
inline
void resize_serial( size_t n )
{
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace
< Kokkos::Impl::ActiveExecutionMemorySpace
, typename traits::memory_space >::verify();
DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ;
@ -398,9 +400,7 @@ public:
, m_chunk_mask( ( 1 << m_chunk_shift ) - 1 )
, m_chunk_max( ( arg_size_max + m_chunk_mask ) >> m_chunk_shift )
{
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace
< Kokkos::Impl::ActiveExecutionMemorySpace
, typename traits::memory_space >::verify();
DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
// A functor to deallocate all of the chunks upon final destruction
@ -452,7 +452,7 @@ void deep_copy( const View<T,DP...> & dst
typedef typename ViewTraits<T,SP...>::memory_space src_memory_space ;
enum { DstExecCanAccessSrc =
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value };
Kokkos::Impl::SpaceAccessibility< dst_execution_space , src_memory_space >::accessible };
if ( DstExecCanAccessSrc ) {
// Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
@ -476,7 +476,7 @@ void deep_copy( const DynamicView<T,DP...> & dst
typedef typename ViewTraits<T,SP...>::memory_space src_memory_space ;
enum { DstExecCanAccessSrc =
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value };
Kokkos::Impl::SpaceAccessibility< dst_execution_space , src_memory_space >::accessible };
if ( DstExecCanAccessSrc ) {
// Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.

View File

@ -0,0 +1,196 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_EXPERIMENTAL_ERROR_REPORTER_HPP
#define KOKKOS_EXPERIMENTAL_ERROR_REPORTER_HPP
#include <vector>
#include <Kokkos_Core.hpp>
#include <Kokkos_View.hpp>
#include <Kokkos_DualView.hpp>
namespace Kokkos {
namespace Experimental {
template <typename ReportType, typename DeviceType>
class ErrorReporter
{
public:
typedef ReportType report_type;
typedef DeviceType device_type;
typedef typename device_type::execution_space execution_space;
ErrorReporter(int max_results)
: m_numReportsAttempted(""),
m_reports("", max_results),
m_reporters("", max_results)
{
clear();
}
int getCapacity() const { return m_reports.h_view.dimension_0(); }
int getNumReports();
int getNumReportAttempts();
void getReports(std::vector<int> &reporters_out, std::vector<report_type> &reports_out);
void getReports( typename Kokkos::View<int*, typename DeviceType::execution_space >::HostMirror &reporters_out,
typename Kokkos::View<report_type*, typename DeviceType::execution_space >::HostMirror &reports_out);
void clear();
void resize(const size_t new_size);
bool full() {return (getNumReportAttempts() >= getCapacity()); }
KOKKOS_INLINE_FUNCTION
bool add_report(int reporter_id, report_type report) const
{
int idx = Kokkos::atomic_fetch_add(&m_numReportsAttempted(), 1);
if (idx >= 0 && (idx < static_cast<int>(m_reports.d_view.dimension_0()))) {
m_reporters.d_view(idx) = reporter_id;
m_reports.d_view(idx) = report;
return true;
}
else {
return false;
}
}
private:
typedef Kokkos::View<report_type *, execution_space> reports_view_t;
typedef Kokkos::DualView<report_type *, execution_space> reports_dualview_t;
typedef typename reports_dualview_t::host_mirror_space host_mirror_space;
Kokkos::View<int, execution_space> m_numReportsAttempted;
reports_dualview_t m_reports;
Kokkos::DualView<int *, execution_space> m_reporters;
};
template <typename ReportType, typename DeviceType>
inline int ErrorReporter<ReportType, DeviceType>::getNumReports()
{
int num_reports = 0;
Kokkos::deep_copy(num_reports,m_numReportsAttempted);
if (num_reports > static_cast<int>(m_reports.h_view.dimension_0())) {
num_reports = m_reports.h_view.dimension_0();
}
return num_reports;
}
template <typename ReportType, typename DeviceType>
inline int ErrorReporter<ReportType, DeviceType>::getNumReportAttempts()
{
int num_reports = 0;
Kokkos::deep_copy(num_reports,m_numReportsAttempted);
return num_reports;
}
template <typename ReportType, typename DeviceType>
void ErrorReporter<ReportType, DeviceType>::getReports(std::vector<int> &reporters_out, std::vector<report_type> &reports_out)
{
int num_reports = getNumReports();
reporters_out.clear();
reporters_out.reserve(num_reports);
reports_out.clear();
reports_out.reserve(num_reports);
if (num_reports > 0) {
m_reports.template sync<host_mirror_space>();
m_reporters.template sync<host_mirror_space>();
for (int i = 0; i < num_reports; ++i) {
reporters_out.push_back(m_reporters.h_view(i));
reports_out.push_back(m_reports.h_view(i));
}
}
}
template <typename ReportType, typename DeviceType>
void ErrorReporter<ReportType, DeviceType>::getReports(
typename Kokkos::View<int*, typename DeviceType::execution_space >::HostMirror &reporters_out,
typename Kokkos::View<report_type*, typename DeviceType::execution_space >::HostMirror &reports_out)
{
int num_reports = getNumReports();
reporters_out = typename Kokkos::View<int*, typename DeviceType::execution_space >::HostMirror("ErrorReport::reporters_out",num_reports);
reports_out = typename Kokkos::View<report_type*, typename DeviceType::execution_space >::HostMirror("ErrorReport::reports_out",num_reports);
if (num_reports > 0) {
m_reports.template sync<host_mirror_space>();
m_reporters.template sync<host_mirror_space>();
for (int i = 0; i < num_reports; ++i) {
reporters_out(i) = m_reporters.h_view(i);
reports_out(i) = m_reports.h_view(i);
}
}
}
template <typename ReportType, typename DeviceType>
void ErrorReporter<ReportType, DeviceType>::clear()
{
int num_reports=0;
Kokkos::deep_copy(m_numReportsAttempted, num_reports);
m_reports.template modify<execution_space>();
m_reporters.template modify<execution_space>();
}
template <typename ReportType, typename DeviceType>
void ErrorReporter<ReportType, DeviceType>::resize(const size_t new_size)
{
m_reports.resize(new_size);
m_reporters.resize(new_size);
Kokkos::fence();
}
} // namespace Experimental
} // namespace kokkos
#endif

View File

@ -1,531 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_SEGMENTED_VIEW_HPP_
#define KOKKOS_SEGMENTED_VIEW_HPP_
#include <Kokkos_Core.hpp>
#include <impl/Kokkos_Error.hpp>
#include <cstdio>
#if ! KOKKOS_USING_EXP_VIEW
namespace Kokkos {
namespace Experimental {
namespace Impl {
template<class DataType, class Arg1Type, class Arg2Type, class Arg3Type>
struct delete_segmented_view;
template<class MemorySpace>
inline
void DeviceSetAllocatableMemorySize(size_t) {}
#if defined( KOKKOS_HAVE_CUDA )
template<>
inline
void DeviceSetAllocatableMemorySize<Kokkos::CudaSpace>(size_t size) {
#ifdef __CUDACC__
size_t size_limit;
cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize);
if(size_limit<size)
cudaDeviceSetLimit(cudaLimitMallocHeapSize,2*size);
cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize);
#endif
}
template<>
inline
void DeviceSetAllocatableMemorySize<Kokkos::CudaUVMSpace>(size_t size) {
#ifdef __CUDACC__
size_t size_limit;
cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize);
if(size_limit<size)
cudaDeviceSetLimit(cudaLimitMallocHeapSize,2*size);
cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize);
#endif
}
#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
}
template< class DataType ,
class Arg1Type = void ,
class Arg2Type = void ,
class Arg3Type = void>
class SegmentedView : public Kokkos::ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type >
{
public:
//! \name Typedefs for device types and various Kokkos::View specializations.
//@{
typedef Kokkos::ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > traits ;
//! The type of a Kokkos::View on the device.
typedef Kokkos::View< typename traits::data_type ,
typename traits::array_layout ,
typename traits::memory_space ,
Kokkos::MemoryUnmanaged > t_dev ;
private:
Kokkos::View<t_dev*,typename traits::memory_space> segments_;
Kokkos::View<int,typename traits::memory_space> realloc_lock;
Kokkos::View<int,typename traits::memory_space> nsegments_;
size_t segment_length_;
size_t segment_length_m1_;
int max_segments_;
int segment_length_log2;
// Dimensions, cardinality, capacity, and offset computation for
// multidimensional array view of contiguous memory.
// Inherits from Impl::Shape
typedef Kokkos::Impl::ViewOffset< typename traits::shape_type
, typename traits::array_layout
> offset_map_type ;
offset_map_type m_offset_map ;
typedef Kokkos::View< typename traits::array_intrinsic_type ,
typename traits::array_layout ,
typename traits::memory_space ,
typename traits::memory_traits > array_type ;
typedef Kokkos::View< typename traits::const_data_type ,
typename traits::array_layout ,
typename traits::memory_space ,
typename traits::memory_traits > const_type ;
typedef Kokkos::View< typename traits::non_const_data_type ,
typename traits::array_layout ,
typename traits::memory_space ,
typename traits::memory_traits > non_const_type ;
typedef Kokkos::View< typename traits::non_const_data_type ,
typename traits::array_layout ,
HostSpace ,
void > HostMirror ;
template< bool Accessible >
KOKKOS_INLINE_FUNCTION
typename Kokkos::Impl::enable_if< Accessible , typename traits::size_type >::type
dimension_0_intern() const { return nsegments_() * segment_length_ ; }
template< bool Accessible >
KOKKOS_INLINE_FUNCTION
typename Kokkos::Impl::enable_if< ! Accessible , typename traits::size_type >::type
dimension_0_intern() const
{
// In Host space
int n = 0 ;
#if ! defined( __CUDA_ARCH__ )
Kokkos::Impl::DeepCopy< HostSpace , typename traits::memory_space >( & n , nsegments_.ptr_on_device() , sizeof(int) );
#endif
return n * segment_length_ ;
}
public:
enum { Rank = traits::rank };
KOKKOS_INLINE_FUNCTION offset_map_type shape() const { return m_offset_map ; }
/* \brief return (current) size of dimension 0 */
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const {
enum { Accessible = Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
Kokkos::Impl::ActiveExecutionMemorySpace, typename traits::memory_space >::value };
int n = SegmentedView::dimension_0_intern< Accessible >();
return n ;
}
/* \brief return size of dimension 1 */
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_offset_map.N1 ; }
/* \brief return size of dimension 2 */
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return m_offset_map.N2 ; }
/* \brief return size of dimension 3 */
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return m_offset_map.N3 ; }
/* \brief return size of dimension 4 */
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return m_offset_map.N4 ; }
/* \brief return size of dimension 5 */
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return m_offset_map.N5 ; }
/* \brief return size of dimension 6 */
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return m_offset_map.N6 ; }
/* \brief return size of dimension 7 */
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return m_offset_map.N7 ; }
/* \brief return size of dimension 2 */
KOKKOS_INLINE_FUNCTION typename traits::size_type size() const {
return dimension_0() *
m_offset_map.N1 * m_offset_map.N2 * m_offset_map.N3 * m_offset_map.N4 *
m_offset_map.N5 * m_offset_map.N6 * m_offset_map.N7 ;
}
template< typename iType >
KOKKOS_INLINE_FUNCTION
typename traits::size_type dimension( const iType & i ) const {
if(i==0)
return dimension_0();
else
return Kokkos::Impl::dimension( m_offset_map , i );
}
KOKKOS_INLINE_FUNCTION
typename traits::size_type capacity() {
return segments_.dimension_0() *
m_offset_map.N1 * m_offset_map.N2 * m_offset_map.N3 * m_offset_map.N4 *
m_offset_map.N5 * m_offset_map.N6 * m_offset_map.N7;
}
KOKKOS_INLINE_FUNCTION
typename traits::size_type get_num_segments() {
enum { Accessible = Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
Kokkos::Impl::ActiveExecutionMemorySpace, typename traits::memory_space >::value };
int n = SegmentedView::dimension_0_intern< Accessible >();
return n/segment_length_ ;
}
KOKKOS_INLINE_FUNCTION
typename traits::size_type get_max_segments() {
return max_segments_;
}
/// \brief Constructor that allocates View objects with an initial length of 0.
///
/// This constructor works mostly like the analogous constructor of View.
/// The first argument is a string label, which is entirely for your
/// benefit. (Different SegmentedView objects may have the same label if
/// you like.) The second argument 'view_length' is the size of the segments.
/// This number must be a power of two. The third argument n0 is the maximum
/// value for the first dimension of the segmented view. The maximal allocatable
/// number of Segments is thus: (n0+view_length-1)/view_length.
/// The arguments that follow are the other dimensions of the (1-7) of the
/// View objects. For example, for a View with 3 runtime dimensions,
/// the first 4 integer arguments will be nonzero:
/// SegmentedView("Name",32768,10000000,8,4). This allocates a SegmentedView
/// with a maximum of 306 segments of dimension (32768,8,4). The logical size of
/// the segmented view is (n,8,4) with n between 0 and 10000000.
/// You may omit the integer arguments that follow.
template< class LabelType >
SegmentedView(const LabelType & label ,
const size_t view_length ,
const size_t n0 ,
const size_t n1 = 0 ,
const size_t n2 = 0 ,
const size_t n3 = 0 ,
const size_t n4 = 0 ,
const size_t n5 = 0 ,
const size_t n6 = 0 ,
const size_t n7 = 0
): segment_length_(view_length),segment_length_m1_(view_length-1)
{
segment_length_log2 = -1;
size_t l = segment_length_;
while(l>0) {
l>>=1;
segment_length_log2++;
}
l = 1<<segment_length_log2;
if(l!=segment_length_)
Kokkos::Impl::throw_runtime_exception("Kokkos::SegmentedView requires a 'power of 2' segment length");
max_segments_ = (n0+segment_length_m1_)/segment_length_;
Impl::DeviceSetAllocatableMemorySize<typename traits::memory_space>(segment_length_*max_segments_*sizeof(typename traits::value_type));
segments_ = Kokkos::View<t_dev*,typename traits::execution_space>(label , max_segments_);
realloc_lock = Kokkos::View<int,typename traits::execution_space>("Lock");
nsegments_ = Kokkos::View<int,typename traits::execution_space>("nviews");
m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n0*n1*n2*n3*n4*n5*n6*n7 );
}
KOKKOS_INLINE_FUNCTION
SegmentedView(const SegmentedView& src):
segments_(src.segments_),
realloc_lock (src.realloc_lock),
nsegments_ (src.nsegments_),
segment_length_(src.segment_length_),
segment_length_m1_(src.segment_length_m1_),
max_segments_ (src.max_segments_),
segment_length_log2(src.segment_length_log2),
m_offset_map (src.m_offset_map)
{}
KOKKOS_INLINE_FUNCTION
SegmentedView& operator= (const SegmentedView& src) {
segments_ = src.segments_;
realloc_lock = src.realloc_lock;
nsegments_ = src.nsegments_;
segment_length_= src.segment_length_;
segment_length_m1_= src.segment_length_m1_;
max_segments_ = src.max_segments_;
segment_length_log2= src.segment_length_log2;
m_offset_map = src.m_offset_map;
return *this;
}
~SegmentedView() {
if ( !segments_.tracker().ref_counting()) { return; }
size_t ref_count = segments_.tracker().ref_count();
if(ref_count == 1u) {
Kokkos::fence();
typename Kokkos::View<int,typename traits::execution_space>::HostMirror h_nviews("h_nviews");
Kokkos::deep_copy(h_nviews,nsegments_);
Kokkos::parallel_for(h_nviews(),Impl::delete_segmented_view<DataType , Arg1Type , Arg2Type, Arg3Type>(*this));
}
}
KOKKOS_INLINE_FUNCTION
t_dev get_segment(const int& i) const {
return segments_[i];
}
template< class MemberType>
KOKKOS_INLINE_FUNCTION
void grow (MemberType& team_member, const size_t& growSize) const {
if (growSize>max_segments_*segment_length_) {
printf ("Exceeding maxSize: %lu %lu\n", growSize, max_segments_*segment_length_);
return;
}
if(team_member.team_rank()==0) {
bool too_small = growSize > segment_length_ * nsegments_();
if (too_small) {
while(Kokkos::atomic_compare_exchange(&realloc_lock(),0,1) )
; // get the lock
too_small = growSize > segment_length_ * nsegments_(); // Recheck once we have the lock
if(too_small) {
while(too_small) {
const size_t alloc_size = segment_length_*m_offset_map.N1*m_offset_map.N2*m_offset_map.N3*
m_offset_map.N4*m_offset_map.N5*m_offset_map.N6*m_offset_map.N7;
typename traits::non_const_value_type* const ptr = new typename traits::non_const_value_type[alloc_size];
segments_(nsegments_()) =
t_dev(ptr,segment_length_,m_offset_map.N1,m_offset_map.N2,m_offset_map.N3,m_offset_map.N4,m_offset_map.N5,m_offset_map.N6,m_offset_map.N7);
nsegments_()++;
too_small = growSize > segment_length_ * nsegments_();
}
}
realloc_lock() = 0; //release the lock
}
}
team_member.team_barrier();
}
KOKKOS_INLINE_FUNCTION
void grow_non_thread_safe (const size_t& growSize) const {
if (growSize>max_segments_*segment_length_) {
printf ("Exceeding maxSize: %lu %lu\n", growSize, max_segments_*segment_length_);
return;
}
bool too_small = growSize > segment_length_ * nsegments_();
if(too_small) {
while(too_small) {
const size_t alloc_size = segment_length_*m_offset_map.N1*m_offset_map.N2*m_offset_map.N3*
m_offset_map.N4*m_offset_map.N5*m_offset_map.N6*m_offset_map.N7;
typename traits::non_const_value_type* const ptr =
new typename traits::non_const_value_type[alloc_size];
segments_(nsegments_()) =
t_dev (ptr, segment_length_, m_offset_map.N1, m_offset_map.N2,
m_offset_map.N3, m_offset_map.N4, m_offset_map.N5,
m_offset_map.N6, m_offset_map.N7);
nsegments_()++;
too_small = growSize > segment_length_ * nsegments_();
}
}
}
template< typename iType0 >
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<iType0>::value && traits::rank == 1 )
, typename traits::value_type &
>::type
operator() ( const iType0 & i0 ) const
{
return segments_[i0>>segment_length_log2](i0&(segment_length_m1_));
}
template< typename iType0 , typename iType1 >
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<iType0>::value &&
std::is_integral<iType1>::value &&
traits::rank == 2 )
, typename traits::value_type &
>::type
operator() ( const iType0 & i0 , const iType1 & i1 ) const
{
return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1);
}
template< typename iType0 , typename iType1 , typename iType2 >
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<iType0>::value &&
std::is_integral<iType1>::value &&
std::is_integral<iType2>::value &&
traits::rank == 3 )
, typename traits::value_type &
>::type
operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const
{
return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2);
}
template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<iType0>::value &&
std::is_integral<iType1>::value &&
std::is_integral<iType2>::value &&
std::is_integral<iType3>::value &&
traits::rank == 4 )
, typename traits::value_type &
>::type
operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const
{
return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3);
}
template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
typename iType4 >
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<iType0>::value &&
std::is_integral<iType1>::value &&
std::is_integral<iType2>::value &&
std::is_integral<iType3>::value &&
std::is_integral<iType4>::value &&
traits::rank == 5 )
, typename traits::value_type &
>::type
operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
const iType4 & i4 ) const
{
return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4);
}
template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
typename iType4 , typename iType5 >
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<iType0>::value &&
std::is_integral<iType1>::value &&
std::is_integral<iType2>::value &&
std::is_integral<iType3>::value &&
std::is_integral<iType4>::value &&
std::is_integral<iType5>::value &&
traits::rank == 6 )
, typename traits::value_type &
>::type
operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
const iType4 & i4 , const iType5 & i5 ) const
{
return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4,i5);
}
template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
typename iType4 , typename iType5 , typename iType6 >
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<iType0>::value &&
std::is_integral<iType1>::value &&
std::is_integral<iType2>::value &&
std::is_integral<iType3>::value &&
std::is_integral<iType4>::value &&
std::is_integral<iType5>::value &&
std::is_integral<iType6>::value &&
traits::rank == 7 )
, typename traits::value_type &
>::type
operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const
{
return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4,i5,i6);
}
template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
typename iType4 , typename iType5 , typename iType6 , typename iType7 >
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<iType0>::value &&
std::is_integral<iType1>::value &&
std::is_integral<iType2>::value &&
std::is_integral<iType3>::value &&
std::is_integral<iType4>::value &&
std::is_integral<iType5>::value &&
std::is_integral<iType6>::value &&
std::is_integral<iType7>::value &&
traits::rank == 8 )
, typename traits::value_type &
>::type
operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const
{
return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4,i5,i6,i7);
}
};
namespace Impl {
template<class DataType, class Arg1Type, class Arg2Type, class Arg3Type>
struct delete_segmented_view {
typedef SegmentedView<DataType , Arg1Type , Arg2Type, Arg3Type> view_type;
typedef typename view_type::execution_space execution_space;
view_type view_;
delete_segmented_view(view_type view):view_(view) {
}
KOKKOS_INLINE_FUNCTION
void operator() (int i) const {
delete [] view_.get_segment(i).ptr_on_device();
}
};
}
}
}
#endif
#endif

View File

@ -241,9 +241,9 @@ public:
typedef UnorderedMap<const_key_type,value_type,execution_space,hasher_type,equal_to_type> modifiable_map_type;
typedef UnorderedMap<const_key_type,const_value_type,execution_space,hasher_type,equal_to_type> const_map_type;
static const bool is_set = Impl::is_same<void,value_type>::value;
static const bool has_const_key = Impl::is_same<const_key_type,declared_key_type>::value;
static const bool has_const_value = is_set || Impl::is_same<const_value_type,declared_value_type>::value;
static const bool is_set = std::is_same<void,value_type>::value;
static const bool has_const_key = std::is_same<const_key_type,declared_key_type>::value;
static const bool has_const_value = is_set || std::is_same<const_value_type,declared_value_type>::value;
static const bool is_insertable_map = !has_const_key && (is_set || !has_const_value);
static const bool is_modifiable_map = has_const_key && !has_const_value;
@ -735,8 +735,8 @@ public:
}
template <typename SKey, typename SValue, typename SDevice>
typename Impl::enable_if< Impl::is_same< typename Impl::remove_const<SKey>::type, key_type>::value &&
Impl::is_same< typename Impl::remove_const<SValue>::type, value_type>::value
typename Impl::enable_if< std::is_same< typename Impl::remove_const<SKey>::type, key_type>::value &&
std::is_same< typename Impl::remove_const<SValue>::type, value_type>::value
>::type
create_copy_view( UnorderedMap<SKey, SValue, SDevice, Hasher,EqualTo> const& src)
{

View File

@ -1,6 +1,6 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
SET(SOURCES

View File

@ -7,21 +7,18 @@ vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests
default: build_all
echo "End Build"
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
CXX = $(KOKKOS_PATH)/config/nvcc_wrapper
else
CXX = g++
endif
CXXFLAGS = -O3
LINK ?= $(CXX)
LDFLAGS ?= -lpthread
include $(KOKKOS_PATH)/Makefile.kokkos
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
CXX = $(NVCC_WRAPPER)
CXXFLAGS ?= -O3
LINK = $(CXX)
LDFLAGS ?= -lpthread
else
CXX ?= g++
CXXFLAGS ?= -O3
LINK ?= $(CXX)
LDFLAGS ?= -lpthread
endif
KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/unit_tests
TEST_TARGETS =

View File

@ -59,11 +59,13 @@
#include <TestVector.hpp>
#include <TestDualView.hpp>
#include <TestDynamicView.hpp>
#include <TestSegmentedView.hpp>
#include <Kokkos_DynRankView.hpp>
#include <TestDynViewAPI.hpp>
#include <Kokkos_ErrorReporter.hpp>
#include <TestErrorReporter.hpp>
//----------------------------------------------------------------------------
@ -133,11 +135,6 @@ void cuda_test_dualview_combinations(unsigned int size)
test_dualview_combinations<int,Kokkos::Cuda>(size);
}
void cuda_test_segmented_view(unsigned int size)
{
test_segmented_view<double,Kokkos::Cuda>(size);
}
void cuda_test_bitset()
{
test_bitset<Kokkos::Cuda>();
@ -184,11 +181,6 @@ void cuda_test_bitset()
cuda_test_dualview_combinations(size); \
}
#define CUDA_SEGMENTEDVIEW_TEST( size ) \
TEST_F( cuda, segmentedview_##size##x) { \
cuda_test_segmented_view(size); \
}
CUDA_DUALVIEW_COMBINE_TEST( 10 )
CUDA_VECTOR_COMBINE_TEST( 10 )
CUDA_VECTOR_COMBINE_TEST( 3057 )
@ -198,7 +190,6 @@ CUDA_INSERT_TEST(close, 100000, 90000, 100, 500)
CUDA_INSERT_TEST(far, 100000, 90000, 100, 500)
CUDA_DEEP_COPY( 10000, 1 )
CUDA_FAILED_INSERT_TEST( 10000, 1000 )
CUDA_SEGMENTEDVIEW_TEST( 200 )
#undef CUDA_INSERT_TEST
@ -207,7 +198,6 @@ CUDA_SEGMENTEDVIEW_TEST( 200 )
#undef CUDA_DEEP_COPY
#undef CUDA_VECTOR_COMBINE_TEST
#undef CUDA_DUALVIEW_COMBINE_TEST
#undef CUDA_SEGMENTEDVIEW_TEST
TEST_F( cuda , dynamic_view )
@ -221,6 +211,18 @@ TEST_F( cuda , dynamic_view )
}
#if defined(KOKKOS_CLASS_LAMBDA)
TEST_F(cuda, ErrorReporterViaLambda)
{
TestErrorReporter<ErrorReporterDriverUseLambda<Kokkos::Cuda>>();
}
#endif
TEST_F(cuda, ErrorReporter)
{
TestErrorReporter<ErrorReporterDriver<Kokkos::Cuda>>();
}
}
#endif /* #ifdef KOKKOS_HAVE_CUDA */

View File

@ -715,9 +715,9 @@ public:
typedef Kokkos::Experimental::DynRankView< T, device, Kokkos::MemoryUnmanaged > dView0_unmanaged ;
typedef typename dView0::host_mirror_space host_drv_space ;
typedef Kokkos::Experimental::View< T , device > View0 ;
typedef Kokkos::Experimental::View< T* , device > View1 ;
typedef Kokkos::Experimental::View< T******* , device > View7 ;
typedef Kokkos::View< T , device > View0 ;
typedef Kokkos::View< T* , device > View1 ;
typedef Kokkos::View< T******* , device > View7 ;
typedef typename View0::host_mirror_space host_view_space ;
@ -1127,8 +1127,7 @@ public:
// T v2 = hx(0,0) ; // Generates compile error as intended
// hx(0,0) = v2 ; // Generates compile error as intended
/*
#if ! KOKKOS_USING_EXP_VIEW
#if 0 /* Asynchronous deep copies not implemented for dynamic rank view */
// Testing with asynchronous deep copy with respect to device
{
size_t count = 0 ;
@ -1193,7 +1192,7 @@ public:
{ ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
}}}}
}
#endif */ // #if ! KOKKOS_USING_EXP_VIEW
#endif
// Testing with synchronous deep copy
{

View File

@ -0,0 +1,227 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_TEST_EXPERIMENTAL_ERROR_REPORTER_HPP
#define KOKKOS_TEST_EXPERIMENTAL_ERROR_REPORTER_HPP
#include <gtest/gtest.h>
#include <iostream>
#include <Kokkos_Core.hpp>
namespace Test {
// Just save the data in the report. Informative text goies in the operator<<(..).
template <typename DataType1, typename DataType2, typename DataType3>
struct ThreeValReport
{
DataType1 m_data1;
DataType2 m_data2;
DataType3 m_data3;
};
template <typename DataType1, typename DataType2, typename DataType3>
std::ostream &operator<<(std::ostream & os, const ThreeValReport<DataType1, DataType2, DataType3> &val)
{
return os << "{" << val.m_data1 << " " << val.m_data2 << " " << val.m_data3 << "}";
}
template<typename ReportType>
void checkReportersAndReportsAgree(const std::vector<int> &reporters,
const std::vector<ReportType> &reports)
{
for (size_t i = 0; i < reports.size(); ++i) {
EXPECT_EQ(1, reporters[i] % 2);
EXPECT_EQ(reporters[i], reports[i].m_data1);
}
}
template <typename DeviceType>
struct ErrorReporterDriverBase {
typedef ThreeValReport<int, int, double> report_type;
typedef Kokkos::Experimental::ErrorReporter<report_type, DeviceType> error_reporter_type;
error_reporter_type m_errorReporter;
ErrorReporterDriverBase(int reporter_capacity, int test_size)
: m_errorReporter(reporter_capacity) { }
KOKKOS_INLINE_FUNCTION bool error_condition(const int work_idx) const { return (work_idx % 2 != 0); }
void check_expectations(int reporter_capacity, int test_size)
{
int num_reported = m_errorReporter.getNumReports();
int num_attempts = m_errorReporter.getNumReportAttempts();
int expected_num_reports = std::min(reporter_capacity, test_size / 2);
EXPECT_EQ(expected_num_reports, num_reported);
EXPECT_EQ(test_size / 2, num_attempts);
bool expect_full = (reporter_capacity <= (test_size / 2));
bool reported_full = m_errorReporter.full();
EXPECT_EQ(expect_full, reported_full);
}
};
template <typename ErrorReporterDriverType>
void TestErrorReporter()
{
typedef ErrorReporterDriverType tester_type;
std::vector<int> reporters;
std::vector<typename tester_type::report_type> reports;
tester_type test1(100, 10);
test1.m_errorReporter.getReports(reporters, reports);
checkReportersAndReportsAgree(reporters, reports);
tester_type test2(10, 100);
test2.m_errorReporter.getReports(reporters, reports);
checkReportersAndReportsAgree(reporters, reports);
typename Kokkos::View<int*, typename ErrorReporterDriverType::execution_space >::HostMirror view_reporters;
typename Kokkos::View<typename tester_type::report_type*, typename ErrorReporterDriverType::execution_space >::HostMirror
view_reports;
test2.m_errorReporter.getReports(view_reporters, view_reports);
int num_reports = view_reporters.extent(0);
reporters.clear();
reports.clear();
reporters.reserve(num_reports);
reports.reserve(num_reports);
for (int i = 0; i < num_reports; ++i) {
reporters.push_back(view_reporters(i));
reports.push_back(view_reports(i));
}
checkReportersAndReportsAgree(reporters, reports);
}
template <typename DeviceType>
struct ErrorReporterDriver : public ErrorReporterDriverBase<DeviceType>
{
typedef ErrorReporterDriverBase<DeviceType> driver_base;
typedef typename driver_base::error_reporter_type::execution_space execution_space;
ErrorReporterDriver(int reporter_capacity, int test_size)
: driver_base(reporter_capacity, test_size)
{
execute(reporter_capacity, test_size);
// Test that clear() and resize() work across memory spaces.
if (reporter_capacity < test_size) {
driver_base::m_errorReporter.clear();
driver_base::m_errorReporter.resize(test_size);
execute(test_size, test_size);
}
}
void execute(int reporter_capacity, int test_size)
{
Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0,test_size), *this);
driver_base::check_expectations(reporter_capacity, test_size);
}
KOKKOS_INLINE_FUNCTION
void operator()(const int work_idx) const
{
if (driver_base::error_condition(work_idx)) {
double val = M_PI * static_cast<double>(work_idx);
typename driver_base::report_type report = {work_idx, -2*work_idx, val};
driver_base::m_errorReporter.add_report(work_idx, report);
}
}
};
#if defined(KOKKOS_CLASS_LAMBDA)
template <typename DeviceType>
struct ErrorReporterDriverUseLambda : public ErrorReporterDriverBase<DeviceType>
{
typedef ErrorReporterDriverBase<DeviceType> driver_base;
typedef typename driver_base::error_reporter_type::execution_space execution_space;
ErrorReporterDriverUseLambda(int reporter_capacity, int test_size)
: driver_base(reporter_capacity, test_size)
{
Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0,test_size), KOKKOS_CLASS_LAMBDA (const int work_idx) {
if (driver_base::error_condition(work_idx)) {
double val = M_PI * static_cast<double>(work_idx);
typename driver_base::report_type report = {work_idx, -2*work_idx, val};
driver_base::m_errorReporter.add_report(work_idx, report);
}
});
driver_base::check_expectations(reporter_capacity, test_size);
}
};
#endif
#ifdef KOKKOS_HAVE_OPENMP
struct ErrorReporterDriverNativeOpenMP : public ErrorReporterDriverBase<Kokkos::OpenMP>
{
typedef ErrorReporterDriverBase<Kokkos::OpenMP> driver_base;
typedef typename driver_base::error_reporter_type::execution_space execution_space;
ErrorReporterDriverNativeOpenMP(int reporter_capacity, int test_size)
: driver_base(reporter_capacity, test_size)
{
#pragma omp parallel for
for(int work_idx = 0; work_idx < test_size; ++work_idx)
{
if (driver_base::error_condition(work_idx)) {
double val = M_PI * static_cast<double>(work_idx);
typename driver_base::report_type report = {work_idx, -2*work_idx, val};
driver_base::m_errorReporter.add_report(work_idx, report);
}
};
driver_base::check_expectations(reporter_capacity, test_size);
}
};
#endif
} // namespace Test
#endif // #ifndef KOKKOS_TEST_ERROR_REPORTING_HPP

View File

@ -56,12 +56,14 @@
#include <TestVector.hpp>
#include <TestDualView.hpp>
#include <TestDynamicView.hpp>
#include <TestSegmentedView.hpp>
#include <TestComplex.hpp>
#include <Kokkos_DynRankView.hpp>
#include <TestDynViewAPI.hpp>
#include <Kokkos_ErrorReporter.hpp>
#include <TestErrorReporter.hpp>
#include <iomanip>
namespace Test {
@ -143,11 +145,6 @@ TEST_F( openmp , staticcrsgraph )
test_dualview_combinations<int,Kokkos::OpenMP>(size); \
}
#define OPENMP_SEGMENTEDVIEW_TEST( size ) \
TEST_F( openmp, segmentedview_##size##x) { \
test_segmented_view<double,Kokkos::OpenMP>(size); \
}
OPENMP_INSERT_TEST(close, 100000, 90000, 100, 500, true)
OPENMP_INSERT_TEST(far, 100000, 90000, 100, 500, false)
OPENMP_FAILED_INSERT_TEST( 10000, 1000 )
@ -156,7 +153,6 @@ OPENMP_DEEP_COPY( 10000, 1 )
OPENMP_VECTOR_COMBINE_TEST( 10 )
OPENMP_VECTOR_COMBINE_TEST( 3057 )
OPENMP_DUALVIEW_COMBINE_TEST( 10 )
OPENMP_SEGMENTEDVIEW_TEST( 10000 )
#undef OPENMP_INSERT_TEST
#undef OPENMP_FAILED_INSERT_TEST
@ -164,7 +160,6 @@ OPENMP_SEGMENTEDVIEW_TEST( 10000 )
#undef OPENMP_DEEP_COPY
#undef OPENMP_VECTOR_COMBINE_TEST
#undef OPENMP_DUALVIEW_COMBINE_TEST
#undef OPENMP_SEGMENTEDVIEW_TEST
#endif
@ -178,5 +173,22 @@ TEST_F( openmp , dynamic_view )
}
}
#if defined(KOKKOS_CLASS_LAMBDA)
TEST_F(openmp, ErrorReporterViaLambda)
{
TestErrorReporter<ErrorReporterDriverUseLambda<Kokkos::OpenMP>>();
}
#endif
TEST_F(openmp, ErrorReporter)
{
TestErrorReporter<ErrorReporterDriver<Kokkos::OpenMP>>();
}
TEST_F(openmp, ErrorReporterNativeOpenMP)
{
TestErrorReporter<ErrorReporterDriverNativeOpenMP>();
}
} // namespace test

View File

@ -1,708 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_TEST_SEGMENTEDVIEW_HPP
#define KOKKOS_TEST_SEGMENTEDVIEW_HPP
#include <gtest/gtest.h>
#include <iostream>
#include <cstdlib>
#include <cstdio>
#include <Kokkos_Core.hpp>
#if ! KOKKOS_USING_EXP_VIEW
#include <Kokkos_SegmentedView.hpp>
#include <impl/Kokkos_Timer.hpp>
namespace Test {
namespace Impl {
template<class ViewType , class ExecutionSpace, int Rank = ViewType::Rank>
struct GrowTest;
template<class ViewType , class ExecutionSpace>
struct GrowTest<ViewType , ExecutionSpace , 1> {
typedef ExecutionSpace execution_space;
typedef Kokkos::TeamPolicy<execution_space> Policy;
typedef typename Policy::member_type team_type;
typedef double value_type;
ViewType a;
GrowTest(ViewType in):a(in) {}
KOKKOS_INLINE_FUNCTION
void operator() (team_type team_member, double& value) const {
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
a.grow(team_member , team_idx+team_member.team_size());
value += team_idx + team_member.team_rank();
if((a.dimension_0()>team_idx+team_member.team_rank()) &&
(a.dimension(0)>team_idx+team_member.team_rank()))
a(team_idx+team_member.team_rank()) = team_idx+team_member.team_rank();
}
};
template<class ViewType , class ExecutionSpace>
struct GrowTest<ViewType , ExecutionSpace , 2> {
typedef ExecutionSpace execution_space;
typedef Kokkos::TeamPolicy<execution_space> Policy;
typedef typename Policy::member_type team_type;
typedef double value_type;
ViewType a;
GrowTest(ViewType in):a(in) {}
KOKKOS_INLINE_FUNCTION
void operator() (team_type team_member, double& value) const {
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
a.grow(team_member , team_idx+ team_member.team_size());
for( typename ExecutionSpace::size_type k=0;k<7;k++)
value += team_idx + team_member.team_rank() + 13*k;
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
(a.dimension(0)>team_idx+ team_member.team_rank())) {
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++) {
a(team_idx+ team_member.team_rank(),k) =
team_idx+ team_member.team_rank() + 13*k;
}
}
}
};
template<class ViewType , class ExecutionSpace>
struct GrowTest<ViewType , ExecutionSpace , 3> {
typedef ExecutionSpace execution_space;
typedef Kokkos::TeamPolicy<execution_space> Policy;
typedef typename Policy::member_type team_type;
typedef double value_type;
ViewType a;
GrowTest(ViewType in):a(in) {}
KOKKOS_INLINE_FUNCTION
void operator() (team_type team_member, double& value) const {
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
a.grow(team_member , team_idx+ team_member.team_size());
for( typename ExecutionSpace::size_type k=0;k<7;k++)
for( typename ExecutionSpace::size_type l=0;l<3;l++)
value += team_idx + team_member.team_rank() + 13*k + 3*l;
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
(a.dimension(0)>team_idx+ team_member.team_rank())) {
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
a(team_idx+ team_member.team_rank(),k,l) =
team_idx+ team_member.team_rank() + 13*k + 3*l;
}
}
};
template<class ViewType , class ExecutionSpace>
struct GrowTest<ViewType , ExecutionSpace , 4> {
typedef ExecutionSpace execution_space;
typedef Kokkos::TeamPolicy<execution_space> Policy;
typedef typename Policy::member_type team_type;
typedef double value_type;
ViewType a;
GrowTest(ViewType in):a(in) {}
KOKKOS_INLINE_FUNCTION
void operator() (team_type team_member, double& value) const {
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
a.grow(team_member , team_idx+ team_member.team_size());
for( typename ExecutionSpace::size_type k=0;k<7;k++)
for( typename ExecutionSpace::size_type l=0;l<3;l++)
for( typename ExecutionSpace::size_type m=0;m<2;m++)
value += team_idx + team_member.team_rank() + 13*k + 3*l + 7*m;
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
(a.dimension(0)>team_idx+ team_member.team_rank())) {
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
a(team_idx+ team_member.team_rank(),k,l,m) =
team_idx+ team_member.team_rank() + 13*k + 3*l + 7*m;
}
}
};
template<class ViewType , class ExecutionSpace>
struct GrowTest<ViewType , ExecutionSpace , 5> {
typedef ExecutionSpace execution_space;
typedef Kokkos::TeamPolicy<execution_space> Policy;
typedef typename Policy::member_type team_type;
typedef double value_type;
ViewType a;
GrowTest(ViewType in):a(in) {}
KOKKOS_INLINE_FUNCTION
void operator() (team_type team_member, double& value) const {
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
a.grow(team_member , team_idx+ team_member.team_size());
for( typename ExecutionSpace::size_type k=0;k<7;k++)
for( typename ExecutionSpace::size_type l=0;l<3;l++)
for( typename ExecutionSpace::size_type m=0;m<2;m++)
for( typename ExecutionSpace::size_type n=0;n<3;n++)
value +=
team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n;
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
(a.dimension(0)>team_idx+ team_member.team_rank())) {
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
a(team_idx+ team_member.team_rank(),k,l,m,n) =
team_idx+ team_member.team_rank() + 13*k + 3*l + 7*m + 5*n;
}
}
};
template<class ViewType , class ExecutionSpace>
struct GrowTest<ViewType , ExecutionSpace , 6> {
typedef ExecutionSpace execution_space;
typedef Kokkos::TeamPolicy<execution_space> Policy;
typedef typename Policy::member_type team_type;
typedef double value_type;
ViewType a;
GrowTest(ViewType in):a(in) {}
KOKKOS_INLINE_FUNCTION
void operator() (team_type team_member, double& value) const {
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
a.grow(team_member , team_idx+ team_member.team_size());
for( typename ExecutionSpace::size_type k=0;k<7;k++)
for( typename ExecutionSpace::size_type l=0;l<3;l++)
for( typename ExecutionSpace::size_type m=0;m<2;m++)
for( typename ExecutionSpace::size_type n=0;n<3;n++)
for( typename ExecutionSpace::size_type o=0;o<2;o++)
value +=
team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o ;
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
(a.dimension(0)>team_idx+ team_member.team_rank())) {
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
a(team_idx+ team_member.team_rank(),k,l,m,n,o) =
team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o ;
}
}
};
template<class ViewType , class ExecutionSpace>
struct GrowTest<ViewType , ExecutionSpace , 7> {
typedef ExecutionSpace execution_space;
typedef Kokkos::TeamPolicy<execution_space> Policy;
typedef typename Policy::member_type team_type;
typedef double value_type;
ViewType a;
GrowTest(ViewType in):a(in) {}
KOKKOS_INLINE_FUNCTION
void operator() (team_type team_member, double& value) const {
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
a.grow(team_member , team_idx+ team_member.team_size());
for( typename ExecutionSpace::size_type k=0;k<7;k++)
for( typename ExecutionSpace::size_type l=0;l<3;l++)
for( typename ExecutionSpace::size_type m=0;m<2;m++)
for( typename ExecutionSpace::size_type n=0;n<3;n++)
for( typename ExecutionSpace::size_type o=0;o<2;o++)
for( typename ExecutionSpace::size_type p=0;p<4;p++)
value +=
team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p ;
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
(a.dimension(0)>team_idx+ team_member.team_rank())) {
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
for( typename ExecutionSpace::size_type p=0;p<a.dimension_6();p++)
a(team_idx+ team_member.team_rank(),k,l,m,n,o,p) =
team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p ;
}
}
};
template<class ViewType , class ExecutionSpace>
struct GrowTest<ViewType , ExecutionSpace , 8> {
typedef ExecutionSpace execution_space;
typedef Kokkos::TeamPolicy<execution_space> Policy;
typedef typename Policy::member_type team_type;
typedef double value_type;
ViewType a;
GrowTest(ViewType in):a(in) {}
KOKKOS_INLINE_FUNCTION
void operator() (team_type team_member, double& value) const {
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
a.grow(team_member , team_idx + team_member.team_size());
for( typename ExecutionSpace::size_type k=0;k<7;k++)
for( typename ExecutionSpace::size_type l=0;l<3;l++)
for( typename ExecutionSpace::size_type m=0;m<2;m++)
for( typename ExecutionSpace::size_type n=0;n<3;n++)
for( typename ExecutionSpace::size_type o=0;o<2;o++)
for( typename ExecutionSpace::size_type p=0;p<4;p++)
for( typename ExecutionSpace::size_type q=0;q<3;q++)
value +=
team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p + 17*q;
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
(a.dimension(0)>team_idx+ team_member.team_rank())) {
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
for( typename ExecutionSpace::size_type p=0;p<a.dimension_6();p++)
for( typename ExecutionSpace::size_type q=0;q<a.dimension_7();q++)
a(team_idx+ team_member.team_rank(),k,l,m,n,o,p,q) =
team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p + 17*q;
}
}
};
template<class ViewType , class ExecutionSpace, int Rank = ViewType::Rank>
struct VerifyTest;
template<class ViewType , class ExecutionSpace>
struct VerifyTest<ViewType , ExecutionSpace , 1> {
typedef ExecutionSpace execution_space;
typedef Kokkos::TeamPolicy<execution_space> Policy;
typedef typename Policy::member_type team_type;
typedef double value_type;
ViewType a;
VerifyTest(ViewType in):a(in) {}
KOKKOS_INLINE_FUNCTION
void operator() (team_type team_member, double& value) const {
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
(a.dimension(0)>team_idx+ team_member.team_rank())) {
value += a(team_idx+ team_member.team_rank());
}
}
};
template<class ViewType , class ExecutionSpace>
struct VerifyTest<ViewType , ExecutionSpace , 2> {
typedef ExecutionSpace execution_space;
typedef Kokkos::TeamPolicy<execution_space> Policy;
typedef typename Policy::member_type team_type;
typedef double value_type;
ViewType a;
VerifyTest(ViewType in):a(in) {}
KOKKOS_INLINE_FUNCTION
void operator() (team_type team_member, double& value) const {
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
(a.dimension(0)>team_idx+ team_member.team_rank())) {
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
value += a(team_idx+ team_member.team_rank(),k);
}
}
};
template<class ViewType , class ExecutionSpace>
struct VerifyTest<ViewType , ExecutionSpace , 3> {
typedef ExecutionSpace execution_space;
typedef Kokkos::TeamPolicy<execution_space> Policy;
typedef typename Policy::member_type team_type;
typedef double value_type;
ViewType a;
VerifyTest(ViewType in):a(in) {}
KOKKOS_INLINE_FUNCTION
void operator() (team_type team_member, double& value) const {
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
(a.dimension(0)>team_idx+ team_member.team_rank())) {
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
value += a(team_idx+ team_member.team_rank(),k,l);
}
}
};
template<class ViewType , class ExecutionSpace>
struct VerifyTest<ViewType , ExecutionSpace , 4> {
typedef ExecutionSpace execution_space;
typedef Kokkos::TeamPolicy<execution_space> Policy;
typedef typename Policy::member_type team_type;
typedef double value_type;
ViewType a;
VerifyTest(ViewType in):a(in) {}
KOKKOS_INLINE_FUNCTION
void operator() (team_type team_member, double& value) const {
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
(a.dimension(0)>team_idx+ team_member.team_rank())) {
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
value += a(team_idx+ team_member.team_rank(),k,l,m);
}
}
};
template<class ViewType , class ExecutionSpace>
struct VerifyTest<ViewType , ExecutionSpace , 5> {
typedef ExecutionSpace execution_space;
typedef Kokkos::TeamPolicy<execution_space> Policy;
typedef typename Policy::member_type team_type;
typedef double value_type;
ViewType a;
VerifyTest(ViewType in):a(in) {}
KOKKOS_INLINE_FUNCTION
void operator() (team_type team_member, double& value) const {
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
(a.dimension(0)>team_idx+ team_member.team_rank())) {
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
value += a(team_idx+ team_member.team_rank(),k,l,m,n);
}
}
};
template<class ViewType , class ExecutionSpace>
struct VerifyTest<ViewType , ExecutionSpace , 6> {
typedef ExecutionSpace execution_space;
typedef Kokkos::TeamPolicy<execution_space> Policy;
typedef typename Policy::member_type team_type;
typedef double value_type;
ViewType a;
VerifyTest(ViewType in):a(in) {}
KOKKOS_INLINE_FUNCTION
void operator() (team_type team_member, double& value) const {
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
(a.dimension(0)>team_idx+ team_member.team_rank())) {
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
value += a(team_idx+ team_member.team_rank(),k,l,m,n,o);
}
}
};
template<class ViewType , class ExecutionSpace>
struct VerifyTest<ViewType , ExecutionSpace , 7> {
typedef ExecutionSpace execution_space;
typedef Kokkos::TeamPolicy<execution_space> Policy;
typedef typename Policy::member_type team_type;
typedef double value_type;
ViewType a;
VerifyTest(ViewType in):a(in) {}
KOKKOS_INLINE_FUNCTION
void operator() (team_type team_member, double& value) const {
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
(a.dimension(0)>team_idx+ team_member.team_rank())) {
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
for( typename ExecutionSpace::size_type p=0;p<a.dimension_6();p++)
value += a(team_idx+ team_member.team_rank(),k,l,m,n,o,p);
}
}
};
template<class ViewType , class ExecutionSpace>
struct VerifyTest<ViewType , ExecutionSpace , 8> {
typedef ExecutionSpace execution_space;
typedef Kokkos::TeamPolicy<execution_space> Policy;
typedef typename Policy::member_type team_type;
typedef double value_type;
ViewType a;
VerifyTest(ViewType in):a(in) {}
KOKKOS_INLINE_FUNCTION
void operator() (team_type team_member, double& value) const {
unsigned int team_idx = team_member.league_rank() * team_member.team_size();
if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
(a.dimension(0)>team_idx+ team_member.team_rank())) {
for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
for( typename ExecutionSpace::size_type p=0;p<a.dimension_6();p++)
for( typename ExecutionSpace::size_type q=0;q<a.dimension_7();q++)
value += a(team_idx+ team_member.team_rank(),k,l,m,n,o,p,q);
}
}
};
template <typename Scalar, class ExecutionSpace>
struct test_segmented_view
{
typedef test_segmented_view<Scalar,ExecutionSpace> self_type;
typedef Scalar scalar_type;
typedef ExecutionSpace execution_space;
typedef Kokkos::TeamPolicy<execution_space> Policy;
double result;
double reference;
template <class ViewType>
void run_me(ViewType a, int max_length){
const int team_size = Policy::team_size_max( GrowTest<ViewType,execution_space>(a) );
const int nteams = max_length/team_size;
reference = 0;
result = 0;
Kokkos::parallel_reduce(Policy(nteams,team_size),GrowTest<ViewType,execution_space>(a),reference);
Kokkos::fence();
Kokkos::parallel_reduce(Policy(nteams,team_size),VerifyTest<ViewType,execution_space>(a),result);
Kokkos::fence();
}
test_segmented_view(unsigned int size,int rank)
{
reference = 0;
result = 0;
const int dim_1 = 7;
const int dim_2 = 3;
const int dim_3 = 2;
const int dim_4 = 3;
const int dim_5 = 2;
const int dim_6 = 4;
//const int dim_7 = 3;
if(rank==1) {
typedef Kokkos::Experimental::SegmentedView<Scalar*,Kokkos::LayoutLeft,ExecutionSpace> rank1_view;
run_me< rank1_view >(rank1_view("Rank1",128,size), size);
}
if(rank==2) {
typedef Kokkos::Experimental::SegmentedView<Scalar**,Kokkos::LayoutLeft,ExecutionSpace> rank2_view;
run_me< rank2_view >(rank2_view("Rank2",128,size,dim_1), size);
}
if(rank==3) {
typedef Kokkos::Experimental::SegmentedView<Scalar*[7][3][2],Kokkos::LayoutRight,ExecutionSpace> rank3_view;
run_me< rank3_view >(rank3_view("Rank3",128,size), size);
}
if(rank==4) {
typedef Kokkos::Experimental::SegmentedView<Scalar****,Kokkos::LayoutRight,ExecutionSpace> rank4_view;
run_me< rank4_view >(rank4_view("Rank4",128,size,dim_1,dim_2,dim_3), size);
}
if(rank==5) {
typedef Kokkos::Experimental::SegmentedView<Scalar*[7][3][2][3],Kokkos::LayoutLeft,ExecutionSpace> rank5_view;
run_me< rank5_view >(rank5_view("Rank5",128,size), size);
}
if(rank==6) {
typedef Kokkos::Experimental::SegmentedView<Scalar*****[2],Kokkos::LayoutRight,ExecutionSpace> rank6_view;
run_me< rank6_view >(rank6_view("Rank6",128,size,dim_1,dim_2,dim_3,dim_4), size);
}
if(rank==7) {
typedef Kokkos::Experimental::SegmentedView<Scalar*******,Kokkos::LayoutLeft,ExecutionSpace> rank7_view;
run_me< rank7_view >(rank7_view("Rank7",128,size,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6), size);
}
if(rank==8) {
typedef Kokkos::Experimental::SegmentedView<Scalar*****[2][4][3],Kokkos::LayoutLeft,ExecutionSpace> rank8_view;
run_me< rank8_view >(rank8_view("Rank8",128,size,dim_1,dim_2,dim_3,dim_4), size);
}
}
};
} // namespace Impl
template <typename Scalar, class ExecutionSpace>
void test_segmented_view(unsigned int size)
{
{
typedef Kokkos::Experimental::SegmentedView<Scalar*****[2][4][3],Kokkos::LayoutLeft,ExecutionSpace> view_type;
view_type a("A",128,size,7,3,2,3);
double reference;
Impl::GrowTest<view_type,ExecutionSpace> f(a);
const int team_size = Kokkos::TeamPolicy<ExecutionSpace>::team_size_max( f );
const int nteams = (size+team_size-1)/team_size;
Kokkos::parallel_reduce(Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size),f,reference);
size_t real_size = ((size+127)/128)*128;
ASSERT_EQ(real_size,a.dimension_0());
ASSERT_EQ(7,a.dimension_1());
ASSERT_EQ(3,a.dimension_2());
ASSERT_EQ(2,a.dimension_3());
ASSERT_EQ(3,a.dimension_4());
ASSERT_EQ(2,a.dimension_5());
ASSERT_EQ(4,a.dimension_6());
ASSERT_EQ(3,a.dimension_7());
ASSERT_EQ(real_size,a.dimension(0));
ASSERT_EQ(7,a.dimension(1));
ASSERT_EQ(3,a.dimension(2));
ASSERT_EQ(2,a.dimension(3));
ASSERT_EQ(3,a.dimension(4));
ASSERT_EQ(2,a.dimension(5));
ASSERT_EQ(4,a.dimension(6));
ASSERT_EQ(3,a.dimension(7));
ASSERT_EQ(8,a.Rank);
}
{
Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,1);
ASSERT_EQ(test.reference,test.result);
}
{
Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,2);
ASSERT_EQ(test.reference,test.result);
}
{
Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,3);
ASSERT_EQ(test.reference,test.result);
}
{
Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,4);
ASSERT_EQ(test.reference,test.result);
}
{
Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,5);
ASSERT_EQ(test.reference,test.result);
}
{
Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,6);
ASSERT_EQ(test.reference,test.result);
}
{
Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,7);
ASSERT_EQ(test.reference,test.result);
}
{
Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,8);
ASSERT_EQ(test.reference,test.result);
}
}
} // namespace Test
#else
template <typename Scalar, class ExecutionSpace>
void test_segmented_view(unsigned int ) {}
#endif
#endif /* #ifndef KOKKOS_TEST_SEGMENTEDVIEW_HPP */

View File

@ -58,7 +58,6 @@
#include <TestStaticCrsGraph.hpp>
#include <TestVector.hpp>
#include <TestDualView.hpp>
#include <TestSegmentedView.hpp>
#include <TestDynamicView.hpp>
#include <TestComplex.hpp>
@ -67,6 +66,9 @@
#include <Kokkos_DynRankView.hpp>
#include <TestDynViewAPI.hpp>
#include <Kokkos_ErrorReporter.hpp>
#include <TestErrorReporter.hpp>
namespace Test {
class serial : public ::testing::Test {
@ -135,11 +137,6 @@ TEST_F( serial, bitset )
test_dualview_combinations<int,Kokkos::Serial>(size); \
}
#define SERIAL_SEGMENTEDVIEW_TEST( size ) \
TEST_F( serial, segmentedview_##size##x) { \
test_segmented_view<double,Kokkos::Serial>(size); \
}
SERIAL_INSERT_TEST(close, 100000, 90000, 100, 500, true)
SERIAL_INSERT_TEST(far, 100000, 90000, 100, 500, false)
SERIAL_FAILED_INSERT_TEST( 10000, 1000 )
@ -148,7 +145,6 @@ SERIAL_DEEP_COPY( 10000, 1 )
SERIAL_VECTOR_COMBINE_TEST( 10 )
SERIAL_VECTOR_COMBINE_TEST( 3057 )
SERIAL_DUALVIEW_COMBINE_TEST( 10 )
SERIAL_SEGMENTEDVIEW_TEST( 10000 )
#undef SERIAL_INSERT_TEST
#undef SERIAL_FAILED_INSERT_TEST
@ -156,7 +152,6 @@ SERIAL_SEGMENTEDVIEW_TEST( 10000 )
#undef SERIAL_DEEP_COPY
#undef SERIAL_VECTOR_COMBINE_TEST
#undef SERIAL_DUALVIEW_COMBINE_TEST
#undef SERIAL_SEGMENTEDVIEW_TEST
TEST_F( serial , dynamic_view )
{
@ -168,6 +163,19 @@ TEST_F( serial , dynamic_view )
}
}
#if defined(KOKKOS_CLASS_LAMBDA)
TEST_F(serial, ErrorReporterViaLambda)
{
TestErrorReporter<ErrorReporterDriverUseLambda<Kokkos::Serial>>();
}
#endif
TEST_F(serial, ErrorReporter)
{
TestErrorReporter<ErrorReporterDriver<Kokkos::Serial>>();
}
} // namespace Test
#endif // KOKKOS_HAVE_SERIAL

View File

@ -62,11 +62,13 @@
#include <TestVector.hpp>
#include <TestDualView.hpp>
#include <TestDynamicView.hpp>
#include <TestSegmentedView.hpp>
#include <Kokkos_DynRankView.hpp>
#include <TestDynViewAPI.hpp>
#include <Kokkos_ErrorReporter.hpp>
#include <TestErrorReporter.hpp>
namespace Test {
class threads : public ::testing::Test {
@ -145,12 +147,6 @@ TEST_F( threads , staticcrsgraph )
test_dualview_combinations<int,Kokkos::Threads>(size); \
}
#define THREADS_SEGMENTEDVIEW_TEST( size ) \
TEST_F( threads, segmentedview_##size##x) { \
test_segmented_view<double,Kokkos::Threads>(size); \
}
THREADS_INSERT_TEST(far, 100000, 90000, 100, 500, false)
THREADS_FAILED_INSERT_TEST( 10000, 1000 )
THREADS_DEEP_COPY( 10000, 1 )
@ -158,7 +154,6 @@ THREADS_DEEP_COPY( 10000, 1 )
THREADS_VECTOR_COMBINE_TEST( 10 )
THREADS_VECTOR_COMBINE_TEST( 3057 )
THREADS_DUALVIEW_COMBINE_TEST( 10 )
THREADS_SEGMENTEDVIEW_TEST( 10000 )
#undef THREADS_INSERT_TEST
@ -167,8 +162,6 @@ THREADS_SEGMENTEDVIEW_TEST( 10000 )
#undef THREADS_DEEP_COPY
#undef THREADS_VECTOR_COMBINE_TEST
#undef THREADS_DUALVIEW_COMBINE_TEST
#undef THREADS_SEGMENTEDVIEW_TEST
TEST_F( threads , dynamic_view )
@ -181,6 +174,19 @@ TEST_F( threads , dynamic_view )
}
}
#if defined(KOKKOS_CLASS_LAMBDA)
TEST_F(threads, ErrorReporterViaLambda)
{
TestErrorReporter<ErrorReporterDriverUseLambda<Kokkos::Threads>>();
}
#endif
TEST_F(threads, ErrorReporter)
{
TestErrorReporter<ErrorReporterDriver<Kokkos::Threads>>();
}
} // namespace Test

View File

@ -2,3 +2,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREAD DLlib
TEST_OPTIONAL_TPLS CUSPARSE
)
TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib)

View File

@ -45,6 +45,16 @@
#define KOKKOS_ENABLE_PROFILING 0
#endif
#cmakedefine KOKKOS_HAVE_CUDA_RDC
#ifdef KOKKOS_HAVE_CUDA_RDC
#define KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE 1
#endif
#cmakedefine KOKKOS_HAVE_CUDA_LAMBDA
#ifdef KOKKOS_HAVE_CUDA_LAMBDA
#define KOKKOS_CUDA_USE_LAMBDA 1
#endif
// Don't forbid users from defining this macro on the command line,
// but still make sure that CMake logic can control its definition.
#if ! defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)

View File

@ -1,6 +1,6 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINRARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
SET(SOURCES
PerfTestMain.cpp
@ -19,7 +19,7 @@ TRIBITS_ADD_EXECUTABLE(
TESTONLYLIBS kokkos_gtest
)
TRIBITS_ADD_EXECUTABLE_AND_TEST(
TRIBITS_ADD_TEST(
PerfTest
NAME PerfTestExec
COMM serial mpi

View File

@ -7,21 +7,18 @@ vpath %.cpp ${KOKKOS_PATH}/core/perf_test
default: build_all
echo "End Build"
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
CXX = $(KOKKOS_PATH)/config/nvcc_wrapper
else
CXX = g++
endif
CXXFLAGS = -O3
LINK ?= $(CXX)
LDFLAGS ?= -lpthread
include $(KOKKOS_PATH)/Makefile.kokkos
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
CXX = $(NVCC_WRAPPER)
CXXFLAGS ?= -O3
LINK = $(CXX)
LDFLAGS ?= -lpthread
else
CXX ?= g++
CXXFLAGS ?= -O3
LINK ?= $(CXX)
LDFLAGS ?= -lpthread
endif
KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/core/perf_test
TEST_TARGETS =

View File

@ -79,10 +79,21 @@ class host : public ::testing::Test {
protected:
static void SetUpTestCase()
{
const unsigned team_count = Kokkos::hwloc::get_available_numa_count();
const unsigned threads_per_team = 4 ;
if(Kokkos::hwloc::available()) {
const unsigned numa_count = Kokkos::hwloc::get_available_numa_count();
const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa();
const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
TestHostDevice::initialize( team_count * threads_per_team );
unsigned threads_count = 0 ;
threads_count = std::max( 1u , numa_count )
* std::max( 2u , cores_per_numa * threads_per_core );
TestHostDevice::initialize( threads_count );
} else {
const unsigned thread_count = 4 ;
TestHostDevice::initialize( thread_count );
}
}
static void TearDownTestCase()

View File

@ -1,334 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
#define KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
/* only compile this file if CUDA is enabled for Kokkos */
#if defined( KOKKOS_HAVE_CUDA )
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
template<>
struct ViewOperatorBoundsErrorAbort< Kokkos::CudaSpace > {
KOKKOS_INLINE_FUNCTION
static void apply( const size_t rank
, const size_t n0 , const size_t n1
, const size_t n2 , const size_t n3
, const size_t n4 , const size_t n5
, const size_t n6 , const size_t n7
, const size_t i0 , const size_t i1
, const size_t i2 , const size_t i3
, const size_t i4 , const size_t i5
, const size_t i6 , const size_t i7 )
{
const int r =
( n0 <= i0 ? 0 :
( n1 <= i1 ? 1 :
( n2 <= i2 ? 2 :
( n3 <= i3 ? 3 :
( n4 <= i4 ? 4 :
( n5 <= i5 ? 5 :
( n6 <= i6 ? 6 : 7 )))))));
const size_t n =
( n0 <= i0 ? n0 :
( n1 <= i1 ? n1 :
( n2 <= i2 ? n2 :
( n3 <= i3 ? n3 :
( n4 <= i4 ? n4 :
( n5 <= i5 ? n5 :
( n6 <= i6 ? n6 : n7 )))))));
const size_t i =
( n0 <= i0 ? i0 :
( n1 <= i1 ? i1 :
( n2 <= i2 ? i2 :
( n3 <= i3 ? i3 :
( n4 <= i4 ? i4 :
( n5 <= i5 ? i5 :
( n6 <= i6 ? i6 : i7 )))))));
printf("Cuda view array bounds error index %d : FAILED %lu < %lu\n" , r , i , n );
Kokkos::Impl::cuda_abort("Cuda view array bounds error");
}
};
} // namespace Impl
} // namespace Experimental
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
// Via reinterpret_case this can be used to support all scalar types of those sizes.
// Any other scalar type falls back to either normal reads out of global memory,
// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
template< typename ValueType , typename AliasType >
struct CudaTextureFetch {
::cudaTextureObject_t m_obj ;
const ValueType * m_ptr ;
int m_offset ;
// Deference operator pulls through texture object and returns by value
template< typename iType >
KOKKOS_INLINE_FUNCTION
ValueType operator[]( const iType & i ) const
{
#if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset );
return *(reinterpret_cast<ValueType*> (&v));
#else
return m_ptr[ i ];
#endif
}
// Pointer to referenced memory
KOKKOS_INLINE_FUNCTION
operator const ValueType * () const { return m_ptr ; }
KOKKOS_INLINE_FUNCTION
CudaTextureFetch() : m_obj() , m_ptr() , m_offset() {}
KOKKOS_INLINE_FUNCTION
~CudaTextureFetch() {}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch( const CudaTextureFetch & rhs )
: m_obj( rhs.m_obj )
, m_ptr( rhs.m_ptr )
, m_offset( rhs.m_offset )
{}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch( CudaTextureFetch && rhs )
: m_obj( rhs.m_obj )
, m_ptr( rhs.m_ptr )
, m_offset( rhs.m_offset )
{}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
{
m_obj = rhs.m_obj ;
m_ptr = rhs.m_ptr ;
m_offset = rhs.m_offset ;
return *this ;
}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch & operator = ( CudaTextureFetch && rhs )
{
m_obj = rhs.m_obj ;
m_ptr = rhs.m_ptr ;
m_offset = rhs.m_offset ;
return *this ;
}
// Texture object spans the entire allocation.
// This handle may view a subset of the allocation, so an offset is required.
template< class CudaMemorySpace >
inline explicit
CudaTextureFetch( const ValueType * const arg_ptr
, Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > & record
)
: m_obj( record.template attach_texture_object< AliasType >() )
, m_ptr( arg_ptr )
, m_offset( record.attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) )
{}
};
#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC )
template< typename ValueType , typename AliasType >
struct CudaLDGFetch {
const ValueType * m_ptr ;
template< typename iType >
KOKKOS_INLINE_FUNCTION
ValueType operator[]( const iType & i ) const
{
AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_ptr[i]));
return *(reinterpret_cast<ValueType*> (&v));
}
KOKKOS_INLINE_FUNCTION
operator const ValueType * () const { return m_ptr ; }
KOKKOS_INLINE_FUNCTION
CudaLDGFetch() : m_ptr() {}
KOKKOS_INLINE_FUNCTION
~CudaLDGFetch() {}
KOKKOS_INLINE_FUNCTION
CudaLDGFetch( const CudaLDGFetch & rhs )
: m_ptr( rhs.m_ptr )
{}
KOKKOS_INLINE_FUNCTION
CudaLDGFetch( CudaLDGFetch && rhs )
: m_ptr( rhs.m_ptr )
{}
KOKKOS_INLINE_FUNCTION
CudaLDGFetch & operator = ( const CudaLDGFetch & rhs )
{
m_ptr = rhs.m_ptr ;
return *this ;
}
KOKKOS_INLINE_FUNCTION
CudaLDGFetch & operator = ( CudaLDGFetch && rhs )
{
m_ptr = rhs.m_ptr ;
return *this ;
}
template< class CudaMemorySpace >
inline explicit
CudaTextureFetch( const ValueType * const arg_ptr
, Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > const &
)
: m_ptr( arg_data_ptr )
{}
};
#endif
} // namespace Impl
} // namespace Experimental
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
/** \brief Replace Default ViewDataHandle with Cuda texture fetch specialization
* if 'const' value type, CudaSpace and random access.
*/
template< class Traits >
class ViewDataHandle< Traits ,
typename std::enable_if<(
// Is Cuda memory space
( std::is_same< typename Traits::memory_space,Kokkos::CudaSpace>::value ||
std::is_same< typename Traits::memory_space,Kokkos::CudaUVMSpace>::value )
&&
// Is a trivial const value of 4, 8, or 16 bytes
std::is_trivial<typename Traits::const_value_type>::value
&&
std::is_same<typename Traits::const_value_type,typename Traits::value_type>::value
&&
( sizeof(typename Traits::const_value_type) == 4 ||
sizeof(typename Traits::const_value_type) == 8 ||
sizeof(typename Traits::const_value_type) == 16 )
&&
// Random access trait
( Traits::memory_traits::RandomAccess != 0 )
)>::type >
{
public:
using track_type = Kokkos::Experimental::Impl::SharedAllocationTracker ;
using value_type = typename Traits::const_value_type ;
using return_type = typename Traits::const_value_type ; // NOT a reference
using alias_type = typename std::conditional< ( sizeof(value_type) == 4 ) , int ,
typename std::conditional< ( sizeof(value_type) == 8 ) , ::int2 ,
typename std::conditional< ( sizeof(value_type) == 16 ) , ::int4 , void
>::type
>::type
>::type ;
#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC )
using handle_type = Kokkos::Experimental::Impl::CudaLDGFetch< value_type , alias_type > ;
#else
using handle_type = Kokkos::Experimental::Impl::CudaTextureFetch< value_type , alias_type > ;
#endif
KOKKOS_INLINE_FUNCTION
static handle_type const & assign( handle_type const & arg_handle , track_type const & /* arg_tracker */ )
{
return arg_handle ;
}
KOKKOS_INLINE_FUNCTION
static handle_type assign( value_type * arg_data_ptr, track_type const & arg_tracker )
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
// Assignment of texture = non-texture requires creation of a texture object
// which can only occur on the host. In addition, 'get_record' is only valid
// if called in a host execution space
return handle_type( arg_data_ptr , arg_tracker.template get_record< typename Traits::memory_space >() );
#else
Kokkos::Impl::cuda_abort("Cannot create Cuda texture object from within a Cuda kernel");
return handle_type();
#endif
}
};
}
}
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */

View File

@ -46,6 +46,7 @@
#include <sstream>
#include <stdexcept>
#include <algorithm>
#include <atomic>
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
@ -58,6 +59,11 @@
#include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <impl/Kokkos_Error.hpp>
#if (KOKKOS_ENABLE_PROFILING)
#include <impl/Kokkos_Profiling_Interface.hpp>
#endif
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
@ -65,6 +71,9 @@ namespace Kokkos {
namespace Impl {
namespace {
static std::atomic<int> num_uvm_allocations(0) ;
cudaStream_t get_deep_copy_stream() {
static cudaStream_t s = 0;
if( s == 0) {
@ -119,6 +128,7 @@ void CudaSpace::access_error( const void * const )
Kokkos::Impl::throw_runtime_exception( msg );
}
/*--------------------------------------------------------------------------*/
bool CudaUVMSpace::available()
@ -133,6 +143,11 @@ bool CudaUVMSpace::available()
/*--------------------------------------------------------------------------*/
int CudaUVMSpace::number_of_allocations()
{
return Kokkos::Impl::num_uvm_allocations.load();
}
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
@ -167,7 +182,18 @@ void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const
{
void * ptr = NULL;
CUDA_SAFE_CALL( cudaMallocManaged( &ptr, arg_alloc_size , cudaMemAttachGlobal ) );
enum { max_uvm_allocations = 65536 };
if ( arg_alloc_size > 0 )
{
Kokkos::Impl::num_uvm_allocations++;
if ( Kokkos::Impl::num_uvm_allocations.load() > max_uvm_allocations ) {
Kokkos::Impl::throw_runtime_exception( "CudaUVM error: The maximum limit of UVM allocations exceeded (currently 65536)." ) ;
}
CUDA_SAFE_CALL( cudaMallocManaged( &ptr, arg_alloc_size , cudaMemAttachGlobal ) );
}
return ptr ;
}
@ -191,7 +217,10 @@ void CudaSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_all
void CudaUVMSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
{
try {
CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) );
if ( arg_alloc_ptr != nullptr ) {
Kokkos::Impl::num_uvm_allocations--;
CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) );
}
} catch(...) {}
}
@ -202,13 +231,24 @@ void CudaHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t
} catch(...) {}
}
constexpr const char* CudaSpace::name() {
return m_name;
}
constexpr const char* CudaUVMSpace::name() {
return m_name;
}
constexpr const char* CudaHostPinnedSpace::name() {
return m_name;
}
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
SharedAllocationRecord< void , void >
@ -335,6 +375,18 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec )
SharedAllocationRecord< Kokkos::CudaSpace , void >::
~SharedAllocationRecord()
{
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
SharedAllocationHeader header ;
Kokkos::Impl::DeepCopy<CudaSpace,HostSpace>::DeepCopy( & header , RecordBase::m_alloc_ptr , sizeof(SharedAllocationHeader) );
Kokkos::Profiling::deallocateData(
Kokkos::Profiling::SpaceHandle(Kokkos::CudaSpace::name()),header.m_label,
data(),size());
}
#endif
m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
, SharedAllocationRecord< void , void >::m_alloc_size
);
@ -343,6 +395,15 @@ SharedAllocationRecord< Kokkos::CudaSpace , void >::
SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
~SharedAllocationRecord()
{
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::fence(); //Make sure I can access the label ...
Kokkos::Profiling::deallocateData(
Kokkos::Profiling::SpaceHandle(Kokkos::CudaUVMSpace::name()),RecordBase::m_alloc_ptr->m_label,
data(),size());
}
#endif
m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
, SharedAllocationRecord< void , void >::m_alloc_size
);
@ -351,6 +412,14 @@ SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
~SharedAllocationRecord()
{
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::deallocateData(
Kokkos::Profiling::SpaceHandle(Kokkos::CudaHostPinnedSpace::name()),RecordBase::m_alloc_ptr->m_label,
data(),size());
}
#endif
m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
, SharedAllocationRecord< void , void >::m_alloc_size
);
@ -373,6 +442,12 @@ SharedAllocationRecord( const Kokkos::CudaSpace & arg_space
, m_tex_obj( 0 )
, m_space( arg_space )
{
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
}
#endif
SharedAllocationHeader header ;
// Fill in the Header information
@ -404,7 +479,12 @@ SharedAllocationRecord( const Kokkos::CudaUVMSpace & arg_space
, m_tex_obj( 0 )
, m_space( arg_space )
{
// Fill in the Header information, directly accessible via UVM
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
}
#endif
// Fill in the Header information, directly accessible via UVM
RecordBase::m_alloc_ptr->m_record = this ;
@ -430,6 +510,11 @@ SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace & arg_space
)
, m_space( arg_space )
{
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
}
#endif
// Fill in the Header information, directly accessible via UVM
RecordBase::m_alloc_ptr->m_record = this ;
@ -502,6 +587,7 @@ void SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
deallocate_tracked( void * const arg_alloc_ptr )
{
if ( arg_alloc_ptr != 0 ) {
SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
RecordBase::decrement( r );
@ -587,7 +673,7 @@ SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr
RecordCuda * const record = alloc_ptr ? static_cast< RecordCuda * >( head.m_record ) : (RecordCuda *) 0 ;
if ( ! alloc_ptr || record->m_alloc_ptr != head_cuda ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) );
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) );
}
#else
@ -598,7 +684,7 @@ SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr
RecordCuda * const record = static_cast< RecordCuda * >( RecordBase::find( & s_root_record , alloc_ptr ) );
if ( record == 0 ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) );
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) );
}
#endif
@ -615,7 +701,7 @@ SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record( void * alloc_
Header * const h = alloc_ptr ? reinterpret_cast< Header * >( alloc_ptr ) - 1 : (Header *) 0 ;
if ( ! alloc_ptr || h->m_record->m_alloc_ptr != h ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record ERROR" ) );
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record ERROR" ) );
}
return static_cast< RecordCuda * >( h->m_record );
@ -630,7 +716,7 @@ SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record( void *
Header * const h = alloc_ptr ? reinterpret_cast< Header * >( alloc_ptr ) - 1 : (Header *) 0 ;
if ( ! alloc_ptr || h->m_record->m_alloc_ptr != h ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record ERROR" ) );
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record ERROR" ) );
}
return static_cast< RecordCuda * >( h->m_record );
@ -728,7 +814,6 @@ print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bo
}
} // namespace Impl
} // namespace Experimental
} // namespace Kokkos
/*--------------------------------------------------------------------------*/

View File

@ -384,10 +384,10 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
const bool ok_id = 0 <= cuda_device_id &&
cuda_device_id < dev_info.m_cudaDevCount ;
// Need device capability 2.0 or better
// Need device capability 3.0 or better
const bool ok_dev = ok_id &&
( 2 <= dev_info.m_cudaProp[ cuda_device_id ].major &&
( 3 <= dev_info.m_cudaProp[ cuda_device_id ].major &&
0 <= dev_info.m_cudaProp[ cuda_device_id ].minor );
if ( ok_init && ok_dev ) {
@ -444,7 +444,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
//----------------------------------
// Maximum number of blocks:
m_maxBlock = m_cudaArch < 300 ? 65535 : cudaProp.maxGridSize[0] ;
m_maxBlock = cudaProp.maxGridSize[0] ;
//----------------------------------
@ -495,7 +495,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
msg << dev_info.m_cudaProp[ cuda_device_id ].major ;
msg << "." ;
msg << dev_info.m_cudaProp[ cuda_device_id ].minor ;
msg << " has insufficient capability, required 2.0 or better" ;
msg << " has insufficient capability, required 3.0 or better" ;
}
Kokkos::Impl::throw_runtime_exception( msg.str() );
}

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -95,27 +95,42 @@ private:
public:
#if defined( __CUDA_ARCH__ )
__device__ inline
KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space & team_shmem() const
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
__device__ inline
KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space & team_scratch(const int& level) const
{ return m_team_shared.set_team_thread_mode(level,1,0) ; }
__device__ inline
KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space & thread_scratch(const int& level) const
{ return m_team_shared.set_team_thread_mode(level,team_size(),team_rank()) ; }
__device__ inline int league_rank() const { return m_league_rank ; }
__device__ inline int league_size() const { return m_league_size ; }
__device__ inline int team_rank() const { return threadIdx.y ; }
__device__ inline int team_size() const { return blockDim.y ; }
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
KOKKOS_INLINE_FUNCTION int team_rank() const {
#ifdef __CUDA_ARCH__
return threadIdx.y ;
#else
return 1;
#endif
}
KOKKOS_INLINE_FUNCTION int team_size() const {
#ifdef __CUDA_ARCH__
return blockDim.y ;
#else
return 1;
#endif
}
__device__ inline void team_barrier() const { __syncthreads(); }
KOKKOS_INLINE_FUNCTION void team_barrier() const {
#ifdef __CUDA_ARCH__
__syncthreads();
#endif
}
template<class ValueType>
__device__ inline void team_broadcast(ValueType& value, const int& thread_id) const {
KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& value, const int& thread_id) const {
#ifdef __CUDA_ARCH__
__shared__ ValueType sh_val;
if(threadIdx.x == 0 && threadIdx.y == thread_id) {
sh_val = value;
@ -123,26 +138,17 @@ public:
team_barrier();
value = sh_val;
team_barrier();
#endif
}
#ifdef KOKKOS_HAVE_CXX11
template< class ValueType, class JoinOp >
__device__ inline
KOKKOS_INLINE_FUNCTION
typename JoinOp::value_type team_reduce( const ValueType & value
, const JoinOp & op_in ) const
{
, const JoinOp & op_in ) const {
#ifdef __CUDA_ARCH__
typedef JoinLambdaAdapter<ValueType,JoinOp> JoinOpFunctor ;
const JoinOpFunctor op(op_in);
ValueType * const base_data = (ValueType *) m_team_reduce ;
#else
template< class JoinOp >
__device__ inline
typename JoinOp::value_type team_reduce( const typename JoinOp::value_type & value
, const JoinOp & op ) const
{
typedef JoinOp JoinOpFunctor ;
typename JoinOp::value_type * const base_data = (typename JoinOp::value_type *) m_team_reduce ;
#endif
__syncthreads(); // Don't write in to shared data until all threads have entered this function
@ -153,6 +159,9 @@ public:
Impl::cuda_intra_block_reduce_scan<false,JoinOpFunctor,void>( op , base_data );
return base_data[ blockDim.y - 1 ];
#else
return typename JoinOp::value_type();
#endif
}
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
@ -165,8 +174,8 @@ public:
* non-deterministic.
*/
template< typename Type >
__device__ inline Type team_scan( const Type & value , Type * const global_accum ) const
{
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const {
#ifdef __CUDA_ARCH__
Type * const base_data = (Type *) m_team_reduce ;
__syncthreads(); // Don't write in to shared data until all threads have entered this function
@ -186,6 +195,9 @@ public:
}
return base_data[ threadIdx.y ];
#else
return Type();
#endif
}
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
@ -194,13 +206,14 @@ public:
* reduction_total = dev.team_scan( value ) + value ;
*/
template< typename Type >
__device__ inline Type team_scan( const Type & value ) const
{ return this->template team_scan<Type>( value , 0 ); }
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const {
return this->template team_scan<Type>( value , 0 );
}
//----------------------------------------
// Private for the driver
__device__ inline
KOKKOS_INLINE_FUNCTION
CudaTeamMember( void * shared
, const int shared_begin
, const int shared_size
@ -210,51 +223,10 @@ public:
, const int arg_league_size )
: m_team_reduce( shared )
, m_team_shared( ((char *)shared) + shared_begin , shared_size, scratch_level_1_ptr, scratch_level_1_size)
, m_league_rank( arg_league_rank )
, m_league_size( arg_league_size )
, m_league_rank( arg_league_rank )
, m_league_size( arg_league_size )
{}
#else
const execution_space::scratch_memory_space & team_shmem() const
{ return m_team_shared.set_team_thread_mode(0, 1,0) ; }
const execution_space::scratch_memory_space & team_scratch(const int& level) const
{ return m_team_shared.set_team_thread_mode(level,1,0) ; }
const execution_space::scratch_memory_space & thread_scratch(const int& level) const
{ return m_team_shared.set_team_thread_mode(level,team_size(),team_rank()) ; }
int league_rank() const {return 0;}
int league_size() const {return 1;}
int team_rank() const {return 0;}
int team_size() const {return 1;}
void team_barrier() const {}
template<class ValueType>
void team_broadcast(ValueType& value, const int& thread_id) const {}
template< class JoinOp >
typename JoinOp::value_type team_reduce( const typename JoinOp::value_type & value
, const JoinOp & op ) const {return typename JoinOp::value_type();}
template< typename Type >
Type team_scan( const Type & value , Type * const global_accum ) const {return Type();}
template< typename Type >
Type team_scan( const Type & value ) const {return Type();}
//----------------------------------------
// Private for the driver
CudaTeamMember( void * shared
, const int shared_begin
, const int shared_end
, void* scratch_level_1_ptr
, const int scratch_level_1_size
, const int arg_league_rank
, const int arg_league_size );
#endif /* #if ! defined( __CUDA_ARCH__ ) */
};
} // namespace Impl
@ -356,7 +328,7 @@ public:
, m_vector_length( 0 )
, m_team_scratch_size {0,0}
, m_thread_scratch_size {0,0}
, m_chunk_size ( 32 )
, m_chunk_size ( 32 )
{}
/** \brief Specify league size, request team size */
@ -508,7 +480,7 @@ private:
typedef typename Policy::work_tag WorkTag ;
const FunctorType m_functor ;
const Policy m_policy ;
const Policy m_policy ;
ParallelFor() = delete ;
ParallelFor & operator = ( const ParallelFor & ) = delete ;
@ -638,8 +610,8 @@ public:
}
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy
)
: m_functor( arg_functor )
, m_league_size( arg_policy.league_size() )
@ -680,7 +652,7 @@ template< class FunctorType , class ReducerType, class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::RangePolicy< Traits ... >
, ReducerType
, Kokkos::Cuda
, Kokkos::Cuda
>
{
private:
@ -835,23 +807,22 @@ public:
const int nwork = m_policy.end() - m_policy.begin();
if ( nwork ) {
const int block_size = local_block_size( m_functor );
m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_size /* block_size == max block_count */ );
m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
// REQUIRED ( 1 , N , 1 )
const dim3 block( 1 , block_size , 1 );
// Required grid.x <= block.y
const dim3 grid( std::min( int(block.y) , int( ( nwork + block.y - 1 ) / block.y ) ) , 1 , 1 );
const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y );
CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute
Cuda::fence();
if ( m_result_ptr ) {
if ( m_unified_space ) {
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
@ -871,8 +842,8 @@ public:
}
template< class HostViewType >
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const HostViewType & arg_result
, typename std::enable_if<
Kokkos::is_view< HostViewType >::value
@ -925,7 +896,6 @@ private:
typedef typename ValueTraits::reference_type reference_type ;
typedef typename ValueTraits::value_type value_type ;
public:
typedef FunctorType functor_type ;
@ -937,7 +907,6 @@ private:
typedef double DummyShflReductionType;
typedef int DummySHMEMReductionType;
// Algorithmic constraints: blockDim.y is a power of two AND blockDim.y == blockDim.z == 1
// shared memory utilization:
//
@ -1058,36 +1027,44 @@ public:
inline
void execute()
{
const int block_count = UseShflReduction? std::min( m_league_size , size_type(1024) )
:std::min( m_league_size , m_team_size );
const int nwork = m_league_size * m_team_size ;
if ( nwork ) {
const int block_count = UseShflReduction? std::min( m_league_size , size_type(1024) )
:std::min( m_league_size , m_team_size );
m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_count );
m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_count );
m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
const dim3 block( m_vector_size , m_team_size , 1 );
const dim3 grid( block_count , 1 , 1 );
const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
const dim3 block( m_vector_size , m_team_size , 1 );
const dim3 grid( block_count , 1 , 1 );
const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem_size_total ); // copy to device and execute
CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem_size_total ); // copy to device and execute
Cuda::fence();
Cuda::fence();
if ( m_result_ptr ) {
if ( m_unified_space ) {
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
if ( m_result_ptr ) {
if ( m_unified_space ) {
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
}
else {
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
DeepCopy<HostSpace,CudaSpace>( m_result_ptr, m_scratch_space, size );
}
}
else {
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
DeepCopy<HostSpace,CudaSpace>( m_result_ptr, m_scratch_space, size );
}
else {
if (m_result_ptr) {
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr );
}
}
}
template< class HostViewType >
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const HostViewType & arg_result
, typename std::enable_if<
Kokkos::is_view< HostViewType >::value
@ -1106,9 +1083,18 @@ public:
, m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
arg_policy.vector_length() )
arg_policy.vector_length() )
, m_vector_size( arg_policy.vector_length() )
, m_scratch_size{arg_policy.scratch_size(0,m_team_size),arg_policy.scratch_size(1,m_team_size)}
, m_scratch_size{
arg_policy.scratch_size(0,( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
arg_policy.vector_length() )
), arg_policy.scratch_size(1,( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
arg_policy.vector_length() )
)}
{
// Return Init value if the number of worksets is zero
if( arg_policy.league_size() == 0) {
@ -1342,7 +1328,7 @@ private:
}
// Scan block values into locations shared_data[1..blockDim.y]
cuda_intra_block_reduce_scan<true,FunctorType,WorkTag>( m_functor , ValueTraits::pointer_type(shared_data+word_count.value) );
cuda_intra_block_reduce_scan<true,FunctorType,WorkTag>( m_functor , typename ValueTraits::pointer_type(shared_data+word_count.value) );
{
size_type * const block_total = shared_data + word_count.value * blockDim.y ;
@ -1391,32 +1377,32 @@ public:
const int nwork = m_policy.end() - m_policy.begin();
if ( nwork ) {
enum { GridMaxComputeCapability_2x = 0x0ffff };
const int block_size = local_block_size( m_functor );
const int grid_max =
( block_size * block_size ) < GridMaxComputeCapability_2x ?
( block_size * block_size ) : GridMaxComputeCapability_2x ;
// At most 'max_grid' blocks:
const int max_grid = std::min( int(grid_max) , int(( nwork + block_size - 1 ) / block_size ));
// How much work per block:
const int work_per_block = ( nwork + max_grid - 1 ) / max_grid ;
// How many block are really needed for this much work:
const int grid_x = ( nwork + work_per_block - 1 ) / work_per_block ;
m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( m_functor ) * grid_x );
m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) * 1 );
const dim3 grid( grid_x , 1 , 1 );
const dim3 block( 1 , block_size , 1 ); // REQUIRED DIMENSIONS ( 1 , N , 1 )
const int shmem = ValueTraits::value_size( m_functor ) * ( block_size + 2 );
m_final = false ;
CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
m_final = true ;
CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
}
@ -1490,18 +1476,30 @@ namespace Impl {
#ifdef __CUDA_ARCH__
__device__ inline
ThreadVectorRangeBoundariesStruct (const CudaTeamMember& thread, const iType& count):
ThreadVectorRangeBoundariesStruct (const CudaTeamMember, const iType& count):
start( threadIdx.x ),
end( count ),
increment( blockDim.x )
{}
__device__ inline
ThreadVectorRangeBoundariesStruct (const iType& count):
start( threadIdx.x ),
end( count ),
increment( blockDim.x )
{}
#else
KOKKOS_INLINE_FUNCTION
ThreadVectorRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count):
ThreadVectorRangeBoundariesStruct (const CudaTeamMember, const iType& count):
start( 0 ),
end( count ),
increment( 1 )
{}
KOKKOS_INLINE_FUNCTION
ThreadVectorRangeBoundariesStruct (const iType& count):
start( 0 ),
end( count ),
increment( 1 )
{}
#endif
};
@ -1509,22 +1507,24 @@ namespace Impl {
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>
TeamThreadRange(const Impl::CudaTeamMember& thread, const iType& count) {
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>(thread,count);
Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >
TeamThreadRange( const Impl::CudaTeamMember & thread, const iType & count ) {
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, count );
}
template<typename iType>
template< typename iType1, typename iType2 >
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>
TeamThreadRange(const Impl::CudaTeamMember& thread, const iType& begin, const iType& end) {
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>(thread,begin,end);
Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
Impl::CudaTeamMember >
TeamThreadRange( const Impl::CudaTeamMember & thread, const iType1 & begin, const iType2 & end ) {
typedef typename std::common_type< iType1, iType2 >::type iType;
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, iType(begin), iType(end) );
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
ThreadVectorRange(const Impl::CudaTeamMember& thread, const iType& count) {
ThreadVectorRange(const Impl::CudaTeamMember& thread, const iType& count) {
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >(thread,count);
}
@ -1571,9 +1571,10 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Cud
lambda(i,result);
}
Impl::cuda_intra_warp_reduction(result,[&] (ValueType& dst, const ValueType& src) { dst+=src; });
Impl::cuda_inter_warp_reduction(result,[&] (ValueType& dst, const ValueType& src) { dst+=src; });
Impl::cuda_intra_warp_reduction(result,[&] (ValueType& dst, const ValueType& src)
{ dst+=src; });
Impl::cuda_inter_warp_reduction(result,[&] (ValueType& dst, const ValueType& src)
{ dst+=src; });
#endif
}
@ -1923,4 +1924,3 @@ namespace Impl {
#endif /* defined( __CUDACC__ ) */
#endif /* #ifndef KOKKOS_CUDA_PARALLEL_HPP */

View File

@ -139,6 +139,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type const result,
Cuda::size_type * const m_scratch_flags,
const int max_active_thread = blockDim.y) {
#ifdef __CUDA_ARCH__
typedef typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type pointer_type;
typedef typename FunctorValueTraits< FunctorType , ArgTag >::value_type value_type;
@ -213,6 +214,9 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
//The last block has in its thread=0 the global reduction value through "value"
return last_block;
#else
return true;
#endif
}
//----------------------------------------------------------------------------
@ -290,10 +294,10 @@ void cuda_intra_block_reduce_scan( const FunctorType & functor ,
if ( ! ( rtid_inter + n < blockDim.y ) ) n = 0 ;
BLOCK_SCAN_STEP(tdata_inter,n,8)
BLOCK_SCAN_STEP(tdata_inter,n,7)
BLOCK_SCAN_STEP(tdata_inter,n,6)
BLOCK_SCAN_STEP(tdata_inter,n,5)
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,8)
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,7)
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,6)
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,5)
}
}
}
@ -308,12 +312,19 @@ void cuda_intra_block_reduce_scan( const FunctorType & functor ,
( rtid_intra & 16 ) ? 16 : 0 ))));
if ( ! ( rtid_intra + n < blockDim.y ) ) n = 0 ;
#ifdef KOKKOS_CUDA_CLANG_WORKAROUND
BLOCK_SCAN_STEP(tdata_intra,n,4) __syncthreads();//__threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,3) __syncthreads();//__threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,2) __syncthreads();//__threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,1) __syncthreads();//__threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,0) __syncthreads();
#else
BLOCK_SCAN_STEP(tdata_intra,n,4) __threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,3) __threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,2) __threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,1) __threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,0)
BLOCK_SCAN_STEP(tdata_intra,n,0) __threadfence_block();
#endif
}
#undef BLOCK_SCAN_STEP

View File

@ -43,7 +43,7 @@
#include <Kokkos_Core.hpp>
#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKDAG )
#include <impl/Kokkos_TaskQueue_impl.hpp>
@ -174,6 +174,6 @@ printf("cuda_task_queue_execute after\n");
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKDAG ) */

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -44,7 +44,7 @@
#ifndef KOKKOS_IMPL_CUDA_TASK_HPP
#define KOKKOS_IMPL_CUDA_TASK_HPP
#if defined( KOKKOS_ENABLE_TASKPOLICY )
#if defined( KOKKOS_ENABLE_TASKDAG )
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
@ -99,7 +99,7 @@ public:
extern template class TaskQueue< Kokkos::Cuda > ;
//----------------------------------------------------------------------------
/**\brief Impl::TaskExec<Cuda> is the TaskPolicy<Cuda>::member_type
/**\brief Impl::TaskExec<Cuda> is the TaskScheduler<Cuda>::member_type
* passed to tasks running in a Cuda space.
*
* Cuda thread blocks for tasking are dimensioned:
@ -234,19 +234,23 @@ namespace Kokkos {
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
, const iType & count )
Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >
TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & count )
{
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >( thread, count );
}
template<typename iType>
template<typename iType1, typename iType2>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & start , const iType & end )
Impl::TeamThreadRangeBoundariesStruct
< typename std::common_type<iType1,iType2>::type
, Impl::TaskExec< Kokkos::Cuda > >
TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
, const iType1 & begin, const iType2 & end )
{
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >(thread,start,end);
typedef typename std::common_type< iType1, iType2 >::type iType;
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >(
thread, iType(begin), iType(end) );
}
template<typename iType>
@ -315,7 +319,7 @@ ValueType shfl_warp_broadcast
}
// all-reduce across corresponding vector lanes between team members within warp
// assume vec_length*team_size == warp_size
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
@ -344,7 +348,7 @@ void parallel_reduce
// all-reduce across corresponding vector lanes between team members within warp
// if no join() provided, use sum
// assume vec_length*team_size == warp_size
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
@ -372,7 +376,7 @@ void parallel_reduce
}
// all-reduce within team members within warp
// assume vec_length*team_size == warp_size
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
@ -397,7 +401,7 @@ void parallel_reduce
// all-reduce within team members within warp
// if no join() provided, use sum
// assume vec_length*team_size == warp_size
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
@ -426,7 +430,7 @@ void parallel_reduce
}
// scan across corresponding vector lanes between team members within warp
// assume vec_length*team_size == warp_size
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
@ -469,7 +473,7 @@ void parallel_scan
}
// scan within team member (vector) within warp
// assume vec_length*team_size == warp_size
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
@ -514,6 +518,6 @@ void parallel_scan
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
#endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */

View File

@ -1,932 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
// Experimental unified task-data parallel manycore LDRD
#include <stdio.h>
#include <iostream>
#include <sstream>
#include <Kokkos_Core.hpp>
#include <Cuda/Kokkos_Cuda_TaskPolicy.hpp>
#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
// #define DETAILED_PRINT
//----------------------------------------------------------------------------
#define QLOCK reinterpret_cast<void*>( ~((uintptr_t)0) )
#define QDENIED reinterpret_cast<void*>( ~((uintptr_t)0) - 1 )
namespace Kokkos {
namespace Experimental {
namespace Impl {
void CudaTaskPolicyQueue::Destroy::destroy_shared_allocation()
{
// Verify the queue is empty
if ( m_policy->m_count_ready ||
m_policy->m_team[0] ||
m_policy->m_team[1] ||
m_policy->m_team[2] ||
m_policy->m_serial[0] ||
m_policy->m_serial[1] ||
m_policy->m_serial[2] ) {
Kokkos::abort("CudaTaskPolicyQueue ERROR : Attempt to destroy non-empty queue" );
}
m_policy->~CudaTaskPolicyQueue();
Kokkos::Cuda::fence();
}
CudaTaskPolicyQueue::
~CudaTaskPolicyQueue()
{
}
CudaTaskPolicyQueue::
CudaTaskPolicyQueue
( const unsigned arg_task_max_count
, const unsigned arg_task_max_size
, const unsigned arg_task_default_dependence_capacity
, const unsigned arg_team_size
)
: m_space( Kokkos::CudaUVMSpace()
, arg_task_max_size * arg_task_max_count * 1.2
, 16 /* log2(superblock size) */
)
, m_team { 0 , 0 , 0 }
, m_serial { 0 , 0 , 0 }
, m_team_size( 32 /* 1 warps */ )
, m_default_dependence_capacity( arg_task_default_dependence_capacity )
, m_count_ready(0)
{
constexpr int max_team_size = 32 * 16 /* 16 warps */ ;
const int target_team_size =
std::min( int(arg_team_size) , max_team_size );
while ( m_team_size < target_team_size ) { m_team_size *= 2 ; }
}
//-----------------------------------------------------------------------
// Called by each block & thread
__device__
void Kokkos::Experimental::Impl::CudaTaskPolicyQueue::driver()
{
task_root_type * const q_denied = reinterpret_cast<task_root_type*>(QDENIED);
#define IS_TEAM_LEAD ( threadIdx.x == 0 && threadIdx.y == 0 )
#ifdef DETAILED_PRINT
if ( IS_TEAM_LEAD ) {
printf( "CudaTaskPolicyQueue::driver() begin on %d with count %d\n"
, blockIdx.x , m_count_ready );
}
#endif
// Each thread block must iterate this loop synchronously
// to insure team-execution of team-task
__shared__ task_root_type * team_task ;
__syncthreads();
do {
if ( IS_TEAM_LEAD ) {
if ( 0 == m_count_ready ) {
team_task = q_denied ; // All queues are empty and no running tasks
}
else {
team_task = 0 ;
for ( int i = 0 ; i < int(NPRIORITY) && 0 == team_task ; ++i ) {
if ( ( i < 2 /* regular queue */ )
|| ( ! m_space.is_empty() /* waiting for memory */ ) ) {
team_task = pop_ready_task( & m_team[i] );
}
}
}
}
__syncthreads();
#ifdef DETAILED_PRINT
if ( IS_TEAM_LEAD && 0 != team_task ) {
printf( "CudaTaskPolicyQueue::driver() (%d) team_task(0x%lx)\n"
, blockIdx.x
, (unsigned long) team_task );
}
#endif
// team_task == q_denied if all queues are empty
// team_task == 0 if no team tasks available
if ( q_denied != team_task ) {
if ( 0 != team_task ) {
Kokkos::Impl::CudaTeamMember
member( kokkos_impl_cuda_shared_memory<void>()
, 16 /* shared_begin */
, team_task->m_shmem_size /* shared size */
, 0 /* scratch level 1 pointer */
, 0 /* scratch level 1 size */
, 0 /* league rank */
, 1 /* league size */
);
(*team_task->m_team)( team_task , member );
// A __synthreads was called and if completed the
// functor was destroyed.
if ( IS_TEAM_LEAD ) {
complete_executed_task( team_task );
}
}
else {
// One thread of one warp performs this serial task
if ( threadIdx.x == 0 &&
0 == ( threadIdx.y % 32 ) ) {
task_root_type * task = 0 ;
for ( int i = 0 ; i < int(NPRIORITY) && 0 == task ; ++i ) {
if ( ( i < 2 /* regular queue */ )
|| ( ! m_space.is_empty() /* waiting for memory */ ) ) {
task = pop_ready_task( & m_serial[i] );
}
}
#ifdef DETAILED_PRINT
if ( 0 != task ) {
printf( "CudaTaskPolicyQueue::driver() (%2d)(%d) single task(0x%lx)\n"
, blockIdx.x
, threadIdx.y
, (unsigned long) task );
}
#endif
if ( task ) {
(*task->m_serial)( task );
complete_executed_task( task );
}
}
__syncthreads();
}
}
} while ( q_denied != team_task );
#ifdef DETAILED_PRINT
if ( IS_TEAM_LEAD ) {
printf( "CudaTaskPolicyQueue::driver() end on %d with count %d\n"
, blockIdx.x , m_count_ready );
}
#endif
#undef IS_TEAM_LEAD
}
//-----------------------------------------------------------------------
__device__
CudaTaskPolicyQueue::task_root_type *
CudaTaskPolicyQueue::pop_ready_task(
CudaTaskPolicyQueue::task_root_type * volatile * const queue )
{
task_root_type * const q_lock = reinterpret_cast<task_root_type*>(QLOCK);
task_root_type * task = 0 ;
task_root_type * const task_claim = *queue ;
if ( ( q_lock != task_claim ) && ( 0 != task_claim ) ) {
// Queue is not locked and not null, try to claim head of queue.
// Is a race among threads to claim the queue.
if ( task_claim == atomic_compare_exchange(queue,task_claim,q_lock) ) {
// Aquired the task which must be in the waiting state.
const int claim_state =
atomic_compare_exchange( & task_claim->m_state
, int(TASK_STATE_WAITING)
, int(TASK_STATE_EXECUTING) );
task_root_type * lock_verify = 0 ;
if ( claim_state == int(TASK_STATE_WAITING) ) {
// Transitioned this task from waiting to executing
// Update the queue to the next entry and release the lock
task_root_type * const next =
*((task_root_type * volatile *) & task_claim->m_next );
*((task_root_type * volatile *) & task_claim->m_next ) = 0 ;
lock_verify = atomic_compare_exchange( queue , q_lock , next );
}
if ( ( claim_state != int(TASK_STATE_WAITING) ) |
( q_lock != lock_verify ) ) {
printf( "CudaTaskPolicyQueue::pop_ready_task(0x%lx) task(0x%lx) state(%d) ERROR %s\n"
, (unsigned long) queue
, (unsigned long) task
, claim_state
, ( claim_state != int(TASK_STATE_WAITING)
? "NOT WAITING"
: "UNLOCK" ) );
Kokkos::abort("CudaTaskPolicyQueue::pop_ready_task");
}
task = task_claim ;
}
}
return task ;
}
//-----------------------------------------------------------------------
__device__
void CudaTaskPolicyQueue::complete_executed_task(
CudaTaskPolicyQueue::task_root_type * task )
{
task_root_type * const q_denied = reinterpret_cast<task_root_type*>(QDENIED);
#ifdef DETAILED_PRINT
printf( "CudaTaskPolicyQueue::complete_executed_task(0x%lx) state(%d) (%d)(%d,%d)\n"
, (unsigned long) task
, task->m_state
, blockIdx.x
, threadIdx.x
, threadIdx.y
);
#endif
// State is either executing or if respawned then waiting,
// try to transition from executing to complete.
// Reads the current value.
const int state_old =
atomic_compare_exchange( & task->m_state
, int(Kokkos::Experimental::TASK_STATE_EXECUTING)
, int(Kokkos::Experimental::TASK_STATE_COMPLETE) );
if ( int(Kokkos::Experimental::TASK_STATE_WAITING) == state_old ) {
/* Task requested a respawn so reschedule it */
schedule_task( task , false /* not initial spawn */ );
}
else if ( int(Kokkos::Experimental::TASK_STATE_EXECUTING) == state_old ) {
/* Task is complete */
// Clear dependences of this task before locking wait queue
task->clear_dependence();
// Stop other tasks from adding themselves to this task's wait queue.
// The wait queue is updated concurrently so guard with an atomic.
task_root_type * wait_queue = *((task_root_type * volatile *) & task->m_wait );
task_root_type * wait_queue_old = 0 ;
do {
wait_queue_old = wait_queue ;
wait_queue = atomic_compare_exchange( & task->m_wait , wait_queue_old , q_denied );
} while ( wait_queue_old != wait_queue );
// The task has been removed from ready queue and
// execution is complete so decrement the reference count.
// The reference count was incremented by the initial spawning.
// The task may be deleted if this was the last reference.
task_root_type::assign( & task , 0 );
// Pop waiting tasks and schedule them
while ( wait_queue ) {
task_root_type * const x = wait_queue ; wait_queue = x->m_next ; x->m_next = 0 ;
schedule_task( x , false /* not initial spawn */ );
}
}
else {
printf( "CudaTaskPolicyQueue::complete_executed_task(0x%lx) ERROR state_old(%d) dep_size(%d)\n"
, (unsigned long)( task )
, int(state_old)
, task->m_dep_size
);
Kokkos::abort("CudaTaskPolicyQueue::complete_executed_task" );
}
// If the task was respawned it may have already been
// put in a ready queue and the count incremented.
// By decrementing the count last it will never go to zero
// with a ready or executing task.
atomic_fetch_add( & m_count_ready , -1 );
}
__device__
void TaskMember< Kokkos::Cuda , void , void >::latch_add( const int k )
{
typedef TaskMember< Kokkos::Cuda , void , void > task_root_type ;
task_root_type * const q_denied = reinterpret_cast<task_root_type*>(QDENIED);
const bool ok_input = 0 < k ;
const int count = ok_input ? atomic_fetch_add( & m_dep_size , -k ) - k
: k ;
const bool ok_count = 0 <= count ;
const int state = 0 != count ? TASK_STATE_WAITING :
atomic_compare_exchange( & m_state
, TASK_STATE_WAITING
, TASK_STATE_COMPLETE );
const bool ok_state = state == TASK_STATE_WAITING ;
if ( ! ok_count || ! ok_state ) {
printf( "CudaTaskPolicyQueue::latch_add[0x%lx](%d) ERROR %s %d\n"
, (unsigned long) this
, k
, ( ! ok_input ? "Non-positive input" :
( ! ok_count ? "Negative count" : "Bad State" ) )
, ( ! ok_input ? k :
( ! ok_count ? count : state ) )
);
Kokkos::abort( "CudaTaskPolicyQueue::latch_add ERROR" );
}
else if ( 0 == count ) {
// Stop other tasks from adding themselves to this latch's wait queue.
// The wait queue is updated concurrently so guard with an atomic.
CudaTaskPolicyQueue & policy = *m_policy ;
task_root_type * wait_queue = *((task_root_type * volatile *) &m_wait);
task_root_type * wait_queue_old = 0 ;
do {
wait_queue_old = wait_queue ;
wait_queue = atomic_compare_exchange( & m_wait , wait_queue_old , q_denied );
} while ( wait_queue_old != wait_queue );
// Pop waiting tasks and schedule them
while ( wait_queue ) {
task_root_type * const x = wait_queue ; wait_queue = x->m_next ; x->m_next = 0 ;
policy.schedule_task( x , false /* not initial spawn */ );
}
}
}
//----------------------------------------------------------------------------
void CudaTaskPolicyQueue::reschedule_task(
CudaTaskPolicyQueue::task_root_type * const task )
{
// Reschedule transitions from executing back to waiting.
const int old_state =
atomic_compare_exchange( & task->m_state
, int(TASK_STATE_EXECUTING)
, int(TASK_STATE_WAITING) );
if ( old_state != int(TASK_STATE_EXECUTING) ) {
printf( "CudaTaskPolicyQueue::reschedule_task(0x%lx) ERROR state(%d)\n"
, (unsigned long) task
, old_state
);
Kokkos::abort("CudaTaskPolicyQueue::reschedule" );
}
}
KOKKOS_FUNCTION
void CudaTaskPolicyQueue::schedule_task(
CudaTaskPolicyQueue::task_root_type * const task ,
const bool initial_spawn )
{
task_root_type * const q_lock = reinterpret_cast<task_root_type*>(QLOCK);
task_root_type * const q_denied = reinterpret_cast<task_root_type*>(QDENIED);
//----------------------------------------
// State is either constructing or already waiting.
// If constructing then transition to waiting.
{
const int old_state = atomic_compare_exchange( & task->m_state
, int(TASK_STATE_CONSTRUCTING)
, int(TASK_STATE_WAITING) );
// Head of linked list of tasks waiting on this task
task_root_type * const waitTask =
*((task_root_type * volatile const *) & task->m_wait );
// Member of linked list of tasks waiting on some other task
task_root_type * const next =
*((task_root_type * volatile const *) & task->m_next );
// An incomplete and non-executing task has:
// task->m_state == TASK_STATE_CONSTRUCTING or TASK_STATE_WAITING
// task->m_wait != q_denied
// task->m_next == 0
//
if ( ( q_denied == waitTask ) ||
( 0 != next ) ||
( old_state != int(TASK_STATE_CONSTRUCTING) &&
old_state != int(TASK_STATE_WAITING) ) ) {
printf( "CudaTaskPolicyQueue::schedule_task(0x%lx) STATE ERROR: state(%d) wait(0x%lx) next(0x%lx)\n"
, (unsigned long) task
, old_state
, (unsigned long) waitTask
, (unsigned long) next );
Kokkos::abort("CudaTaskPolicyQueue::schedule" );
}
}
//----------------------------------------
if ( initial_spawn ) {
// The initial spawn of a task increments the reference count
// for the task's existence in either a waiting or ready queue
// until the task has completed.
// Completing the task's execution is the matching
// decrement of the reference count.
task_root_type::assign( 0 , task );
}
//----------------------------------------
// Insert this task into a dependence task that is not complete.
// Push on to that task's wait queue.
bool attempt_insert_in_queue = true ;
task_root_type * volatile * queue =
task->m_dep_size ? & task->m_dep[0]->m_wait : (task_root_type **) 0 ;
for ( int i = 0 ; attempt_insert_in_queue && ( 0 != queue ) ; ) {
task_root_type * const head_value_old = *queue ;
if ( q_denied == head_value_old ) {
// Wait queue is closed because task is complete,
// try again with the next dependence wait queue.
++i ;
queue = i < task->m_dep_size ? & task->m_dep[i]->m_wait
: (task_root_type **) 0 ;
}
else {
// Wait queue is open and not denied.
// Have exclusive access to this task.
// Assign m_next assuming a successfull insertion into the queue.
// Fence the memory assignment before attempting the CAS.
*((task_root_type * volatile *) & task->m_next ) = head_value_old ;
memory_fence();
// Attempt to insert this task into the queue.
// If fails then continue the attempt.
attempt_insert_in_queue =
head_value_old != atomic_compare_exchange(queue,head_value_old,task);
}
}
//----------------------------------------
// All dependences are complete, insert into the ready list
if ( attempt_insert_in_queue ) {
// Increment the count of ready tasks.
// Count will be decremented when task is complete.
atomic_fetch_add( & m_count_ready , 1 );
queue = task->m_queue ;
while ( attempt_insert_in_queue ) {
// A locked queue is being popped.
task_root_type * const head_value_old = *queue ;
if ( q_lock != head_value_old ) {
// Read the head of ready queue,
// if same as previous value then CAS locks the ready queue
// Have exclusive access to this task,
// assign to head of queue, assuming successful insert
// Fence assignment before attempting insert.
*((task_root_type * volatile *) & task->m_next ) = head_value_old ;
memory_fence();
attempt_insert_in_queue =
head_value_old != atomic_compare_exchange(queue,head_value_old,task);
}
}
}
}
void CudaTaskPolicyQueue::deallocate_task
( CudaTaskPolicyQueue::task_root_type * const task )
{
m_space.deallocate( task , task->m_size_alloc );
}
KOKKOS_FUNCTION
CudaTaskPolicyQueue::task_root_type *
CudaTaskPolicyQueue::allocate_task
( const unsigned arg_sizeof_task
, const unsigned arg_dep_capacity
, const unsigned arg_team_shmem
)
{
const unsigned base_size = arg_sizeof_task +
( arg_sizeof_task % sizeof(task_root_type*)
? sizeof(task_root_type*) - arg_sizeof_task % sizeof(task_root_type*)
: 0 );
const unsigned dep_capacity
= ~0u == arg_dep_capacity
? m_default_dependence_capacity
: arg_dep_capacity ;
const unsigned size_alloc =
base_size + sizeof(task_root_type*) * dep_capacity ;
task_root_type * const task =
reinterpret_cast<task_root_type*>( m_space.allocate( size_alloc ) );
if ( task != 0 ) {
// Initialize task's root and value data structure
// Calling function must copy construct the functor.
new( (void*) task ) task_root_type();
task->m_policy = this ;
task->m_size_alloc = size_alloc ;
task->m_dep_capacity = dep_capacity ;
task->m_shmem_size = arg_team_shmem ;
if ( dep_capacity ) {
task->m_dep =
reinterpret_cast<task_root_type**>(
reinterpret_cast<unsigned char*>(task) + base_size );
for ( unsigned i = 0 ; i < dep_capacity ; ++i )
task->task_root_type::m_dep[i] = 0 ;
}
}
return task ;
}
//----------------------------------------------------------------------------
void CudaTaskPolicyQueue::add_dependence
( CudaTaskPolicyQueue::task_root_type * const after
, CudaTaskPolicyQueue::task_root_type * const before
)
{
if ( ( after != 0 ) && ( before != 0 ) ) {
int const state = *((volatile const int *) & after->m_state );
// Only add dependence during construction or during execution.
// Both tasks must have the same policy.
// Dependence on non-full memory cannot be mixed with any other dependence.
const bool ok_state =
Kokkos::Experimental::TASK_STATE_CONSTRUCTING == state ||
Kokkos::Experimental::TASK_STATE_EXECUTING == state ;
const bool ok_capacity =
after->m_dep_size < after->m_dep_capacity ;
const bool ok_policy =
after->m_policy == this && before->m_policy == this ;
if ( ok_state && ok_capacity && ok_policy ) {
++after->m_dep_size ;
task_root_type::assign( after->m_dep + (after->m_dep_size-1) , before );
memory_fence();
}
else {
printf( "CudaTaskPolicyQueue::add_dependence( 0x%lx , 0x%lx ) ERROR %s\n"
, (unsigned long) after
, (unsigned long) before
, ( ! ok_state ? "Task not constructing or executing" :
( ! ok_capacity ? "Task Exceeded dependence capacity"
: "Tasks from different policies" )) );
Kokkos::abort("CudaTaskPolicyQueue::add_dependence ERROR");
}
}
}
} /* namespace Impl */
} /* namespace Experimental */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
TaskPolicy< Kokkos::Cuda >::TaskPolicy
( const unsigned arg_task_max_count
, const unsigned arg_task_max_size
, const unsigned arg_task_default_dependence_capacity
, const unsigned arg_task_team_size
)
: m_track()
, m_policy(0)
{
// Allocate the queue data sructure in UVM space
typedef Kokkos::Experimental::Impl::SharedAllocationRecord
< Kokkos::CudaUVMSpace , Impl::CudaTaskPolicyQueue::Destroy > record_type ;
record_type * record =
record_type::allocate( Kokkos::CudaUVMSpace()
, "CudaUVM task queue"
, sizeof(Impl::CudaTaskPolicyQueue)
);
m_policy = reinterpret_cast< Impl::CudaTaskPolicyQueue * >( record->data() );
// Tasks are allocated with application's task size + sizeof(task_root_type)
const size_t full_task_size_estimate =
arg_task_max_size +
sizeof(task_root_type) +
sizeof(task_root_type*) * arg_task_default_dependence_capacity ;
new( m_policy )
Impl::CudaTaskPolicyQueue( arg_task_max_count
, full_task_size_estimate
, arg_task_default_dependence_capacity
, arg_task_team_size );
record->m_destroy.m_policy = m_policy ;
m_track.assign_allocated_record_to_uninitialized( record );
}
__global__
static void kokkos_cuda_task_policy_queue_driver
( Kokkos::Experimental::Impl::CudaTaskPolicyQueue * queue )
{
queue->driver();
}
void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Cuda > & policy )
{
const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
const dim3 block( 1 , policy.m_policy->m_team_size , 1 );
const int shared = 0 ; // Kokkos::Impl::CudaTraits::SharedMemoryUsage / 2 ;
const cudaStream_t stream = 0 ;
#ifdef DETAILED_PRINT
printf("kokkos_cuda_task_policy_queue_driver grid(%d,%d,%d) block(%d,%d,%d) shared(%d) policy(0x%lx)\n"
, grid.x , grid.y , grid.z
, block.x , block.y , block.z
, shared
, (unsigned long)( policy.m_policy ) );
fflush(stdout);
#endif
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
/*
CUDA_SAFE_CALL(
cudaFuncSetCacheConfig( kokkos_cuda_task_policy_queue_driver
, cudaFuncCachePreferL1 ) );
CUDA_SAFE_CALL( cudaGetLastError() );
*/
kokkos_cuda_task_policy_queue_driver<<< grid , block , shared , stream >>>
( policy.m_policy );
CUDA_SAFE_CALL( cudaGetLastError() );
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
#ifdef DETAILED_PRINT
printf("kokkos_cuda_task_policy_queue_driver end\n");
fflush(stdout);
#endif
}
} /* namespace Experimental */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
typedef TaskMember< Kokkos::Cuda , void , void > Task ;
__host__ __device__
Task::~TaskMember()
{
}
__host__ __device__
void Task::assign( Task ** const lhs_ptr , Task * rhs )
{
Task * const q_denied = reinterpret_cast<Task*>(QDENIED);
// Increment rhs reference count.
if ( rhs ) { atomic_fetch_add( & rhs->m_ref_count , 1 ); }
if ( 0 == lhs_ptr ) return ;
// Must have exclusive access to *lhs_ptr.
// Assign the pointer and retrieve the previous value.
// Cannot use atomic exchange since *lhs_ptr may be
// in Cuda register space.
#if 0
Task * const old_lhs = *((Task*volatile*)lhs_ptr);
*((Task*volatile*)lhs_ptr) = rhs ;
Kokkos::memory_fence();
#else
Task * const old_lhs = *lhs_ptr ;
*lhs_ptr = rhs ;
#endif
if ( old_lhs && rhs && old_lhs->m_policy != rhs->m_policy ) {
Kokkos::abort( "Kokkos::Impl::TaskMember<Kokkos::Cuda>::assign ERROR different queues");
}
if ( old_lhs ) {
Kokkos::memory_fence();
// Decrement former lhs reference count.
// If reference count is zero task must be complete, then delete task.
// Task is ready for deletion when wait == q_denied
int const count = atomic_fetch_add( & (old_lhs->m_ref_count) , -1 ) - 1 ;
int const state = old_lhs->m_state ;
Task * const wait = *((Task * const volatile *) & old_lhs->m_wait );
const bool ok_count = 0 <= count ;
// If count == 0 then will be deleting
// and must either be constructing or complete.
const bool ok_state = 0 < count ? true :
( ( state == int(TASK_STATE_CONSTRUCTING) && wait == 0 ) ||
( state == int(TASK_STATE_COMPLETE) && wait == q_denied ) )
&&
old_lhs->m_next == 0 &&
old_lhs->m_dep_size == 0 ;
if ( ! ok_count || ! ok_state ) {
printf( "%s Kokkos::Impl::TaskManager<Kokkos::Cuda>::assign ERROR deleting task(0x%lx) m_ref_count(%d) m_state(%d) m_wait(0x%ld)\n"
#if defined( KOKKOS_ACTIVE_EXECUTION_SPACE_CUDA )
, "CUDA "
#else
, "HOST "
#endif
, (unsigned long) old_lhs
, count
, state
, (unsigned long) wait );
Kokkos::abort( "Kokkos::Impl::TaskMember<Kokkos::Cuda>::assign ERROR deleting");
}
if ( count == 0 ) {
// When 'count == 0' this thread has exclusive access to 'old_lhs'
#ifdef DETAILED_PRINT
printf( "Task::assign(...) old_lhs(0x%lx) deallocate\n"
, (unsigned long) old_lhs
);
#endif
old_lhs->m_policy->deallocate_task( old_lhs );
}
}
}
//----------------------------------------------------------------------------
__device__
int Task::get_dependence() const
{
return m_dep_size ;
}
__device__
Task * Task::get_dependence( int i ) const
{
Task * const t = ((Task*volatile*)m_dep)[i] ;
if ( Kokkos::Experimental::TASK_STATE_EXECUTING != m_state || i < 0 || m_dep_size <= i || 0 == t ) {
printf( "TaskMember< Cuda >::get_dependence ERROR : task[%lx]{ state(%d) dep_size(%d) dep[%d] = %lx }\n"
, (unsigned long) this
, m_state
, m_dep_size
, i
, (unsigned long) t
);
Kokkos::abort("TaskMember< Cuda >::get_dependence ERROR");
}
return t ;
}
//----------------------------------------------------------------------------
__device__ __host__
void Task::clear_dependence()
{
for ( int i = m_dep_size - 1 ; 0 <= i ; --i ) {
assign( m_dep + i , 0 );
}
*((volatile int *) & m_dep_size ) = 0 ;
memory_fence();
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
} /* namespace Impl */
} /* namespace Experimental */
} /* namespace Kokkos */
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */

View File

@ -1,833 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
// Experimental unified task-data parallel manycore LDRD
#ifndef KOKKOS_CUDA_TASKPOLICY_HPP
#define KOKKOS_CUDA_TASKPOLICY_HPP
#include <Kokkos_Core_fwd.hpp>
#include <Kokkos_Cuda.hpp>
#include <Kokkos_TaskPolicy.hpp>
#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
struct CudaTaskPolicyQueue ;
/** \brief Base class for all Kokkos::Cuda tasks */
template<>
class TaskMember< Kokkos::Cuda , void , void > {
public:
template< class > friend class Kokkos::Experimental::TaskPolicy ;
friend struct CudaTaskPolicyQueue ;
typedef void (* function_single_type) ( TaskMember * );
typedef void (* function_team_type) ( TaskMember * , Kokkos::Impl::CudaTeamMember & );
private:
CudaTaskPolicyQueue * m_policy ;
TaskMember * volatile * m_queue ;
function_team_type m_team ; ///< Apply function on CUDA
function_single_type m_serial ; ///< Apply function on CUDA
TaskMember ** m_dep ; ///< Dependences
TaskMember * m_wait ; ///< Linked list of tasks waiting on this task
TaskMember * m_next ; ///< Linked list of tasks waiting on a different task
int m_dep_capacity ; ///< Capacity of dependences
int m_dep_size ; ///< Actual count of dependences
int m_size_alloc ;
int m_shmem_size ;
int m_ref_count ; ///< Reference count
int m_state ; ///< State of the task
TaskMember( TaskMember && ) = delete ;
TaskMember( const TaskMember & ) = delete ;
TaskMember & operator = ( TaskMember && ) = delete ;
TaskMember & operator = ( const TaskMember & ) = delete ;
protected:
KOKKOS_INLINE_FUNCTION
TaskMember()
: m_policy(0)
, m_queue(0)
, m_team(0)
, m_serial(0)
, m_dep(0)
, m_wait(0)
, m_next(0)
, m_size_alloc(0)
, m_dep_capacity(0)
, m_dep_size(0)
, m_shmem_size(0)
, m_ref_count(0)
, m_state( TASK_STATE_CONSTRUCTING )
{}
public:
KOKKOS_FUNCTION
~TaskMember();
KOKKOS_INLINE_FUNCTION
int reference_count() const
{ return *((volatile int *) & m_ref_count ); }
// Cannot use the function pointer to verify the type
// since the function pointer is not unique between
// Host and Cuda. Don't run verificaton for Cuda.
// Assume testing on Host-only back-end will catch such errors.
template< typename ResultType >
KOKKOS_INLINE_FUNCTION static
TaskMember * verify_type( TaskMember * t ) { return t ; }
//----------------------------------------
/* Inheritence Requirements on task types:
*
* class DerivedTaskType
* : public TaskMember< Cuda , DerivedType::value_type , FunctorType >
* { ... };
*
* class TaskMember< Cuda , DerivedType::value_type , FunctorType >
* : public TaskMember< Cuda , DerivedType::value_type , void >
* , public Functor
* { ... };
*
* If value_type != void
* class TaskMember< Cuda , value_type , void >
* : public TaskMember< Cuda , void , void >
*
* Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
*
*/
//----------------------------------------
// If after the 'apply' the task's state is waiting
// then it will be rescheduled and called again.
// Otherwise the functor must be destroyed.
template< class DerivedTaskType , class Tag >
__device__ static
void apply_single(
typename std::enable_if
<( std::is_same< Tag , void >::value &&
std::is_same< typename DerivedTaskType::result_type , void >::value
), TaskMember * >::type t )
{
typedef typename DerivedTaskType::functor_type functor_type ;
functor_type * const f =
static_cast< functor_type * >( static_cast< DerivedTaskType * >(t) );
f->apply();
if ( t->m_state == int(Kokkos::Experimental::TASK_STATE_EXECUTING) ) {
f->~functor_type();
}
}
template< class DerivedTaskType , class Tag >
__device__ static
void apply_single(
typename std::enable_if
<( std::is_same< Tag , void >::value &&
! std::is_same< typename DerivedTaskType::result_type , void >::value
), TaskMember * >::type t )
{
typedef typename DerivedTaskType::functor_type functor_type ;
DerivedTaskType * const self = static_cast< DerivedTaskType * >(t);
functor_type * const f = static_cast< functor_type * >( self );
f->apply( self->m_result );
if ( t->m_state == int(Kokkos::Experimental::TASK_STATE_EXECUTING) ) {
f->~functor_type();
}
}
template< class DerivedTaskType , class Tag >
__device__
void set_apply_single()
{
m_serial = & TaskMember::template apply_single<DerivedTaskType,Tag> ;
}
//----------------------------------------
template< class DerivedTaskType , class Tag >
__device__ static
void apply_team(
typename std::enable_if
<( std::is_same<Tag,void>::value &&
std::is_same<typename DerivedTaskType::result_type,void>::value
), TaskMember * >::type t
, Kokkos::Impl::CudaTeamMember & member
)
{
typedef typename DerivedTaskType::functor_type functor_type ;
functor_type * const f =
static_cast< functor_type * >( static_cast< DerivedTaskType * >(t) );
f->apply( member );
__syncthreads(); // Wait for team to finish calling function
if ( threadIdx.x == 0 &&
threadIdx.y == 0 &&
t->m_state == int(Kokkos::Experimental::TASK_STATE_EXECUTING) ) {
f->~functor_type();
}
}
template< class DerivedTaskType , class Tag >
__device__ static
void apply_team(
typename std::enable_if
<( std::is_same<Tag,void>::value &&
! std::is_same<typename DerivedTaskType::result_type,void>::value
), TaskMember * >::type t
, Kokkos::Impl::CudaTeamMember & member
)
{
typedef typename DerivedTaskType::functor_type functor_type ;
DerivedTaskType * const self = static_cast< DerivedTaskType * >(t);
functor_type * const f = static_cast< functor_type * >( self );
f->apply( member , self->m_result );
__syncthreads(); // Wait for team to finish calling function
if ( threadIdx.x == 0 &&
threadIdx.y == 0 &&
t->m_state == int(Kokkos::Experimental::TASK_STATE_EXECUTING) ) {
f->~functor_type();
}
}
template< class DerivedTaskType , class Tag >
__device__
void set_apply_team()
{
m_team = & TaskMember::template apply_team<DerivedTaskType,Tag> ;
}
//----------------------------------------
KOKKOS_FUNCTION static
void assign( TaskMember ** const lhs , TaskMember * const rhs );
__device__
TaskMember * get_dependence( int i ) const ;
__device__
int get_dependence() const ;
KOKKOS_FUNCTION void clear_dependence();
__device__
void latch_add( const int k );
//----------------------------------------
KOKKOS_INLINE_FUNCTION static
void construct_result( TaskMember * const ) {}
typedef FutureValueTypeIsVoidError get_result_type ;
KOKKOS_INLINE_FUNCTION
get_result_type get() const { return get_result_type() ; }
KOKKOS_INLINE_FUNCTION
Kokkos::Experimental::TaskState get_state() const { return Kokkos::Experimental::TaskState( m_state ); }
};
/** \brief A Future< Kokkos::Cuda , ResultType > will cast
* from TaskMember< Kokkos::Cuda , void , void >
* to TaskMember< Kokkos::Cuda , ResultType , void >
* to query the result.
*/
template< class ResultType >
class TaskMember< Kokkos::Cuda , ResultType , void >
: public TaskMember< Kokkos::Cuda , void , void >
{
public:
typedef ResultType result_type ;
result_type m_result ;
typedef const result_type & get_result_type ;
KOKKOS_INLINE_FUNCTION
get_result_type get() const { return m_result ; }
KOKKOS_INLINE_FUNCTION static
void construct_result( TaskMember * const ptr )
{
new((void*)(& ptr->m_result)) result_type();
}
TaskMember() = delete ;
TaskMember( TaskMember && ) = delete ;
TaskMember( const TaskMember & ) = delete ;
TaskMember & operator = ( TaskMember && ) = delete ;
TaskMember & operator = ( const TaskMember & ) = delete ;
};
/** \brief Callback functions will cast
* from TaskMember< Kokkos::Cuda , void , void >
* to TaskMember< Kokkos::Cuda , ResultType , FunctorType >
* to execute work functions.
*/
template< class ResultType , class FunctorType >
class TaskMember< Kokkos::Cuda , ResultType , FunctorType >
: public TaskMember< Kokkos::Cuda , ResultType , void >
, public FunctorType
{
public:
typedef ResultType result_type ;
typedef FunctorType functor_type ;
KOKKOS_INLINE_FUNCTION static
void copy_construct( TaskMember * const ptr
, const functor_type & arg_functor )
{
typedef TaskMember< Kokkos::Cuda , ResultType , void > base_type ;
new((void*)static_cast<FunctorType*>(ptr)) functor_type( arg_functor );
base_type::construct_result( static_cast<base_type*>( ptr ) );
}
TaskMember() = delete ;
TaskMember( TaskMember && ) = delete ;
TaskMember( const TaskMember & ) = delete ;
TaskMember & operator = ( TaskMember && ) = delete ;
TaskMember & operator = ( const TaskMember & ) = delete ;
};
//----------------------------------------------------------------------------
namespace {
template< class DerivedTaskType , class Tag >
__global__
void cuda_set_apply_single( DerivedTaskType * task )
{
typedef Kokkos::Experimental::Impl::TaskMember< Kokkos::Cuda , void , void >
task_root_type ;
task->task_root_type::template set_apply_single< DerivedTaskType , Tag >();
}
template< class DerivedTaskType , class Tag >
__global__
void cuda_set_apply_team( DerivedTaskType * task )
{
typedef Kokkos::Experimental::Impl::TaskMember< Kokkos::Cuda , void , void >
task_root_type ;
task->task_root_type::template set_apply_team< DerivedTaskType , Tag >();
}
} /* namespace */
} /* namespace Impl */
} /* namespace Experimental */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
struct CudaTaskPolicyQueue {
enum { NPRIORITY = 3 };
// Must use UVM so that tasks can be created in both
// Host and Cuda space.
typedef Kokkos::Experimental::MemoryPool< Kokkos::CudaUVMSpace >
memory_space ;
typedef Kokkos::Experimental::Impl::TaskMember< Kokkos::Cuda , void , void >
task_root_type ;
memory_space m_space ;
task_root_type * m_team[ NPRIORITY ] ;
task_root_type * m_serial[ NPRIORITY ];
int m_team_size ;
int m_default_dependence_capacity ;
int volatile m_count_ready ; ///< Ready plus executing tasks
// Execute tasks until all non-waiting tasks are complete
__device__
void driver();
__device__ static
task_root_type * pop_ready_task( task_root_type * volatile * const queue );
// When a task finishes executing.
__device__
void complete_executed_task( task_root_type * );
KOKKOS_FUNCTION void schedule_task( task_root_type * const
, const bool initial_spawn = true );
KOKKOS_FUNCTION void reschedule_task( task_root_type * const );
KOKKOS_FUNCTION
void add_dependence( task_root_type * const after
, task_root_type * const before );
CudaTaskPolicyQueue() = delete ;
CudaTaskPolicyQueue( CudaTaskPolicyQueue && ) = delete ;
CudaTaskPolicyQueue( const CudaTaskPolicyQueue & ) = delete ;
CudaTaskPolicyQueue & operator = ( CudaTaskPolicyQueue && ) = delete ;
CudaTaskPolicyQueue & operator = ( const CudaTaskPolicyQueue & ) = delete ;
~CudaTaskPolicyQueue();
// Construct only on the Host
CudaTaskPolicyQueue
( const unsigned arg_task_max_count
, const unsigned arg_task_max_size
, const unsigned arg_task_default_dependence_capacity
, const unsigned arg_task_team_size
);
struct Destroy {
CudaTaskPolicyQueue * m_policy ;
void destroy_shared_allocation();
};
//----------------------------------------
/** \brief Allocate and construct a task.
*
* Allocate space for DerivedTaskType followed
* by TaskMember*[ dependence_capacity ]
*/
KOKKOS_FUNCTION
task_root_type *
allocate_task( const unsigned arg_sizeof_task
, const unsigned arg_dep_capacity
, const unsigned arg_team_shmem = 0 );
KOKKOS_FUNCTION void deallocate_task( task_root_type * const );
};
} /* namespace Impl */
} /* namespace Experimental */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
void wait( TaskPolicy< Kokkos::Cuda > & );
template<>
class TaskPolicy< Kokkos::Cuda >
{
public:
typedef Kokkos::Cuda execution_space ;
typedef TaskPolicy execution_policy ;
typedef Kokkos::Impl::CudaTeamMember member_type ;
private:
typedef Impl::TaskMember< Kokkos::Cuda , void , void > task_root_type ;
typedef Kokkos::Experimental::MemoryPool< Kokkos::CudaUVMSpace > memory_space ;
typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ;
track_type m_track ;
Impl::CudaTaskPolicyQueue * m_policy ;
template< class FunctorType >
KOKKOS_INLINE_FUNCTION static
const task_root_type * get_task_root( const FunctorType * f )
{
typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
return static_cast< const task_root_type * >( static_cast< const task_type * >(f) );
}
template< class FunctorType >
KOKKOS_INLINE_FUNCTION static
task_root_type * get_task_root( FunctorType * f )
{
typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
return static_cast< task_root_type * >( static_cast< task_type * >(f) );
}
public:
TaskPolicy
( const unsigned arg_task_max_count
, const unsigned arg_task_max_size
, const unsigned arg_task_default_dependence_capacity = 4
, const unsigned arg_task_team_size = 0 /* choose default */
);
KOKKOS_FUNCTION TaskPolicy() = default ;
KOKKOS_FUNCTION TaskPolicy( TaskPolicy && rhs ) = default ;
KOKKOS_FUNCTION TaskPolicy( const TaskPolicy & rhs ) = default ;
KOKKOS_FUNCTION TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
KOKKOS_FUNCTION TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ;
KOKKOS_FUNCTION
int allocated_task_count() const { return 0 ; }
//----------------------------------------
// Create serial-thread task
// Main process and tasks must use different functions
// to work around CUDA limitation where __host__ __device__
// functions are not allowed to invoke templated __global__ functions.
template< class FunctorType >
Future< typename FunctorType::value_type , execution_space >
proc_create( const FunctorType & arg_functor
, const unsigned arg_dep_capacity = ~0u ) const
{
typedef typename FunctorType::value_type value_type ;
typedef Impl::TaskMember< execution_space , value_type , FunctorType >
task_type ;
task_type * const task =
static_cast<task_type*>(
m_policy->allocate_task( sizeof(task_type) , arg_dep_capacity ) );
if ( task ) {
// The root part of the class has been constructed.
// Must now construct the functor and result specific part.
task_type::copy_construct( task , arg_functor );
// Setting the apply pointer on the device requires code
// executing on the GPU. This function is called on the
// host process so a kernel must be run.
// Launching a kernel will cause the allocated task in
// UVM memory to be copied to the GPU.
// Synchronize to guarantee non-concurrent access
// between host and device.
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
Impl::cuda_set_apply_single<task_type,void><<<1,1>>>( task );
CUDA_SAFE_CALL( cudaGetLastError() );
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
}
return Future< value_type , execution_space >( task );
}
template< class FunctorType >
__device__
Future< typename FunctorType::value_type , execution_space >
task_create( const FunctorType & arg_functor
, const unsigned arg_dep_capacity = ~0u ) const
{
typedef typename FunctorType::value_type value_type ;
typedef Impl::TaskMember< execution_space , value_type , FunctorType >
task_type ;
task_type * const task =
static_cast<task_type*>(
m_policy->allocate_task( sizeof(task_type) , arg_dep_capacity ) );
if ( task ) {
// The root part of the class has been constructed.
// Must now construct the functor and result specific part.
task_type::copy_construct( task , arg_functor );
// Setting the apply pointer on the device requires code
// executing on the GPU. If this function is called on the
// Host then a kernel must be run.
task->task_root_type::template set_apply_single< task_type , void >();
}
return Future< value_type , execution_space >( task );
}
//----------------------------------------
// Create thread-team task
// Main process and tasks must use different functions
// to work around CUDA limitation where __host__ __device__
// functions are not allowed to invoke templated __global__ functions.
template< class FunctorType >
Future< typename FunctorType::value_type , execution_space >
proc_create_team( const FunctorType & arg_functor
, const unsigned arg_dep_capacity = ~0u ) const
{
typedef typename FunctorType::value_type value_type ;
typedef Impl::TaskMember< execution_space , value_type , FunctorType >
task_type ;
const unsigned team_shmem_size =
Kokkos::Impl::FunctorTeamShmemSize< FunctorType >::value
( arg_functor , m_policy->m_team_size );
task_type * const task =
static_cast<task_type*>(
m_policy->allocate_task( sizeof(task_type) , arg_dep_capacity , team_shmem_size ) );
if ( task ) {
// The root part of the class has been constructed.
// Must now construct the functor and result specific part.
task_type::copy_construct( task , arg_functor );
// Setting the apply pointer on the device requires code
// executing on the GPU. This function is called on the
// host process so a kernel must be run.
// Launching a kernel will cause the allocated task in
// UVM memory to be copied to the GPU.
// Synchronize to guarantee non-concurrent access
// between host and device.
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
Impl::cuda_set_apply_team<task_type,void><<<1,1>>>( task );
CUDA_SAFE_CALL( cudaGetLastError() );
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
}
return Future< value_type , execution_space >( task );
}
template< class FunctorType >
__device__
Future< typename FunctorType::value_type , execution_space >
task_create_team( const FunctorType & arg_functor
, const unsigned arg_dep_capacity = ~0u ) const
{
typedef typename FunctorType::value_type value_type ;
typedef Impl::TaskMember< execution_space , value_type , FunctorType >
task_type ;
const unsigned team_shmem_size =
Kokkos::Impl::FunctorTeamShmemSize< FunctorType >::value
( arg_functor , m_policy->m_team_size );
task_type * const task =
static_cast<task_type*>(
m_policy->allocate_task( sizeof(task_type) , arg_dep_capacity , team_shmem_size ) );
if ( task ) {
// The root part of the class has been constructed.
// Must now construct the functor and result specific part.
task_type::copy_construct( task , arg_functor );
// Setting the apply pointer on the device requires code
// executing on the GPU. If this function is called on the
// Host then a kernel must be run.
task->task_root_type::template set_apply_team< task_type , void >();
}
return Future< value_type , execution_space >( task );
}
//----------------------------------------
Future< Latch , execution_space >
KOKKOS_INLINE_FUNCTION
create_latch( const int N ) const
{
task_root_type * const task =
m_policy->allocate_task( sizeof(task_root_type) , 0 , 0 );
task->m_dep_size = N ; // Using m_dep_size for latch counter
task->m_state = TASK_STATE_WAITING ;
return Future< Latch , execution_space >( task );
}
//----------------------------------------
template< class A1 , class A2 , class A3 , class A4 >
KOKKOS_INLINE_FUNCTION
void add_dependence( const Future<A1,A2> & after
, const Future<A3,A4> & before
, typename std::enable_if
< std::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
&&
std::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
>::type * = 0
) const
{ m_policy->add_dependence( after.m_task , before.m_task ); }
template< class FunctorType , class A3 , class A4 >
KOKKOS_INLINE_FUNCTION
void add_dependence( FunctorType * task_functor
, const Future<A3,A4> & before
, typename std::enable_if
< std::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
>::type * = 0
) const
{ m_policy->add_dependence( get_task_root(task_functor) , before.m_task ); }
template< class ValueType >
KOKKOS_INLINE_FUNCTION
const Future< ValueType , execution_space > &
spawn( const Future< ValueType , execution_space > & f
, const bool priority = false ) const
{
if ( f.m_task ) {
f.m_task->m_queue =
( f.m_task->m_team != 0
? & ( m_policy->m_team[ priority ? 0 : 1 ] )
: & ( m_policy->m_serial[ priority ? 0 : 1 ] ) );
m_policy->schedule_task( f.m_task );
}
return f ;
}
template< class FunctorType >
KOKKOS_INLINE_FUNCTION
void respawn( FunctorType * task_functor
, const bool priority = false ) const
{
task_root_type * const t = get_task_root(task_functor);
t->m_queue =
( t->m_team != 0 ? & ( m_policy->m_team[ priority ? 0 : 1 ] )
: & ( m_policy->m_serial[ priority ? 0 : 1 ] ) );
m_policy->reschedule_task( t );
}
// When a create method fails by returning a null Future
// the task that called the create method may respawn
// with a dependence on memory becoming available.
// This is a race as more than one task may be respawned
// with this need.
template< class FunctorType >
KOKKOS_INLINE_FUNCTION
void respawn_needing_memory( FunctorType * task_functor ) const
{
task_root_type * const t = get_task_root(task_functor);
t->m_queue =
( t->m_team != 0 ? & ( m_policy->m_team[ 2 ] )
: & ( m_policy->m_serial[ 2 ] ) );
m_policy->reschedule_task( t );
}
//----------------------------------------
// Functions for an executing task functor to query dependences,
// set new dependences, and respawn itself.
template< class FunctorType >
KOKKOS_INLINE_FUNCTION
Future< void , execution_space >
get_dependence( const FunctorType * task_functor , int i ) const
{
return Future<void,execution_space>(
get_task_root(task_functor)->get_dependence(i)
);
}
template< class FunctorType >
KOKKOS_INLINE_FUNCTION
int get_dependence( const FunctorType * task_functor ) const
{ return get_task_root(task_functor)->get_dependence(); }
template< class FunctorType >
KOKKOS_INLINE_FUNCTION
void clear_dependence( FunctorType * task_functor ) const
{ get_task_root(task_functor)->clear_dependence(); }
//----------------------------------------
__device__
static member_type member_single()
{
return
member_type( 0 /* shared memory pointer */
, 0 /* shared memory begin offset */
, 0 /* shared memory end offset */
, 0 /* scratch level_1 pointer */
, 0 /* scratch level_1 size */
, 0 /* league rank */
, 1 /* league size */ );
}
friend void wait( TaskPolicy< Kokkos::Cuda > & );
};
} /* namespace Experimental */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
#endif /* #ifndef KOKKOS_CUDA_TASKPOLICY_HPP */

View File

@ -41,53 +41,266 @@
//@HEADER
*/
#ifndef KOKKOS_CUDA_VIEW_HPP
#define KOKKOS_CUDA_VIEW_HPP
#include <Kokkos_Macros.hpp>
#ifndef KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
#define KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_HAVE_CUDA
#include <cstring>
#include <Kokkos_HostSpace.hpp>
#include <Kokkos_CudaSpace.hpp>
#include <impl/Kokkos_Shape.hpp>
#include <Kokkos_View.hpp>
#if defined( KOKKOS_HAVE_CUDA )
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
template<>
struct AssertShapeBoundsAbort< CudaSpace >
{
KOKKOS_INLINE_FUNCTION
static void apply( const size_t /* rank */ ,
const size_t /* n0 */ , const size_t /* n1 */ ,
const size_t /* n2 */ , const size_t /* n3 */ ,
const size_t /* n4 */ , const size_t /* n5 */ ,
const size_t /* n6 */ , const size_t /* n7 */ ,
// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
// Via reinterpret_case this can be used to support all scalar types of those sizes.
// Any other scalar type falls back to either normal reads out of global memory,
// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
const size_t /* arg_rank */ ,
const size_t /* i0 */ , const size_t /* i1 */ ,
const size_t /* i2 */ , const size_t /* i3 */ ,
const size_t /* i4 */ , const size_t /* i5 */ ,
const size_t /* i6 */ , const size_t /* i7 */ )
template< typename ValueType , typename AliasType >
struct CudaTextureFetch {
::cudaTextureObject_t m_obj ;
const ValueType * m_ptr ;
int m_offset ;
// Deference operator pulls through texture object and returns by value
template< typename iType >
KOKKOS_INLINE_FUNCTION
ValueType operator[]( const iType & i ) const
{
Kokkos::abort("Kokkos::View array bounds violation");
#if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset );
return *(reinterpret_cast<ValueType*> (&v));
#else
return m_ptr[ i ];
#endif
}
// Pointer to referenced memory
KOKKOS_INLINE_FUNCTION
operator const ValueType * () const { return m_ptr ; }
KOKKOS_INLINE_FUNCTION
CudaTextureFetch() : m_obj() , m_ptr() , m_offset() {}
KOKKOS_INLINE_FUNCTION
~CudaTextureFetch() {}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch( const CudaTextureFetch & rhs )
: m_obj( rhs.m_obj )
, m_ptr( rhs.m_ptr )
, m_offset( rhs.m_offset )
{}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch( CudaTextureFetch && rhs )
: m_obj( rhs.m_obj )
, m_ptr( rhs.m_ptr )
, m_offset( rhs.m_offset )
{}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
{
m_obj = rhs.m_obj ;
m_ptr = rhs.m_ptr ;
m_offset = rhs.m_offset ;
return *this ;
}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch & operator = ( CudaTextureFetch && rhs )
{
m_obj = rhs.m_obj ;
m_ptr = rhs.m_ptr ;
m_offset = rhs.m_offset ;
return *this ;
}
// Texture object spans the entire allocation.
// This handle may view a subset of the allocation, so an offset is required.
template< class CudaMemorySpace >
inline explicit
CudaTextureFetch( const ValueType * const arg_ptr
, Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > & record
)
: m_obj( record.template attach_texture_object< AliasType >() )
, m_ptr( arg_ptr )
, m_offset( record.attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) )
{}
// Texture object spans the entire allocation.
// This handle may view a subset of the allocation, so an offset is required.
KOKKOS_INLINE_FUNCTION
CudaTextureFetch( const CudaTextureFetch & rhs , size_t offset )
: m_obj( rhs.m_obj )
, m_ptr( rhs.m_ptr + offset)
, m_offset( offset + rhs.m_offset )
{}
};
#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC )
template< typename ValueType , typename AliasType >
struct CudaLDGFetch {
const ValueType * m_ptr ;
template< typename iType >
KOKKOS_INLINE_FUNCTION
ValueType operator[]( const iType & i ) const
{
#ifdef __CUDA_ARCH__
AliasType v = __ldg(reinterpret_cast<const AliasType*>(&m_ptr[i]));
return *(reinterpret_cast<ValueType*> (&v));
#else
return m_ptr[i];
#endif
}
KOKKOS_INLINE_FUNCTION
operator const ValueType * () const { return m_ptr ; }
KOKKOS_INLINE_FUNCTION
CudaLDGFetch() : m_ptr() {}
KOKKOS_INLINE_FUNCTION
~CudaLDGFetch() {}
KOKKOS_INLINE_FUNCTION
CudaLDGFetch( const CudaLDGFetch & rhs )
: m_ptr( rhs.m_ptr )
{}
KOKKOS_INLINE_FUNCTION
CudaLDGFetch( CudaLDGFetch && rhs )
: m_ptr( rhs.m_ptr )
{}
KOKKOS_INLINE_FUNCTION
CudaLDGFetch & operator = ( const CudaLDGFetch & rhs )
{
m_ptr = rhs.m_ptr ;
return *this ;
}
KOKKOS_INLINE_FUNCTION
CudaLDGFetch & operator = ( CudaLDGFetch && rhs )
{
m_ptr = rhs.m_ptr ;
return *this ;
}
template< class CudaMemorySpace >
inline explicit
CudaLDGFetch( const ValueType * const arg_ptr
, Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > const &
)
: m_ptr( arg_ptr )
{}
KOKKOS_INLINE_FUNCTION
CudaLDGFetch( CudaLDGFetch const rhs ,size_t offset)
: m_ptr( rhs.m_ptr + offset )
{}
};
#endif
} // namespace Impl
} // namespace Experimental
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
/** \brief Replace Default ViewDataHandle with Cuda texture fetch specialization
* if 'const' value type, CudaSpace and random access.
*/
template< class Traits >
class ViewDataHandle< Traits ,
typename std::enable_if<(
// Is Cuda memory space
( std::is_same< typename Traits::memory_space,Kokkos::CudaSpace>::value ||
std::is_same< typename Traits::memory_space,Kokkos::CudaUVMSpace>::value )
&&
// Is a trivial const value of 4, 8, or 16 bytes
std::is_trivial<typename Traits::const_value_type>::value
&&
std::is_same<typename Traits::const_value_type,typename Traits::value_type>::value
&&
( sizeof(typename Traits::const_value_type) == 4 ||
sizeof(typename Traits::const_value_type) == 8 ||
sizeof(typename Traits::const_value_type) == 16 )
&&
// Random access trait
( Traits::memory_traits::RandomAccess != 0 )
)>::type >
{
public:
using track_type = Kokkos::Experimental::Impl::SharedAllocationTracker ;
using value_type = typename Traits::const_value_type ;
using return_type = typename Traits::const_value_type ; // NOT a reference
using alias_type = typename std::conditional< ( sizeof(value_type) == 4 ) , int ,
typename std::conditional< ( sizeof(value_type) == 8 ) , ::int2 ,
typename std::conditional< ( sizeof(value_type) == 16 ) , ::int4 , void
>::type
>::type
>::type ;
#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC )
using handle_type = Kokkos::Experimental::Impl::CudaLDGFetch< value_type , alias_type > ;
#else
using handle_type = Kokkos::Experimental::Impl::CudaTextureFetch< value_type , alias_type > ;
#endif
KOKKOS_INLINE_FUNCTION
static handle_type const & assign( handle_type const & arg_handle , track_type const & /* arg_tracker */ )
{
return arg_handle ;
}
KOKKOS_INLINE_FUNCTION
static handle_type const assign( handle_type const & arg_handle , size_t offset )
{
return handle_type(arg_handle,offset) ;
}
KOKKOS_INLINE_FUNCTION
static handle_type assign( value_type * arg_data_ptr, track_type const & arg_tracker )
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
// Assignment of texture = non-texture requires creation of a texture object
// which can only occur on the host. In addition, 'get_record' is only valid
// if called in a host execution space
return handle_type( arg_data_ptr , arg_tracker.template get_record< typename Traits::memory_space >() );
#else
Kokkos::Impl::cuda_abort("Cannot create Cuda texture object from within a Cuda kernel");
return handle_type();
#endif
}
};
}
}
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif // KOKKOS_HAVE_CUDA
#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */

View File

@ -47,18 +47,10 @@
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#include "Kokkos_Macros.hpp"
#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA )
#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA )
#include <cuda.h>
#if ! defined( CUDA_VERSION ) || ( CUDA_VERSION < 4010 )
#error "Cuda version 4.1 or greater required"
#endif
#if ( __CUDA_ARCH__ < 200 )
#error "Cuda device capability 2.0 or greater required"
#endif
extern "C" {
/* Cuda runtime function, declared in <crt/device_runtime.h>
* Requires capability 2.x or better.
@ -90,30 +82,6 @@ void cuda_abort( const char * const message )
} // namespace Impl
} // namespace Kokkos
#else
namespace Kokkos {
namespace Impl {
KOKKOS_INLINE_FUNCTION
void cuda_abort( const char * const ) {}
}
}
#endif /* #if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
namespace Kokkos {
__device__ inline
void abort( const char * const message ) { Kokkos::Impl::cuda_abort(message); }
}
#endif /* defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined(__CUDACC__) && defined( KOKKOS_HAVE_CUDA ) */
#endif /* #ifndef KOKKOS_CUDA_ABORT_HPP */

View File

@ -75,15 +75,16 @@
#if defined(_WIN32)
#define KOKKOS_ATOMICS_USE_WINDOWS
#else
#if defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA )
#if defined( KOKKOS_HAVE_CUDA )
// Compiling NVIDIA device code, must use Cuda atomics:
#define KOKKOS_ATOMICS_USE_CUDA
#endif
#elif ! defined( KOKKOS_ATOMICS_USE_GCC ) && \
! defined( KOKKOS_ATOMICS_USE_INTEL ) && \
! defined( KOKKOS_ATOMICS_USE_OMP31 )
#if ! defined( KOKKOS_ATOMICS_USE_GCC ) && \
! defined( KOKKOS_ATOMICS_USE_INTEL ) && \
! defined( KOKKOS_ATOMICS_USE_OMP31 )
// Compiling for non-Cuda atomic implementation has not been pre-selected.
// Choose the best implementation for the detected compiler.
@ -91,7 +92,7 @@
#if defined( KOKKOS_COMPILER_GNU ) || \
defined( KOKKOS_COMPILER_CLANG ) || \
( defined ( KOKKOS_COMPILER_NVCC ) && defined ( __GNUC__ ) )
( defined ( KOKKOS_COMPILER_NVCC ) )
#define KOKKOS_ATOMICS_USE_GCC
@ -126,6 +127,9 @@ namespace Impl {
/// This function tries to aquire the lock for the hash value derived
/// from the provided ptr. If the lock is successfully aquired the
/// function returns true. Otherwise it returns false.
#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
extern
#endif
__device__ inline
bool lock_address_cuda_space(void* ptr);
@ -135,6 +139,9 @@ bool lock_address_cuda_space(void* ptr);
/// from the provided ptr. This function should only be called
/// after previously successfully aquiring a lock with
/// lock_address.
#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
extern
#endif
__device__ inline
void unlock_address_cuda_space(void* ptr);
}
@ -287,7 +294,7 @@ const char * atomic_query_version()
//----------------------------------------------------------------------------
// This atomic-style macro should be an inlined function, not a macro
#if defined( KOKKOS_COMPILER_GNU ) && !defined(__PGIC__)
#if defined( KOKKOS_COMPILER_GNU ) && !defined(__PGIC__) && !defined(__CUDA_ARCH__)
#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) __builtin_prefetch(addr,0,0)
#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) __builtin_prefetch(addr,1,0)

View File

@ -46,7 +46,14 @@
#include <type_traits>
// Needed for 'is_space<S>::host_mirror_space
#include <Kokkos_Core_fwd.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
//Schedules for Execution Policies
struct Static {};
struct Dynamic {};
@ -59,7 +66,7 @@ struct Schedule
|| std::is_same<T,Dynamic>::value
, "Kokkos: Invalid Schedule<> type."
);
using schedule_type = Schedule<T>;
using schedule_type = Schedule ;
using type = T;
};
@ -68,11 +75,268 @@ template<typename T>
struct IndexType
{
static_assert(std::is_integral<T>::value,"Kokkos: Invalid IndexType<>.");
using index_type = IndexType<T>;
using index_type = IndexType ;
using type = T;
};
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
#define KOKKOS_IMPL_IS_CONCEPT( CONCEPT ) \
template< typename T > struct is_ ## CONCEPT { \
private: \
template< typename , typename = std::true_type > struct have : std::false_type {}; \
template< typename U > struct have<U,typename std::is_same<U,typename U:: CONCEPT >::type> : std::true_type {}; \
public: \
enum { value = is_ ## CONCEPT::template have<T>::value }; \
};
// Public concept:
KOKKOS_IMPL_IS_CONCEPT( memory_space )
KOKKOS_IMPL_IS_CONCEPT( memory_traits )
KOKKOS_IMPL_IS_CONCEPT( execution_space )
KOKKOS_IMPL_IS_CONCEPT( execution_policy )
KOKKOS_IMPL_IS_CONCEPT( array_layout )
namespace Impl {
// For backward compatibility:
using Kokkos::is_memory_space ;
using Kokkos::is_memory_traits ;
using Kokkos::is_execution_space ;
using Kokkos::is_execution_policy ;
using Kokkos::is_array_layout ;
// Implementation concept:
KOKKOS_IMPL_IS_CONCEPT( iteration_pattern )
KOKKOS_IMPL_IS_CONCEPT( schedule_type )
KOKKOS_IMPL_IS_CONCEPT( index_type )
}
#undef KOKKOS_IMPL_IS_CONCEPT
} // namespace Kokkos
//----------------------------------------------------------------------------
namespace Kokkos {
template< class ExecutionSpace , class MemorySpace >
struct Device {
static_assert( Kokkos::is_execution_space<ExecutionSpace>::value
, "Execution space is not valid" );
static_assert( Kokkos::is_memory_space<MemorySpace>::value
, "Memory space is not valid" );
typedef ExecutionSpace execution_space;
typedef MemorySpace memory_space;
typedef Device<execution_space,memory_space> device_type;
};
template< typename T >
struct is_space {
private:
template< typename , typename = void >
struct exe : std::false_type { typedef void space ; };
template< typename , typename = void >
struct mem : std::false_type { typedef void space ; };
template< typename , typename = void >
struct dev : std::false_type { typedef void space ; };
template< typename U >
struct exe<U,typename std::conditional<true,void,typename U::execution_space>::type>
: std::is_same<U,typename U::execution_space>::type
{ typedef typename U::execution_space space ; };
template< typename U >
struct mem<U,typename std::conditional<true,void,typename U::memory_space>::type>
: std::is_same<U,typename U::memory_space>::type
{ typedef typename U::memory_space space ; };
template< typename U >
struct dev<U,typename std::conditional<true,void,typename U::device_type>::type>
: std::is_same<U,typename U::device_type>::type
{ typedef typename U::device_type space ; };
typedef typename is_space::template exe<T> is_exe ;
typedef typename is_space::template mem<T> is_mem ;
typedef typename is_space::template dev<T> is_dev ;
public:
enum { value = is_exe::value || is_mem::value || is_dev::value };
typedef typename is_exe::space execution_space ;
typedef typename is_mem::space memory_space ;
// For backward compatibility, deprecated in favor of
// Kokkos::Impl::HostMirror<S>::host_mirror_space
typedef typename std::conditional
< std::is_same< memory_space , Kokkos::HostSpace >::value
#if defined( KOKKOS_HAVE_CUDA )
|| std::is_same< memory_space , Kokkos::CudaUVMSpace >::value
|| std::is_same< memory_space , Kokkos::CudaHostPinnedSpace >::value
#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
, memory_space
, Kokkos::HostSpace
>::type host_memory_space ;
#if defined( KOKKOS_HAVE_CUDA )
typedef typename std::conditional
< std::is_same< execution_space , Kokkos::Cuda >::value
, Kokkos::DefaultHostExecutionSpace , execution_space
>::type host_execution_space ;
#else
typedef execution_space host_execution_space ;
#endif
typedef typename std::conditional
< std::is_same< execution_space , host_execution_space >::value &&
std::is_same< memory_space , host_memory_space >::value
, T , Kokkos::Device< host_execution_space , host_memory_space >
>::type host_mirror_space ;
};
// For backward compatiblity
namespace Impl {
using Kokkos::is_space ;
}
} // namespace Kokkos
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
/**\brief Access relationship between DstMemorySpace and SrcMemorySpace
*
* The default case can assume accessibility for the same space.
* Specializations must be defined for different memory spaces.
*/
template< typename DstMemorySpace , typename SrcMemorySpace >
struct MemorySpaceAccess {
static_assert( Kokkos::is_memory_space< DstMemorySpace >::value &&
Kokkos::is_memory_space< SrcMemorySpace >::value
, "template arguments must be memory spaces" );
/**\brief Can a View (or pointer) to memory in SrcMemorySpace
* be assigned to a View (or pointer) to memory marked DstMemorySpace.
*
* 1. DstMemorySpace::execution_space == SrcMemorySpace::execution_space
* 2. All execution spaces that can access DstMemorySpace can also access
* SrcMemorySpace.
*/
enum { assignable = std::is_same<DstMemorySpace,SrcMemorySpace>::value };
/**\brief For all DstExecSpace::memory_space == DstMemorySpace
* DstExecSpace can access SrcMemorySpace.
*/
enum { accessible = assignable };
/**\brief Does a DeepCopy capability exist
* to DstMemorySpace from SrcMemorySpace
*/
enum { deepcopy = assignable };
};
/**\brief Can AccessSpace access MemorySpace ?
*
* Requires:
* Kokkos::is_space< AccessSpace >::value
* Kokkos::is_memory_space< MemorySpace >::value
*
* Can AccessSpace::execution_space access MemorySpace ?
* enum : bool { accessible };
*
* Is View<AccessSpace::memory_space> assignable from View<MemorySpace> ?
* enum : bool { assignable };
*
* If ! accessible then through which intercessory memory space
* should a be used to deep copy memory for
* AccessSpace::execution_space
* to get access.
* When AccessSpace::memory_space == Kokkos::HostSpace
* then space is the View host mirror space.
*/
template< typename AccessSpace , typename MemorySpace >
struct SpaceAccessibility {
private:
static_assert( Kokkos::is_space< AccessSpace >::value
, "template argument #1 must be a Kokkos space" );
static_assert( Kokkos::is_memory_space< MemorySpace >::value
, "template argument #2 must be a Kokkos memory space" );
// The input AccessSpace may be a Device<ExecSpace,MemSpace>
// verify that it is a valid combination of spaces.
static_assert( Kokkos::Impl::MemorySpaceAccess
< typename AccessSpace::execution_space::memory_space
, typename AccessSpace::memory_space
>::accessible
, "template argument #1 is an invalid space" );
typedef Kokkos::Impl::MemorySpaceAccess
< typename AccessSpace::execution_space::memory_space , MemorySpace >
exe_access ;
typedef Kokkos::Impl::MemorySpaceAccess
< typename AccessSpace::memory_space , MemorySpace >
mem_access ;
public:
/**\brief Can AccessSpace::execution_space access MemorySpace ?
*
* Default based upon memory space accessibility.
* Specialization required for other relationships.
*/
enum { accessible = exe_access::accessible };
/**\brief Can assign to AccessSpace from MemorySpace ?
*
* Default based upon memory space accessibility.
* Specialization required for other relationships.
*/
enum { assignable =
is_memory_space< AccessSpace >::value && mem_access::assignable };
/**\brief Can deep copy to AccessSpace::memory_Space from MemorySpace ? */
enum { deepcopy = mem_access::deepcopy };
// What intercessory space for AccessSpace::execution_space
// to be able to access MemorySpace?
// If same memory space or not accessible use the AccessSpace
// else construct a device with execution space and memory space.
typedef typename std::conditional
< std::is_same<typename AccessSpace::memory_space,MemorySpace>::value ||
! exe_access::accessible
, AccessSpace
, Kokkos::Device< typename AccessSpace::execution_space , MemorySpace >
>::type space ;
};
}} // namespace Kokkos::Impl
//----------------------------------------------------------------------------
#endif // KOKKOS_CORE_CONCEPTS_HPP

View File

@ -72,6 +72,7 @@
#include <Kokkos_Vectorization.hpp>
#include <Kokkos_Atomic.hpp>
#include <Kokkos_hwloc.hpp>
#include <Kokkos_Timer.hpp>
#ifdef KOKKOS_HAVE_CXX11
#include <Kokkos_Complex.hpp>
@ -112,7 +113,6 @@ void fence();
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
/* Allocate memory from a memory space.
* The allocation is tracked in Kokkos memory tracking system, so
@ -155,18 +155,8 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
reallocate_tracked( arg_alloc , arg_alloc_size );
}
} // namespace Experimental
} // namespace Kokkos
namespace Kokkos {
using Kokkos::Experimental::kokkos_malloc ;
using Kokkos::Experimental::kokkos_realloc ;
using Kokkos::Experimental::kokkos_free ;
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -49,6 +49,7 @@
// and compiler environment then sets a collection of #define macros.
#include <Kokkos_Macros.hpp>
#include <impl/Kokkos_Utilities.hpp>
//----------------------------------------------------------------------------
// Have assumed a 64bit build (8byte pointers) throughout the code base.
@ -207,7 +208,7 @@ namespace Impl {
template< class Functor
, class Policy
, class EnableFunctor = void
, class EnableFunctor = void
, class EnablePolicy = void
>
struct FunctorPolicyExecutionSpace;
@ -220,7 +221,7 @@ struct FunctorPolicyExecutionSpace;
/// This is an implementation detail of parallel_for. Users should
/// skip this and go directly to the nonmember function parallel_for.
template< class FunctorType , class ExecPolicy , class ExecutionSpace =
typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
> class ParallelFor ;
/// \class ParallelReduce
@ -229,7 +230,7 @@ template< class FunctorType , class ExecPolicy , class ExecutionSpace =
/// This is an implementation detail of parallel_reduce. Users should
/// skip this and go directly to the nonmember function parallel_reduce.
template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType, class ExecutionSpace =
typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
> class ParallelReduce ;
/// \class ParallelScan
@ -238,8 +239,8 @@ template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType
/// This is an implementation detail of parallel_scan. Users should
/// skip this and go directly to the documentation of the nonmember
/// template function Kokkos::parallel_scan.
template< class FunctorType , class ExecPolicy , class ExecutionSapce =
typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
template< class FunctorType , class ExecPolicy , class ExecutionSapce =
typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
> class ParallelScan ;
}}

View File

@ -56,7 +56,7 @@
#include <Kokkos_CudaSpace.hpp>
#include <Kokkos_Parallel.hpp>
#include <Kokkos_TaskPolicy.hpp>
#include <Kokkos_TaskScheduler.hpp>
#include <Kokkos_Layout.hpp>
#include <Kokkos_ScratchSpace.hpp>
#include <Kokkos_MemoryTraits.hpp>
@ -229,6 +229,39 @@ private:
namespace Kokkos {
namespace Impl {
template<>
struct MemorySpaceAccess
< Kokkos::CudaSpace
, Kokkos::Cuda::scratch_memory_space
>
{
enum { assignable = false };
enum { accessible = true };
enum { deepcopy = false };
};
#if defined( KOKKOS_USE_CUDA_UVM )
// If forcing use of UVM everywhere
// then must assume that CudaUVMSpace
// can be a stand-in for CudaSpace.
// This will fail when a strange host-side execution space
// that defines CudaUVMSpace as its preferredmemory space.
template<>
struct MemorySpaceAccess
< Kokkos::CudaUVMSpace
, Kokkos::Cuda::scratch_memory_space
>
{
enum { assignable = false };
enum { accessible = true };
enum { deepcopy = false };
};
#endif
template<>
struct VerifyExecutionCanAccessMemorySpace
< Kokkos::CudaSpace
@ -259,9 +292,6 @@ struct VerifyExecutionCanAccessMemorySpace
#include <Cuda/Kokkos_CudaExec.hpp>
#include <Cuda/Kokkos_Cuda_View.hpp>
#include <Cuda/KokkosExp_Cuda_View.hpp>
#include <Cuda/Kokkos_Cuda_Parallel.hpp>
#include <Cuda/Kokkos_Cuda_Task.hpp>

View File

@ -88,6 +88,9 @@ public:
void deallocate( void * const arg_alloc_ptr
, const size_t arg_alloc_size ) const ;
/**\brief Return Name of the MemorySpace */
static constexpr const char* name();
/*--------------------------------*/
/** \brief Error reporting for HostSpace attempt to access CudaSpace */
static void access_error();
@ -97,7 +100,8 @@ private:
int m_device ; ///< Which Cuda device
// friend class Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > ;
static constexpr const char* m_name = "Cuda";
friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > ;
};
namespace Impl {
@ -156,6 +160,14 @@ public:
/** \brief If UVM capability is available */
static bool available();
/*--------------------------------*/
/** \brief CudaUVMSpace specific routine */
static int number_of_allocations();
/*--------------------------------*/
/*--------------------------------*/
CudaUVMSpace();
@ -172,11 +184,16 @@ public:
void deallocate( void * const arg_alloc_ptr
, const size_t arg_alloc_size ) const ;
/**\brief Return Name of the MemorySpace */
static constexpr const char* name();
/*--------------------------------*/
private:
int m_device ; ///< Which Cuda device
static constexpr const char* m_name = "CudaUVM";
};
} // namespace Kokkos
@ -215,6 +232,13 @@ public:
void deallocate( void * const arg_alloc_ptr
, const size_t arg_alloc_size ) const ;
/**\brief Return Name of the MemorySpace */
static constexpr const char* name();
private:
static constexpr const char* m_name = "CudaHostPinned";
/*--------------------------------*/
};
@ -226,6 +250,126 @@ public:
namespace Kokkos {
namespace Impl {
static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::CudaSpace >::assignable , "" );
static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaUVMSpace >::assignable , "" );
static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaHostPinnedSpace >::assignable , "" );
//----------------------------------------
template<>
struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaSpace > {
enum { assignable = false };
enum { accessible = false };
enum { deepcopy = true };
};
template<>
struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaUVMSpace > {
// HostSpace::execution_space != CudaUVMSpace::execution_space
enum { assignable = false };
enum { accessible = true };
enum { deepcopy = true };
};
template<>
struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaHostPinnedSpace > {
// HostSpace::execution_space == CudaHostPinnedSpace::execution_space
enum { assignable = true };
enum { accessible = true };
enum { deepcopy = true };
};
//----------------------------------------
template<>
struct MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::HostSpace > {
enum { assignable = false };
enum { accessible = false };
enum { deepcopy = true };
};
template<>
struct MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::CudaUVMSpace > {
// CudaSpace::execution_space == CudaUVMSpace::execution_space
enum { assignable = true };
enum { accessible = true };
enum { deepcopy = true };
};
template<>
struct MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::CudaHostPinnedSpace > {
// CudaSpace::execution_space != CudaHostPinnedSpace::execution_space
enum { assignable = false };
enum { accessible = true }; // CudaSpace::execution_space
enum { deepcopy = true };
};
//----------------------------------------
// CudaUVMSpace::execution_space == Cuda
// CudaUVMSpace accessible to both Cuda and Host
template<>
struct MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::HostSpace > {
enum { assignable = false };
enum { accessible = false }; // Cuda cannot access HostSpace
enum { deepcopy = true };
};
template<>
struct MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaSpace > {
// CudaUVMSpace::execution_space == CudaSpace::execution_space
// Can access CudaUVMSpace from Host but cannot access CudaSpace from Host
enum { assignable = false };
// CudaUVMSpace::execution_space can access CudaSpace
enum { accessible = true };
enum { deepcopy = true };
};
template<>
struct MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaHostPinnedSpace > {
// CudaUVMSpace::execution_space != CudaHostPinnedSpace::execution_space
enum { assignable = false };
enum { accessible = true }; // CudaUVMSpace::execution_space
enum { deepcopy = true };
};
//----------------------------------------
// CudaHostPinnedSpace::execution_space == HostSpace::execution_space
// CudaHostPinnedSpace accessible to both Cuda and Host
template<>
struct MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace > {
enum { assignable = false }; // Cannot access from Cuda
enum { accessible = true }; // CudaHostPinnedSpace::execution_space
enum { deepcopy = true };
};
template<>
struct MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaSpace > {
enum { assignable = false }; // Cannot access from Host
enum { accessible = false };
enum { deepcopy = true };
};
template<>
struct MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaUVMSpace > {
enum { assignable = false }; // different execution_space
enum { accessible = true }; // same accessibility
enum { deepcopy = true };
};
//----------------------------------------
}} // namespace Kokkos::Impl
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
void DeepCopyAsyncCuda( void * dst , const void * src , size_t n);
template<> struct DeepCopy< CudaSpace , CudaSpace , Cuda>
@ -553,7 +697,6 @@ struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaHost
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
template<>
@ -791,7 +934,6 @@ public:
};
} // namespace Impl
} // namespace Experimental
} // namespace Kokkos
//----------------------------------------------------------------------------

View File

@ -52,6 +52,7 @@
#include <impl/Kokkos_AnalyzePolicy.hpp>
#include <Kokkos_Concepts.hpp>
#include <iostream>
//----------------------------------------------------------------------------
namespace Kokkos {
@ -82,7 +83,6 @@ class RangePolicy
: public Impl::PolicyTraits<Properties ... >
{
private:
typedef Impl::PolicyTraits<Properties ... > traits;
typename traits::execution_space m_space ;
@ -90,8 +90,8 @@ private:
typename traits::index_type m_end ;
typename traits::index_type m_granularity ;
typename traits::index_type m_granularity_mask ;
public:
public:
//! Tag this class as an execution policy
typedef RangePolicy execution_policy;
typedef typename traits::index_type member_type ;
@ -100,7 +100,6 @@ public:
KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin ; }
KOKKOS_INLINE_FUNCTION member_type end() const { return m_end ; }
//TODO: find a better workaround for Clangs weird instantiation order
// This thing is here because of an instantiation error, where the RangePolicy is inserted into FunctorValue Traits, which
// tries decltype on the operator. It tries to do this even though the first argument of parallel for clearly doesn't match.
@ -135,47 +134,45 @@ public:
, work_begin , work_end )
{}
public:
public:
/** \brief return chunk_size */
inline member_type chunk_size() const {
return m_granularity;
}
/** \brief return chunk_size */
inline member_type chunk_size() const {
return m_granularity;
}
/** \brief set chunk_size to a discrete value*/
inline RangePolicy set_chunk_size(int chunk_size_) const {
RangePolicy p = *this;
p.m_granularity = chunk_size_;
p.m_granularity_mask = p.m_granularity - 1;
return p;
}
/** \brief set chunk_size to a discrete value*/
inline RangePolicy set_chunk_size(int chunk_size_) const {
RangePolicy p = *this;
p.m_granularity = chunk_size_;
p.m_granularity_mask = p.m_granularity - 1;
return p;
}
private:
/** \brief finalize chunk_size if it was set to AUTO*/
inline void set_auto_chunk_size() {
private:
/** \brief finalize chunk_size if it was set to AUTO*/
inline void set_auto_chunk_size() {
typename traits::index_type concurrency = traits::execution_space::concurrency();
if( concurrency==0 ) concurrency=1;
typename traits::index_type concurrency = traits::execution_space::concurrency();
if( concurrency==0 ) concurrency=1;
if(m_granularity > 0) {
if(!Impl::is_integral_power_of_two( m_granularity ))
Kokkos::abort("RangePolicy blocking granularity must be power of two" );
}
if(m_granularity > 0) {
if(!Impl::is_integral_power_of_two( m_granularity ))
Kokkos::abort("RangePolicy blocking granularity must be power of two" );
}
member_type new_chunk_size = 1;
while(new_chunk_size*100*concurrency < m_end-m_begin)
new_chunk_size *= 2;
if(new_chunk_size < 128) {
new_chunk_size = 1;
while( (new_chunk_size*40*concurrency < m_end-m_begin ) && (new_chunk_size<128) )
new_chunk_size*=2;
}
m_granularity = new_chunk_size;
m_granularity_mask = m_granularity - 1;
}
member_type new_chunk_size = 1;
while(new_chunk_size*100*concurrency < m_end-m_begin)
new_chunk_size *= 2;
if(new_chunk_size < 128) {
new_chunk_size = 1;
while( (new_chunk_size*40*concurrency < m_end-m_begin ) && (new_chunk_size<128) )
new_chunk_size*=2;
}
m_granularity = new_chunk_size;
m_granularity_mask = m_granularity - 1;
}
public:
public:
/** \brief Subrange for a partition's rank and size.
*
* Typically used to partition a range over a group of threads.
@ -212,16 +209,15 @@ public:
if ( range.end() < m_end ) m_end = range.end() ;
}
}
private:
member_type m_begin ;
member_type m_end ;
WorkRange();
WorkRange & operator = ( const WorkRange & );
private:
member_type m_begin ;
member_type m_end ;
WorkRange();
WorkRange & operator = ( const WorkRange & );
};
};
} // namespace Kokkos
//----------------------------------------------------------------------------
@ -231,7 +227,6 @@ namespace Kokkos {
namespace Impl {
template< class ExecSpace, class ... Properties>
class TeamPolicyInternal: public Impl::PolicyTraits<Properties ... > {
private:
@ -245,6 +240,10 @@ public:
* This size takes into account execution space concurrency limitations and
* scratch memory space limitations for reductions, team reduce/scan, and
* team shared memory.
*
* This function only works for single-operator functors.
* With multi-operator functors it cannot be determined
* which operator will be called.
*/
template< class FunctorType >
static int team_size_max( const FunctorType & );
@ -254,6 +253,10 @@ public:
* This size takes into account execution space concurrency limitations and
* scratch memory space limitations for reductions, team reduce/scan, and
* team shared memory.
*
* This function only works for single-operator functors.
* With multi-operator functors it cannot be determined
* which operator will be called.
*/
template< class FunctorType >
static int team_size_recommended( const FunctorType & );
@ -344,9 +347,7 @@ public:
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const ;
};
};
}
namespace Impl {
struct PerTeamValue {
int value;
PerTeamValue(int arg);
@ -356,12 +357,12 @@ namespace Impl {
int value;
PerThreadValue(int arg);
};
}
Impl::PerTeamValue PerTeam(const int& arg);
Impl::PerThreadValue PerThread(const int& arg);
/** \brief Execution policy for parallel work over a league of teams of threads.
*
* The work functor is called for each thread of each team such that
@ -443,10 +444,6 @@ public:
};
} // namespace Kokkos
namespace Kokkos {
namespace Impl {
template<typename iType, class TeamMemberType>
@ -484,8 +481,8 @@ public:
KOKKOS_INLINE_FUNCTION
TeamThreadRangeBoundariesStruct( const TeamMemberType& arg_thread
, const iType& arg_end
)
, const iType& arg_end
)
: start( ibegin( 0 , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
, end( iend( 0 , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
, thread( arg_thread )
@ -502,32 +499,33 @@ public:
{}
};
template<typename iType, class TeamMemberType>
struct ThreadVectorRangeBoundariesStruct {
typedef iType index_type;
enum {start = 0};
const iType end;
enum {increment = 1};
template<typename iType, class TeamMemberType>
struct ThreadVectorRangeBoundariesStruct {
typedef iType index_type;
enum {start = 0};
const iType end;
enum {increment = 1};
KOKKOS_INLINE_FUNCTION
ThreadVectorRangeBoundariesStruct (const TeamMemberType& thread, const iType& count):
end( count )
{}
};
KOKKOS_INLINE_FUNCTION
ThreadVectorRangeBoundariesStruct ( const TeamMemberType, const iType& count ) : end( count ) {}
KOKKOS_INLINE_FUNCTION
ThreadVectorRangeBoundariesStruct ( const iType& count ) : end( count ) {}
};
template<class TeamMemberType>
struct ThreadSingleStruct {
const TeamMemberType& team_member;
KOKKOS_INLINE_FUNCTION
ThreadSingleStruct(const TeamMemberType& team_member_):team_member(team_member_){}
};
template<class TeamMemberType>
struct ThreadSingleStruct {
const TeamMemberType& team_member;
KOKKOS_INLINE_FUNCTION
ThreadSingleStruct( const TeamMemberType& team_member_ ) : team_member( team_member_ ) {}
};
template<class TeamMemberType>
struct VectorSingleStruct {
const TeamMemberType& team_member;
KOKKOS_INLINE_FUNCTION
VectorSingleStruct( const TeamMemberType& team_member_ ) : team_member( team_member_ ) {}
};
template<class TeamMemberType>
struct VectorSingleStruct {
const TeamMemberType& team_member;
KOKKOS_INLINE_FUNCTION
VectorSingleStruct(const TeamMemberType& team_member_):team_member(team_member_){}
};
} // namespace Impl
/** \brief Execution policy for parallel work over a threads within a team.
@ -538,7 +536,8 @@ public:
*/
template<typename iType, class TeamMemberType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,TeamMemberType> TeamThreadRange(const TeamMemberType&, const iType& count);
Impl::TeamThreadRangeBoundariesStruct<iType,TeamMemberType>
TeamThreadRange( const TeamMemberType&, const iType& count );
/** \brief Execution policy for parallel work over a threads within a team.
*
@ -546,9 +545,10 @@ Impl::TeamThreadRangeBoundariesStruct<iType,TeamMemberType> TeamThreadRange(cons
* This policy is used together with a parallel pattern as a nested layer within a kernel launched
* with the TeamPolicy. This variant expects a begin and end. So the range is (begin,end].
*/
template<typename iType, class TeamMemberType>
template<typename iType1, typename iType2, class TeamMemberType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,TeamMemberType> TeamThreadRange(const TeamMemberType&, const iType& begin, const iType& end);
Impl::TeamThreadRangeBoundariesStruct<typename std::common_type<iType1, iType2>::type, TeamMemberType>
TeamThreadRange( const TeamMemberType&, const iType1& begin, const iType2& end );
/** \brief Execution policy for a vector parallel loop.
*
@ -558,13 +558,12 @@ Impl::TeamThreadRangeBoundariesStruct<iType,TeamMemberType> TeamThreadRange(cons
*/
template<typename iType, class TeamMemberType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType> ThreadVectorRange(const TeamMemberType&, const iType& count);
Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType>
ThreadVectorRange( const TeamMemberType&, const iType& count );
} // namespace Kokkos
#endif /* #define KOKKOS_EXECPOLICY_HPP */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -46,7 +46,6 @@
#include <Kokkos_HostSpace.hpp>
#include <impl/Kokkos_HBWAllocators.hpp>
/*--------------------------------------------------------------------------*/
#ifdef KOKKOS_HAVE_HBWSPACE
@ -148,11 +147,14 @@ public:
void deallocate( void * const arg_alloc_ptr
, const size_t arg_alloc_size ) const ;
/**\brief Return Name of the MemorySpace */
static constexpr const char* name();
private:
AllocationMechanism m_alloc_mech ;
friend class Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void > ;
static constexpr const char* m_name = "HBW";
friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void > ;
};
} // namespace Experimental
@ -162,7 +164,6 @@ private:
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
template<>
@ -239,9 +240,33 @@ public:
};
} // namespace Impl
} // namespace Experimental
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::HBWSpace , Kokkos::Experimental::HBWSpace >::assignable , "" );
template<>
struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::Experimental::HBWSpace > {
enum { assignable = true };
enum { accessible = true };
enum { deepcopy = true };
};
template<>
struct MemorySpaceAccess< Kokkos::Experimental::HBWSpace , Kokkos::HostSpace> {
enum { assignable = false };
enum { accessible = true };
enum { deepcopy = true };
};
}}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

Some files were not shown because too many files have changed in this diff Show More