Compare commits
36 Commits
patch_10Au
...
patch_17Au
| Author | SHA1 | Date | |
|---|---|---|---|
| 7ddcb6812b | |||
| 76cd61350d | |||
| fa3c0c61d6 | |||
| c46d5ff422 | |||
| dd67989c76 | |||
| 00aafef1a8 | |||
| 7175abcc71 | |||
| e34b20405c | |||
| 1d4d2155a2 | |||
| cee87d7a54 | |||
| 60e14f1490 | |||
| 81e7d4a942 | |||
| 0b3f1b8a15 | |||
| b209a4e246 | |||
| 27553283c3 | |||
| df56b2d6a4 | |||
| c6d923b6c8 | |||
| 6d24be8bb7 | |||
| 8c16ea1bfc | |||
| c8741f3a01 | |||
| 2a7d2dee36 | |||
| da01be7c18 | |||
| 3e9b41c6b7 | |||
| 8a7a831bd6 | |||
| 8431ca5fec | |||
| 13f2d39f55 | |||
| aa60ef6ed8 | |||
| a71f5a0c20 | |||
| 3d1d0c58c7 | |||
| cdac5f496c | |||
| 8c9db3ea00 | |||
| ea2b01e83b | |||
| 34fe2273f6 | |||
| 77c60189b8 | |||
| 1c6533e53d | |||
| 68206079da |
21
.github/CODEOWNERS
vendored
Normal file
21
.github/CODEOWNERS
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
# This file contains file patterns that triggers automatic
|
||||
# code review requests from users that are owners of these files
|
||||
# Order matters, the last match has the highest precedence
|
||||
|
||||
# library folders
|
||||
lib/colvars/* @giacomofiorin
|
||||
lib/compress/* @akohlmey
|
||||
lib/kokkos/* @stanmoore1
|
||||
lib/molfile/* @akohlmey
|
||||
lib/qmmm/* @akohlmey
|
||||
lib/vtk/* @rbberger
|
||||
|
||||
# packages
|
||||
src/KOKKOS @stanmoore1
|
||||
src/USER-CGSDK @akohlmey
|
||||
src/USER-COLVARS @giacomofiorin
|
||||
src/USER-OMP @akohlmey
|
||||
src/USER-QMMM @akohlmey
|
||||
|
||||
# tools
|
||||
tools/msi2lmp/* @akohlmey
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 20 KiB |
@ -1,7 +1,7 @@
|
||||
<!-- HTML_ONLY -->
|
||||
<HEAD>
|
||||
<TITLE>LAMMPS Users Manual</TITLE>
|
||||
<META NAME="docnumber" CONTENT="10 Aug 2017 version">
|
||||
<META NAME="docnumber" CONTENT="17 Aug 2017 version">
|
||||
<META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
|
||||
<META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation. This software and manual is distributed under the GNU General Public License.">
|
||||
</HEAD>
|
||||
@ -21,7 +21,7 @@
|
||||
<H1></H1>
|
||||
|
||||
LAMMPS Documentation :c,h3
|
||||
10 Aug 2017 version :c,h4
|
||||
17 Aug 2017 version :c,h4
|
||||
|
||||
Version info: :h4
|
||||
|
||||
@ -79,7 +79,7 @@ bug reports and feature requests are mainly coordinated through the
|
||||
"LAMMPS project on GitHub."_https://github.com/lammps/lammps
|
||||
The lammps.org domain, currently hosting "public continuous integration
|
||||
testing"_https://ci.lammps.org/job/lammps/ and "precompiled Linux
|
||||
RPM and Windows installer packages"_http://rpm.lammps.org is located
|
||||
RPM and Windows installer packages"_http://packages.lammps.org is located
|
||||
at Temple University and managed by Richard Berger,
|
||||
richard.berger at temple.edu.
|
||||
|
||||
|
||||
Binary file not shown.
@ -892,8 +892,8 @@ KOKKOS, o = USER-OMP, t = OPT.
|
||||
"hybrid"_pair_hybrid.html,
|
||||
"hybrid/overlay"_pair_hybrid.html,
|
||||
"adp (o)"_pair_adp.html,
|
||||
"airebo (o)"_pair_airebo.html,
|
||||
"airebo/morse (o)"_pair_airebo.html,
|
||||
"airebo (oi)"_pair_airebo.html,
|
||||
"airebo/morse (oi)"_pair_airebo.html,
|
||||
"beck (go)"_pair_beck.html,
|
||||
"body"_pair_body.html,
|
||||
"bop"_pair_bop.html,
|
||||
@ -927,8 +927,8 @@ KOKKOS, o = USER-OMP, t = OPT.
|
||||
"dpd/tstat (go)"_pair_dpd.html,
|
||||
"dsmc"_pair_dsmc.html,
|
||||
"eam (gkiot)"_pair_eam.html,
|
||||
"eam/alloy (gkot)"_pair_eam.html,
|
||||
"eam/fs (gkot)"_pair_eam.html,
|
||||
"eam/alloy (gkiot)"_pair_eam.html,
|
||||
"eam/fs (gkiot)"_pair_eam.html,
|
||||
"eim (o)"_pair_eim.html,
|
||||
"gauss (go)"_pair_gauss.html,
|
||||
"gayberne (gio)"_pair_gayberne.html,
|
||||
@ -942,9 +942,9 @@ KOKKOS, o = USER-OMP, t = OPT.
|
||||
"kim"_pair_kim.html,
|
||||
"lcbop"_pair_lcbop.html,
|
||||
"line/lj"_pair_line_lj.html,
|
||||
"lj/charmm/coul/charmm (ko)"_pair_charmm.html,
|
||||
"lj/charmm/coul/charmm (kio)"_pair_charmm.html,
|
||||
"lj/charmm/coul/charmm/implicit (ko)"_pair_charmm.html,
|
||||
"lj/charmm/coul/long (giko)"_pair_charmm.html,
|
||||
"lj/charmm/coul/long (gkio)"_pair_charmm.html,
|
||||
"lj/charmm/coul/msm"_pair_charmm.html,
|
||||
"lj/charmmfsw/coul/charmmfsh"_pair_charmm.html,
|
||||
"lj/charmmfsw/coul/long"_pair_charmm.html,
|
||||
@ -990,7 +990,7 @@ KOKKOS, o = USER-OMP, t = OPT.
|
||||
"polymorphic"_pair_polymorphic.html,
|
||||
"python"_pair_python.html,
|
||||
"reax"_pair_reax.html,
|
||||
"rebo (o)"_pair_airebo.html,
|
||||
"rebo (oi)"_pair_airebo.html,
|
||||
"resquared (go)"_pair_resquared.html,
|
||||
"snap"_pair_snap.html,
|
||||
"soft (go)"_pair_soft.html,
|
||||
|
||||
@ -7886,8 +7886,8 @@ keyword to allow for additional bonds to be formed :dd
|
||||
|
||||
{New bond exceeded special list size in fix bond/create} :dt
|
||||
|
||||
See the "special_bonds extra" command
|
||||
(or the "read_data extra/special/per/atom" command)
|
||||
See the "read_data extra/special/per/atom" command
|
||||
(or the "create_box extra/special/per/atom" command)
|
||||
for info on how to leave space in the special bonds
|
||||
list to allow for additional bonds to be formed. :dd
|
||||
|
||||
@ -9666,8 +9666,8 @@ you are running. :dd
|
||||
|
||||
{Special list size exceeded in fix bond/create} :dt
|
||||
|
||||
See the special_bonds extra command
|
||||
(or the read_data extra/special/per/atom command)
|
||||
See the "read_data extra/special/per/atom" command
|
||||
(or the "create_box extra/special/per/atom" command)
|
||||
for info on how to leave space in the special bonds
|
||||
list to allow for additional bonds to be formed. :dd
|
||||
|
||||
|
||||
@ -662,27 +662,25 @@ your own build system. Due to differences between the Windows OS
|
||||
and Windows system libraries to Unix-like environments like Linux
|
||||
or MacOS, when compiling for Windows a few adjustments may be needed:
|
||||
|
||||
Do not set the -DLAMMPS_MEMALIGN define (see LMP_INC makefile variable)
|
||||
Do [not] set the -DLAMMPS_MEMALIGN define (see LMP_INC makefile variable)
|
||||
Add -lwsock32 -lpsapi to the linker flags (see LIB makefile variable)
|
||||
Try adding -static-libgcc or -static or both to the linker flags when your
|
||||
LAMMPS executable complains about missing .dll files :ul
|
||||
Try adding -static-libgcc or -static or both to the linker flags when your LAMMPS executable complains about missing .dll files :ul
|
||||
|
||||
Since none of the current LAMMPS core developers
|
||||
has significant experience building executables on Windows, we are
|
||||
happy to distribute contributed instructions and modifications, but
|
||||
we cannot provide support for those.
|
||||
Since none of the current LAMMPS core developers has significant
|
||||
experience building executables on Windows, we are happy to distribute
|
||||
contributed instructions and modifications to improve the situation,
|
||||
but we cannot provide support for those.
|
||||
|
||||
With the so-called "Anniversary Update" to Windows 10, there is a
|
||||
Ubuntu Linux subsystem available for Windows, that can be installed
|
||||
and then used to compile/install LAMMPS as if you are running on a
|
||||
Ubuntu Linux system instead of Windows.
|
||||
|
||||
As an alternative, you can download "daily builds" (and some older
|
||||
versions) of the installer packages from
|
||||
"rpm.lammps.org/windows.html"_http://rpm.lammps.org/windows.html.
|
||||
These executables are built with most optional packages and the
|
||||
download includes documentation, potential files, some tools and
|
||||
many examples, but no source code.
|
||||
As an alternative, you can download pre-compiled installer packages from
|
||||
"packages.lammps.org/windows.html"_http://packages.lammps.org/windows.html.
|
||||
These executables are built with most optional packages included and the
|
||||
download includes documentation, potential files, some tools and many
|
||||
examples, but no source code.
|
||||
|
||||
:line
|
||||
|
||||
@ -1095,7 +1093,7 @@ LAMMPS to be built with one or more of its optional packages.
|
||||
:line
|
||||
|
||||
On a Windows box, you can skip making LAMMPS and simply download an
|
||||
installer package from "here"_http://rpm.lammps.org/windows.html
|
||||
installer package from "here"_http://packages.lammps.org/windows.html
|
||||
|
||||
For running the non-MPI executable, follow these steps:
|
||||
|
||||
@ -1107,18 +1105,27 @@ the [in.lj] input from the bench folder. (e.g. by typing: cd "Documents"). :l
|
||||
|
||||
At the command prompt, type "lmp_serial -in in.lj", replacing [in.lj]
|
||||
with the name of your LAMMPS input script. :l
|
||||
|
||||
The serial executable includes support for multi-threading
|
||||
parallelization from the styles in the USER-OMP packages.
|
||||
|
||||
To run with, e.g. 4 threads, type "lmp_serial -in in.lj -pk omp 4 -sf omp"
|
||||
:ule
|
||||
|
||||
For the MPI version, which allows you to run LAMMPS under Windows on
|
||||
multiple processors, follow these steps:
|
||||
For the MPI version, which allows you to run LAMMPS under Windows with
|
||||
the more general message passing parallel library (LAMMPS has been
|
||||
designed from ground up to use MPI efficiently), follow these steps:
|
||||
|
||||
Download and install
|
||||
"MPICH2"_http://www.mcs.anl.gov/research/projects/mpich2/downloads/index.php?s=downloads
|
||||
for Windows. :ulb,l
|
||||
Download and install a compatible MPI library binary package:
|
||||
for 32-bit Windows
|
||||
"mpich2-1.4.1p1-win-ia32.msi"_download.lammps.org/thirdparty/mpich2-1.4.1p1-win-ia32.msi
|
||||
and for 64-bit Windows
|
||||
"mpich2-1.4.1p1-win-x86-64.msi"_download.lammps.org/thirdparty/mpich2-1.4.1p1-win-x86-64.msi
|
||||
:ulb,l
|
||||
|
||||
The LAMMPS Windows installer packages will automatically adjust your
|
||||
path for the default location of this MPI package. After the installation
|
||||
of the MPICH software, it needs to be integrated into the system.
|
||||
of the MPICH2 software, it needs to be integrated into the system.
|
||||
For this you need to start a Command Prompt in {Administrator Mode}
|
||||
(right click on the icon and select it). Change into the MPICH2
|
||||
installation directory, then into the subdirectory [bin] and execute
|
||||
@ -1137,7 +1144,7 @@ or
|
||||
|
||||
mpiexec -np 4 lmp_mpi -in in.lj :pre
|
||||
|
||||
replacing in.lj with the name of your LAMMPS input script. For the latter
|
||||
replacing [in.lj] with the name of your LAMMPS input script. For the latter
|
||||
case, you may be prompted to enter your password. :l
|
||||
|
||||
In this mode, output may not immediately show up on the screen, so if
|
||||
@ -1149,6 +1156,11 @@ something like:
|
||||
|
||||
lmp_mpi -in in.lj :pre
|
||||
|
||||
And the parallel executable also includes OpenMP multi-threading, which
|
||||
can be combined with MPI using something like:
|
||||
|
||||
mpiexec -localonly 2 lmp_mpi -in in.lj -pk omp 2 -sf omp :pre
|
||||
|
||||
:ule
|
||||
|
||||
:line
|
||||
|
||||
@ -29,8 +29,10 @@ Bond Styles: fene, harmonic :l
|
||||
Dihedral Styles: charmm, harmonic, opls :l
|
||||
Fixes: nve, npt, nvt, nvt/sllod :l
|
||||
Improper Styles: cvff, harmonic :l
|
||||
Pair Styles: buck/coul/cut, buck/coul/long, buck, eam, gayberne,
|
||||
charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, sw, tersoff :l
|
||||
Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long,
|
||||
buck, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm,
|
||||
lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo,
|
||||
sw, tersoff :l
|
||||
K-Space Styles: pppm, pppm/disp :l
|
||||
:ule
|
||||
|
||||
|
||||
@ -150,10 +150,9 @@ atoms. Note that adding a single bond always adds a new 1st neighbor
|
||||
but may also induce *many* new 2nd and 3rd neighbors, depending on the
|
||||
molecular topology of your system. The "extra special per atom"
|
||||
parameter must typically be set to allow for the new maximum total
|
||||
size (1st + 2nd + 3rd neighbors) of this per-atom list. There are 3
|
||||
size (1st + 2nd + 3rd neighbors) of this per-atom list. There are 2
|
||||
ways to do this. See the "read_data"_read_data.html or
|
||||
"create_box"_create_box.html or "special_bonds extra" commands for
|
||||
details.
|
||||
"create_box"_create_box.html commands for details.
|
||||
|
||||
NOTE: Even if you do not use the {atype}, {dtype}, or {itype}
|
||||
keywords, the list of topological neighbors is updated for atoms
|
||||
|
||||
@ -7,10 +7,13 @@
|
||||
:line
|
||||
|
||||
pair_style airebo command :h3
|
||||
pair_style airebo/intel command :h3
|
||||
pair_style airebo/omp command :h3
|
||||
pair_style airebo/morse command :h3
|
||||
pair_style airebo/morse/intel command :h3
|
||||
pair_style airebo/morse/omp command :h3
|
||||
pair_style rebo command :h3
|
||||
pair_style rebo/intel command :h3
|
||||
pair_style rebo/omp command :h3
|
||||
|
||||
[Syntax:]
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
:line
|
||||
|
||||
pair_style lj/charmm/coul/charmm command :h3
|
||||
pair_style lj/charmm/coul/charmm/intel command :h3
|
||||
pair_style lj/charmm/coul/charmm/omp command :h3
|
||||
pair_style lj/charmm/coul/charmm/implicit command :h3
|
||||
pair_style lj/charmm/coul/charmm/implicit/omp command :h3
|
||||
|
||||
@ -14,6 +14,7 @@ pair_style eam/omp command :h3
|
||||
pair_style eam/opt command :h3
|
||||
pair_style eam/alloy command :h3
|
||||
pair_style eam/alloy/gpu command :h3
|
||||
pair_style eam/alloy/intel command :h3
|
||||
pair_style eam/alloy/kk command :h3
|
||||
pair_style eam/alloy/omp command :h3
|
||||
pair_style eam/alloy/opt command :h3
|
||||
@ -21,6 +22,7 @@ pair_style eam/cd command :h3
|
||||
pair_style eam/cd/omp command :h3
|
||||
pair_style eam/fs command :h3
|
||||
pair_style eam/fs/gpu command :h3
|
||||
pair_style eam/fs/intel command :h3
|
||||
pair_style eam/fs/kk command :h3
|
||||
pair_style eam/fs/omp command :h3
|
||||
pair_style eam/fs/opt command :h3
|
||||
|
||||
@ -25,9 +25,7 @@ keyword = {amber} or {charmm} or {dreiding} or {fene} or {lj/coul} or {lj} or {c
|
||||
{coul} values = w1,w2,w3
|
||||
w1,w2,w3 = weights (0.0 to 1.0) on pairwise Coulombic interactions
|
||||
{angle} value = {yes} or {no}
|
||||
{dihedral} value = {yes} or {no}
|
||||
{extra} value = N
|
||||
N = number of extra 1-2,1-3,1-4 interactions to save space for :pre
|
||||
{dihedral} value = {yes} or {no} :pre
|
||||
:ule
|
||||
|
||||
Examples:
|
||||
@ -36,8 +34,7 @@ special_bonds amber
|
||||
special_bonds charmm
|
||||
special_bonds fene dihedral no
|
||||
special_bonds lj/coul 0.0 0.0 0.5 angle yes dihedral yes
|
||||
special_bonds lj 0.0 0.0 0.5 coul 0.0 0.0 0.0 dihedral yes
|
||||
special_bonds lj/coul 0 1 1 extra 2 :pre
|
||||
special_bonds lj 0.0 0.0 0.5 coul 0.0 0.0 0.0 dihedral yes :pre
|
||||
|
||||
[Description:]
|
||||
|
||||
@ -178,14 +175,6 @@ interaction between atoms 2 and 5 will be unaffected (full weighting
|
||||
of 1.0). If the {dihedral} keyword is specified as {no} which is the
|
||||
default, then the 2,5 interaction will also be weighted by 0.5.
|
||||
|
||||
The {extra} keyword can be used when additional bonds will be created
|
||||
during a simulation run, e.g. by the "fix
|
||||
bond/create"_fix_bond_create.html command. It can also be used if
|
||||
molecules will be added to the system, e.g. via the "fix
|
||||
deposit"_fix_deposit.html, or "fix pour"_fix_pour.html commands, which
|
||||
will have atoms with more special neighbors than any atom in the
|
||||
current system has.
|
||||
|
||||
:line
|
||||
|
||||
NOTE: LAMMPS stores and maintains a data structure with a list of the
|
||||
@ -194,8 +183,9 @@ the system). If new bonds are created (or molecules added containing
|
||||
atoms with more special neighbors), the size of this list needs to
|
||||
grow. Note that adding a single bond always adds a new 1st neighbor
|
||||
but may also induce *many* new 2nd and 3rd neighbors, depending on the
|
||||
molecular topology of your system. Using the {extra} keyword leaves
|
||||
empty space in the list for this N additional 1st, 2nd, or 3rd
|
||||
molecular topology of your system. Using the {extra/special/per/atom}
|
||||
keyword to either "read_data"_read_data.html or "create_box"_create_box.html
|
||||
reserves empty space in the list for this N additional 1st, 2nd, or 3rd
|
||||
neighbors to be added. If you do not do this, you may get an error
|
||||
when bonds (or molecules) are added.
|
||||
|
||||
@ -203,8 +193,7 @@ when bonds (or molecules) are added.
|
||||
|
||||
NOTE: If you reuse this command in an input script, you should set all
|
||||
the options you need each time. This command cannot be used a 2nd
|
||||
time incrementally, e.g. to add some extra storage locations via the
|
||||
{extra} keyword. E.g. these two commands:
|
||||
time incrementally. E.g. these two commands:
|
||||
|
||||
special_bonds lj 0.0 1.0 1.0
|
||||
special_bonds coul 0.0 0.0 1.0
|
||||
@ -221,25 +210,6 @@ Coul: coul 0.0 0.0 1.0
|
||||
because the LJ settings are reset to their default values
|
||||
each time the command is issued.
|
||||
|
||||
Likewise
|
||||
|
||||
special_bonds amber
|
||||
special_bonds extra 2 :pre
|
||||
|
||||
is not the same as this single command:
|
||||
|
||||
special_bonds amber extra 2 :pre
|
||||
|
||||
since in the former case, the 2nd command will reset all the LJ and
|
||||
Coulombic weights to 0.0 (the default).
|
||||
|
||||
One exception to this rule is the {extra} option itself. It is not
|
||||
reset to its default value of 0 each time the special_bonds command is
|
||||
invoked. This is because it can also be set by the
|
||||
"read_data"_read_data.html and "create_box"_create_box.html commands,
|
||||
so this command will not override those settings unless you explicitly
|
||||
use {extra} as an option.
|
||||
|
||||
[Restrictions:] none
|
||||
|
||||
[Related commands:]
|
||||
|
||||
0
doc/src/tutorial_bash_on_windows.txt
Executable file → Normal file
0
doc/src/tutorial_bash_on_windows.txt
Executable file → Normal file
@ -176,12 +176,13 @@ By recognizing the fix {drude}, LAMMPS will find and store matching
|
||||
DC-DP pairs and will treat DP as equivalent to their DC in the
|
||||
{special bonds} relations. It may be necessary to extend the space
|
||||
for storing such special relations. In this case extra space should
|
||||
be reserved by using the {extra} keyword of the {special_bonds}
|
||||
be reserved by using the {extra/special/per/atom} keyword of either
|
||||
the "read_data"_read_data.html or "create_box"_create_box.html
|
||||
command. With our phenol, there is 1 more special neighbor for which
|
||||
space is required. Otherwise LAMMPS crashes and gives the required
|
||||
value.
|
||||
|
||||
special_bonds lj/coul 0.0 0.0 0.5 extra 1 :pre
|
||||
read_data data-p.lmp extra/special/per/atom 1 :pre
|
||||
|
||||
Let us assume we want to run a simple NVT simulation at 300 K. Note
|
||||
that Drude oscillators need to be thermalized at a low temperature in
|
||||
|
||||
0
doc/src/tutorials.txt
Executable file → Normal file
0
doc/src/tutorials.txt
Executable file → Normal file
@ -45,12 +45,12 @@ while iarg < nargs:
|
||||
if args[iarg] == "-m":
|
||||
if iarg+2 > len(args): error()
|
||||
machine = args[iarg+1]
|
||||
iarg += 2
|
||||
iarg += 2
|
||||
elif args[iarg] == "-e":
|
||||
if iarg+2 > len(args): error()
|
||||
extraflag = True
|
||||
suffix = args[iarg+1]
|
||||
iarg += 2
|
||||
iarg += 2
|
||||
else: error()
|
||||
|
||||
# set lib from working dir
|
||||
|
||||
@ -32,7 +32,7 @@ where Makefile.g++ uses the GNU C++ compiler and is a good template to start.
|
||||
|
||||
**Optional**: if you use the Install.py script provided in this folder, you
|
||||
can give the machine name as the '-m' argument. This can be the suffix of one
|
||||
of the files from either this folder, or from src/MAKE.
|
||||
of the files from either this folder, or from src/MAKE/MACHINES.
|
||||
*This is only supported by the Install.py within the lib/colvars folder*.
|
||||
|
||||
When you are done building this library, two files should
|
||||
@ -53,10 +53,10 @@ settings in Makefile.common should work.
|
||||
For the reference manual see:
|
||||
http://colvars.github.io/colvars-refman-lammps
|
||||
|
||||
A copy of reference manual is also in:
|
||||
A copy of the reference manual is also in:
|
||||
doc/PDF/colvars-refman-lammps.pdf
|
||||
|
||||
Also included is a Doxygen-based developer documentation:
|
||||
Also available is a Doxygen-based developer documentation:
|
||||
http://colvars.github.io/doxygen/html/
|
||||
|
||||
The reference article is:
|
||||
|
||||
@ -88,7 +88,12 @@ public:
|
||||
static std::vector<feature *> cv_features;
|
||||
|
||||
/// \brief Implementation of the feature list accessor for colvar
|
||||
std::vector<feature *> &features() {
|
||||
virtual const std::vector<feature *> &features()
|
||||
{
|
||||
return cv_features;
|
||||
}
|
||||
virtual std::vector<feature *> &modify_features()
|
||||
{
|
||||
return cv_features;
|
||||
}
|
||||
|
||||
|
||||
@ -206,7 +206,12 @@ public:
|
||||
static std::vector<feature *> ag_features;
|
||||
|
||||
/// \brief Implementation of the feature list accessor for atom group
|
||||
virtual std::vector<feature *> &features() {
|
||||
virtual const std::vector<feature *> &features()
|
||||
{
|
||||
return ag_features;
|
||||
}
|
||||
virtual std::vector<feature *> &modify_features()
|
||||
{
|
||||
return ag_features;
|
||||
}
|
||||
|
||||
|
||||
@ -384,6 +384,7 @@ std::ostream & colvarbias::write_traj(std::ostream &os)
|
||||
os << " ";
|
||||
if (b_output_energy)
|
||||
os << " "
|
||||
<< std::setprecision(cvm::en_prec) << std::setw(cvm::en_width)
|
||||
<< bias_energy;
|
||||
return os;
|
||||
}
|
||||
|
||||
@ -175,7 +175,11 @@ public:
|
||||
static std::vector<feature *> cvb_features;
|
||||
|
||||
/// \brief Implementation of the feature list accessor for colvarbias
|
||||
virtual std::vector<feature *> &features()
|
||||
virtual const std::vector<feature *> &features()
|
||||
{
|
||||
return cvb_features;
|
||||
}
|
||||
virtual std::vector<feature *> &modify_features()
|
||||
{
|
||||
return cvb_features;
|
||||
}
|
||||
|
||||
@ -99,12 +99,9 @@ int colvarbias_restraint_centers::init(std::string const &conf)
|
||||
if (null_centers) {
|
||||
// try to initialize the restraint centers for the first time
|
||||
colvar_centers.resize(num_variables());
|
||||
colvar_centers_raw.resize(num_variables());
|
||||
for (i = 0; i < num_variables(); i++) {
|
||||
colvar_centers[i].type(variables(i)->value());
|
||||
colvar_centers[i].reset();
|
||||
colvar_centers_raw[i].type(variables(i)->value());
|
||||
colvar_centers_raw[i].reset();
|
||||
}
|
||||
}
|
||||
|
||||
@ -113,7 +110,6 @@ int colvarbias_restraint_centers::init(std::string const &conf)
|
||||
if (cvm::debug()) {
|
||||
cvm::log("colvarbias_restraint: parsing initial centers, i = "+cvm::to_str(i)+".\n");
|
||||
}
|
||||
colvar_centers_raw[i] = colvar_centers[i];
|
||||
colvar_centers[i].apply_constraints();
|
||||
}
|
||||
null_centers = false;
|
||||
@ -141,8 +137,6 @@ int colvarbias_restraint_centers::change_configuration(std::string const &conf)
|
||||
for (size_t i = 0; i < num_variables(); i++) {
|
||||
colvar_centers[i].type(variables(i)->value());
|
||||
colvar_centers[i].apply_constraints();
|
||||
colvar_centers_raw[i].type(variables(i)->value());
|
||||
colvar_centers_raw[i] = colvar_centers[i];
|
||||
}
|
||||
}
|
||||
return COLVARS_OK;
|
||||
@ -232,7 +226,6 @@ int colvarbias_restraint_moving::set_state_params(std::string const &conf)
|
||||
{
|
||||
if (b_chg_centers || b_chg_force_k) {
|
||||
if (target_nstages) {
|
||||
// cvm::log ("Reading current stage from the restart.\n");
|
||||
if (!get_keyval(conf, "stage", stage))
|
||||
cvm::error("Error: current stage is missing from the restart.\n");
|
||||
}
|
||||
@ -265,100 +258,127 @@ int colvarbias_restraint_centers_moving::init(std::string const &conf)
|
||||
|
||||
size_t i;
|
||||
if (get_keyval(conf, "targetCenters", target_centers, colvar_centers)) {
|
||||
if (colvar_centers.size() != num_variables()) {
|
||||
if (target_centers.size() != num_variables()) {
|
||||
cvm::error("Error: number of target centers does not match "
|
||||
"that of collective variables.\n");
|
||||
"that of collective variables.\n", INPUT_ERROR);
|
||||
}
|
||||
b_chg_centers = true;
|
||||
for (i = 0; i < target_centers.size(); i++) {
|
||||
target_centers[i].apply_constraints();
|
||||
centers_incr.push_back(colvar_centers[i]);
|
||||
centers_incr[i].reset();
|
||||
}
|
||||
}
|
||||
|
||||
if (b_chg_centers) {
|
||||
// parse moving restraint options
|
||||
// parse moving schedule options
|
||||
colvarbias_restraint_moving::init(conf);
|
||||
if (initial_centers.size() == 0) {
|
||||
// One-time init
|
||||
initial_centers = colvar_centers;
|
||||
}
|
||||
// Call to check that the definition is correct
|
||||
for (i = 0; i < num_variables(); i++) {
|
||||
colvarvalue const midpoint =
|
||||
colvarvalue::interpolate(initial_centers[i],
|
||||
target_centers[i],
|
||||
0.5);
|
||||
}
|
||||
} else {
|
||||
target_centers.clear();
|
||||
return COLVARS_OK;
|
||||
}
|
||||
|
||||
get_keyval(conf, "outputCenters", b_output_centers, b_output_centers);
|
||||
get_keyval(conf, "outputAccumulatedWork", b_output_acc_work, b_output_acc_work);
|
||||
get_keyval(conf, "outputAccumulatedWork", b_output_acc_work,
|
||||
b_output_acc_work); // TODO this conflicts with stages
|
||||
|
||||
return COLVARS_OK;
|
||||
}
|
||||
|
||||
|
||||
int colvarbias_restraint_centers_moving::update_centers(cvm::real lambda)
|
||||
{
|
||||
if (cvm::debug()) {
|
||||
cvm::log("Updating centers for the restraint bias \""+
|
||||
this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
|
||||
}
|
||||
size_t i;
|
||||
for (i = 0; i < num_variables(); i++) {
|
||||
colvarvalue const c_new = colvarvalue::interpolate(initial_centers[i],
|
||||
target_centers[i],
|
||||
lambda);
|
||||
centers_incr[i] = (c_new).dist2_grad(colvar_centers[i]);
|
||||
colvar_centers[i] = c_new;
|
||||
variables(i)->wrap(colvar_centers[i]);
|
||||
}
|
||||
if (cvm::debug()) {
|
||||
cvm::log("New centers for the restraint bias \""+
|
||||
this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
|
||||
}
|
||||
return cvm::get_error();
|
||||
}
|
||||
|
||||
|
||||
int colvarbias_restraint_centers_moving::update()
|
||||
{
|
||||
if (b_chg_centers) {
|
||||
|
||||
if (cvm::debug()) {
|
||||
cvm::log("Updating centers for the restraint bias \""+
|
||||
this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
|
||||
}
|
||||
|
||||
if (!centers_incr.size()) {
|
||||
// if this is the first calculation, calculate the advancement
|
||||
// at each simulation step (or stage, if applicable)
|
||||
// (take current stage into account: it can be non-zero
|
||||
// if we are restarting a staged calculation)
|
||||
centers_incr.resize(num_variables());
|
||||
for (size_t i = 0; i < num_variables(); i++) {
|
||||
centers_incr[i].type(variables(i)->value());
|
||||
centers_incr[i] = (target_centers[i] - colvar_centers_raw[i]) /
|
||||
cvm::real( target_nstages ? (target_nstages - stage) :
|
||||
(target_nsteps - cvm::step_absolute()));
|
||||
}
|
||||
if (cvm::debug()) {
|
||||
cvm::log("Center increment for the restraint bias \""+
|
||||
this->name+"\": "+cvm::to_str(centers_incr)+" at stage "+cvm::to_str(stage)+ ".\n");
|
||||
}
|
||||
}
|
||||
|
||||
if (target_nstages) {
|
||||
if ((cvm::step_relative() > 0)
|
||||
&& (cvm::step_absolute() % target_nsteps) == 0
|
||||
&& stage < target_nstages) {
|
||||
|
||||
for (size_t i = 0; i < num_variables(); i++) {
|
||||
colvar_centers_raw[i] += centers_incr[i];
|
||||
colvar_centers[i] = colvar_centers_raw[i];
|
||||
variables(i)->wrap(colvar_centers[i]);
|
||||
colvar_centers[i].apply_constraints();
|
||||
// Staged update
|
||||
if (stage <= target_nstages) {
|
||||
if ((cvm::step_relative() > 0) &&
|
||||
((cvm::step_absolute() % target_nsteps) == 1)) {
|
||||
cvm::real const lambda =
|
||||
cvm::real(stage)/cvm::real(target_nstages);
|
||||
update_centers(lambda);
|
||||
stage++;
|
||||
cvm::log("Moving restraint \"" + this->name +
|
||||
"\" stage " + cvm::to_str(stage) +
|
||||
" : setting centers to " + cvm::to_str(colvar_centers) +
|
||||
" at step " + cvm::to_str(cvm::step_absolute()));
|
||||
} else {
|
||||
for (size_t i = 0; i < num_variables(); i++) {
|
||||
centers_incr[i].reset();
|
||||
}
|
||||
}
|
||||
stage++;
|
||||
cvm::log("Moving restraint \"" + this->name +
|
||||
"\" stage " + cvm::to_str(stage) +
|
||||
" : setting centers to " + cvm::to_str(colvar_centers) +
|
||||
" at step " + cvm::to_str(cvm::step_absolute()));
|
||||
}
|
||||
} else if ((cvm::step_relative() > 0) && (cvm::step_absolute() <= target_nsteps)) {
|
||||
// move the restraint centers in the direction of the targets
|
||||
// (slow growth)
|
||||
} else {
|
||||
// Continuous update
|
||||
if (cvm::step_absolute() <= target_nsteps) {
|
||||
cvm::real const lambda =
|
||||
cvm::real(cvm::step_absolute())/cvm::real(target_nsteps);
|
||||
update_centers(lambda);
|
||||
} else {
|
||||
for (size_t i = 0; i < num_variables(); i++) {
|
||||
centers_incr[i].reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (cvm::step_relative() == 0) {
|
||||
for (size_t i = 0; i < num_variables(); i++) {
|
||||
colvar_centers_raw[i] += centers_incr[i];
|
||||
colvar_centers[i] = colvar_centers_raw[i];
|
||||
variables(i)->wrap(colvar_centers[i]);
|
||||
colvar_centers[i].apply_constraints();
|
||||
// finite differences are undefined when restarting
|
||||
centers_incr[i].reset();
|
||||
}
|
||||
}
|
||||
|
||||
if (cvm::debug()) {
|
||||
cvm::log("New centers for the restraint bias \""+
|
||||
this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
|
||||
cvm::log("Center increment for the restraint bias \""+
|
||||
this->name+"\": "+cvm::to_str(centers_incr)+
|
||||
" at stage "+cvm::to_str(stage)+ ".\n");
|
||||
}
|
||||
}
|
||||
|
||||
return COLVARS_OK;
|
||||
return cvm::get_error();
|
||||
}
|
||||
|
||||
|
||||
int colvarbias_restraint_centers_moving::update_acc_work()
|
||||
{
|
||||
if (b_output_acc_work) {
|
||||
if ((cvm::step_relative() > 0) || (cvm::step_absolute() == 0)) {
|
||||
if ((cvm::step_relative() > 0) &&
|
||||
(cvm::step_absolute() <= target_nsteps)) {
|
||||
for (size_t i = 0; i < num_variables(); i++) {
|
||||
// project forces on the calculated increments at this step
|
||||
acc_work += colvar_forces[i] * centers_incr[i];
|
||||
@ -383,13 +403,6 @@ std::string const colvarbias_restraint_centers_moving::get_state_params() const
|
||||
<< colvar_centers[i];
|
||||
}
|
||||
os << "\n";
|
||||
os << "centers_raw ";
|
||||
for (i = 0; i < num_variables(); i++) {
|
||||
os << " "
|
||||
<< std::setprecision(cvm::cv_prec) << std::setw(cvm::cv_width)
|
||||
<< colvar_centers_raw[i];
|
||||
}
|
||||
os << "\n";
|
||||
|
||||
if (b_output_acc_work) {
|
||||
os << "accumulatedWork "
|
||||
@ -398,7 +411,7 @@ std::string const colvarbias_restraint_centers_moving::get_state_params() const
|
||||
}
|
||||
}
|
||||
|
||||
return colvarbias_restraint_moving::get_state_params() + os.str();
|
||||
return os.str();
|
||||
}
|
||||
|
||||
|
||||
@ -410,8 +423,6 @@ int colvarbias_restraint_centers_moving::set_state_params(std::string const &con
|
||||
// cvm::log ("Reading the updated restraint centers from the restart.\n");
|
||||
if (!get_keyval(conf, "centers", colvar_centers))
|
||||
cvm::error("Error: restraint centers are missing from the restart.\n");
|
||||
if (!get_keyval(conf, "centers_raw", colvar_centers_raw))
|
||||
cvm::error("Error: \"raw\" restraint centers are missing from the restart.\n");
|
||||
if (b_output_acc_work) {
|
||||
if (!get_keyval(conf, "accumulatedWork", acc_work))
|
||||
cvm::error("Error: accumulatedWork is missing from the restart.\n");
|
||||
@ -609,7 +620,7 @@ std::string const colvarbias_restraint_k_moving::get_state_params() const
|
||||
<< std::setprecision(cvm::en_prec)
|
||||
<< std::setw(cvm::en_width) << force_k << "\n";
|
||||
}
|
||||
return colvarbias_restraint_moving::get_state_params() + os.str();
|
||||
return os.str();
|
||||
}
|
||||
|
||||
|
||||
@ -770,6 +781,7 @@ cvm::real colvarbias_restraint_harmonic::d_restraint_potential_dk(size_t i) cons
|
||||
std::string const colvarbias_restraint_harmonic::get_state_params() const
|
||||
{
|
||||
return colvarbias_restraint::get_state_params() +
|
||||
colvarbias_restraint_moving::get_state_params() +
|
||||
colvarbias_restraint_centers_moving::get_state_params() +
|
||||
colvarbias_restraint_k_moving::get_state_params();
|
||||
}
|
||||
@ -779,6 +791,7 @@ int colvarbias_restraint_harmonic::set_state_params(std::string const &conf)
|
||||
{
|
||||
int error_code = COLVARS_OK;
|
||||
error_code |= colvarbias_restraint::set_state_params(conf);
|
||||
error_code |= colvarbias_restraint_moving::set_state_params(conf);
|
||||
error_code |= colvarbias_restraint_centers_moving::set_state_params(conf);
|
||||
error_code |= colvarbias_restraint_k_moving::set_state_params(conf);
|
||||
return error_code;
|
||||
@ -1037,6 +1050,7 @@ cvm::real colvarbias_restraint_harmonic_walls::d_restraint_potential_dk(size_t i
|
||||
std::string const colvarbias_restraint_harmonic_walls::get_state_params() const
|
||||
{
|
||||
return colvarbias_restraint::get_state_params() +
|
||||
colvarbias_restraint_moving::get_state_params() +
|
||||
colvarbias_restraint_k_moving::get_state_params();
|
||||
}
|
||||
|
||||
@ -1045,6 +1059,7 @@ int colvarbias_restraint_harmonic_walls::set_state_params(std::string const &con
|
||||
{
|
||||
int error_code = COLVARS_OK;
|
||||
error_code |= colvarbias_restraint::set_state_params(conf);
|
||||
error_code |= colvarbias_restraint_moving::set_state_params(conf);
|
||||
error_code |= colvarbias_restraint_k_moving::set_state_params(conf);
|
||||
return error_code;
|
||||
}
|
||||
@ -1164,6 +1179,7 @@ cvm::real colvarbias_restraint_linear::d_restraint_potential_dk(size_t i) const
|
||||
std::string const colvarbias_restraint_linear::get_state_params() const
|
||||
{
|
||||
return colvarbias_restraint::get_state_params() +
|
||||
colvarbias_restraint_moving::get_state_params() +
|
||||
colvarbias_restraint_centers_moving::get_state_params() +
|
||||
colvarbias_restraint_k_moving::get_state_params();
|
||||
}
|
||||
@ -1173,6 +1189,7 @@ int colvarbias_restraint_linear::set_state_params(std::string const &conf)
|
||||
{
|
||||
int error_code = COLVARS_OK;
|
||||
error_code |= colvarbias_restraint::set_state_params(conf);
|
||||
error_code |= colvarbias_restraint_moving::set_state_params(conf);
|
||||
error_code |= colvarbias_restraint_centers_moving::set_state_params(conf);
|
||||
error_code |= colvarbias_restraint_k_moving::set_state_params(conf);
|
||||
return error_code;
|
||||
|
||||
@ -74,9 +74,6 @@ protected:
|
||||
|
||||
/// \brief Restraint centers
|
||||
std::vector<colvarvalue> colvar_centers;
|
||||
|
||||
/// \brief Restraint centers outside the domain of the colvars (no wrapping or constraints applied)
|
||||
std::vector<colvarvalue> colvar_centers_raw;
|
||||
};
|
||||
|
||||
|
||||
@ -156,10 +153,16 @@ protected:
|
||||
/// \brief New restraint centers
|
||||
std::vector<colvarvalue> target_centers;
|
||||
|
||||
/// \brief Initial value of the restraint centers
|
||||
std::vector<colvarvalue> initial_centers;
|
||||
|
||||
/// \brief Amplitude of the restraint centers' increment at each step
|
||||
/// (or stage) towards the new values (calculated from target_nsteps)
|
||||
/// towards the new values (calculated from target_nsteps)
|
||||
std::vector<colvarvalue> centers_incr;
|
||||
|
||||
/// \brief Update the centers by interpolating between initial and target
|
||||
virtual int update_centers(cvm::real lambda);
|
||||
|
||||
/// Whether to write the current restraint centers to the trajectory file
|
||||
bool b_output_centers;
|
||||
|
||||
|
||||
@ -132,9 +132,15 @@ public:
|
||||
static std::vector<feature *> cvc_features;
|
||||
|
||||
/// \brief Implementation of the feature list accessor for colvar
|
||||
virtual std::vector<feature *> &features() {
|
||||
virtual const std::vector<feature *> &features()
|
||||
{
|
||||
return cvc_features;
|
||||
}
|
||||
virtual std::vector<feature *> &modify_features()
|
||||
{
|
||||
return cvc_features;
|
||||
}
|
||||
|
||||
|
||||
/// \brief Obtain data needed for the calculation for the backend
|
||||
virtual void read_data();
|
||||
|
||||
@ -374,8 +374,8 @@ int colvardeps::decr_ref_count(int feature_id) {
|
||||
}
|
||||
|
||||
void colvardeps::init_feature(int feature_id, const char *description, feature_type type) {
|
||||
features()[feature_id]->description = description;
|
||||
features()[feature_id]->type = type;
|
||||
modify_features()[feature_id]->description = description;
|
||||
modify_features()[feature_id]->type = type;
|
||||
}
|
||||
|
||||
// Shorthand macros for describing dependencies
|
||||
@ -401,7 +401,7 @@ void colvardeps::init_cvb_requires() {
|
||||
int i;
|
||||
if (features().size() == 0) {
|
||||
for (i = 0; i < f_cvb_ntot; i++) {
|
||||
features().push_back(new feature);
|
||||
modify_features().push_back(new feature);
|
||||
}
|
||||
|
||||
init_feature(f_cvb_active, "active", f_type_dynamic);
|
||||
@ -438,7 +438,7 @@ void colvardeps::init_cv_requires() {
|
||||
size_t i;
|
||||
if (features().size() == 0) {
|
||||
for (i = 0; i < f_cv_ntot; i++) {
|
||||
features().push_back(new feature);
|
||||
modify_features().push_back(new feature);
|
||||
}
|
||||
|
||||
init_feature(f_cv_active, "active", f_type_dynamic);
|
||||
@ -554,7 +554,7 @@ void colvardeps::init_cvc_requires() {
|
||||
// Initialize static array once and for all
|
||||
if (features().size() == 0) {
|
||||
for (i = 0; i < colvardeps::f_cvc_ntot; i++) {
|
||||
features().push_back(new feature);
|
||||
modify_features().push_back(new feature);
|
||||
}
|
||||
|
||||
init_feature(f_cvc_active, "active", f_type_dynamic);
|
||||
@ -633,7 +633,7 @@ void colvardeps::init_ag_requires() {
|
||||
// Initialize static array once and for all
|
||||
if (features().size() == 0) {
|
||||
for (i = 0; i < f_ag_ntot; i++) {
|
||||
features().push_back(new feature);
|
||||
modify_features().push_back(new feature);
|
||||
}
|
||||
|
||||
init_feature(f_ag_active, "active", f_type_dynamic);
|
||||
|
||||
@ -135,7 +135,8 @@ public:
|
||||
// with a non-static array
|
||||
// Intermediate classes (colvarbias and colvarcomp, which are also base classes)
|
||||
// implement this as virtual to allow overriding
|
||||
virtual std::vector<feature *>&features() = 0;
|
||||
virtual const std::vector<feature *>&features() = 0;
|
||||
virtual std::vector<feature *>&modify_features() = 0;
|
||||
|
||||
void add_child(colvardeps *child);
|
||||
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
#define COLVARS_VERSION "2017-07-15"
|
||||
#ifndef COLVARS_VERSION
|
||||
#define COLVARS_VERSION "2017-08-06"
|
||||
// This file is part of the Collective Variables module (Colvars).
|
||||
// The original version of Colvars and its updates are located at:
|
||||
// https://github.com/colvars/colvars
|
||||
@ -6,3 +7,4 @@
|
||||
// If you wish to distribute your changes, please submit them to the
|
||||
// Colvars repository at GitHub.
|
||||
|
||||
#endif
|
||||
|
||||
@ -472,7 +472,7 @@ int colvarscript::proc_features(colvardeps *obj,
|
||||
}
|
||||
|
||||
if ((subcmd == "get") || (subcmd == "set")) {
|
||||
std::vector<colvardeps::feature *> &features = obj->features();
|
||||
std::vector<colvardeps::feature *> const &features = obj->features();
|
||||
std::string const req_feature(obj_to_str(objv[3]));
|
||||
colvardeps::feature *f = NULL;
|
||||
int fid = 0;
|
||||
|
||||
@ -19,6 +19,17 @@ bool colvarmodule::rotation::monitor_crossings = false;
|
||||
cvm::real colvarmodule::rotation::crossing_threshold = 1.0E-02;
|
||||
|
||||
|
||||
/// Numerical recipes diagonalization
|
||||
static int jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot);
|
||||
|
||||
/// Eigenvector sort
|
||||
static int eigsrt(cvm::real *d, cvm::real **v);
|
||||
|
||||
/// Transpose the matrix
|
||||
static int transpose(cvm::real **v);
|
||||
|
||||
|
||||
|
||||
std::string cvm::rvector::to_simple_string() const
|
||||
{
|
||||
std::ostringstream os;
|
||||
@ -286,7 +297,12 @@ void colvarmodule::rotation::diagonalize_matrix(cvm::matrix2d<cvm::real> &S,
|
||||
|
||||
// diagonalize
|
||||
int jac_nrot = 0;
|
||||
jacobi(S.c_array(), S_eigval.c_array(), S_eigvec.c_array(), &jac_nrot);
|
||||
if (jacobi(S.c_array(), S_eigval.c_array(), S_eigvec.c_array(), &jac_nrot) !=
|
||||
COLVARS_OK) {
|
||||
cvm::error("Too many iterations in routine jacobi.\n"
|
||||
"This is usually the result of an ill-defined set of atoms for "
|
||||
"rotational alignment (RMSD, rotateReference, etc).\n");
|
||||
}
|
||||
eigsrt(S_eigval.c_array(), S_eigvec.c_array());
|
||||
// jacobi saves eigenvectors by columns
|
||||
transpose(S_eigvec.c_array());
|
||||
@ -528,7 +544,7 @@ void colvarmodule::rotation::calc_optimal_rotation(std::vector<cvm::atom_pos> co
|
||||
|
||||
#define n 4
|
||||
|
||||
void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
|
||||
int jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
|
||||
{
|
||||
int j,iq,ip,i;
|
||||
cvm::real tresh,theta,tau,t,sm,s,h,g,c;
|
||||
@ -554,7 +570,7 @@ void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
|
||||
sm += std::fabs(a[ip][iq]);
|
||||
}
|
||||
if (sm == 0.0) {
|
||||
return;
|
||||
return COLVARS_OK;
|
||||
}
|
||||
if (i < 4)
|
||||
tresh=0.2*sm/(n*n);
|
||||
@ -606,10 +622,11 @@ void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
|
||||
z[ip]=0.0;
|
||||
}
|
||||
}
|
||||
cvm::error("Too many iterations in routine jacobi.\n");
|
||||
return COLVARS_ERROR;
|
||||
}
|
||||
|
||||
void eigsrt(cvm::real *d, cvm::real **v)
|
||||
|
||||
int eigsrt(cvm::real *d, cvm::real **v)
|
||||
{
|
||||
int k,j,i;
|
||||
cvm::real p;
|
||||
@ -628,9 +645,11 @@ void eigsrt(cvm::real *d, cvm::real **v)
|
||||
}
|
||||
}
|
||||
}
|
||||
return COLVARS_OK;
|
||||
}
|
||||
|
||||
void transpose(cvm::real **v)
|
||||
|
||||
int transpose(cvm::real **v)
|
||||
{
|
||||
cvm::real p;
|
||||
int i,j;
|
||||
@ -641,6 +660,7 @@ void transpose(cvm::real **v)
|
||||
v[j][i]=p;
|
||||
}
|
||||
}
|
||||
return COLVARS_OK;
|
||||
}
|
||||
|
||||
#undef n
|
||||
|
||||
@ -1020,16 +1020,6 @@ inline cvm::rvector operator * (cvm::rmatrix const &m,
|
||||
}
|
||||
|
||||
|
||||
/// Numerical recipes diagonalization
|
||||
void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot);
|
||||
|
||||
/// Eigenvector sort
|
||||
void eigsrt(cvm::real *d, cvm::real **v);
|
||||
|
||||
/// Transpose the matrix
|
||||
void transpose(cvm::real **v);
|
||||
|
||||
|
||||
|
||||
|
||||
/// \brief 1-dimensional vector of real numbers with four components and
|
||||
|
||||
@ -570,6 +570,50 @@ colvarvalue colvarvalue::dist2_grad(colvarvalue const &x2) const
|
||||
}
|
||||
|
||||
|
||||
/// Return the midpoint between x1 and x2, optionally weighted by lambda
|
||||
/// (which must be between 0.0 and 1.0)
|
||||
colvarvalue const colvarvalue::interpolate(colvarvalue const &x1,
|
||||
colvarvalue const &x2,
|
||||
cvm::real const lambda)
|
||||
{
|
||||
colvarvalue::check_types(x1, x2);
|
||||
|
||||
if ((lambda < 0.0) || (lambda > 1.0)) {
|
||||
cvm::error("Error: trying to interpolate between two colvarvalues with a "
|
||||
"lamdba outside [0:1].\n", BUG_ERROR);
|
||||
}
|
||||
|
||||
colvarvalue interp = ((1.0-lambda)*x1 + lambda*x2);
|
||||
cvm::real const d2 = x1.dist2(x2);
|
||||
|
||||
switch (x1.type()) {
|
||||
case colvarvalue::type_scalar:
|
||||
case colvarvalue::type_3vector:
|
||||
case colvarvalue::type_vector:
|
||||
case colvarvalue::type_unit3vectorderiv:
|
||||
case colvarvalue::type_quaternionderiv:
|
||||
return interp;
|
||||
break;
|
||||
case colvarvalue::type_unit3vector:
|
||||
case colvarvalue::type_quaternion:
|
||||
if (interp.norm()/std::sqrt(d2) < 1.0e-6) {
|
||||
cvm::error("Error: interpolation between "+cvm::to_str(x1)+" and "+
|
||||
cvm::to_str(x2)+" with lambda = "+cvm::to_str(lambda)+
|
||||
" is undefined: result = "+cvm::to_str(interp)+"\n",
|
||||
INPUT_ERROR);
|
||||
}
|
||||
interp.apply_constraints();
|
||||
return interp;
|
||||
break;
|
||||
case colvarvalue::type_notset:
|
||||
default:
|
||||
x1.undef_op();
|
||||
break;
|
||||
}
|
||||
return colvarvalue(colvarvalue::type_notset);
|
||||
}
|
||||
|
||||
|
||||
std::string colvarvalue::to_simple_string() const
|
||||
{
|
||||
switch (type()) {
|
||||
|
||||
@ -193,6 +193,12 @@ public:
|
||||
/// Derivative with respect to this \link colvarvalue \endlink of the square distance
|
||||
colvarvalue dist2_grad(colvarvalue const &x2) const;
|
||||
|
||||
/// Return the midpoint between x1 and x2, optionally weighted by lambda
|
||||
/// (which must be between 0.0 and 1.0)
|
||||
static colvarvalue const interpolate(colvarvalue const &x1,
|
||||
colvarvalue const &x2,
|
||||
cvm::real const lambda = 0.5);
|
||||
|
||||
/// Assignment operator (type of x is checked)
|
||||
colvarvalue & operator = (colvarvalue const &x);
|
||||
|
||||
@ -285,10 +291,10 @@ public:
|
||||
cvm::real & operator [] (int const i);
|
||||
|
||||
/// Ensure that the two types are the same within a binary operator
|
||||
int static check_types(colvarvalue const &x1, colvarvalue const &x2);
|
||||
static int check_types(colvarvalue const &x1, colvarvalue const &x2);
|
||||
|
||||
/// Ensure that the two types are the same within an assignment, or that the left side is type_notset
|
||||
int static check_types_assign(Type const &vt1, Type const &vt2);
|
||||
static int check_types_assign(Type const &vt1, Type const &vt2);
|
||||
|
||||
/// Undefined operation
|
||||
void undef_op() const;
|
||||
@ -317,14 +323,14 @@ public:
|
||||
|
||||
/// \brief Optimized routine for the inner product of one collective
|
||||
/// variable with an array
|
||||
void static inner_opt(colvarvalue const &x,
|
||||
static void inner_opt(colvarvalue const &x,
|
||||
std::vector<colvarvalue>::iterator &xv,
|
||||
std::vector<colvarvalue>::iterator const &xv_end,
|
||||
std::vector<cvm::real>::iterator &result);
|
||||
|
||||
/// \brief Optimized routine for the inner product of one collective
|
||||
/// variable with an array
|
||||
void static inner_opt(colvarvalue const &x,
|
||||
static void inner_opt(colvarvalue const &x,
|
||||
std::list<colvarvalue>::iterator &xv,
|
||||
std::list<colvarvalue>::iterator const &xv_end,
|
||||
std::vector<cvm::real>::iterator &result);
|
||||
@ -332,14 +338,14 @@ public:
|
||||
/// \brief Optimized routine for the second order Legendre
|
||||
/// polynomial, (3cos^2(w)-1)/2, of one collective variable with an
|
||||
/// array
|
||||
void static p2leg_opt(colvarvalue const &x,
|
||||
static void p2leg_opt(colvarvalue const &x,
|
||||
std::vector<colvarvalue>::iterator &xv,
|
||||
std::vector<colvarvalue>::iterator const &xv_end,
|
||||
std::vector<cvm::real>::iterator &result);
|
||||
|
||||
/// \brief Optimized routine for the second order Legendre
|
||||
/// polynomial of one collective variable with an array
|
||||
void static p2leg_opt(colvarvalue const &x,
|
||||
static void p2leg_opt(colvarvalue const &x,
|
||||
std::list<colvarvalue>::iterator &xv,
|
||||
std::list<colvarvalue>::iterator const &xv_end,
|
||||
std::vector<cvm::real>::iterator &result);
|
||||
|
||||
@ -14,7 +14,7 @@ Syntax from lib dir: python Install.py -m machine -h hdir -a arch -p precision -
|
||||
|
||||
specify one or more options, order does not matter
|
||||
|
||||
copies an existing Makefile.machine in lib/gpu to Makefile.auto
|
||||
copies an existing Makefile.machine in lib/gpu to Makefile.auto
|
||||
optionally edits these variables in Makefile.auto:
|
||||
CUDA_HOME, CUDA_ARCH, CUDA_PRECISION, EXTRAMAKE
|
||||
optionally uses Makefile.auto to build the GPU library -> libgpu.a
|
||||
@ -26,7 +26,7 @@ optionally copies Makefile.auto to a new Makefile.osuffix
|
||||
-h = set CUDA_HOME variable in Makefile.auto to hdir
|
||||
hdir = path to NVIDIA Cuda software, e.g. /usr/local/cuda
|
||||
-a = set CUDA_ARCH variable in Makefile.auto to arch
|
||||
use arch = 20 for Tesla C2050/C2070 (Fermi) (deprecated as of CUDA 8.0)
|
||||
use arch = 20 for Tesla C2050/C2070 (Fermi) (deprecated as of CUDA 8.0)
|
||||
or GeForce GTX 580 or similar
|
||||
use arch = 30 for Tesla K10 (Kepler)
|
||||
use arch = 35 for Tesla K40 (Kepler) or GeForce GTX Titan or similar
|
||||
@ -108,10 +108,10 @@ if pflag:
|
||||
elif precision == "mixed": precstr = "-D_SINGLE_DOUBLE"
|
||||
elif precision == "single": precstr = "-D_SINGLE_SINGLE"
|
||||
else: error("Invalid precision setting")
|
||||
|
||||
|
||||
# create Makefile.auto
|
||||
# reset EXTRAMAKE, CUDA_HOME, CUDA_ARCH, CUDA_PRECISION if requested
|
||||
|
||||
|
||||
if not os.path.exists("Makefile.%s" % isuffix):
|
||||
error("lib/gpu/Makefile.%s does not exist" % isuffix)
|
||||
|
||||
|
||||
@ -22,21 +22,21 @@
|
||||
offset=tid & (t_per_atom-1); \
|
||||
ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
|
||||
|
||||
#define nbor_info(nbor_mem, packed_mem, nbor_stride, t_per_atom, ii, offset, \
|
||||
i, numj, stride, nbor_end, nbor_begin) \
|
||||
i=nbor_mem[ii]; \
|
||||
nbor_begin=ii+nbor_stride; \
|
||||
numj=nbor_mem[nbor_begin]; \
|
||||
if (nbor_mem==packed_mem) { \
|
||||
nbor_begin+=nbor_stride+fast_mul(ii,t_per_atom-1); \
|
||||
stride=fast_mul(t_per_atom,nbor_stride); \
|
||||
nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & (t_per_atom-1)); \
|
||||
#define nbor_info(dev_nbor, dev_packed, nbor_pitch, t_per_atom, ii, offset, \
|
||||
i, numj, n_stride, nbor_end, nbor_begin) \
|
||||
i=dev_nbor[ii]; \
|
||||
nbor_begin=ii+nbor_pitch; \
|
||||
numj=dev_nbor[nbor_begin]; \
|
||||
if (dev_nbor==dev_packed) { \
|
||||
nbor_begin+=nbor_pitch+fast_mul(ii,t_per_atom-1); \
|
||||
n_stride=fast_mul(t_per_atom,nbor_pitch); \
|
||||
nbor_end=nbor_begin+fast_mul(numj/t_per_atom,n_stride)+(numj & (t_per_atom-1)); \
|
||||
nbor_begin+=offset; \
|
||||
} else { \
|
||||
nbor_begin+=nbor_stride; \
|
||||
nbor_begin=nbor_mem[nbor_begin]; \
|
||||
nbor_begin+=nbor_pitch; \
|
||||
nbor_begin=dev_nbor[nbor_begin]; \
|
||||
nbor_end=nbor_begin+numj; \
|
||||
stride=t_per_atom; \
|
||||
n_stride=t_per_atom; \
|
||||
nbor_begin+=offset; \
|
||||
}
|
||||
|
||||
|
||||
@ -20,7 +20,7 @@ using namespace LAMMPS_AL;
|
||||
extern Device<PRECISION,ACC_PRECISION> global_device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
|
||||
BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
|
||||
device=&global_device;
|
||||
ans=new Answer<numtyp,acctyp>();
|
||||
nbor=new Neighbor();
|
||||
@ -53,8 +53,8 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
||||
const int max_nbors, const int maxspecial,
|
||||
const double cell_size, const double gpu_split,
|
||||
FILE *_screen, const void *pair_program,
|
||||
const char *k_two, const char *k_three_center,
|
||||
const char *k_three_end) {
|
||||
const char *two, const char *three_center,
|
||||
const char *three_end, const char *short_nbor) {
|
||||
screen=_screen;
|
||||
|
||||
int gpu_nbor=0;
|
||||
@ -70,10 +70,10 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
||||
_gpu_host=1;
|
||||
|
||||
_threads_per_atom=device->threads_per_atom();
|
||||
if (_threads_per_atom>1 && gpu_nbor==0) {
|
||||
if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1
|
||||
nbor->packing(true);
|
||||
_nbor_data=&(nbor->dev_packed);
|
||||
} else
|
||||
} else // neigh yes or tpa == 1
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
if (_threads_per_atom*_threads_per_atom>device->warp_size())
|
||||
return -10;
|
||||
@ -97,7 +97,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
||||
|
||||
_block_pair=device->pair_block_size();
|
||||
_block_size=device->block_ellipse();
|
||||
compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end);
|
||||
compile_kernels(*ucl_device,pair_program,two,three_center,three_end,short_nbor);
|
||||
|
||||
// Initialize host-device load balancer
|
||||
hd_balancer.init(device,gpu_nbor,gpu_split);
|
||||
@ -113,6 +113,11 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
||||
_max_an_bytes+=ans2->gpu_bytes();
|
||||
#endif
|
||||
|
||||
int ef_nall=nall;
|
||||
if (ef_nall==0)
|
||||
ef_nall=2000;
|
||||
dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -136,6 +141,7 @@ void BaseThreeT::clear_atomic() {
|
||||
k_three_end.clear();
|
||||
k_three_end_vatom.clear();
|
||||
k_pair.clear();
|
||||
k_short_nbor.clear();
|
||||
delete pair_program;
|
||||
_compiled=false;
|
||||
}
|
||||
@ -143,6 +149,7 @@ void BaseThreeT::clear_atomic() {
|
||||
time_pair.clear();
|
||||
hd_balancer.clear();
|
||||
|
||||
dev_short_nbor.clear();
|
||||
nbor->clear();
|
||||
ans->clear();
|
||||
#ifdef THREE_CONCURRENT
|
||||
@ -169,6 +176,8 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
|
||||
if (!success)
|
||||
return NULL;
|
||||
|
||||
_nall = nall;
|
||||
|
||||
// originally the requirement that nall == nlist was enforced
|
||||
// to allow direct indexing neighbors of neighbors after re-arrangement
|
||||
// nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
|
||||
@ -203,6 +212,8 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
|
||||
return 0;
|
||||
atom->cast_copy_x(host_x,host_type);
|
||||
|
||||
_nall = nall;
|
||||
|
||||
int mn;
|
||||
nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, tag,
|
||||
nspecial, special, success, mn);
|
||||
@ -247,12 +258,22 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
|
||||
reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
|
||||
if (!success)
|
||||
return;
|
||||
_max_nbors = nbor->max_nbor_loop(nlist,numj,ilist);
|
||||
}
|
||||
|
||||
atom->cast_x_data(host_x,host_type);
|
||||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
|
||||
// re-allocate dev_short_nbor if necessary
|
||||
if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
dev_short_nbor.resize((2+_max_nbors)*_nmax);
|
||||
}
|
||||
|
||||
// _ainum to be used in loop() for short neighbor list build
|
||||
_ainum = nlist;
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
evatom=1;
|
||||
@ -300,7 +321,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
_max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
sublo, subhi, tag, nspecial, special, success);
|
||||
if (!success)
|
||||
return NULL;
|
||||
@ -313,6 +334,15 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
|
||||
*ilist=nbor->host_ilist.begin();
|
||||
*jnum=nbor->host_acc.begin();
|
||||
|
||||
// re-allocate dev_short_nbor if necessary
|
||||
if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
dev_short_nbor.resize((2+_max_nbors)*_nmax);
|
||||
}
|
||||
|
||||
// _ainum to be used in loop() for short neighbor list build
|
||||
_ainum = nall;
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
evatom=1;
|
||||
@ -339,19 +369,20 @@ double BaseThreeT::host_memory_usage_atomic() const {
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
const char *ktwo, const char *kthree_center,
|
||||
const char *kthree_end) {
|
||||
const char *two, const char *three_center,
|
||||
const char *three_end, const char* short_nbor) {
|
||||
if (_compiled)
|
||||
return;
|
||||
|
||||
std::string vatom_name=std::string(kthree_end)+"_vatom";
|
||||
std::string vatom_name=std::string(three_end)+"_vatom";
|
||||
|
||||
pair_program=new UCL_Program(dev);
|
||||
pair_program->load_string(pair_str,device->compile_string().c_str());
|
||||
k_three_center.set_function(*pair_program,kthree_center);
|
||||
k_three_end.set_function(*pair_program,kthree_end);
|
||||
k_three_center.set_function(*pair_program,three_center);
|
||||
k_three_end.set_function(*pair_program,three_end);
|
||||
k_three_end_vatom.set_function(*pair_program,vatom_name.c_str());
|
||||
k_pair.set_function(*pair_program,ktwo);
|
||||
k_pair.set_function(*pair_program,two);
|
||||
k_short_nbor.set_function(*pair_program,short_nbor);
|
||||
pos_tex.get_texture(*pair_program,"pos_tex");
|
||||
|
||||
#ifdef THREE_CONCURRENT
|
||||
|
||||
@ -56,7 +56,8 @@ class BaseThree {
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen,
|
||||
const void *pair_program, const char *k_two,
|
||||
const char *k_three_center, const char *k_three_end);
|
||||
const char *k_three_center, const char *k_three_end,
|
||||
const char *k_short_nbor=NULL);
|
||||
|
||||
/// Estimate the overhead for GPU context changes and CPU driver
|
||||
void estimate_gpu_overhead();
|
||||
@ -73,18 +74,18 @@ class BaseThree {
|
||||
}
|
||||
|
||||
/// Check if there is enough storage for neighbors and realloc if not
|
||||
/** \param nlocal number of particles whose nbors must be stored on device
|
||||
* \param host_inum number of particles whose nbors need to copied to host
|
||||
* \param current maximum number of neighbors
|
||||
/** \param inum number of particles whose nbors must be stored on device
|
||||
* \param max_nbors maximum number of neighbors
|
||||
* \param success set to false if insufficient memory
|
||||
* \note olist_size=total number of local particles **/
|
||||
inline void resize_local(const int inum, const int max_nbors, bool &success) {
|
||||
nbor->resize(inum,max_nbors,success);
|
||||
}
|
||||
|
||||
/// Check if there is enough storage for neighbors and realloc if not
|
||||
/** \param nlocal number of particles whose nbors must be stored on device
|
||||
/** \param inum number of particles whose nbors must be stored on device
|
||||
* \param host_inum number of particles whose nbors need to copied to host
|
||||
* \param current maximum number of neighbors
|
||||
* \param max_nbors current maximum number of neighbors
|
||||
* \note host_inum is 0 if the host is performing neighboring
|
||||
* \note nlocal+host_inum=total number local particles
|
||||
* \note olist_size=0 **/
|
||||
@ -143,14 +144,6 @@ class BaseThree {
|
||||
const bool vflag, const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time, bool &success);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int * compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int ** compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type, double *sublo,
|
||||
@ -193,6 +186,9 @@ class BaseThree {
|
||||
/// Neighbor data
|
||||
Neighbor *nbor;
|
||||
|
||||
UCL_D_Vec<int> dev_short_nbor;
|
||||
UCL_Kernel k_short_nbor;
|
||||
|
||||
// ------------------------- DEVICE KERNELS -------------------------
|
||||
UCL_Program *pair_program;
|
||||
UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom;
|
||||
@ -207,12 +203,13 @@ class BaseThree {
|
||||
int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
|
||||
int _gpu_nbor;
|
||||
double _max_bytes, _max_an_bytes;
|
||||
int _max_nbors, _ainum, _nall;
|
||||
double _gpu_overhead, _driver_overhead;
|
||||
UCL_D_Vec<int> *_nbor_data;
|
||||
|
||||
void compile_kernels(UCL_Device &dev, const void *pair_string,
|
||||
const char *k_two, const char *k_three_center,
|
||||
const char *k_three_end);
|
||||
const char *two, const char *three_center,
|
||||
const char *three_end, const char* short_nbor);
|
||||
|
||||
virtual void loop(const bool _eflag, const bool _vflag,
|
||||
const int evatom) = 0;
|
||||
|
||||
@ -55,7 +55,7 @@ int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_
|
||||
int success;
|
||||
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
|
||||
_screen,sw,"k_sw","k_sw_three_center",
|
||||
"k_sw_three_end");
|
||||
"k_sw_three_end","k_sw_short_nbor");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -193,19 +193,30 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
// build the short neighbor list
|
||||
int ainum=this->_ainum;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/this->_threads_per_atom)));
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &sw3, &map, &elem2param, &_nelements,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
|
||||
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
|
||||
int ainum=this->ans->inum();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
ainum=this->ans->inum();
|
||||
nbor_pitch=this->nbor->nbor_pitch();
|
||||
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
this->time_pair.start();
|
||||
|
||||
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
|
||||
&map, &elem2param, &_nelements,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv,
|
||||
&eflag, &vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
@ -217,6 +228,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3,
|
||||
&map, &elem2param, &_nelements,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &evatom);
|
||||
|
||||
@ -231,7 +243,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
|
||||
&map, &elem2param, &_nelements,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&this->nbor->dev_acc, &this->dev_short_nbor,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
|
||||
@ -240,7 +252,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
|
||||
&map, &elem2param, &_nelements,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&this->nbor->dev_acc, &this->dev_short_nbor,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
|
||||
|
||||
@ -130,6 +130,63 @@ texture<int4> sw3_tex;
|
||||
|
||||
#endif
|
||||
|
||||
__kernel void k_sw_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict sw3,
|
||||
const __global int *restrict map,
|
||||
const __global int *restrict elem2param,
|
||||
const int nelements,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global int * dev_short_nbor,
|
||||
const int inum, const int nbor_pitch, const int t_per_atom) {
|
||||
__local int n_stride;
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
int ncount = 0;
|
||||
int m = nbor;
|
||||
dev_short_nbor[m] = 0;
|
||||
int nbor_short = nbor+n_stride;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
int nj = j;
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y
|
||||
dev_short_nbor[nbor_short] = nj;
|
||||
nbor_short += n_stride;
|
||||
ncount++;
|
||||
}
|
||||
} // for nbor
|
||||
|
||||
// store the number of neighbors for each thread
|
||||
dev_short_nbor[m] = ncount;
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_sw(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict sw1,
|
||||
@ -140,6 +197,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
||||
const int nelements,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
@ -158,8 +216,8 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
int nbor, nbor_end, i, numj;
|
||||
const int* nbor_mem = dev_packed;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -167,9 +225,17 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor];
|
||||
nbor += n_stride;
|
||||
nbor_end = nbor+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
int j=nbor_mem[nbor];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -337,6 +403,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
||||
const int nelements,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
@ -361,7 +428,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end;
|
||||
|
||||
const int* nbor_mem = dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
@ -371,9 +438,18 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -395,14 +471,23 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
||||
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ij=sw3_ijparam.x;
|
||||
|
||||
int nbor_k=nbor_j-offset_j+offset_k;
|
||||
if (nbor_k<=nbor_j)
|
||||
nbor_k+=n_stride;
|
||||
int nbor_k,k_end;
|
||||
if (dev_packed==dev_nbor) {
|
||||
nbor_k=nborj_start-offset_j+offset_k;
|
||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
} else {
|
||||
nbor_k = nbor_j-offset_j+offset_k;
|
||||
if (nbor_k<=nbor_j) nbor_k += n_stride;
|
||||
k_end = nbor_end;
|
||||
}
|
||||
|
||||
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (dev_packed==dev_nbor && k <= j) continue;
|
||||
|
||||
numtyp4 kx; fetch4(kx,k,pos_tex);
|
||||
int ktype=kx.w;
|
||||
ktype=map[ktype];
|
||||
@ -460,6 +545,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
@ -484,7 +570,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
const int* nbor_mem = dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
@ -494,8 +580,16 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -534,8 +628,15 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
||||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
// recalculate numk and k_end for the use of short neighbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numk = dev_short_nbor[nbor_k];
|
||||
nbor_k += n_stride;
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
@ -598,6 +699,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
@ -622,7 +724,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
const int* nbor_mem = dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
@ -632,8 +734,16 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -672,8 +782,15 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
// recalculate numk and k_end for the use of short neighbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numk = dev_short_nbor[nbor_k];
|
||||
nbor_k += n_stride;
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
|
||||
@ -55,7 +55,8 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
|
||||
int success;
|
||||
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
|
||||
_screen,tersoff,"k_tersoff_repulsive",
|
||||
"k_tersoff_three_center", "k_tersoff_three_end");
|
||||
"k_tersoff_three_center", "k_tersoff_three_end",
|
||||
"k_tersoff_short_nbor");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -157,11 +158,16 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
|
||||
|
||||
UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
|
||||
UCL_WRITE_ONLY);
|
||||
for (int i=0; i<nparams; i++)
|
||||
double cutsqmax = 0.0;
|
||||
for (int i=0; i<nparams; i++) {
|
||||
cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
|
||||
if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
|
||||
}
|
||||
cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
|
||||
ucl_copy(cutsq,cutsq_view,false);
|
||||
|
||||
_cutshortsq = static_cast<numtyp>(cutsqmax);
|
||||
|
||||
UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
|
||||
*(this->ucl_device), UCL_WRITE_ONLY);
|
||||
|
||||
@ -219,171 +225,6 @@ double TersoffT::host_memory_usage() const {
|
||||
|
||||
#define KTHREADS this->_threads_per_atom
|
||||
#define JTHREADS this->_threads_per_atom
|
||||
// ---------------------------------------------------------------------------
|
||||
// Copy nbor list from host if necessary and then calculate forces, virials,..
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void TersoffT::compute(const int f_ago, const int inum_full, const int nall,
|
||||
const int nlist, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success) {
|
||||
this->acc_timers();
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
this->resize_atom(0,nall,success);
|
||||
this->zero_timers();
|
||||
return;
|
||||
}
|
||||
|
||||
int ago=this->hd_balancer.ago_first(f_ago);
|
||||
int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
|
||||
this->ans->inum(inum);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->inum(inum);
|
||||
#endif
|
||||
host_start=inum;
|
||||
|
||||
if (ago==0) {
|
||||
this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
|
||||
if (!success)
|
||||
return;
|
||||
_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
|
||||
}
|
||||
|
||||
this->atom->cast_x_data(host_x,host_type);
|
||||
this->hd_balancer.start_timer();
|
||||
this->atom->add_x_data(host_x,host_type);
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
if (nall*_max_nbors > _zetaij.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_zetaij.resize(_max_nbors*_nmax);
|
||||
}
|
||||
|
||||
int _eflag;
|
||||
if (eflag)
|
||||
_eflag=1;
|
||||
else
|
||||
_eflag=0;
|
||||
|
||||
int ainum=nlist;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int BX=this->block_pair();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/(JTHREADS*KTHREADS))));
|
||||
|
||||
this->k_zeta.set_size(GX,BX);
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
evatom=1;
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ucl_device->sync();
|
||||
#endif
|
||||
loop(eflag,vflag,evatom);
|
||||
this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
this->device->add_ans_object(this->ans);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
this->device->add_ans_object(this->ans2);
|
||||
#endif
|
||||
this->hd_balancer.stop_timer();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reneighbor on GPU if necessary and then compute forces, virials, energies
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int ** TersoffT::compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success) {
|
||||
this->acc_timers();
|
||||
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
this->resize_atom(0,nall,success);
|
||||
this->zero_timers();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
this->hd_balancer.balance(cpu_time);
|
||||
int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
|
||||
this->ans->inum(inum);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->inum(inum);
|
||||
#endif
|
||||
host_start=inum;
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
sublo, subhi, tag, nspecial, special, success);
|
||||
if (!success)
|
||||
return NULL;
|
||||
this->hd_balancer.start_timer();
|
||||
} else {
|
||||
this->atom->cast_x_data(host_x,host_type);
|
||||
this->hd_balancer.start_timer();
|
||||
this->atom->add_x_data(host_x,host_type);
|
||||
}
|
||||
*ilist=this->nbor->host_ilist.begin();
|
||||
*jnum=this->nbor->host_acc.begin();
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
if (nall*_max_nbors > _zetaij.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_zetaij.resize(_max_nbors*_nmax);
|
||||
}
|
||||
|
||||
int _eflag;
|
||||
if (eflag)
|
||||
_eflag=1;
|
||||
else
|
||||
_eflag=0;
|
||||
|
||||
int ainum=nall;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int BX=this->block_pair();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/(JTHREADS*KTHREADS))));
|
||||
|
||||
this->k_zeta.set_size(GX,BX);
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
evatom=1;
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ucl_device->sync();
|
||||
#endif
|
||||
loop(eflag,vflag,evatom);
|
||||
this->ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
this->device->add_ans_object(this->ans);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->copy_answers(eflag,vflag,eatom,vatom);
|
||||
this->device->add_ans_object(this->ans2);
|
||||
#endif
|
||||
this->hd_balancer.stop_timer();
|
||||
|
||||
return this->nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
@ -402,9 +243,40 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int ainum=this->ans->inum();
|
||||
// build the short neighbor list
|
||||
int ainum=this->_ainum;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
|
||||
&elem2param, &_nelements, &_nparams,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
int nall = this->_nall;
|
||||
if (nall*this->_max_nbors > _zetaij.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_zetaij.resize(this->_max_nbors*_nmax);
|
||||
}
|
||||
|
||||
nbor_pitch=this->nbor->nbor_pitch();
|
||||
GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
|
||||
(BX/(JTHREADS*KTHREADS))));
|
||||
|
||||
this->k_zeta.set_size(GX,BX);
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
ainum=this->ans->inum();
|
||||
nbor_pitch=this->nbor->nbor_pitch();
|
||||
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
this->time_pair.start();
|
||||
@ -412,6 +284,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv,
|
||||
&eflag, &vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
@ -423,6 +296,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &evatom);
|
||||
|
||||
@ -437,7 +311,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&this->nbor->dev_acc, &this->dev_short_nbor,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
|
||||
@ -446,7 +320,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&this->nbor->dev_acc, &this->dev_short_nbor,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
}
|
||||
|
||||
@ -106,7 +106,7 @@ texture<int4> ts5_tex;
|
||||
ans[ii]=old; \
|
||||
}
|
||||
|
||||
#define store_zeta(z, tid, t_per_atom, offset) \
|
||||
#define acc_zeta(z, tid, t_per_atom, offset) \
|
||||
if (t_per_atom>1) { \
|
||||
__local acctyp red_acc[BLOCK_PAIR]; \
|
||||
red_acc[tid]=z; \
|
||||
@ -155,7 +155,7 @@ texture<int4> ts5_tex;
|
||||
ans[ii]=old; \
|
||||
}
|
||||
|
||||
#define store_zeta(z, tid, t_per_atom, offset) \
|
||||
#define acc_zeta(z, tid, t_per_atom, offset) \
|
||||
if (t_per_atom>1) { \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
z += shfl_xor(z, s, t_per_atom); \
|
||||
@ -164,6 +164,65 @@ texture<int4> ts5_tex;
|
||||
|
||||
#endif
|
||||
|
||||
__kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const __global int *restrict map,
|
||||
const __global int *restrict elem2param,
|
||||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global int * dev_short_nbor,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
__local int n_stride;
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
int ncount = 0;
|
||||
int m = nbor;
|
||||
dev_short_nbor[m] = 0;
|
||||
int nbor_short = nbor+n_stride;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
int nj = j;
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq[ijparam]) {
|
||||
dev_short_nbor[nbor_short] = nj;
|
||||
nbor_short += n_stride;
|
||||
ncount++;
|
||||
}
|
||||
} // for nbor
|
||||
|
||||
// store the number of neighbors for each thread
|
||||
dev_short_nbor[m] = ncount;
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
// Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
|
||||
// while the block size should never be less than 32.
|
||||
// SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
|
||||
@ -184,6 +243,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
|
||||
__global acctyp4 * zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
const int eflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
__local int tpa_sq,n_stride;
|
||||
@ -211,22 +271,29 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor_j, nbor_end;
|
||||
int i, numj;
|
||||
|
||||
int nbor_j, nbor_end, i, numj;
|
||||
const int* nbor_mem=dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
int offset_k=tid & (t_per_atom-1);
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -241,14 +308,20 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
|
||||
delr1.z = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
// if (rsq1 > cutsq[ijparam]) continue;
|
||||
|
||||
// compute zeta_ij
|
||||
z = (acctyp)0;
|
||||
|
||||
int nbor_k = nborj_start-offset_j+offset_k;
|
||||
for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k_end = nbor_end;
|
||||
if (dev_packed==dev_nbor) {
|
||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
|
||||
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == j) continue;
|
||||
@ -284,10 +357,12 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
|
||||
|
||||
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
|
||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
i, nbor_j, offset_j, idx);
|
||||
store_zeta(z, tid, t_per_atom, offset_k);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
int idx = nbor_j;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// i, nbor_j, offset_j, idx);
|
||||
acc_zeta(z, tid, t_per_atom, offset_k);
|
||||
|
||||
numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
|
||||
numtyp ijparam_lam2 = ts1_ijparam.y;
|
||||
@ -330,6 +405,7 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
|
||||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
@ -356,8 +432,8 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
int nbor, nbor_end, i, numj;
|
||||
const int* nbor_mem=dev_packed;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -365,9 +441,17 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor];
|
||||
nbor += n_stride;
|
||||
nbor_end = nbor+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
int j=nbor_mem[nbor];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -382,32 +466,31 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq[ijparam]) {
|
||||
numtyp feng[2];
|
||||
numtyp ijparam_lam1 = ts1[ijparam].x;
|
||||
numtyp4 ts2_ijparam = ts2[ijparam];
|
||||
numtyp ijparam_biga = ts2_ijparam.x;
|
||||
numtyp ijparam_bigr = ts2_ijparam.z;
|
||||
numtyp ijparam_bigd = ts2_ijparam.w;
|
||||
// rsq<cutsq[ijparam]
|
||||
numtyp feng[2];
|
||||
numtyp ijparam_lam1 = ts1[ijparam].x;
|
||||
numtyp4 ts2_ijparam = ts2[ijparam];
|
||||
numtyp ijparam_biga = ts2_ijparam.x;
|
||||
numtyp ijparam_bigr = ts2_ijparam.z;
|
||||
numtyp ijparam_bigd = ts2_ijparam.w;
|
||||
|
||||
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
|
||||
rsq, eflag, feng);
|
||||
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
|
||||
rsq, eflag, feng);
|
||||
|
||||
numtyp force = feng[0];
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
numtyp force = feng[0];
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0)
|
||||
energy+=feng[1];
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
if (eflag>0)
|
||||
energy+=feng[1];
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
} // for nbor
|
||||
|
||||
@ -428,6 +511,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
@ -461,20 +545,28 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end;
|
||||
|
||||
const int* nbor_mem=dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
int offset_k=tid & (t_per_atom-1);
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -489,7 +581,6 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
|
||||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
numtyp r1 = ucl_sqrt(rsq1);
|
||||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
|
||||
@ -497,9 +588,11 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
|
||||
|
||||
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
|
||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
i, nbor_j, offset_j, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
int idx = nbor_j;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// i, nbor_j, offset_j, idx);
|
||||
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
|
||||
numtyp force = zeta_ij.x*tpainv;
|
||||
numtyp prefactor = zeta_ij.y;
|
||||
@ -520,9 +613,15 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
|
||||
virial[5] += delr1[1]*delr1[2]*mforce;
|
||||
}
|
||||
|
||||
int nbor_k=nborj_start-offset_j+offset_k;
|
||||
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int nbor_k = nborj_start-offset_j+offset_k;
|
||||
int k_end = nbor_end;
|
||||
if (dev_packed==dev_nbor) {
|
||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (j == k) continue;
|
||||
@ -598,6 +697,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
@ -632,7 +732,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
const int* nbor_mem=dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
@ -643,9 +743,18 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
||||
itype=map[itype];
|
||||
|
||||
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -660,8 +769,6 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
||||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
|
||||
numtyp mdelr1[3];
|
||||
mdelr1[0] = -delr1[0];
|
||||
mdelr1[1] = -delr1[1];
|
||||
@ -683,13 +790,20 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
||||
k_end=nbor_k+numk;
|
||||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
// recalculate numk and k_end for the use of short neighbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numk = dev_short_nbor[nbor_k];
|
||||
nbor_k += n_stride;
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
int nbork_start = nbor_k;
|
||||
|
||||
// look up for zeta_ji: find i in the j's neighbor list
|
||||
int m = tid / t_per_atom;
|
||||
int ijnum = -1;
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
@ -711,9 +825,11 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
||||
|
||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, ijnum, offset_kf, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, ijnum, offset_kf, idx);
|
||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||
numtyp force = zeta_ji.x*tpainv;
|
||||
numtyp prefactor_ji = zeta_ji.y;
|
||||
@ -736,7 +852,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
||||
|
||||
// attractive forces
|
||||
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
@ -777,9 +893,11 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
||||
|
||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, nbor_k, offset_k, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, nbor_k, offset_k, idx);
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
|
||||
@ -824,6 +942,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
@ -858,7 +977,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
const int* nbor_mem = dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
@ -869,9 +988,18 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
itype=map[itype];
|
||||
|
||||
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -886,8 +1014,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
|
||||
numtyp mdelr1[3];
|
||||
mdelr1[0] = -delr1[0];
|
||||
mdelr1[1] = -delr1[1];
|
||||
@ -909,13 +1035,20 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
k_end=nbor_k+numk;
|
||||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
// recalculate numk and k_end for the use of short neighbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numk = dev_short_nbor[nbor_k];
|
||||
nbor_k += n_stride;
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
int nbork_start = nbor_k;
|
||||
|
||||
// look up for zeta_ji
|
||||
int m = tid / t_per_atom;
|
||||
int ijnum = -1;
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
@ -937,9 +1070,11 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
|
||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, ijnum, offset_kf, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, ijnum, offset_kf, idx);
|
||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||
numtyp force = zeta_ji.x*tpainv;
|
||||
numtyp prefactor_ji = zeta_ji.y;
|
||||
@ -962,7 +1097,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
|
||||
// attractive forces
|
||||
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
@ -1010,9 +1145,11 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
|
||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, nbor_k, offset_k, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, nbor_k, offset_k, idx);
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
|
||||
@ -1040,7 +1177,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
virial[3] += TWOTHIRD*(delr2[0]*fj[1] + mdelr1[0]*fk[1]);
|
||||
virial[4] += TWOTHIRD*(delr2[0]*fj[2] + mdelr1[0]*fk[2]);
|
||||
virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]);
|
||||
|
||||
}
|
||||
} // for nbor
|
||||
|
||||
|
||||
@ -47,21 +47,6 @@ class Tersoff : public BaseThree<numtyp, acctyp> {
|
||||
const double* h, const double* gamma, const double* beta,
|
||||
const double* powern, const double* cutsq);
|
||||
|
||||
/// Pair loop with host neighboring
|
||||
void compute(const int f_ago, const int inum_full, const int nall,
|
||||
const int nlist, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh, const bool eflag,
|
||||
const bool vflag, const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time, bool &success);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int ** compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **numj, const double cpu_time, bool &success);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
@ -104,8 +89,7 @@ class Tersoff : public BaseThree<numtyp, acctyp> {
|
||||
|
||||
UCL_Kernel k_zeta;
|
||||
UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex;
|
||||
|
||||
int _max_nbors;
|
||||
numtyp _cutshortsq;
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
|
||||
@ -55,7 +55,8 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
|
||||
int success;
|
||||
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
|
||||
_screen,tersoff_mod,"k_tersoff_mod_repulsive",
|
||||
"k_tersoff_mod_three_center", "k_tersoff_mod_three_end");
|
||||
"k_tersoff_mod_three_center", "k_tersoff_mod_three_end",
|
||||
"k_tersoff_mod_short_nbor");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -157,11 +158,16 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
|
||||
|
||||
UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
|
||||
UCL_WRITE_ONLY);
|
||||
for (int i=0; i<nparams; i++)
|
||||
double cutsqmax = 0.0;
|
||||
for (int i=0; i<nparams; i++) {
|
||||
cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
|
||||
if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
|
||||
}
|
||||
cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
|
||||
ucl_copy(cutsq,cutsq_view,false);
|
||||
|
||||
_cutshortsq = static_cast<numtyp>(cutsqmax);
|
||||
|
||||
UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
|
||||
*(this->ucl_device), UCL_WRITE_ONLY);
|
||||
|
||||
@ -219,171 +225,6 @@ double TersoffMT::host_memory_usage() const {
|
||||
|
||||
#define KTHREADS this->_threads_per_atom
|
||||
#define JTHREADS this->_threads_per_atom
|
||||
// ---------------------------------------------------------------------------
|
||||
// Copy nbor list from host if necessary and then calculate forces, virials,..
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void TersoffMT::compute(const int f_ago, const int inum_full, const int nall,
|
||||
const int nlist, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success) {
|
||||
this->acc_timers();
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
this->resize_atom(0,nall,success);
|
||||
this->zero_timers();
|
||||
return;
|
||||
}
|
||||
|
||||
int ago=this->hd_balancer.ago_first(f_ago);
|
||||
int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
|
||||
this->ans->inum(inum);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->inum(inum);
|
||||
#endif
|
||||
host_start=inum;
|
||||
|
||||
if (ago==0) {
|
||||
this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
|
||||
if (!success)
|
||||
return;
|
||||
_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
|
||||
}
|
||||
|
||||
this->atom->cast_x_data(host_x,host_type);
|
||||
this->hd_balancer.start_timer();
|
||||
this->atom->add_x_data(host_x,host_type);
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
if (nall*_max_nbors > _zetaij.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_zetaij.resize(_max_nbors*_nmax);
|
||||
}
|
||||
|
||||
int _eflag;
|
||||
if (eflag)
|
||||
_eflag=1;
|
||||
else
|
||||
_eflag=0;
|
||||
|
||||
int ainum=nlist;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int BX=this->block_pair();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/(JTHREADS*KTHREADS))));
|
||||
|
||||
this->k_zeta.set_size(GX,BX);
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
evatom=1;
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ucl_device->sync();
|
||||
#endif
|
||||
loop(eflag,vflag,evatom);
|
||||
this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
this->device->add_ans_object(this->ans);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
this->device->add_ans_object(this->ans2);
|
||||
#endif
|
||||
this->hd_balancer.stop_timer();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reneighbor on GPU if necessary and then compute forces, virials, energies
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int ** TersoffMT::compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success) {
|
||||
this->acc_timers();
|
||||
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
this->resize_atom(0,nall,success);
|
||||
this->zero_timers();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
this->hd_balancer.balance(cpu_time);
|
||||
int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
|
||||
this->ans->inum(inum);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->inum(inum);
|
||||
#endif
|
||||
host_start=inum;
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
sublo, subhi, tag, nspecial, special, success);
|
||||
if (!success)
|
||||
return NULL;
|
||||
this->hd_balancer.start_timer();
|
||||
} else {
|
||||
this->atom->cast_x_data(host_x,host_type);
|
||||
this->hd_balancer.start_timer();
|
||||
this->atom->add_x_data(host_x,host_type);
|
||||
}
|
||||
*ilist=this->nbor->host_ilist.begin();
|
||||
*jnum=this->nbor->host_acc.begin();
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
if (nall*_max_nbors > _zetaij.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_zetaij.resize(_max_nbors*_nmax);
|
||||
}
|
||||
|
||||
int _eflag;
|
||||
if (eflag)
|
||||
_eflag=1;
|
||||
else
|
||||
_eflag=0;
|
||||
|
||||
int ainum=nall;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int BX=this->block_pair();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/(JTHREADS*KTHREADS))));
|
||||
|
||||
this->k_zeta.set_size(GX,BX);
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
evatom=1;
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ucl_device->sync();
|
||||
#endif
|
||||
loop(eflag,vflag,evatom);
|
||||
this->ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
this->device->add_ans_object(this->ans);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->copy_answers(eflag,vflag,eatom,vatom);
|
||||
this->device->add_ans_object(this->ans2);
|
||||
#endif
|
||||
this->hd_balancer.stop_timer();
|
||||
|
||||
return this->nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
@ -402,9 +243,40 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int ainum=this->ans->inum();
|
||||
// build the short neighbor list
|
||||
int ainum=this->_ainum;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
|
||||
&elem2param, &_nelements, &_nparams,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
int nall = this->_nall;
|
||||
if (nall*this->_max_nbors > _zetaij.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_zetaij.resize(this->_max_nbors*_nmax);
|
||||
}
|
||||
|
||||
nbor_pitch=this->nbor->nbor_pitch();
|
||||
GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
|
||||
(BX/(JTHREADS*KTHREADS))));
|
||||
|
||||
this->k_zeta.set_size(GX,BX);
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
ainum=this->ans->inum();
|
||||
nbor_pitch=this->nbor->nbor_pitch();
|
||||
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
this->time_pair.start();
|
||||
@ -412,6 +284,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv,
|
||||
&eflag, &vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
@ -423,6 +296,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &evatom);
|
||||
|
||||
@ -437,7 +311,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&this->nbor->dev_acc, &this->dev_short_nbor,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
|
||||
@ -446,7 +320,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&this->nbor->dev_acc, &this->dev_short_nbor,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
}
|
||||
|
||||
@ -106,7 +106,7 @@ texture<int4> ts5_tex;
|
||||
ans[ii]=old; \
|
||||
}
|
||||
|
||||
#define store_zeta(z, tid, t_per_atom, offset) \
|
||||
#define acc_zeta(z, tid, t_per_atom, offset) \
|
||||
if (t_per_atom>1) { \
|
||||
__local acctyp red_acc[BLOCK_PAIR]; \
|
||||
red_acc[tid]=z; \
|
||||
@ -155,7 +155,7 @@ texture<int4> ts5_tex;
|
||||
ans[ii]=old; \
|
||||
}
|
||||
|
||||
#define store_zeta(z, tid, t_per_atom, offset) \
|
||||
#define acc_zeta(z, tid, t_per_atom, offset) \
|
||||
if (t_per_atom>1) { \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
z += shfl_xor(z, s, t_per_atom); \
|
||||
@ -164,6 +164,65 @@ texture<int4> ts5_tex;
|
||||
|
||||
#endif
|
||||
|
||||
__kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const __global int *restrict map,
|
||||
const __global int *restrict elem2param,
|
||||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global int * dev_short_nbor,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
__local int n_stride;
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
int ncount = 0;
|
||||
int m = nbor;
|
||||
dev_short_nbor[m] = 0;
|
||||
int nbor_short = nbor+n_stride;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
int nj = j;
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq[ijparam]) {
|
||||
dev_short_nbor[nbor_short] = nj;
|
||||
nbor_short += n_stride;
|
||||
ncount++;
|
||||
}
|
||||
} // for nbor
|
||||
|
||||
// store the number of neighbors for each thread
|
||||
dev_short_nbor[m] = ncount;
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
// Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
|
||||
// while the block size should never be less than 32.
|
||||
// SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
|
||||
@ -184,6 +243,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
|
||||
__global acctyp4 * zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
const int eflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
__local int tpa_sq,n_stride;
|
||||
@ -211,22 +271,29 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor_j, nbor_end;
|
||||
int i, numj;
|
||||
|
||||
int nbor_j, nbor_end, i, numj;
|
||||
const int* nbor_mem=dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
int offset_k=tid & (t_per_atom-1);
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -241,14 +308,18 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
|
||||
delr1.z = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
|
||||
// compute zeta_ij
|
||||
z = (numtyp)0;
|
||||
z = (acctyp)0;
|
||||
|
||||
int nbor_k = nborj_start-offset_j+offset_k;
|
||||
for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k_end = nbor_end;
|
||||
if (dev_packed==dev_nbor) {
|
||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
|
||||
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == j) continue;
|
||||
@ -287,10 +358,12 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
|
||||
|
||||
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
|
||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
i, nbor_j, offset_j, idx);
|
||||
store_zeta(z, tid, t_per_atom, offset_k);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
int idx = nbor_j;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// i, nbor_j, offset_j, idx);
|
||||
acc_zeta(z, tid, t_per_atom, offset_k);
|
||||
|
||||
numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
|
||||
numtyp ijparam_lam2 = ts1_ijparam.y;
|
||||
@ -331,6 +404,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
|
||||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
@ -357,8 +431,8 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
int nbor, nbor_end, i, numj;
|
||||
const int* nbor_mem=dev_packed;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -366,9 +440,17 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor];
|
||||
nbor += n_stride;
|
||||
nbor_end = nbor+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
int j=nbor_mem[nbor];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -383,32 +465,31 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq[ijparam]) {
|
||||
numtyp feng[2];
|
||||
numtyp ijparam_lam1 = ts1[ijparam].x;
|
||||
numtyp4 ts2_ijparam = ts2[ijparam];
|
||||
numtyp ijparam_biga = ts2_ijparam.x;
|
||||
numtyp ijparam_bigr = ts2_ijparam.z;
|
||||
numtyp ijparam_bigd = ts2_ijparam.w;
|
||||
// rsq<cutsq[ijparam]
|
||||
numtyp feng[2];
|
||||
numtyp ijparam_lam1 = ts1[ijparam].x;
|
||||
numtyp4 ts2_ijparam = ts2[ijparam];
|
||||
numtyp ijparam_biga = ts2_ijparam.x;
|
||||
numtyp ijparam_bigr = ts2_ijparam.z;
|
||||
numtyp ijparam_bigd = ts2_ijparam.w;
|
||||
|
||||
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
|
||||
rsq, eflag, feng);
|
||||
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
|
||||
rsq, eflag, feng);
|
||||
|
||||
numtyp force = feng[0];
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
numtyp force = feng[0];
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0)
|
||||
energy+=feng[1];
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
if (eflag>0)
|
||||
energy+=feng[1];
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
} // for nbor
|
||||
|
||||
@ -430,6 +511,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
@ -465,20 +547,28 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end;
|
||||
|
||||
const int* nbor_mem=dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
int offset_k=tid & (t_per_atom-1);
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -493,7 +583,6 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
|
||||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
numtyp r1 = ucl_sqrt(rsq1);
|
||||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
|
||||
@ -501,9 +590,11 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
|
||||
|
||||
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
|
||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
i, nbor_j, offset_j, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
int idx = nbor_j;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// i, nbor_j, offset_j, idx);
|
||||
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
|
||||
numtyp force = zeta_ij.x*tpainv;
|
||||
numtyp prefactor = zeta_ij.y;
|
||||
@ -524,9 +615,15 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
|
||||
virial[5] += delr1[1]*delr1[2]*mforce;
|
||||
}
|
||||
|
||||
int nbor_k=nborj_start-offset_j+offset_k;
|
||||
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int nbor_k = nborj_start-offset_j+offset_k;
|
||||
int k_end = nbor_end;
|
||||
if (dev_packed==dev_nbor) {
|
||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (j == k) continue;
|
||||
@ -606,6 +703,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
@ -642,7 +740,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
const int* nbor_mem=dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
@ -653,9 +751,18 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
itype=map[itype];
|
||||
|
||||
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -670,8 +777,6 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
|
||||
numtyp mdelr1[3];
|
||||
mdelr1[0] = -delr1[0];
|
||||
mdelr1[1] = -delr1[1];
|
||||
@ -693,13 +798,20 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
k_end=nbor_k+numk;
|
||||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
// recalculate numk and k_end for the use of short neighbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numk = dev_short_nbor[nbor_k];
|
||||
nbor_k += n_stride;
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
int nbork_start = nbor_k;
|
||||
|
||||
// look up for zeta_ji: find i in the j's neighbor list
|
||||
int m = tid / t_per_atom;
|
||||
int ijnum = -1;
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
@ -721,9 +833,11 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
|
||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, ijnum, offset_kf, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, ijnum, offset_kf, idx);
|
||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||
numtyp force = zeta_ji.x*tpainv;
|
||||
numtyp prefactor_ji = zeta_ji.y;
|
||||
@ -746,7 +860,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
|
||||
// attractive forces
|
||||
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
@ -790,9 +904,11 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
|
||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, nbor_k, offset_k, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, nbor_k, offset_k, idx);
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
|
||||
@ -841,6 +957,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
@ -877,7 +994,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
const int* nbor_mem = dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
@ -888,9 +1005,18 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
itype=map[itype];
|
||||
|
||||
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -905,8 +1031,6 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
|
||||
numtyp mdelr1[3];
|
||||
mdelr1[0] = -delr1[0];
|
||||
mdelr1[1] = -delr1[1];
|
||||
@ -928,13 +1052,20 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
k_end=nbor_k+numk;
|
||||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
// recalculate numk and k_end for the use of short neighbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numk = dev_short_nbor[nbor_k];
|
||||
nbor_k += n_stride;
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
int nbork_start = nbor_k;
|
||||
|
||||
// look up for zeta_ji
|
||||
int m = tid / t_per_atom;
|
||||
int ijnum = -1;
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
@ -956,9 +1087,11 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
|
||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, ijnum, offset_kf, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, ijnum, offset_kf, idx);
|
||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||
numtyp force = zeta_ji.x*tpainv;
|
||||
numtyp prefactor_ji = zeta_ji.y;
|
||||
@ -981,7 +1114,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
|
||||
// attractive forces
|
||||
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
@ -1032,9 +1165,11 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
|
||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, nbor_k, offset_k, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, nbor_k, offset_k, idx);
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
|
||||
|
||||
@ -47,21 +47,6 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
|
||||
const double* h, const double* beta, const double* powern,
|
||||
const double* powern_del, const double* ca1, const double* cutsq);
|
||||
|
||||
/// Pair loop with host neighboring
|
||||
void compute(const int f_ago, const int inum_full, const int nall,
|
||||
const int nlist, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh, const bool eflag,
|
||||
const bool vflag, const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time, bool &success);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int ** compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **numj, const double cpu_time, bool &success);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
@ -104,8 +89,7 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
|
||||
|
||||
UCL_Kernel k_zeta;
|
||||
UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex;
|
||||
|
||||
int _max_nbors;
|
||||
numtyp _cutshortsq;
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
|
||||
@ -62,7 +62,8 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
|
||||
int success;
|
||||
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
|
||||
_screen,tersoff_zbl,"k_tersoff_zbl_repulsive",
|
||||
"k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end");
|
||||
"k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end",
|
||||
"k_tersoff_zbl_short_nbor");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -177,11 +178,16 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
|
||||
|
||||
UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
|
||||
UCL_WRITE_ONLY);
|
||||
for (int i=0; i<nparams; i++)
|
||||
double cutsqmax = 0.0;
|
||||
for (int i=0; i<nparams; i++) {
|
||||
cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
|
||||
if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
|
||||
}
|
||||
cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
|
||||
ucl_copy(cutsq,cutsq_view,false);
|
||||
|
||||
_cutshortsq = static_cast<numtyp>(cutsqmax);
|
||||
|
||||
UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
|
||||
*(this->ucl_device), UCL_WRITE_ONLY);
|
||||
|
||||
@ -244,171 +250,6 @@ double TersoffZT::host_memory_usage() const {
|
||||
|
||||
#define KTHREADS this->_threads_per_atom
|
||||
#define JTHREADS this->_threads_per_atom
|
||||
// ---------------------------------------------------------------------------
|
||||
// Copy nbor list from host if necessary and then calculate forces, virials,..
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void TersoffZT::compute(const int f_ago, const int inum_full, const int nall,
|
||||
const int nlist, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success) {
|
||||
this->acc_timers();
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
this->resize_atom(0,nall,success);
|
||||
this->zero_timers();
|
||||
return;
|
||||
}
|
||||
|
||||
int ago=this->hd_balancer.ago_first(f_ago);
|
||||
int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
|
||||
this->ans->inum(inum);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->inum(inum);
|
||||
#endif
|
||||
host_start=inum;
|
||||
|
||||
if (ago==0) {
|
||||
this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
|
||||
if (!success)
|
||||
return;
|
||||
_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
|
||||
}
|
||||
|
||||
this->atom->cast_x_data(host_x,host_type);
|
||||
this->hd_balancer.start_timer();
|
||||
this->atom->add_x_data(host_x,host_type);
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
if (nall*_max_nbors > _zetaij.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_zetaij.resize(_max_nbors*_nmax);
|
||||
}
|
||||
|
||||
int _eflag;
|
||||
if (eflag)
|
||||
_eflag=1;
|
||||
else
|
||||
_eflag=0;
|
||||
|
||||
int ainum=nlist;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int BX=this->block_pair();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/(JTHREADS*KTHREADS))));
|
||||
|
||||
this->k_zeta.set_size(GX,BX);
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
evatom=1;
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ucl_device->sync();
|
||||
#endif
|
||||
loop(eflag,vflag,evatom);
|
||||
this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
this->device->add_ans_object(this->ans);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
this->device->add_ans_object(this->ans2);
|
||||
#endif
|
||||
this->hd_balancer.stop_timer();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reneighbor on GPU if necessary and then compute forces, virials, energies
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int ** TersoffZT::compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success) {
|
||||
this->acc_timers();
|
||||
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
this->resize_atom(0,nall,success);
|
||||
this->zero_timers();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
this->hd_balancer.balance(cpu_time);
|
||||
int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
|
||||
this->ans->inum(inum);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->inum(inum);
|
||||
#endif
|
||||
host_start=inum;
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
sublo, subhi, tag, nspecial, special, success);
|
||||
if (!success)
|
||||
return NULL;
|
||||
this->hd_balancer.start_timer();
|
||||
} else {
|
||||
this->atom->cast_x_data(host_x,host_type);
|
||||
this->hd_balancer.start_timer();
|
||||
this->atom->add_x_data(host_x,host_type);
|
||||
}
|
||||
*ilist=this->nbor->host_ilist.begin();
|
||||
*jnum=this->nbor->host_acc.begin();
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
if (nall*_max_nbors > _zetaij.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_zetaij.resize(_max_nbors*_nmax);
|
||||
}
|
||||
|
||||
int _eflag;
|
||||
if (eflag)
|
||||
_eflag=1;
|
||||
else
|
||||
_eflag=0;
|
||||
|
||||
int ainum=nall;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int BX=this->block_pair();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/(JTHREADS*KTHREADS))));
|
||||
|
||||
this->k_zeta.set_size(GX,BX);
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
evatom=1;
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ucl_device->sync();
|
||||
#endif
|
||||
loop(eflag,vflag,evatom);
|
||||
this->ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
this->device->add_ans_object(this->ans);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->copy_answers(eflag,vflag,eatom,vatom);
|
||||
this->device->add_ans_object(this->ans2);
|
||||
#endif
|
||||
this->hd_balancer.stop_timer();
|
||||
|
||||
return this->nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
@ -427,9 +268,40 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int ainum=this->ans->inum();
|
||||
// build the short neighbor list
|
||||
int ainum=this->_ainum;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
|
||||
&elem2param, &_nelements, &_nparams,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
int nall = this->_nall;
|
||||
if (nall*this->_max_nbors > _zetaij.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_zetaij.resize(this->_max_nbors*_nmax);
|
||||
}
|
||||
|
||||
nbor_pitch=this->nbor->nbor_pitch();
|
||||
GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
|
||||
(BX/(JTHREADS*KTHREADS))));
|
||||
|
||||
this->k_zeta.set_size(GX,BX);
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
ainum=this->ans->inum();
|
||||
nbor_pitch=this->nbor->nbor_pitch();
|
||||
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
this->time_pair.start();
|
||||
@ -438,6 +310,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
&_global_e, &_global_a_0, &_global_epsilon_0, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv,
|
||||
&eflag, &vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
@ -449,6 +322,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &evatom);
|
||||
|
||||
@ -463,7 +337,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&this->nbor->dev_acc, &this->dev_short_nbor,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
|
||||
@ -472,7 +346,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&this->nbor->dev_acc, &this->dev_short_nbor,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
}
|
||||
|
||||
@ -109,7 +109,7 @@ texture<int4> ts6_tex;
|
||||
ans[ii]=old; \
|
||||
}
|
||||
|
||||
#define store_zeta(z, tid, t_per_atom, offset) \
|
||||
#define acc_zeta(z, tid, t_per_atom, offset) \
|
||||
if (t_per_atom>1) { \
|
||||
__local acctyp red_acc[BLOCK_PAIR]; \
|
||||
red_acc[tid]=z; \
|
||||
@ -158,7 +158,7 @@ texture<int4> ts6_tex;
|
||||
ans[ii]=old; \
|
||||
}
|
||||
|
||||
#define store_zeta(z, tid, t_per_atom, offset) \
|
||||
#define acc_zeta(z, tid, t_per_atom, offset) \
|
||||
if (t_per_atom>1) { \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
z += shfl_xor(z, s, t_per_atom); \
|
||||
@ -167,6 +167,65 @@ texture<int4> ts6_tex;
|
||||
|
||||
#endif
|
||||
|
||||
__kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const __global int *restrict map,
|
||||
const __global int *restrict elem2param,
|
||||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global int * dev_short_nbor,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
__local int n_stride;
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
int ncount = 0;
|
||||
int m = nbor;
|
||||
dev_short_nbor[m] = 0;
|
||||
int nbor_short = nbor+n_stride;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
int nj = j;
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq[ijparam]) {
|
||||
dev_short_nbor[nbor_short] = nj;
|
||||
nbor_short += n_stride;
|
||||
ncount++;
|
||||
}
|
||||
} // for nbor
|
||||
|
||||
// store the number of neighbors for each thread
|
||||
dev_short_nbor[m] = ncount;
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
// Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
|
||||
// while the block size should never be less than 32.
|
||||
// SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
|
||||
@ -188,6 +247,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
|
||||
__global acctyp4 * zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
const int eflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
__local int tpa_sq,n_stride;
|
||||
@ -217,22 +277,29 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor_j, nbor_end;
|
||||
int i, numj;
|
||||
|
||||
int nbor_j, nbor_end, i, numj;
|
||||
const int* nbor_mem=dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
int offset_k=tid & (t_per_atom-1);
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -247,14 +314,18 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
|
||||
delr1.z = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
|
||||
// compute zeta_ij
|
||||
z = (acctyp)0;
|
||||
|
||||
int nbor_k = nborj_start-offset_j+offset_k;
|
||||
for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k_end = nbor_end;
|
||||
if (dev_packed==dev_nbor) {
|
||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
|
||||
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == j) continue;
|
||||
@ -290,10 +361,12 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
|
||||
|
||||
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
|
||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
i, nbor_j, offset_j, idx);
|
||||
store_zeta(z, tid, t_per_atom, offset_k);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
int idx = nbor_j;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// i, nbor_j, offset_j, idx);
|
||||
acc_zeta(z, tid, t_per_atom, offset_k);
|
||||
|
||||
numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
|
||||
numtyp ijparam_lam2 = ts1_ijparam.y;
|
||||
@ -342,6 +415,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
|
||||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
@ -370,8 +444,8 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
int nbor, nbor_end, i, numj;
|
||||
const int* nbor_mem=dev_packed;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -379,9 +453,17 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor];
|
||||
nbor += n_stride;
|
||||
nbor_end = nbor+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
int j=nbor_mem[nbor];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -396,38 +478,37 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq[ijparam]) {
|
||||
numtyp feng[2];
|
||||
numtyp ijparam_lam1 = ts1[ijparam].x;
|
||||
numtyp4 ts2_ijparam = ts2[ijparam];
|
||||
numtyp ijparam_biga = ts2_ijparam.x;
|
||||
numtyp ijparam_bigr = ts2_ijparam.z;
|
||||
numtyp ijparam_bigd = ts2_ijparam.w;
|
||||
numtyp4 ts6_ijparam = ts6[ijparam];
|
||||
numtyp ijparam_Z_i = ts6_ijparam.x;
|
||||
numtyp ijparam_Z_j = ts6_ijparam.y;
|
||||
numtyp ijparam_ZBLcut = ts6_ijparam.z;
|
||||
numtyp ijparam_ZBLexpscale = ts6_ijparam.w;
|
||||
// rsq<cutsq[ijparam]
|
||||
numtyp feng[2];
|
||||
numtyp ijparam_lam1 = ts1[ijparam].x;
|
||||
numtyp4 ts2_ijparam = ts2[ijparam];
|
||||
numtyp ijparam_biga = ts2_ijparam.x;
|
||||
numtyp ijparam_bigr = ts2_ijparam.z;
|
||||
numtyp ijparam_bigd = ts2_ijparam.w;
|
||||
numtyp4 ts6_ijparam = ts6[ijparam];
|
||||
numtyp ijparam_Z_i = ts6_ijparam.x;
|
||||
numtyp ijparam_Z_j = ts6_ijparam.y;
|
||||
numtyp ijparam_ZBLcut = ts6_ijparam.z;
|
||||
numtyp ijparam_ZBLexpscale = ts6_ijparam.w;
|
||||
|
||||
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
|
||||
ijparam_Z_i, ijparam_Z_j, ijparam_ZBLcut, ijparam_ZBLexpscale,
|
||||
global_e, global_a_0, global_epsilon_0, rsq, eflag, feng);
|
||||
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
|
||||
ijparam_Z_i, ijparam_Z_j, ijparam_ZBLcut, ijparam_ZBLexpscale,
|
||||
global_e, global_a_0, global_epsilon_0, rsq, eflag, feng);
|
||||
|
||||
numtyp force = feng[0];
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
numtyp force = feng[0];
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0)
|
||||
energy+=feng[1];
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
if (eflag>0)
|
||||
energy+=feng[1];
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
} // for nbor
|
||||
|
||||
@ -448,6 +529,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
@ -481,20 +563,28 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end;
|
||||
|
||||
const int* nbor_mem=dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
int offset_k=tid & (t_per_atom-1);
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -509,7 +599,6 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
|
||||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
numtyp r1 = ucl_sqrt(rsq1);
|
||||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
|
||||
@ -517,9 +606,11 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
|
||||
|
||||
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
|
||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
i, nbor_j, offset_j, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
int idx = nbor_j;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// i, nbor_j, offset_j, idx);
|
||||
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
|
||||
numtyp force = zeta_ij.x*tpainv;
|
||||
numtyp prefactor = zeta_ij.y;
|
||||
@ -540,9 +631,15 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
|
||||
virial[5] += delr1[1]*delr1[2]*mforce;
|
||||
}
|
||||
|
||||
int nbor_k=nborj_start-offset_j+offset_k;
|
||||
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int nbor_k = nborj_start-offset_j+offset_k;
|
||||
int k_end = nbor_end;
|
||||
if (dev_packed==dev_nbor) {
|
||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (j == k) continue;
|
||||
@ -618,6 +715,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
@ -652,7 +750,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
const int* nbor_mem=dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
@ -663,9 +761,18 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
itype=map[itype];
|
||||
|
||||
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -680,8 +787,6 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
|
||||
numtyp mdelr1[3];
|
||||
mdelr1[0] = -delr1[0];
|
||||
mdelr1[1] = -delr1[1];
|
||||
@ -703,13 +808,20 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
k_end=nbor_k+numk;
|
||||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
// recalculate numk and k_end for the use of short neighbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numk = dev_short_nbor[nbor_k];
|
||||
nbor_k += n_stride;
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
int nbork_start = nbor_k;
|
||||
|
||||
// look up for zeta_ji: find i in the j's neighbor list
|
||||
int m = tid / t_per_atom;
|
||||
int ijnum = -1;
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
@ -731,9 +843,11 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
|
||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, ijnum, offset_kf, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, ijnum, offset_kf, idx);
|
||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||
numtyp force = zeta_ji.x*tpainv;
|
||||
numtyp prefactor_ji = zeta_ji.y;
|
||||
@ -756,7 +870,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
|
||||
// attractive forces
|
||||
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
@ -797,9 +911,11 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
|
||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, nbor_k, offset_k, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, nbor_k, offset_k, idx);
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
|
||||
@ -844,6 +960,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
@ -878,7 +995,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
const int* nbor_mem = dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
@ -889,9 +1006,18 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
itype=map[itype];
|
||||
|
||||
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -906,8 +1032,6 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
|
||||
numtyp mdelr1[3];
|
||||
mdelr1[0] = -delr1[0];
|
||||
mdelr1[1] = -delr1[1];
|
||||
@ -929,13 +1053,20 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
k_end=nbor_k+numk;
|
||||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
// recalculate numk and k_end for the use of short neighbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numk = dev_short_nbor[nbor_k];
|
||||
nbor_k += n_stride;
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
int nbork_start = nbor_k;
|
||||
|
||||
// look up for zeta_ji
|
||||
int m = tid / t_per_atom;
|
||||
int ijnum = -1;
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
@ -957,9 +1088,11 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
|
||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, ijnum, offset_kf, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, ijnum, offset_kf, idx);
|
||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||
numtyp force = zeta_ji.x*tpainv;
|
||||
numtyp prefactor_ji = zeta_ji.y;
|
||||
@ -982,7 +1115,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
|
||||
// attractive forces
|
||||
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
@ -1030,9 +1163,11 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
|
||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, nbor_k, offset_k, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, nbor_k, offset_k, idx);
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
|
||||
|
||||
@ -49,21 +49,6 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
|
||||
const double* ZBLcut, const double* ZBLexpscale, const double global_e,
|
||||
const double global_a_0, const double global_epsilon_0, const double* cutsq);
|
||||
|
||||
/// Pair loop with host neighboring
|
||||
void compute(const int f_ago, const int inum_full, const int nall,
|
||||
const int nlist, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh, const bool eflag,
|
||||
const bool vflag, const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time, bool &success);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int ** compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **numj, const double cpu_time, bool &success);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
@ -109,8 +94,8 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
|
||||
UCL_Kernel k_zeta;
|
||||
UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex, ts6_tex;
|
||||
|
||||
int _max_nbors;
|
||||
numtyp _global_e,_global_a_0,_global_epsilon_0;
|
||||
numtyp _cutshortsq;
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
|
||||
@ -59,7 +59,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
|
||||
int success;
|
||||
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
|
||||
_screen,vashishta,"k_vashishta","k_vashishta_three_center",
|
||||
"k_vashishta_three_end");
|
||||
"k_vashishta_three_end","k_vashishta_short_nbor");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -128,15 +128,18 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
|
||||
|
||||
param4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
|
||||
|
||||
double r0sqmax = 0;
|
||||
for (int i=0; i<nparams; i++) {
|
||||
double r0sq = r0[i]*r0[i]-1e-4; // TODO: should we have the 1e-4?
|
||||
|
||||
double r0sq = r0[i]*r0[i]; // TODO: should we have the 1e-4?
|
||||
if (r0sqmax < r0sq) r0sqmax = r0sq;
|
||||
dview[i].x=static_cast<numtyp>(r0sq);
|
||||
dview[i].y=static_cast<numtyp>(gamma[i]);
|
||||
dview[i].z=static_cast<numtyp>(cutsq[i]);
|
||||
dview[i].w=static_cast<numtyp>(r0[i]);
|
||||
}
|
||||
|
||||
_cutshortsq = static_cast<numtyp>(r0sqmax);
|
||||
|
||||
ucl_copy(param4,dview,false);
|
||||
param4_tex.get_texture(*(this->pair_program),"param4_tex");
|
||||
param4_tex.bind_float(param4,4);
|
||||
@ -223,15 +226,28 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
// build the short neighbor list
|
||||
int ainum=this->_ainum;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, ¶m4, &map,
|
||||
&elem2param, &_nelements, &_nparams,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
|
||||
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
|
||||
int ainum=this->ans->inum();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
ainum=this->ans->inum();
|
||||
nbor_pitch=this->nbor->nbor_pitch();
|
||||
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
this->time_pair.start();
|
||||
|
||||
// note that k_pair does not run with the short neighbor list
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5,
|
||||
&map, &elem2param, &_nelements,
|
||||
@ -248,6 +264,7 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_three_center.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5,
|
||||
&map, &elem2param, &_nelements,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &evatom);
|
||||
Answer<numtyp,acctyp> *end_ans;
|
||||
@ -257,21 +274,19 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
end_ans=this->ans;
|
||||
#endif
|
||||
if (evatom!=0) {
|
||||
|
||||
this->k_three_end_vatom.set_size(GX,BX);
|
||||
this->k_three_end_vatom.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5,
|
||||
&map, &elem2param, &_nelements,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&this->nbor->dev_acc, &this->dev_short_nbor,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
} else {
|
||||
|
||||
this->k_three_end.set_size(GX,BX);
|
||||
this->k_three_end.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5,
|
||||
&map, &elem2param, &_nelements,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&this->nbor->dev_acc, &this->dev_short_nbor,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
}
|
||||
|
||||
@ -136,6 +136,64 @@ texture<int4> param5_tex;
|
||||
|
||||
#endif
|
||||
|
||||
__kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict param4,
|
||||
const __global int *restrict map,
|
||||
const __global int *restrict elem2param,
|
||||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global int * dev_short_nbor,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
__local int n_stride;
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
int ncount = 0;
|
||||
int m = nbor;
|
||||
dev_short_nbor[m] = 0;
|
||||
int nbor_short = nbor+n_stride;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
int nj = j;
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<param4[ijparam].x) { //param4[ijparam].x = r0sq; //param4[ijparam].z=cutsq
|
||||
dev_short_nbor[nbor_short] = nj;
|
||||
nbor_short += n_stride;
|
||||
ncount++;
|
||||
}
|
||||
} // for nbor
|
||||
|
||||
// store the number of neighbors for each thread
|
||||
dev_short_nbor[m] = ncount;
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_vashishta(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict param1,
|
||||
@ -166,8 +224,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
int nbor, nbor_end, i, numj;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -211,7 +268,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
|
||||
numtyp param3_dvrc=param3_ijparam.z;
|
||||
numtyp param3_c0 =param3_ijparam.w;
|
||||
|
||||
numtyp r=sqrt(rsq);
|
||||
numtyp r=ucl_sqrt(rsq);
|
||||
numtyp rinvsq=1.0/rsq;
|
||||
numtyp r4inv = rinvsq*rinvsq;
|
||||
numtyp r6inv = rinvsq*r4inv;
|
||||
@ -219,8 +276,8 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
|
||||
numtyp reta = pow(r,-param1_eta);
|
||||
numtyp lam1r = r*param1_lam1inv;
|
||||
numtyp lam4r = r*param1_lam4inv;
|
||||
numtyp vc2 = param1_zizj * exp(-lam1r)/r;
|
||||
numtyp vc3 = param2_mbigd * r4inv*exp(-lam4r);
|
||||
numtyp vc2 = param1_zizj * ucl_exp(-lam1r)/r;
|
||||
numtyp vc3 = param2_mbigd * r4inv*ucl_exp(-lam4r);
|
||||
|
||||
numtyp force = (param2_dvrc*r
|
||||
- (4.0*vc3 + lam4r*vc3+param2_big6w*r6inv
|
||||
@ -230,6 +287,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0)
|
||||
energy += (param3_bigh*reta+vc2-vc3-param3_bigw*r6inv-r*param3_dvrc+param3_c0);
|
||||
|
||||
@ -255,31 +313,31 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
|
||||
numtyp r1 = ucl_sqrt(rsq1); \
|
||||
numtyp rinvsq1 = ucl_recip(rsq1); \
|
||||
numtyp rainv1 = ucl_recip(r1 - param_r0_ij); \
|
||||
numtyp gsrainv1 = param_gamma_ij * rainv1; \
|
||||
numtyp gsrainv1 = param_gamma_ij * rainv1; \
|
||||
numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \
|
||||
numtyp expgsrainv1 = ucl_exp(gsrainv1); \
|
||||
\
|
||||
numtyp r2 = ucl_sqrt(rsq2); \
|
||||
numtyp rinvsq2 = ucl_recip(rsq2); \
|
||||
numtyp rainv2 = ucl_recip(r2 - param_r0_ik); \
|
||||
numtyp gsrainv2 = param_gamma_ik * rainv2; \
|
||||
numtyp gsrainv2 = param_gamma_ik * rainv2; \
|
||||
numtyp gsrainvsq2 = gsrainv2*rainv2/r2; \
|
||||
numtyp expgsrainv2 = ucl_exp(gsrainv2); \
|
||||
\
|
||||
numtyp rinv12 = ucl_recip(r1*r2); \
|
||||
numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \
|
||||
numtyp delcs = cs - param_costheta_ijk; \
|
||||
numtyp delcs = cs - param_costheta_ijk; \
|
||||
numtyp delcssq = delcs*delcs; \
|
||||
numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \
|
||||
numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \
|
||||
numtyp pcsinvsq = pcsinv*pcsinv; \
|
||||
numtyp pcs = delcssq/pcsinv; \
|
||||
\
|
||||
numtyp facexp = expgsrainv1*expgsrainv2; \
|
||||
\
|
||||
numtyp facrad = param_bigb_ijk * facexp*pcs; \
|
||||
numtyp facrad = param_bigb_ijk * facexp*pcs; \
|
||||
numtyp frad1 = facrad*gsrainvsq1; \
|
||||
numtyp frad2 = facrad*gsrainvsq2; \
|
||||
numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \
|
||||
numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \
|
||||
numtyp facang12 = rinv12*facang; \
|
||||
numtyp csfacang = cs*facang; \
|
||||
numtyp csfac1 = rinvsq1*csfacang; \
|
||||
@ -311,28 +369,28 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
|
||||
numtyp r1 = ucl_sqrt(rsq1); \
|
||||
numtyp rinvsq1 = ucl_recip(rsq1); \
|
||||
numtyp rainv1 = ucl_recip(r1 - param_r0_ij); \
|
||||
numtyp gsrainv1 = param_gamma_ij * rainv1; \
|
||||
numtyp gsrainv1 = param_gamma_ij * rainv1; \
|
||||
numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \
|
||||
numtyp expgsrainv1 = ucl_exp(gsrainv1); \
|
||||
\
|
||||
numtyp r2 = ucl_sqrt(rsq2); \
|
||||
numtyp rainv2 = ucl_recip(r2 - param_r0_ik); \
|
||||
numtyp gsrainv2 = param_gamma_ik * rainv2; \
|
||||
numtyp gsrainv2 = param_gamma_ik * rainv2; \
|
||||
numtyp expgsrainv2 = ucl_exp(gsrainv2); \
|
||||
\
|
||||
numtyp rinv12 = ucl_recip(r1*r2); \
|
||||
numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \
|
||||
numtyp delcs = cs - param_costheta_ijk; \
|
||||
numtyp delcs = cs - param_costheta_ijk; \
|
||||
numtyp delcssq = delcs*delcs; \
|
||||
numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \
|
||||
numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \
|
||||
numtyp pcsinvsq = pcsinv*pcsinv; \
|
||||
numtyp pcs = delcssq/pcsinv; \
|
||||
\
|
||||
numtyp facexp = expgsrainv1*expgsrainv2; \
|
||||
\
|
||||
numtyp facrad = param_bigb_ijk * facexp*pcs; \
|
||||
numtyp facrad = param_bigb_ijk * facexp*pcs; \
|
||||
numtyp frad1 = facrad*gsrainvsq1; \
|
||||
numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \
|
||||
numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \
|
||||
numtyp facang12 = rinv12*facang; \
|
||||
numtyp csfacang = cs*facang; \
|
||||
numtyp csfac1 = rinvsq1*csfacang; \
|
||||
@ -353,6 +411,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
|
||||
const int nelements,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
@ -377,7 +436,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end;
|
||||
|
||||
const int* nbor_mem = dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
@ -387,9 +446,18 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -406,18 +474,27 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
|
||||
param_r0sq_ij=param4_ijparam.x;
|
||||
if (rsq1 > param_r0sq_ij) continue;
|
||||
if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1
|
||||
param_gamma_ij=param4_ijparam.y;
|
||||
param_r0_ij=param4_ijparam.w;
|
||||
|
||||
int nbor_k=nbor_j-offset_j+offset_k;
|
||||
if (nbor_k<=nbor_j)
|
||||
nbor_k+=n_stride;
|
||||
int nbor_k,k_end;
|
||||
if (dev_packed==dev_nbor) {
|
||||
nbor_k=nborj_start-offset_j+offset_k;
|
||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
} else {
|
||||
nbor_k = nbor_j-offset_j+offset_k;
|
||||
if (nbor_k<=nbor_j) nbor_k += n_stride;
|
||||
k_end = nbor_end;
|
||||
}
|
||||
|
||||
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (dev_packed==dev_nbor && k <= j) continue;
|
||||
|
||||
numtyp4 kx; fetch4(kx,k,pos_tex);
|
||||
int ktype=kx.w;
|
||||
ktype=map[ktype];
|
||||
@ -478,6 +555,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
@ -502,7 +580,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
const int* nbor_mem = dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
@ -512,8 +590,16 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -529,7 +615,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
|
||||
param_r0sq_ij = param4_ijparam.x;
|
||||
if (rsq1 > param_r0sq_ij) continue;
|
||||
if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1
|
||||
|
||||
param_gamma_ij=param4_ijparam.y;
|
||||
param_r0_ij = param4_ijparam.w;
|
||||
@ -551,8 +637,15 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
|
||||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
// recalculate numk and k_end for the use of short neighbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numk = dev_short_nbor[nbor_k];
|
||||
nbor_k += n_stride;
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
@ -617,6 +710,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
@ -641,7 +735,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
const int* nbor_mem = dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
@ -651,8 +745,16 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -668,7 +770,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
|
||||
param_r0sq_ij=param4_ijparam.x;
|
||||
if (rsq1 > param_r0sq_ij) continue;
|
||||
if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1
|
||||
|
||||
param_gamma_ij=param4_ijparam.y;
|
||||
param_r0_ij=param4_ijparam.w;
|
||||
@ -690,8 +792,15 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
// recalculate numk and k_end for the use of short neighbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numk = dev_short_nbor[nbor_k];
|
||||
nbor_k += n_stride;
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
|
||||
@ -82,6 +82,7 @@ class Vashishta : public BaseThree<numtyp, acctyp> {
|
||||
UCL_D_Vec<int> elem2param;
|
||||
UCL_D_Vec<int> map;
|
||||
int _nparams,_nelements;
|
||||
numtyp _cutshortsq;
|
||||
|
||||
UCL_Texture param1_tex, param2_tex, param3_tex, param4_tex, param5_tex;
|
||||
|
||||
|
||||
@ -6,6 +6,8 @@
|
||||
from __future__ import print_function
|
||||
import sys,os,re,subprocess
|
||||
|
||||
# help message
|
||||
|
||||
help = """
|
||||
Syntax from src dir: make lib-kim args="-b -v version -a kim-name"
|
||||
or: make lib-kim args="-b -a everything"
|
||||
@ -23,7 +25,7 @@ specify one or more options, order does not matter
|
||||
-b = download and build base KIM API library with example Models
|
||||
this will delete any previous installation in the current folder
|
||||
-n = do NOT download and build base KIM API library.
|
||||
Use an existing installation
|
||||
Use an existing installation
|
||||
-p = specify location of KIM API installation (implies -n)
|
||||
-a = add single KIM model or model driver with kim-name
|
||||
to existing KIM API lib (see example below).
|
||||
@ -78,13 +80,27 @@ def which(program):
|
||||
return None
|
||||
|
||||
def geturl(url,fname):
|
||||
success = False
|
||||
|
||||
if which('curl') != None:
|
||||
cmd = 'curl -L -o "%s" %s' % (fname,url)
|
||||
elif which('wget') != None:
|
||||
try:
|
||||
subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
|
||||
success = True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print("Calling curl failed with: %s" % e.output.decode('UTF-8'))
|
||||
|
||||
if not success and which('wget') != None:
|
||||
cmd = 'wget -O "%s" %s' % (fname,url)
|
||||
else: error("cannot find 'wget' or 'curl' to download source code")
|
||||
txt = subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
|
||||
return txt
|
||||
try:
|
||||
subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
|
||||
success = True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print("Calling wget failed with: %s" % e.output.decode('UTF-8'))
|
||||
|
||||
if not success:
|
||||
error("Failed to download source code with 'curl' or 'wget'")
|
||||
return
|
||||
|
||||
# parse args
|
||||
|
||||
|
||||
@ -1,5 +1,46 @@
|
||||
# Change Log
|
||||
|
||||
## [2.03.13](https://github.com/kokkos/kokkos/tree/2.03.13) (2017-07-27)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.05...2.03.13)
|
||||
|
||||
**Implemented enhancements:**
|
||||
|
||||
- Disallow enabling both OpenMP and Threads in the same executable [\#406](https://github.com/kokkos/kokkos/issues/406)
|
||||
- Make Kokkos::OpenMP respect OMP environment even if hwloc is available [\#630](https://github.com/kokkos/kokkos/issues/630)
|
||||
- Improve Atomics Performance on KNL/Broadwell where PREFETCHW/RFO is Available [\#898](https://github.com/kokkos/kokkos/issues/898)
|
||||
- Kokkos::resize should test whether dimensions have changed before resizing [\#904](https://github.com/kokkos/kokkos/issues/904)
|
||||
- Develop performance-regression/acceptance tests [\#737](https://github.com/kokkos/kokkos/issues/737)
|
||||
- Make the deep\_copy Profiling hook a start/end system [\#890](https://github.com/kokkos/kokkos/issues/890)
|
||||
- Add deep\_copy Profiling hook [\#843](https://github.com/kokkos/kokkos/issues/843)
|
||||
- Append tag name to parallel construct name for Profiling [\#842](https://github.com/kokkos/kokkos/issues/842)
|
||||
- Add view label to `View bounds error` message for CUDA backend [\#870](https://github.com/kokkos/kokkos/issues/870)
|
||||
- Disable printing the loaded profiling library [\#824](https://github.com/kokkos/kokkos/issues/824)
|
||||
- "Declared but never referenced" warnings [\#853](https://github.com/kokkos/kokkos/issues/853)
|
||||
- Warnings about lock\_address\_cuda\_space [\#852](https://github.com/kokkos/kokkos/issues/852)
|
||||
- WorkGraph execution policy [\#771](https://github.com/kokkos/kokkos/issues/771)
|
||||
- Simplify makefiles by guarding compilation with appropriate KOKKOS\_ENABLE\_\#\#\# macros [\#716](https://github.com/kokkos/kokkos/issues/716)
|
||||
- Cmake build: wrong include install directory [\#668](https://github.com/kokkos/kokkos/issues/668)
|
||||
- Derived View type and allocation [\#566](https://github.com/kokkos/kokkos/issues/566)
|
||||
- Fix Compiler warnings when compiling core unit tests for Cuda [\#214](https://github.com/kokkos/kokkos/issues/214)
|
||||
|
||||
**Fixed bugs:**
|
||||
|
||||
- Out-of-bounds read in Kokkos\_Layout.hpp [\#975](https://github.com/kokkos/kokkos/issues/975)
|
||||
- CudaClang: Fix failing test with Clang 4.0 [\#941](https://github.com/kokkos/kokkos/issues/941)
|
||||
- Respawn when memory pool allocation fails \(not available memory\) [\#940](https://github.com/kokkos/kokkos/issues/940)
|
||||
- Memory pool aborts on zero allocation request, returns NULL for \< minimum [\#939](https://github.com/kokkos/kokkos/issues/939)
|
||||
- Error with TaskScheduler query of underlying memory pool [\#917](https://github.com/kokkos/kokkos/issues/917)
|
||||
- Profiling::\*Callee static variables declared in header [\#863](https://github.com/kokkos/kokkos/issues/863)
|
||||
- calling \*Space::name\(\) causes compile error [\#862](https://github.com/kokkos/kokkos/issues/862)
|
||||
- bug in Profiling::deallocateData [\#860](https://github.com/kokkos/kokkos/issues/860)
|
||||
- task\_depend test failing, CUDA 8.0 + Pascal + RDC [\#829](https://github.com/kokkos/kokkos/issues/829)
|
||||
- \[develop branch\] Standalone cmake issues [\#826](https://github.com/kokkos/kokkos/issues/826)
|
||||
- Kokkos CUDA failes to compile with OMPI\_CXX and MPICH\_CXX wrappers [\#776](https://github.com/kokkos/kokkos/issues/776)
|
||||
- Task Team reduction on Pascal [\#767](https://github.com/kokkos/kokkos/issues/767)
|
||||
- CUDA stack overflow with TaskDAG test [\#758](https://github.com/kokkos/kokkos/issues/758)
|
||||
- TeamVector test on Cuda [\#670](https://github.com/kokkos/kokkos/issues/670)
|
||||
- Clang 4.0 Cuda Build broken again [\#560](https://github.com/kokkos/kokkos/issues/560)
|
||||
|
||||
|
||||
## [2.03.05](https://github.com/kokkos/kokkos/tree/2.03.05) (2017-05-27)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.00...2.03.05)
|
||||
|
||||
@ -33,6 +33,7 @@ KOKKOS_INTERNAL_USE_LIBRT := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "lib
|
||||
KOKKOS_INTERNAL_USE_MEMKIND := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "experimental_memkind" | wc -l))
|
||||
|
||||
# Check for advanced settings.
|
||||
KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "compiler_warnings" | wc -l))
|
||||
KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "aggressive_vectorization" | wc -l))
|
||||
KOKKOS_INTERNAL_DISABLE_PROFILING := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_profiling" | wc -l))
|
||||
KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_dualview_modify_check" | wc -l))
|
||||
@ -78,14 +79,14 @@ KOKKOS_INTERNAL_COMPILER_PGI := $(strip $(shell $(CXX) --version 2
|
||||
KOKKOS_INTERNAL_COMPILER_XL := $(strip $(shell $(CXX) -qversion 2>&1 | grep XL | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-" | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(CXX) --version 2>&1 | grep nvcc | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep clang | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep "apple-darwin" | wc -l))
|
||||
ifneq ($(OMPI_CXX),)
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(OMPI_CXX) --version 2>&1 | grep "nvcc" | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(OMPI_CXX) --version 2>&1 | grep nvcc | wc -l))
|
||||
endif
|
||||
ifneq ($(MPICH_CXX),)
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(MPICH_CXX) --version 2>&1 | grep "nvcc" | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(MPICH_CXX) --version 2>&1 | grep nvcc | wc -l))
|
||||
endif
|
||||
KOKKOS_INTERNAL_COMPILER_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep clang | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep "apple-darwin" | wc -l))
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2)
|
||||
KOKKOS_INTERNAL_COMPILER_CLANG = 1
|
||||
@ -111,6 +112,36 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
endif
|
||||
endif
|
||||
|
||||
# Set compiler warnings flags.
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
# TODO check if PGI accepts GNU style warnings
|
||||
KOKKOS_INTERNAL_COMPILER_WARNINGS =
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
|
||||
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
|
||||
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
# TODO check if cray accepts GNU style warnings
|
||||
KOKKOS_INTERNAL_COMPILER_WARNINGS =
|
||||
else
|
||||
#gcc
|
||||
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
KOKKOS_INTERNAL_COMPILER_WARNINGS =
|
||||
endif
|
||||
|
||||
# Set OpenMP flags.
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
KOKKOS_INTERNAL_OPENMP_FLAG := -mp
|
||||
@ -162,6 +193,7 @@ endif
|
||||
|
||||
# Intel based.
|
||||
KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_WSM := $(strip $(shell echo $(KOKKOS_ARCH) | grep WSM | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW | wc -l))
|
||||
@ -229,13 +261,14 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l))
|
||||
|
||||
# Any AVX?
|
||||
KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
|
||||
|
||||
# Decide what ISA level we are able to support.
|
||||
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM)+$(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ISA_KNC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc ))
|
||||
|
||||
@ -243,7 +276,7 @@ KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_
|
||||
KOKKOS_INTERNAL_USE_TM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
|
||||
|
||||
# Incompatible flags?
|
||||
KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc))
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
|
||||
@ -257,12 +290,10 @@ endif
|
||||
|
||||
KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src
|
||||
|
||||
# No warnings:
|
||||
KOKKOS_CXXFLAGS =
|
||||
# INTEL and CLANG warnings:
|
||||
#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
|
||||
# GCC warnings:
|
||||
#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized -Wignored-qualifiers -Wempty-body -Wclobbered
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1)
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_WARNINGS)
|
||||
endif
|
||||
|
||||
KOKKOS_LIBS = -lkokkos -ldl
|
||||
KOKKOS_LDFLAGS = -L$(shell pwd)
|
||||
@ -486,6 +517,28 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_SSE42 1" >> KokkosCore_config.tmp )
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
|
||||
KOKKOS_CXXFLAGS += -xSSE4.2
|
||||
KOKKOS_LDFLAGS += -xSSE4.2
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
KOKKOS_CXXFLAGS += -tp=nehalem
|
||||
KOKKOS_LDFLAGS += -tp=nehalem
|
||||
else
|
||||
# Assume that this is a really a GNU compiler.
|
||||
KOKKOS_CXXFLAGS += -msse4.2
|
||||
KOKKOS_LDFLAGS += -msse4.2
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp )
|
||||
|
||||
@ -689,7 +742,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
endif
|
||||
endif
|
||||
|
||||
KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h)
|
||||
KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1)
|
||||
ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h)
|
||||
KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
|
||||
else
|
||||
|
||||
@ -20,8 +20,10 @@ Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ta
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
|
||||
Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
|
||||
Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
|
||||
Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
|
||||
Kokkos_Rendezvous.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Rendezvous.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Rendezvous.cpp
|
||||
Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
|
||||
Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
|
||||
@ -36,6 +38,8 @@ Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
|
||||
Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
|
||||
Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
|
||||
|
||||
@ -61,14 +61,19 @@ protected:
|
||||
{
|
||||
std::cout << std::setprecision(5) << std::scientific;
|
||||
|
||||
unsigned threads_count = omp_get_max_threads();
|
||||
int threads_count = 0;
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp atomic
|
||||
++threads_count;
|
||||
}
|
||||
|
||||
if ( Kokkos::hwloc::available() ) {
|
||||
threads_count = Kokkos::hwloc::get_available_numa_count() *
|
||||
Kokkos::hwloc::get_available_cores_per_numa();
|
||||
if (threads_count > 3) {
|
||||
threads_count /= 2;
|
||||
}
|
||||
|
||||
Kokkos::OpenMP::initialize( threads_count );
|
||||
Kokkos::OpenMP::print_configuration( std::cout );
|
||||
}
|
||||
|
||||
static void TearDownTestCase()
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -35,7 +35,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
|
||||
@ -283,12 +283,12 @@ struct test_random_scalar {
|
||||
RandomGenerator& pool,
|
||||
unsigned int num_draws)
|
||||
{
|
||||
using std::cerr;
|
||||
using std::cout;
|
||||
using std::endl;
|
||||
using Kokkos::parallel_reduce;
|
||||
|
||||
{
|
||||
cerr << " -- Testing randomness properties" << endl;
|
||||
cout << " -- Testing randomness properties" << endl;
|
||||
|
||||
RandomProperties result;
|
||||
typedef test_random_functor<RandomGenerator, Scalar> functor_type;
|
||||
@ -307,7 +307,7 @@ struct test_random_scalar {
|
||||
( 1.5*tolerance > variance_eps)) ? 1:0;
|
||||
pass_covar = ((-2.0*tolerance < covariance_eps) &&
|
||||
( 2.0*tolerance > covariance_eps)) ? 1:0;
|
||||
cerr << "Pass: " << pass_mean
|
||||
cout << "Pass: " << pass_mean
|
||||
<< " " << pass_var
|
||||
<< " " << mean_eps
|
||||
<< " " << variance_eps
|
||||
@ -315,7 +315,7 @@ struct test_random_scalar {
|
||||
<< " || " << tolerance << endl;
|
||||
}
|
||||
{
|
||||
cerr << " -- Testing 1-D histogram" << endl;
|
||||
cout << " -- Testing 1-D histogram" << endl;
|
||||
|
||||
RandomProperties result;
|
||||
typedef test_histogram1d_functor<typename RandomGenerator::device_type> functor_type;
|
||||
@ -335,7 +335,7 @@ struct test_random_scalar {
|
||||
pass_hist1d_covar = ((-0.06 < covariance_eps) &&
|
||||
( 0.06 > covariance_eps)) ? 1:0;
|
||||
|
||||
cerr << "Density 1D: " << mean_eps
|
||||
cout << "Density 1D: " << mean_eps
|
||||
<< " " << variance_eps
|
||||
<< " " << (result.covariance/HIST_DIM1D/HIST_DIM1D)
|
||||
<< " || " << tolerance
|
||||
@ -348,7 +348,7 @@ struct test_random_scalar {
|
||||
<< endl;
|
||||
}
|
||||
{
|
||||
cerr << " -- Testing 3-D histogram" << endl;
|
||||
cout << " -- Testing 3-D histogram" << endl;
|
||||
|
||||
RandomProperties result;
|
||||
typedef test_histogram3d_functor<typename RandomGenerator::device_type> functor_type;
|
||||
@ -368,7 +368,7 @@ struct test_random_scalar {
|
||||
pass_hist3d_covar = ((-tolerance < covariance_eps) &&
|
||||
( tolerance > covariance_eps)) ? 1:0;
|
||||
|
||||
cerr << "Density 3D: " << mean_eps
|
||||
cout << "Density 3D: " << mean_eps
|
||||
<< " " << variance_eps
|
||||
<< " " << result.covariance/HIST_DIM1D/HIST_DIM1D
|
||||
<< " || " << tolerance
|
||||
@ -381,18 +381,18 @@ struct test_random_scalar {
|
||||
template <class RandomGenerator>
|
||||
void test_random(unsigned int num_draws)
|
||||
{
|
||||
using std::cerr;
|
||||
using std::cout;
|
||||
using std::endl;
|
||||
typename test_random_functor<RandomGenerator,int>::type_1d density_1d("D1d");
|
||||
typename test_random_functor<RandomGenerator,int>::type_3d density_3d("D3d");
|
||||
|
||||
|
||||
uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count();
|
||||
cerr << "Test Seed:" << ticks << endl;
|
||||
cout << "Test Seed:" << ticks << endl;
|
||||
|
||||
RandomGenerator pool(ticks);
|
||||
|
||||
cerr << "Test Scalar=int" << endl;
|
||||
cout << "Test Scalar=int" << endl;
|
||||
test_random_scalar<RandomGenerator,int> test_int(density_1d,density_3d,pool,num_draws);
|
||||
ASSERT_EQ( test_int.pass_mean,1);
|
||||
ASSERT_EQ( test_int.pass_var,1);
|
||||
@ -406,7 +406,7 @@ void test_random(unsigned int num_draws)
|
||||
deep_copy(density_1d,0);
|
||||
deep_copy(density_3d,0);
|
||||
|
||||
cerr << "Test Scalar=unsigned int" << endl;
|
||||
cout << "Test Scalar=unsigned int" << endl;
|
||||
test_random_scalar<RandomGenerator,unsigned int> test_uint(density_1d,density_3d,pool,num_draws);
|
||||
ASSERT_EQ( test_uint.pass_mean,1);
|
||||
ASSERT_EQ( test_uint.pass_var,1);
|
||||
@ -420,7 +420,7 @@ void test_random(unsigned int num_draws)
|
||||
deep_copy(density_1d,0);
|
||||
deep_copy(density_3d,0);
|
||||
|
||||
cerr << "Test Scalar=int64_t" << endl;
|
||||
cout << "Test Scalar=int64_t" << endl;
|
||||
test_random_scalar<RandomGenerator,int64_t> test_int64(density_1d,density_3d,pool,num_draws);
|
||||
ASSERT_EQ( test_int64.pass_mean,1);
|
||||
ASSERT_EQ( test_int64.pass_var,1);
|
||||
@ -434,7 +434,7 @@ void test_random(unsigned int num_draws)
|
||||
deep_copy(density_1d,0);
|
||||
deep_copy(density_3d,0);
|
||||
|
||||
cerr << "Test Scalar=uint64_t" << endl;
|
||||
cout << "Test Scalar=uint64_t" << endl;
|
||||
test_random_scalar<RandomGenerator,uint64_t> test_uint64(density_1d,density_3d,pool,num_draws);
|
||||
ASSERT_EQ( test_uint64.pass_mean,1);
|
||||
ASSERT_EQ( test_uint64.pass_var,1);
|
||||
@ -448,7 +448,7 @@ void test_random(unsigned int num_draws)
|
||||
deep_copy(density_1d,0);
|
||||
deep_copy(density_3d,0);
|
||||
|
||||
cerr << "Test Scalar=float" << endl;
|
||||
cout << "Test Scalar=float" << endl;
|
||||
test_random_scalar<RandomGenerator,float> test_float(density_1d,density_3d,pool,num_draws);
|
||||
ASSERT_EQ( test_float.pass_mean,1);
|
||||
ASSERT_EQ( test_float.pass_var,1);
|
||||
@ -462,7 +462,7 @@ void test_random(unsigned int num_draws)
|
||||
deep_copy(density_1d,0);
|
||||
deep_copy(density_3d,0);
|
||||
|
||||
cerr << "Test Scalar=double" << endl;
|
||||
cout << "Test Scalar=double" << endl;
|
||||
test_random_scalar<RandomGenerator,double> test_double(density_1d,density_3d,pool,num_draws);
|
||||
ASSERT_EQ( test_double.pass_mean,1);
|
||||
ASSERT_EQ( test_double.pass_var,1);
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
@ -44,12 +44,13 @@
|
||||
#include<Kokkos_Core.hpp>
|
||||
#include<impl/Kokkos_Timer.hpp>
|
||||
#include<bench.hpp>
|
||||
#include<cstdlib>
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
Kokkos::initialize();
|
||||
|
||||
|
||||
if(argc<10) {
|
||||
|
||||
if(argc<10) {
|
||||
printf("Arguments: N K R D U F T S\n");
|
||||
printf(" P: Precision (1==float, 2==double)\n");
|
||||
printf(" N,K: dimensions of the 2D array to allocate\n");
|
||||
@ -68,7 +69,7 @@ int main(int argc, char* argv[]) {
|
||||
Kokkos::finalize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int P = atoi(argv[1]);
|
||||
int N = atoi(argv[2]);
|
||||
@ -80,7 +81,7 @@ int main(int argc, char* argv[]) {
|
||||
int T = atoi(argv[8]);
|
||||
int S = atoi(argv[9]);
|
||||
|
||||
if(U>8) {printf("U must be 1-8\n"); return 0;}
|
||||
if(U>8) {printf("U must be 1-8\n"); return 0;}
|
||||
if( (D!=1) && (D!=2) && (D!=4) && (D!=8) && (D!=16) && (D!=32)) {printf("D must be one of 1,2,4,8,16,32\n"); return 0;}
|
||||
if( (P!=1) && (P!=2) ) {printf("P must be one of 1,2\n"); return 0;}
|
||||
|
||||
|
||||
@ -44,11 +44,11 @@
|
||||
#include<Kokkos_Core.hpp>
|
||||
#include<impl/Kokkos_Timer.hpp>
|
||||
#include<gather.hpp>
|
||||
#include<cstdlib>
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
Kokkos::initialize(argc,argv);
|
||||
|
||||
|
||||
if(argc<8) {
|
||||
printf("Arguments: S N K D\n");
|
||||
printf(" S: Scalar Type Size (1==float, 2==double, 4=complex<double>)\n");
|
||||
|
||||
44
lib/kokkos/benchmarks/policy_performance/Makefile
Normal file
44
lib/kokkos/benchmarks/policy_performance/Makefile
Normal file
@ -0,0 +1,44 @@
|
||||
KOKKOS_PATH = ../..
|
||||
SRC = $(wildcard *.cpp)
|
||||
|
||||
default: build
|
||||
echo "Start Build"
|
||||
|
||||
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
|
||||
CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
|
||||
CXXFLAGS = -O3 -g
|
||||
LINK = ${CXX}
|
||||
LINKFLAGS =
|
||||
EXE = policy_performance.cuda
|
||||
KOKKOS_DEVICES = "Cuda,OpenMP"
|
||||
KOKKOS_ARCH = "SNB,Kepler35"
|
||||
KOKKOS_CUDA_OPTIONS+=enable_lambda
|
||||
else
|
||||
CXX = g++
|
||||
CXXFLAGS = -O3 -g -Wall -Werror
|
||||
LINK = ${CXX}
|
||||
LINKFLAGS =
|
||||
EXE = policy_performance.host
|
||||
KOKKOS_DEVICES = "OpenMP"
|
||||
KOKKOS_ARCH = "SNB"
|
||||
endif
|
||||
|
||||
DEPFLAGS = -M
|
||||
|
||||
OBJ = $(SRC:.cpp=.o)
|
||||
LIB =
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
build: $(EXE)
|
||||
|
||||
$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
|
||||
|
||||
clean: kokkos-clean
|
||||
rm -f *.o *.cuda *.host
|
||||
|
||||
# Compilation rules
|
||||
|
||||
%.o:%.cpp $(KOKKOS_CPP_DEPENDS) main.cpp policy_perf_test.hpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
|
||||
170
lib/kokkos/benchmarks/policy_performance/main.cpp
Normal file
170
lib/kokkos/benchmarks/policy_performance/main.cpp
Normal file
@ -0,0 +1,170 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include "policy_perf_test.hpp"
|
||||
|
||||
int main(int argc, char* argv[] ) {
|
||||
Kokkos::initialize(argc,argv);
|
||||
|
||||
if(argc<10) {
|
||||
printf(" Ten arguments are needed to run this program:\n");
|
||||
printf(" (1)team_range, (2)thread_range, (3)vector_range, (4)outer_repeat, (5)thread_repeat, (6)vector_repeat, (7)team_size, (8)vector_size, (9)schedule, (10)test_type\n");
|
||||
printf(" team_range: number of teams (league_size)\n");
|
||||
printf(" thread_range: range for nested TeamThreadRange parallel_*\n");
|
||||
printf(" vector_range: range for nested ThreadVectorRange parallel_*\n");
|
||||
printf(" outer_repeat: number of repeats for outer parallel_* call\n");
|
||||
printf(" thread_repeat: number of repeats for TeamThreadRange parallel_* call\n");
|
||||
printf(" vector_repeat: number of repeats for ThreadVectorRange parallel_* call\n");
|
||||
printf(" team_size: number of team members (team_size)\n");
|
||||
printf(" vector_size: desired vectorization (if possible)\n");
|
||||
printf(" schedule: 1 == Static 2 == Dynamic\n");
|
||||
printf(" test_type: 3-digit code XYZ for testing (nested) parallel_*\n");
|
||||
printf(" code key: XYZ X in {1,2,3,4,5}, Y in {0,1,2}, Z in {0,1,2}\n");
|
||||
printf(" TeamPolicy:\n");
|
||||
printf(" X: 0 = none (never used, makes no sense); 1 = parallel_for; 2 = parallel_reduce\n");
|
||||
printf(" Y: 0 = none; 1 = parallel_for; 2 = parallel_reduce\n");
|
||||
printf(" Z: 0 = none; 1 = parallel_for; 2 = parallel_reduce\n");
|
||||
printf(" RangePolicy:\n");
|
||||
printf(" X: 3 = parallel_for; 4 = parallel_reduce; 5 = parallel_scan\n");
|
||||
printf(" Y: 0 = none\n");
|
||||
printf(" Z: 0 = none\n");
|
||||
printf(" Example Input:\n");
|
||||
printf(" 100000 32 32 100 100 100 8 1 1 100\n");
|
||||
Kokkos::finalize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int team_range = atoi(argv[1]);
|
||||
int thread_range = atoi(argv[2]);
|
||||
int vector_range = atoi(argv[3]);
|
||||
|
||||
int outer_repeat = atoi(argv[4]);
|
||||
int thread_repeat = atoi(argv[5]);
|
||||
int vector_repeat = atoi(argv[6]);
|
||||
|
||||
int team_size = atoi(argv[7]);
|
||||
int vector_size = atoi(argv[8]);
|
||||
int schedule = atoi(argv[9]);
|
||||
int test_type = atoi(argv[10]);
|
||||
|
||||
int disable_verbose_output = 0;
|
||||
if ( argc > 11 ) {
|
||||
disable_verbose_output = atoi(argv[11]);
|
||||
}
|
||||
|
||||
if ( schedule != 1 && schedule != 2 ) {
|
||||
printf("schedule: %d\n", schedule);
|
||||
printf("Options for schedule are: 1 == Static 2 == Dynamic\n");
|
||||
Kokkos::finalize();
|
||||
return -1;
|
||||
}
|
||||
|
||||
if ( test_type != 100 && test_type != 110 && test_type != 111 && test_type != 112 && test_type != 120 && test_type != 121 && test_type != 122
|
||||
&& test_type != 200 && test_type != 210 && test_type != 211 && test_type != 212 && test_type != 220 && test_type != 221 && test_type != 222
|
||||
&& test_type != 300 && test_type != 400 && test_type != 500
|
||||
)
|
||||
{
|
||||
printf("Incorrect test_type option\n");
|
||||
Kokkos::finalize();
|
||||
return -2;
|
||||
}
|
||||
|
||||
double result = 0.0;
|
||||
|
||||
Kokkos::parallel_reduce( "parallel_reduce warmup", Kokkos::TeamPolicy<>(10,1),
|
||||
KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type team, double& lval) {
|
||||
lval += 1;
|
||||
}, result);
|
||||
|
||||
typedef Kokkos::View<double*, Kokkos::LayoutRight> view_type_1d;
|
||||
typedef Kokkos::View<double**, Kokkos::LayoutRight> view_type_2d;
|
||||
typedef Kokkos::View<double***, Kokkos::LayoutRight> view_type_3d;
|
||||
|
||||
// Allocate view without initializing
|
||||
// Call a 'warmup' test with 1 repeat - this will initialize the corresponding view appropriately for test and should obey first-touch etc
|
||||
// Second call to test is the one we actually care about and time
|
||||
view_type_1d v_1( Kokkos::ViewAllocateWithoutInitializing("v_1"), team_range*team_size);
|
||||
view_type_2d v_2( Kokkos::ViewAllocateWithoutInitializing("v_2"), team_range*team_size, thread_range);
|
||||
view_type_3d v_3( Kokkos::ViewAllocateWithoutInitializing("v_3"), team_range*team_size, thread_range, vector_range);
|
||||
|
||||
double result_computed = 0.0;
|
||||
double result_expect = 0.0;
|
||||
double time = 0.0;
|
||||
|
||||
if(schedule==1) {
|
||||
if ( test_type != 500 ) {
|
||||
// warmup - no repeat of loops
|
||||
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
}
|
||||
else {
|
||||
// parallel_scan: initialize 1d view for parallel_scan
|
||||
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,100,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
}
|
||||
}
|
||||
if(schedule==2) {
|
||||
if ( test_type != 500 ) {
|
||||
// warmup - no repeat of loops
|
||||
test_policy<Kokkos::Schedule<Kokkos::Dynamic>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
test_policy<Kokkos::Schedule<Kokkos::Dynamic>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
}
|
||||
else {
|
||||
// parallel_scan: initialize 1d view for parallel_scan
|
||||
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,100,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
}
|
||||
}
|
||||
|
||||
if ( disable_verbose_output == 0 ) {
|
||||
printf("%7i %4i %2i %9i %4i %4i %4i %2i %1i %3i %e %e %lf\n",team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,schedule,test_type,result_computed,result_expect,time);
|
||||
}
|
||||
else {
|
||||
printf("%lf\n",time);
|
||||
}
|
||||
|
||||
Kokkos::finalize();
|
||||
|
||||
return 0;
|
||||
}
|
||||
354
lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp
Normal file
354
lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp
Normal file
@ -0,0 +1,354 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
template < class ViewType >
|
||||
struct ParallelScanFunctor {
|
||||
using value_type = double;
|
||||
ViewType v;
|
||||
|
||||
ParallelScanFunctor( const ViewType & v_ )
|
||||
: v(v_)
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()( const int idx, value_type& val, const bool& final ) const
|
||||
{
|
||||
// inclusive scan
|
||||
val += v(idx);
|
||||
if ( final ) {
|
||||
v(idx) = val;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class ScheduleType,class IndexType,class ViewType1, class ViewType2, class ViewType3>
|
||||
void test_policy(int team_range, int thread_range, int vector_range,
|
||||
int outer_repeat, int thread_repeat, int inner_repeat,
|
||||
int team_size, int vector_size, int test_type,
|
||||
ViewType1 &v1, ViewType2 &v2, ViewType3 &v3,
|
||||
double &result, double &result_expect, double &time) {
|
||||
|
||||
typedef Kokkos::TeamPolicy<ScheduleType,IndexType> t_policy;
|
||||
typedef typename t_policy::member_type t_team;
|
||||
Kokkos::Timer timer;
|
||||
|
||||
for(int orep = 0; orep<outer_repeat; orep++) {
|
||||
|
||||
if (test_type == 100) {
|
||||
Kokkos::parallel_for("100 outer for", t_policy(team_range,team_size),
|
||||
KOKKOS_LAMBDA (const t_team& team) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
v1(idx) = idx;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
|
||||
if (test_type == 110) {
|
||||
Kokkos::parallel_for("110 outer for", t_policy(team_range,team_size),
|
||||
KOKKOS_LAMBDA (const t_team& team) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
for (int tr = 0; tr<thread_repeat; ++tr) {
|
||||
// Each team launches a parallel_for; thread_range is partitioned among team members
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
|
||||
v2( idx, t ) = t;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
if (test_type == 111) {
|
||||
Kokkos::parallel_for("111 outer for", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
for (int tr = 0; tr<thread_repeat; ++tr) {
|
||||
// Each team launches a parallel_for; thread_range is partitioned among team members
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
|
||||
for (int vr = 0; vr<inner_repeat; ++vr)
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi) {
|
||||
v3( idx, t, vi ) = vi;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
if (test_type == 112) {
|
||||
Kokkos::parallel_for("112 outer for", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
for (int tr = 0; tr<thread_repeat; ++tr) {
|
||||
// Each team launches a parallel_for; thread_range is partitioned among team members
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
|
||||
double vector_result = 0.0;
|
||||
for (int vr = 0; vr<inner_repeat; ++vr) {
|
||||
vector_result = 0.0;
|
||||
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi, double &vval) {
|
||||
vval += 1;
|
||||
}, vector_result);
|
||||
}
|
||||
v2( idx, t ) = vector_result;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
if (test_type == 120) {
|
||||
Kokkos::parallel_for("120 outer for", t_policy(team_range,team_size),
|
||||
KOKKOS_LAMBDA (const t_team& team) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
double team_result = 0.0;
|
||||
for (int tr = 0; tr<thread_repeat; ++tr) {
|
||||
team_result = 0.0;
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
|
||||
lval += 1;
|
||||
}, team_result);
|
||||
}
|
||||
v1(idx) = team_result;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
if (test_type == 121) {
|
||||
Kokkos::parallel_for("121 outer for", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
double team_result = 0.0;
|
||||
for (int tr = 0; tr<thread_repeat; ++tr) {
|
||||
team_result = 0.0;
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
|
||||
lval += 1;
|
||||
for (int vr = 0; vr<inner_repeat; ++vr) {
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi) {
|
||||
v3( idx, t, vi ) = vi;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
}, team_result);
|
||||
}
|
||||
v3( idx, 0, 0 ) = team_result;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
if (test_type == 122) {
|
||||
Kokkos::parallel_for("122 outer for", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
double team_result = 0.0;
|
||||
for (int tr = 0; tr<thread_repeat; ++tr) {
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
|
||||
double vector_result = 0.0;
|
||||
for (int vr = 0; vr<inner_repeat; ++vr)
|
||||
vector_result = 0.0;
|
||||
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi, double &vval) {
|
||||
vval += 1;
|
||||
}, vector_result);
|
||||
lval += vector_result;
|
||||
}, team_result);
|
||||
}
|
||||
v1(idx) = team_result;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
if (test_type == 200) {
|
||||
Kokkos::parallel_reduce("200 outer reduce", t_policy(team_range,team_size),
|
||||
KOKKOS_LAMBDA (const t_team& team, double& lval) {
|
||||
lval+=team.team_size()*team.league_rank() + team.team_rank();
|
||||
},result);
|
||||
result_expect = 0.5* (team_range*team_size)*(team_range*team_size-1);
|
||||
// sum ( seq( [0, team_range*team_size) )
|
||||
}
|
||||
if (test_type == 210) {
|
||||
Kokkos::parallel_reduce("210 outer reduce", t_policy(team_range,team_size),
|
||||
KOKKOS_LAMBDA (const t_team& team, double& lval) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
double thread_for = 1.0;
|
||||
for(int tr = 0; tr<thread_repeat; tr++) {
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
|
||||
v2(idx,t) = t;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
lval+=(team.team_size()*team.league_rank() + team.team_rank() + thread_for);
|
||||
},result);
|
||||
result_expect = 0.5* (team_range*team_size)*(team_range*team_size-1) + (team_range*team_size);
|
||||
// sum ( seq( [0, team_range*team_size) + 1 per team_member (total of team_range*team_size) )
|
||||
}
|
||||
if (test_type == 211) {
|
||||
Kokkos::parallel_reduce("211 outer reduce", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team, double& lval) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
double thread_for = 1.0;
|
||||
for(int tr = 0; tr<thread_repeat; tr++) {
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
|
||||
for (int vr = 0; vr<inner_repeat; ++vr)
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi) {
|
||||
v3(idx, t, vi) = vi;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
});
|
||||
}
|
||||
lval+=idx+thread_for;
|
||||
},result);
|
||||
result_expect = 0.5*(team_range*team_size)*(team_range*team_size-1) + (team_range*team_size);
|
||||
// sum ( seq( [0, team_range*team_size) + 1 per team_member (total of team_range*team_size) )
|
||||
}
|
||||
if (test_type == 212) {
|
||||
Kokkos::parallel_reduce("212 outer reduce", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team, double& lval) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
double vector_result = 0.0;
|
||||
for(int tr = 0; tr<thread_repeat; tr++) {
|
||||
// This parallel_for is executed by each team; the thread_range is partitioned among the team members
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
|
||||
v2(idx,t) = t;
|
||||
// prevent compiler optimizing loop away
|
||||
for (int vr = 0; vr<inner_repeat; ++vr) {
|
||||
vector_result = 0.0;
|
||||
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi, double &vval) {
|
||||
vval += vi;
|
||||
}, vector_result );
|
||||
}
|
||||
});
|
||||
}
|
||||
lval+= idx + vector_result;
|
||||
},result);
|
||||
result_expect = 0.5*(team_range*team_size)*(team_range*team_size-1) + (0.5*vector_range*(vector_range-1)*team_range*team_size);
|
||||
// sum ( seq( [0, team_range*team_size) + sum( seq( [0, vector_range) ) per team_member (total of team_range*team_size) )
|
||||
}
|
||||
if (test_type == 220) {
|
||||
Kokkos::parallel_reduce("220 outer reduce", t_policy(team_range,team_size),
|
||||
KOKKOS_LAMBDA (const t_team& team, double& lval) {
|
||||
double team_result = 0.0;
|
||||
for(int tr = 0; tr<thread_repeat; tr++) {
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
|
||||
tval += t;
|
||||
},team_result);
|
||||
}
|
||||
lval+=team_result*team.league_rank(); // constant * league_rank
|
||||
},result);
|
||||
result_expect = 0.5*(team_range)*(team_range-1) * team_size * 0.5*(thread_range)*(thread_range-1);
|
||||
// sum ( seq( [0, team_range) * constant ); constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
|
||||
}
|
||||
if (test_type == 221) {
|
||||
Kokkos::parallel_reduce("221 outer reduce", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team, double& lval) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
double team_result = 0;
|
||||
for(int tr = 0; tr<thread_repeat; tr++) {
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
|
||||
double vector_for = 1.0;
|
||||
for (int vr = 0; vr<inner_repeat; ++vr) {
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi) {
|
||||
v3(idx, t, vi) = vi;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
tval += t + vector_for;
|
||||
},team_result);
|
||||
}
|
||||
lval+=team_result*team.league_rank();
|
||||
},result);
|
||||
result_expect = 0.5* (team_range)*(team_range-1) * team_size * (0.5*(thread_range) * (thread_range-1) + thread_range);
|
||||
// sum ( seq( [0, team_range) * constant ) + 1 per member per team; constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
|
||||
}
|
||||
if (test_type == 222) {
|
||||
Kokkos::parallel_reduce("222 outer reduce", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team, double& lval) {
|
||||
double team_result = 0.0;
|
||||
for(int tr = 0; tr<thread_repeat; tr++) {
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
|
||||
double vector_result = 0.0;
|
||||
for (int vr = 0; vr<inner_repeat; ++vr) {
|
||||
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi, double& vval) {
|
||||
vval += vi;
|
||||
}, vector_result);
|
||||
}
|
||||
tval += t + vector_result;
|
||||
},team_result);
|
||||
}
|
||||
lval+=team_result*team.league_rank();
|
||||
},result);
|
||||
result_expect = 0.5* (team_range)*(team_range-1) * team_size * (0.5*(thread_range) * (thread_range-1) + thread_range*0.5*(vector_range)*(vector_range-1));
|
||||
// sum ( seq( [0, team_range) * constant ) + 1 + sum( seq([0,vector_range) ) per member per team; constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
|
||||
}
|
||||
|
||||
// parallel_for RangePolicy: range = team_size*team_range
|
||||
if (test_type == 300) {
|
||||
Kokkos::parallel_for("300 outer for", team_size*team_range,
|
||||
KOKKOS_LAMBDA (const int idx) {
|
||||
v1(idx) = idx;
|
||||
// prevent compiler from optimizing away the loop
|
||||
});
|
||||
}
|
||||
// parallel_reduce RangePolicy: range = team_size*team_range
|
||||
if (test_type == 400) {
|
||||
Kokkos::parallel_reduce("400 outer reduce", team_size*team_range,
|
||||
KOKKOS_LAMBDA (const int idx, double& val) {
|
||||
val += idx;
|
||||
}, result);
|
||||
result_expect = 0.5*(team_size*team_range)*(team_size*team_range-1);
|
||||
}
|
||||
// parallel_scan RangePolicy: range = team_size*team_range
|
||||
if (test_type == 500) {
|
||||
Kokkos::parallel_scan("500 outer scan", team_size*team_range,
|
||||
ParallelScanFunctor<ViewType1>(v1)
|
||||
#if 0
|
||||
// This does not compile with pre Cuda 8.0 - see Github Issue #913 for explanation
|
||||
KOKKOS_LAMBDA (const int idx, double& val, const bool& final) {
|
||||
// inclusive scan
|
||||
val += v1(idx);
|
||||
if ( final ) {
|
||||
v1(idx) = val;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
);
|
||||
// result = v1( team_size*team_range - 1 ); // won't work with Cuda - need to copy result back to host to print
|
||||
// result_expect = 0.5*(team_size*team_range)*(team_size*team_range-1);
|
||||
}
|
||||
|
||||
} // end outer for loop
|
||||
|
||||
time = timer.seconds();
|
||||
} //end test_policy
|
||||
53
lib/kokkos/benchmarks/policy_performance/script_basic_testing.sh
Executable file
53
lib/kokkos/benchmarks/policy_performance/script_basic_testing.sh
Executable file
@ -0,0 +1,53 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Script to check policy_perf_test code works with each possible combo of options
|
||||
|
||||
echo "Performance test results for parallel_reduce code computing sum of sequence [0,N) with various (nested) policies"
|
||||
|
||||
EXECUTABLE=policy_performance
|
||||
|
||||
TEAMRANGE=1000
|
||||
THREADRANGE=4
|
||||
VECTORRANGE=32
|
||||
TEAMSIZE=4
|
||||
VECTORSIZE=1
|
||||
OREPEAT=1
|
||||
MREPEAT=1
|
||||
IREPEAT=1
|
||||
SCHEDULE=1
|
||||
|
||||
SUFFIX=host
|
||||
if [ -e $EXECUTABLE.$SUFFIX ]
|
||||
then
|
||||
SCHEDULE=1
|
||||
echo "Host tests Static schedule"
|
||||
for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
|
||||
do
|
||||
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
|
||||
SCHEDULE=2
|
||||
echo "Host tests Dynamic schedule"
|
||||
for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
|
||||
do
|
||||
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
fi
|
||||
|
||||
SUFFIX=cuda
|
||||
if [ -e $EXECUTABLE.$SUFFIX ]
|
||||
then
|
||||
SCHEDULE=1
|
||||
echo "Cuda tests Static schedule"
|
||||
for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
|
||||
do
|
||||
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
|
||||
SCHEDULE=2
|
||||
echo "Cuda tests Dynamic schedule"
|
||||
for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
|
||||
do
|
||||
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
fi
|
||||
126
lib/kokkos/benchmarks/policy_performance/script_sample_usage.sh
Executable file
126
lib/kokkos/benchmarks/policy_performance/script_sample_usage.sh
Executable file
@ -0,0 +1,126 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Sample script for benchmarking policy performance
|
||||
|
||||
# Suggested enviroment variables to export prior to executing script:
|
||||
# KNL:
|
||||
# OMP_NUM_THREADS=256 KMP_AFFINITY=compact
|
||||
# Power:
|
||||
# OMP_NUM_THREADS=64 OMP_PROC_BIND=true
|
||||
|
||||
# Constants and Variables:
|
||||
# Vary: TEAMSIZE, and THREADRANGE
|
||||
# for TEAMSIZE in {1,2,4,5,8}; do
|
||||
# for THREADRANGE in {32,41,1000}; do
|
||||
# Fixed: TEAMRANGE, VECTORRANGE, VECTORSIZE
|
||||
# System specific: Adjust REPEAT values to architecture tests are run on
|
||||
|
||||
# Tests
|
||||
# Static SCHEDULE = 1
|
||||
# Tier 1: parallel_for + RangePolicy 300
|
||||
# Tier 2: parallel_reduce, parallel_scan + RangePolicy 400 500
|
||||
# Tier 3: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY
|
||||
# Tier 4: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY
|
||||
# Dynamic SCHEDULE = 2
|
||||
# Tier 5: parallel_for + RangePolicy 300
|
||||
# Tier 6: parallel_reduce, parallel_scan + RangePolicy 400 500
|
||||
# Tier 7: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY
|
||||
# Tier 8: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY
|
||||
|
||||
# Results grouped by:
|
||||
# 0) SCHEDULE 1) CODE (test) 2) TEAMRANGE 3) TEAMSIZE 4) THREADRANGE
|
||||
|
||||
EXECUTABLE=policy_performance
|
||||
|
||||
# Default defined values
|
||||
TEAMRANGE=1000
|
||||
THREADRANGE=1
|
||||
VECTORRANGE=32
|
||||
TEAMSIZE=1
|
||||
VECTORSIZE=1
|
||||
OREPEAT=1
|
||||
MREPEAT=1
|
||||
IREPEAT=1
|
||||
SCHEDULE=1
|
||||
|
||||
# Host tests
|
||||
SUFFIX=host
|
||||
if [ -e $EXECUTABLE.$SUFFIX ]; then
|
||||
echo "Host"
|
||||
|
||||
for SCHEDULE in {1,2}; do
|
||||
|
||||
# Tier 1 and 2, 5 and 6
|
||||
for CODE in {300,400,500}; do
|
||||
for TEAMSIZE in {1,2,4,5,8}; do
|
||||
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
done
|
||||
|
||||
# Tier 3, 7
|
||||
for CODE in {100,110,111,112,120,121,122}; do
|
||||
for TEAMSIZE in {1,2,4,5,8}; do
|
||||
for THREADRANGE in {32,41,1000}; do
|
||||
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
# Tier 4, 8
|
||||
for CODE in {200,210,211,212,220,221,222}; do
|
||||
for TEAMSIZE in {1,2,4,5,8}; do
|
||||
for THREADRANGE in {32,41,1000}; do
|
||||
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
done # end SCHEDULE
|
||||
|
||||
fi # end host
|
||||
|
||||
|
||||
# Cuda tests
|
||||
SUFFIX=cuda
|
||||
# TEAMRANGE=10000, TEAMSIZE=8 too large
|
||||
# TEAMRANGE=10000, TEAMSIZE=8, THREADRANGE=1000 too large
|
||||
if [ -e $EXECUTABLE.$SUFFIX ]; then
|
||||
echo "Cuda"
|
||||
|
||||
for SCHEDULE in {1,2}; do
|
||||
|
||||
# Reset defaults
|
||||
TEAMRANGE=1000
|
||||
THREADRANGE=1
|
||||
VECTORRANGE=32
|
||||
TEAMSIZE=1
|
||||
VECTORSIZE=1
|
||||
|
||||
# Tier 1 and 2, 5 and 6
|
||||
for CODE in {300,400,500}; do
|
||||
for TEAMSIZE in {1,2,4,5,8}; do
|
||||
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
done
|
||||
|
||||
# Tier 3, 7
|
||||
for CODE in {100,110,111,112,120,121,122}; do
|
||||
for TEAMSIZE in {1,2,4,5,8}; do
|
||||
for THREADRANGE in {32,41,1000}; do
|
||||
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
# Tier 4, 8
|
||||
for CODE in {200,210,211,212,220,221,222}; do
|
||||
for TEAMSIZE in {1,2,4,5,8}; do
|
||||
for THREADRANGE in {32,41,1000}; do
|
||||
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
done # end SCHEDULE
|
||||
|
||||
fi #end cuda
|
||||
454
lib/kokkos/bin/hpcbind
Executable file
454
lib/kokkos/bin/hpcbind
Executable file
@ -0,0 +1,454 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
################################################################################
|
||||
# Check if hwloc commands exist
|
||||
################################################################################
|
||||
declare -i HPCBIND_HAS_HWLOC=1
|
||||
type hwloc-bind >/dev/null 2>&1
|
||||
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
|
||||
|
||||
type hwloc-distrib >/dev/null 2>&1
|
||||
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
|
||||
|
||||
type hwloc-ls >/dev/null 2>&1
|
||||
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
|
||||
|
||||
type hwloc-calc >/dev/null 2>&1
|
||||
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
|
||||
|
||||
type hwloc-ps >/dev/null 2>&1
|
||||
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
|
||||
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 0 ]]; then
|
||||
echo "hwloc not found, no process binding will occur"
|
||||
fi
|
||||
|
||||
# Get parent cpuset
|
||||
HPCBIND_HWLOC_PARENT_CPUSET=""
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
|
||||
MY_PID="$BASHPID"
|
||||
HPCBIND_HWLOC_PARENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
# Check if nvidia-smi exist
|
||||
################################################################################
|
||||
declare -i HPCBIND_HAS_NVIDIA=0
|
||||
type nvidia-smi >/dev/null 2>&1
|
||||
HPCBIND_HAS_NVIDIA=$((!$?))
|
||||
|
||||
|
||||
################################################################################
|
||||
# Get visible gpu
|
||||
################################################################################
|
||||
declare -i NUM_GPUS=0
|
||||
HPCBIND_VISIBLE_GPUS=""
|
||||
if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then
|
||||
NUM_GPUS=$(nvidia-smi -L | wc -l);
|
||||
GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )"
|
||||
HPCBIND_VISIBLE_GPUS=${CUDA_VISIBLE_DEVICES:-${GPU_LIST}}
|
||||
fi
|
||||
|
||||
declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0))
|
||||
|
||||
|
||||
################################################################################
|
||||
# Get queue id
|
||||
# supports sbatch, bsub, aprun
|
||||
################################################################################
|
||||
HPCBIND_QUEUE_NAME=""
|
||||
declare -i HPCBIND_QUEUE_INDEX=0
|
||||
declare -i HPCBIND_QUEUE_GPU_MAPPING=0
|
||||
|
||||
if [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
|
||||
HPCBIND_QUEUE_GPU_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="sbatch"
|
||||
HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID}
|
||||
elif [[ ! -z "${LBS_JOBINDEX}" ]]; then
|
||||
HPCBIND_QUEUE_GPU_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="bsub"
|
||||
HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX}
|
||||
elif [[ ! -z "${ALPS_APP_PE}" ]]; then
|
||||
HPCBIND_QUEUE_GPU_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="aprun"
|
||||
HPCBIND_QUEUE_INDEX=${ALPS_APP_PE}
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Show help
|
||||
################################################################################
|
||||
function show_help {
|
||||
local cmd=$(basename "$0")
|
||||
echo "Usage: ${cmd} <options> -- command ..."
|
||||
echo " Set the process mask, OMP environment variables and CUDA environment"
|
||||
echo " variables to sane values if possible. Uses hwloc and nvidia-smi if"
|
||||
echo " available. Will preserve the current process binding, so it is safe"
|
||||
echo " to use with a queuing system or mpiexec."
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --no-hwloc-bind Disable binding"
|
||||
echo " --proc-bind=<LOC> Set the initial process mask for the script"
|
||||
echo " LOC can be any valid location argument for"
|
||||
echo " hwloc-calc Default: all"
|
||||
echo " --distribute=N Distribute the current cpuset into N partitions"
|
||||
echo " --distribute-partition=I"
|
||||
echo " Use the i'th partition (zero based)"
|
||||
echo " --visible-gpus=<L> Comma separated list of gpu ids"
|
||||
echo " Default: CUDA_VISIBLE_DEVICES or all gpus in"
|
||||
echo " sequential order"
|
||||
echo " --gpu-ignore-queue Ignore queue job id when choosing visible GPU"
|
||||
echo " --no-gpu-mapping Do not set CUDA_VISIBLE_DEVICES"
|
||||
echo " --openmp=M.m Set env variables for the given OpenMP version"
|
||||
echo " Default: 4.0"
|
||||
echo " --openmp-percent=N Integer percentage of cpuset to use for OpenMP"
|
||||
echo " threads Default: 100"
|
||||
echo " --openmp-places=<Op> Op=threads|cores|sockets. Default: threads"
|
||||
echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
|
||||
echo " --force-openmp-num-threads=N"
|
||||
echo " Override logic for selecting OMP_NUM_THREADS"
|
||||
echo " --force-openmp-proc-bind=<OP>"
|
||||
echo " Override logic for selecting OMP_PROC_BIND"
|
||||
echo " --no-openmp-nested Set OMP_NESTED to false"
|
||||
echo " --show-bindings Show the bindings"
|
||||
echo " --lstopo Show bindings in lstopo without executing a command"
|
||||
echo " -v|--verbose Show options and relevant environment variables"
|
||||
echo " -h|--help Show this message"
|
||||
echo ""
|
||||
echo "Sample Usage:"
|
||||
echo " Split the current process cpuset into 4 and use the 3rd partition"
|
||||
echo " ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..."
|
||||
echo " Bing the process to all even cores"
|
||||
echo " ${cmd} --proc-bind=core:even -v -- command ..."
|
||||
echo " Bind to the first 64 cores and split the current process cpuset into 4"
|
||||
echo " ${cmd} --proc-bind=core:0-63 --distribute=4 --distribute-partition=0 -- command ..."
|
||||
echo " skip GPU 0 when mapping visible devices"
|
||||
echo " ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..."
|
||||
echo " Display the current bindings"
|
||||
echo " ${cmd} --proc-bind=numa:0 --show-bindings -- command"
|
||||
echo " Display the current bindings using lstopo"
|
||||
echo " ${cmd} --proc-bind=numa:0.core:odd --lstopo"
|
||||
echo ""
|
||||
}
|
||||
|
||||
|
||||
################################################################################
|
||||
# Parse command line arguments
|
||||
################################################################################
|
||||
# Show help if no command line arguments given
|
||||
if [[ "$#" -eq 0 ]]; then
|
||||
show_help
|
||||
exit 0
|
||||
fi
|
||||
|
||||
declare -a UNKNOWN_ARGS=()
|
||||
declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC}
|
||||
declare -i HPCBIND_DISTRIBUTE=1
|
||||
declare -i HPCBIND_PARTITION=0
|
||||
HPCBIND_PROC_BIND="all"
|
||||
HPCBIND_OPENMP_VERSION=4.0
|
||||
declare -i HPCBIND_OPENMP_PERCENT=100
|
||||
HPCBIND_OPENMP_PLACES=${OMP_PLACES:-threads}
|
||||
declare -i HPCBIND_OPENMP_PROC_BIND=1
|
||||
declare -i HPCBIND_OPENMP_FORCE_NUM_THREADS=-1
|
||||
HPCBIND_OPENMP_FORCE_PROC_BIND=""
|
||||
HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true}
|
||||
declare -i HPCBIND_VERBOSE=0
|
||||
|
||||
declare -i HPCBIND_SHOW_BINDINGS=0
|
||||
declare -i HPCBIND_LSTOPO=0
|
||||
|
||||
for i in $@; do
|
||||
case $i in
|
||||
# number of partitions to create
|
||||
--no-hwloc-bind)
|
||||
HPCBIND_ENABLE_HWLOC_BIND=0
|
||||
shift
|
||||
;;
|
||||
--proc-bind=*)
|
||||
HPCBIND_PROC_BIND="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--distribute=*)
|
||||
HPCBIND_DISTRIBUTE="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
# which partition to use
|
||||
--distribute-partition=*)
|
||||
HPCBIND_PARTITION="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--visible-gpus=*)
|
||||
HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ')
|
||||
shift
|
||||
;;
|
||||
--gpu-ignore-queue)
|
||||
HPCBIND_QUEUE_GPU_MAPPING=0
|
||||
shift
|
||||
;;
|
||||
--no-gpu-mapping)
|
||||
HPCBIND_ENABLE_GPU_MAPPING=0
|
||||
shift
|
||||
;;
|
||||
--openmp=*)
|
||||
HPCBIND_OPENMP_VERSION="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--openmp-percent=*)
|
||||
HPCBIND_OPENMP_PERCENT="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--openmp-places=*)
|
||||
HPCBIND_OPENMP_PLACES="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--no-openmp-proc-bind)
|
||||
HPCBIND_OPENMP_PROC_BIND=0
|
||||
shift
|
||||
;;
|
||||
--force-openmp-proc-bind=*)
|
||||
HPCBIND_OPENMP_FORCE_PROC_BIND="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--force-openmp-num-threads=*)
|
||||
HPCBIND_OPENMP_FORCE_NUM_THREADS="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--no-openmp-nested)
|
||||
HPCBIND_OPENMP_NESTED="false"
|
||||
shift
|
||||
;;
|
||||
--show-bindings)
|
||||
HPCBIND_VERBOSE=1
|
||||
HPCBIND_SHOW_BINDINGS=1
|
||||
shift
|
||||
;;
|
||||
--lstopo)
|
||||
HPCBIND_VERBOSE=1
|
||||
HPCBIND_SHOW_BINDINGS=0
|
||||
HPCBIND_LSTOPO=1
|
||||
shift
|
||||
;;
|
||||
-v|--verbose)
|
||||
HPCBIND_VERBOSE=1
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
# ignore remaining arguments
|
||||
--)
|
||||
shift
|
||||
break
|
||||
;;
|
||||
# unknown option
|
||||
*)
|
||||
UNKNOWN_ARGS+=("$i")
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
|
||||
################################################################################
|
||||
# Check unknown arguments
|
||||
################################################################################
|
||||
if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
|
||||
echo "Uknown options: ${UNKNOWN_ARGS[*]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Check that visible gpus are valid
|
||||
################################################################################
|
||||
HPCBIND_VISIBLE_GPUS=(${HPCBIND_VISIBLE_GPUS})
|
||||
if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
|
||||
for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do
|
||||
if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} ||
|
||||
${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then
|
||||
echo "Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]}, setting to 0"
|
||||
HPCBIND_VISIBLE_GPUS[$i]=0;
|
||||
fi
|
||||
done
|
||||
NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]}
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Check OpenMP percent
|
||||
################################################################################
|
||||
if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then
|
||||
echo "OpenMP percent < 1, setting to 1"
|
||||
HPCBIND_OPENMP_PERCENT=1
|
||||
elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then
|
||||
echo "OpenMP percent > 100, setting to 100"
|
||||
HPCBIND_OPENMP_PERCENT=100
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
# Check distribute
|
||||
################################################################################
|
||||
if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then
|
||||
echo "Invalid input for distribute, changing distribute to 1"
|
||||
HPCBIND_DISTRIBUTE=1
|
||||
fi
|
||||
|
||||
if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
|
||||
echo "Invalid input for distribute-partition, changing to 0"
|
||||
HPCBIND_PARTITION=0
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Find cpuset and num threads
|
||||
################################################################################
|
||||
HPCBIND_HWLOC_CPUSET=""
|
||||
declare -i HPCBIND_NUM_PUS=0
|
||||
|
||||
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
|
||||
if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
|
||||
BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND})
|
||||
else
|
||||
BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND})
|
||||
fi
|
||||
|
||||
CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
|
||||
HPCBIND_HWLOC_CPUSET=${CPUSETS[${HPCBIND_PARTITION}]}
|
||||
HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l)
|
||||
else
|
||||
HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor)
|
||||
fi
|
||||
|
||||
declare -i HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_NUM_PUS * HPCBIND_OPENMP_PERCENT))
|
||||
HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_OPENMP_NUM_THREADS / 100))
|
||||
|
||||
|
||||
if [[ ${HPCBIND_OPENMP_NUM_THREADS} -lt 1 ]]; then
|
||||
HPCBIND_OPENMP_NUM_THREADS=1
|
||||
elif [[ ${HPCBIND_OPENMP_NUM_THREADS} -gt ${HPCBIND_NUM_PUS} ]]; then
|
||||
HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_NUM_PUS}
|
||||
fi
|
||||
|
||||
if [[ ${HPCBIND_OPENMP_FORCE_NUM_THREADS} -gt 0 ]]; then
|
||||
HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_OPENMP_FORCE_NUM_THREADS}
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
# Set OpenMP environment variables
|
||||
################################################################################
|
||||
|
||||
# set OMP_NUM_THREADS
|
||||
export OMP_NUM_THREADS=${HPCBIND_OPENMP_NUM_THREADS}
|
||||
|
||||
# set OMP_PROC_BIND and OMP_PLACES
|
||||
if [[ ${HPCBIND_OPENMP_PROC_BIND} -eq 1 ]]; then
|
||||
if [[ "${HPCBIND_OPENMP_FORCE_PROC_BIND}" == "" ]]; then
|
||||
#default proc bind logic
|
||||
if [[ "${HPCBIND_OPENMP_VERSION}" == "4.0" || "${HPCBIND_OPENMP_VERSION}" > "4.0" ]]; then
|
||||
export OMP_PLACES="${HPCBIND_OPENMP_PLACES}"
|
||||
export OMP_PROC_BIND="spread"
|
||||
else
|
||||
export OMP_PROC_BIND="true"
|
||||
unset OMP_PLACES
|
||||
fi
|
||||
else
|
||||
#force proc bind
|
||||
export OMP_PLACES="${HPCBIND_OPENMP_PLACES}"
|
||||
export OMP_PROC_BIND="${HPCBIND_OPENMP_FORCE_PROC_BIND}"
|
||||
fi
|
||||
else
|
||||
# no openmp proc bind
|
||||
unset OMP_PLACES
|
||||
unset OMP_PROC_BIND
|
||||
fi
|
||||
|
||||
# set OMP_NESTED
|
||||
export OMP_NESTED=${HPCBIND_OPENMP_NESTED}
|
||||
|
||||
|
||||
################################################################################
|
||||
# Set CUDA environment variables
|
||||
################################################################################
|
||||
|
||||
if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
|
||||
if [[ ${HPCBIND_QUEUE_GPU_MAPPING} -eq 0 ]]; then
|
||||
declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
|
||||
export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
|
||||
else
|
||||
declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
|
||||
declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
|
||||
export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
|
||||
fi
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
# Set hpcbind environment variables
|
||||
################################################################################
|
||||
export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC}
|
||||
export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA}
|
||||
export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS}
|
||||
export HPCBIND_HWLOC_CPUSET=${HPCBIND_HWLOC_CPUSET}
|
||||
export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE}
|
||||
export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION}
|
||||
if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
|
||||
export HPCBIND_HWLOC_PARENT_CPUSET="all"
|
||||
else
|
||||
export HPCBIND_HWLOC_PARENT_CPUSET=${HPCBIND_HWLOC_PARENT_CPUSET}
|
||||
fi
|
||||
export HPCBIND_HWLOC_PROC_BIND=${HPCBIND_PROC_BIND}
|
||||
export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
|
||||
export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
|
||||
export HPCBIND_OPENMP_VERSION=${HPCBIND_OPENMP_VERSION}
|
||||
if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then
|
||||
export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX}
|
||||
export HPCBIND_QUEUE_NAME=${HPCBIND_QUEUE_NAME}
|
||||
export HPCBIND_QUEUE_GPU_MAPPING=${HPCBIND_QUEUE_GPU_MAPPING}
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Print verbose
|
||||
################################################################################
|
||||
|
||||
if [[ ${HPCBIND_VERBOSE} -eq 1 ]]; then
|
||||
MY_ENV=$(env | sort)
|
||||
echo "[HPCBIND]"
|
||||
echo "${MY_ENV}" | grep -E "^HPCBIND_"
|
||||
echo "[CUDA]"
|
||||
echo "${MY_ENV}" | grep -E "^CUDA_"
|
||||
echo "[OPENMP]"
|
||||
echo "${MY_ENV}" | grep -E "^OMP_"
|
||||
fi
|
||||
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
|
||||
echo "[BINDINGS]"
|
||||
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
|
||||
elif [[ ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
|
||||
echo "Unable to show bindings, hwloc not available."
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
# Run command
|
||||
################################################################################
|
||||
|
||||
if [[ ${HPCBIND_LSTOPO} -eq 0 ]]; then
|
||||
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
|
||||
hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- $@
|
||||
else
|
||||
eval $@
|
||||
fi
|
||||
else
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
|
||||
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
|
||||
echo "[BINDINGS]"
|
||||
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
|
||||
hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- lstopo --pid 0
|
||||
else
|
||||
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET}
|
||||
fi
|
||||
else
|
||||
echo "Unable to show bindings, hwloc not available."
|
||||
fi
|
||||
fi
|
||||
221
lib/kokkos/bin/kokkos-bind
Executable file
221
lib/kokkos/bin/kokkos-bind
Executable file
@ -0,0 +1,221 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# check if hwloc commands exist
|
||||
declare -i HAS_HWLOC=0
|
||||
type hwloc-bind >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
type hwloc-distrib >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
type hwloc-ls >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
type hwloc-calc >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
type hwloc-ps >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
|
||||
#parse args
|
||||
declare -a UNKNOWN_ARGS=()
|
||||
declare -i DISTRIBUTE=1
|
||||
declare -i INDEX=0
|
||||
PROC_BIND="all"
|
||||
CURRENT_CPUSET=""
|
||||
OPENMP_VERSION=4.0
|
||||
OPENMP_PROC_BIND=True
|
||||
OPENMP_NESTED=True
|
||||
VERBOSE=False
|
||||
|
||||
#get the current process cpuset
|
||||
if [[ ${HAS_HWLOC} -eq 0 ]]; then
|
||||
MY_PID="$BASHPID"
|
||||
CURRENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
|
||||
echo "$CURRENT_CPUSET"
|
||||
fi
|
||||
|
||||
function show_help {
|
||||
local cmd=$(basename "$0")
|
||||
echo "Usage: ${cmd} <options> -- command ..."
|
||||
echo " Uses hwloc to divide the node into the given number of groups,"
|
||||
echo " set the appropriate OMP_NUM_THREADS and execute the command on the"
|
||||
echo " selected group."
|
||||
echo ""
|
||||
echo " NOTE: This command assumes it has exclusive use of the node"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --proc-bind=<LOC> Set the initial process mask for the script. "
|
||||
echo " LOC can be any valid location argumnet for"
|
||||
echo " hwloc-calc. Defaults to the entire machine"
|
||||
echo " --distribute=N Distribute the current proc-bind into N groups"
|
||||
echo " --index=I Use the i'th group (zero based)"
|
||||
echo " --openmp=M.m Set env variables for the given OpenMP version"
|
||||
echo " (default 4.0)"
|
||||
echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
|
||||
echo " --no-openmp-nested Set OMP_NESTED to false"
|
||||
echo " -v|--verbose"
|
||||
echo " -h|--help"
|
||||
echo ""
|
||||
echo "Sample Usage:"
|
||||
echo " ${cmd} --distribute=4 --index=2 -v -- command ..."
|
||||
echo ""
|
||||
}
|
||||
|
||||
if [[ "$#" -eq 0 ]]; then
|
||||
show_help
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
||||
for i in $@; do
|
||||
case $i in
|
||||
# number of partitions to create
|
||||
--proc-bind=*)
|
||||
PROC_BIND="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--distribute=*)
|
||||
DISTRIBUTE="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
# which group to use
|
||||
--index=*)
|
||||
INDEX="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--openmp=*)
|
||||
OPENMP_VERSION="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--no-openmp-proc-bind)
|
||||
OPENMP_PROC_BIND=False
|
||||
shift
|
||||
;;
|
||||
--no-openmp-nested)
|
||||
OPENMP_NESTED=False
|
||||
shift
|
||||
;;
|
||||
-v|--verbose)
|
||||
VERBOSE=True
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
# ignore remaining arguments
|
||||
--)
|
||||
shift
|
||||
break
|
||||
;;
|
||||
# unknown option
|
||||
*)
|
||||
UNKNOWN_ARGS+=("$i")
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
|
||||
echo "Uknown options: ${UNKNOWN_ARGS[*]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ${DISTRIBUTE} -le 0 ]]; then
|
||||
echo "Invalid input for distribute, changing distribute to 1"
|
||||
DISTRIBUTE=1
|
||||
fi
|
||||
|
||||
if [[ ${INDEX} -ge ${DISTRIBUTE} ]]; then
|
||||
echo "Invalid input for index, changing index to 0"
|
||||
INDEX=0
|
||||
fi
|
||||
|
||||
if [[ ${HAS_HWLOC} -ne 0 ]]; then
|
||||
echo "hwloc not found, no process binding will occur"
|
||||
DISTRIBUTE=1
|
||||
INDEX=0
|
||||
fi
|
||||
|
||||
if [[ ${HAS_HWLOC} -eq 0 ]]; then
|
||||
|
||||
if [[ "${CURRENT_CPUSET}" == "" ]]; then
|
||||
BINDING=$(hwloc-calc ${PROC_BIND})
|
||||
else
|
||||
BINDING=$(hwloc-calc --restrict ${CURRENT_CPUSET} ${PROC_BIND})
|
||||
fi
|
||||
|
||||
CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${DISTRIBUTE}))
|
||||
CPUSET=${CPUSETS[${INDEX}]}
|
||||
NUM_THREADS=$(hwloc-ls --restrict ${CPUSET} --only pu | wc -l)
|
||||
|
||||
if [[ "${VERBOSE}" == "True" ]]; then
|
||||
echo "hwloc: true"
|
||||
echo " proc_bind: ${PROC_BIND}"
|
||||
echo " distribute: ${DISTRIBUTE}"
|
||||
echo " index: ${INDEX}"
|
||||
echo " parent_cpuset: ${CURRENT_CPUSET}"
|
||||
echo " cpuset: ${CPUSET}"
|
||||
echo "omp_num_threads: ${NUM_THREADS}"
|
||||
echo "omp_proc_bind: ${OPENMP_PROC_BIND}"
|
||||
echo "omp_nested: ${OPENMP_NESTED}"
|
||||
echo "OpenMP: ${OPENMP_VERSION}"
|
||||
fi
|
||||
|
||||
# set OMP env
|
||||
if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
|
||||
if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
|
||||
export OMP_PLACES="threads"
|
||||
export OMP_PROC_BIND="spread"
|
||||
else
|
||||
export OMP_PROC_BIND="true"
|
||||
unset OMP_PLACES
|
||||
fi
|
||||
else
|
||||
unset OMP_PLACES
|
||||
unset OMP_PROC_BIND
|
||||
fi
|
||||
if [[ "${OPENMP_NESTED}" == "True" ]]; then
|
||||
export OMP_NESTED="true"
|
||||
else
|
||||
export OMP_NESTED="false"
|
||||
fi
|
||||
export OMP_NUM_THREADS="${NUM_THREADS}"
|
||||
|
||||
hwloc-bind ${CPUSET} -- $@
|
||||
else
|
||||
NUM_THREADS=$(cat /proc/cpuinfo | grep -c processor)
|
||||
|
||||
if [[ "${VERBOSE}" == "True" ]]; then
|
||||
echo "hwloc: false"
|
||||
echo "omp_num_threads: ${NUM_THREADS}"
|
||||
echo "omp_proc_bind: ${OPENMP_PROC_BIND}"
|
||||
echo "omp_nested: ${OPENMP_NESTED}"
|
||||
echo "OpenMP: ${OPENMP_VERSION}"
|
||||
fi
|
||||
|
||||
# set OMP env
|
||||
if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
|
||||
if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
|
||||
export OMP_PLACES="threads"
|
||||
export OMP_PROC_BIND="spread"
|
||||
else
|
||||
export OMP_PROC_BIND="true"
|
||||
unset OMP_PLACES
|
||||
fi
|
||||
else
|
||||
unset OMP_PLACES
|
||||
unset OMP_PROC_BIND
|
||||
fi
|
||||
if [[ "${OPENMP_NESTED}" == "True" ]]; then
|
||||
export OMP_NESTED="true"
|
||||
else
|
||||
export OMP_NESTED="false"
|
||||
fi
|
||||
export OMP_NUM_THREADS="${NUM_THREADS}"
|
||||
|
||||
eval $@
|
||||
fi
|
||||
|
||||
165
lib/kokkos/bin/runtest
Executable file
165
lib/kokkos/bin/runtest
Executable file
@ -0,0 +1,165 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
function get_path() {
|
||||
cd "$(dirname "$0")"
|
||||
cd ..
|
||||
echo "$(pwd -P)"
|
||||
}
|
||||
|
||||
KOKKOS_PATH="$(get_path "$0")"
|
||||
|
||||
function show_help() {
|
||||
local cmd=$(basename "$0")
|
||||
echo "Usage: ${cmd} <options> "
|
||||
echo " Build and run the tests"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " -j=N|--make-j=N Build the tests in parallel"
|
||||
echo " -c|--clean Clean build and regenerate make files"
|
||||
echo " --clean-on-pass Clean build when runtest passes"
|
||||
echo " --output-prefix=<pre> Prefix of log files Default: runtest"
|
||||
echo " --build-only Only build the tests"
|
||||
echo " -v|--verbose Tee STDOUT and STDERR to screen and files"
|
||||
echo " -h|--help Show this message"
|
||||
echo ""
|
||||
${KOKKOS_PATH}/generate_makefile.bash --help
|
||||
return 0
|
||||
}
|
||||
|
||||
|
||||
declare -a GENERATE_ARGS=()
|
||||
declare -i VERBOSE=0
|
||||
declare -i CLEAN=0
|
||||
declare -i CLEAN_ON_PASS=0
|
||||
declare -i BUILD_ONLY=0
|
||||
OUTPUT="runtest"
|
||||
|
||||
declare -i MAKE_J=${HPCBIND_NUM_PUS:-1}
|
||||
|
||||
for i in $@; do
|
||||
case $i in
|
||||
-j=*|--make-j=*)
|
||||
MAKE_J=${i#*=}
|
||||
shift
|
||||
;;
|
||||
-c|--clean)
|
||||
CLEAN=1
|
||||
shift
|
||||
;;
|
||||
--clean-on-pass)
|
||||
CLEAN_ON_PASS=1
|
||||
shift
|
||||
;;
|
||||
--output-prefix=*)
|
||||
OUTPUT=${i#*=}
|
||||
shift
|
||||
;;
|
||||
--build-only)
|
||||
BUILD_ONLY=1
|
||||
shift
|
||||
;;
|
||||
-v|--verbose)
|
||||
VERBOSE=1
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
GENERATE_ARGS+=("$i")
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ "$(pwd -P)" == ${KOKKOS_PATH} ]]; then
|
||||
echo "Cannot call $0 from root repository path ${KOKKOS_PATH}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Some makefile dependencies are incorrect, so clean needs to force
|
||||
# a new call to generate_makefiles.bash
|
||||
if [[ ${CLEAN} -eq 1 ]]; then
|
||||
START=${SECONDS}
|
||||
echo "Cleaning"
|
||||
/bin/rm -rf algorithms containers core example install Makefile >/dev/null 2>&1
|
||||
END=${SECONDS}
|
||||
echo " $((END-START)) seconds"
|
||||
if [[ ${VERBOSE} -eq 1 ]]; then
|
||||
echo ""
|
||||
echo ""
|
||||
fi
|
||||
fi
|
||||
|
||||
declare -i START=${SECONDS}
|
||||
echo "Generating Makefile"
|
||||
echo " ${KOKKOS_PATH}/generate_makefile.bash --kokkos-path=${KOKKOS_PATH} ${GENERATE_ARGS[@]}"
|
||||
|
||||
if [[ ${VERBOSE} -eq 0 ]]; then
|
||||
"${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > ${OUTPUT}.out 2> >(tee ${OUTPUT}.err >&2)
|
||||
else
|
||||
"${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > >(tee ${OUTPUT}.out) 2> >(tee ${OUTPUT}.err >&2)
|
||||
fi
|
||||
declare -i RESULT=$?
|
||||
declare -i END=${SECONDS}
|
||||
if [[ ${RESULT} -eq 0 ]]; then
|
||||
echo " PASS: $((END-START)) seconds"
|
||||
if [[ ${VERBOSE} -eq 1 ]]; then
|
||||
echo ""
|
||||
echo ""
|
||||
fi
|
||||
else
|
||||
cat ${OUTPUT}.out | grep "FAIL"
|
||||
cat ${OUTPUT}.err | grep "FAIL"
|
||||
echo " FAIL: $((END-START)) seconds"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
START=${SECONDS}
|
||||
echo "Building"
|
||||
if [[ ${VERBOSE} -eq 0 ]]; then
|
||||
make --keep-going -j ${MAKE_J} build-test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2)
|
||||
else
|
||||
make --keep-going -j ${MAKE_J} build-test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2)
|
||||
fi
|
||||
RESULT=$?
|
||||
END=${SECONDS}
|
||||
if [[ ${RESULT} -eq 0 ]]; then
|
||||
echo " PASS: $((END-START)) seconds"
|
||||
if [[ ${VERBOSE} -eq 1 ]]; then
|
||||
echo ""
|
||||
echo ""
|
||||
fi
|
||||
else
|
||||
cat ${OUTPUT}.out | grep -E "[[:space:]]error:[[:space:]]"
|
||||
cat ${OUTPUT}.err | grep -E "[[:space:]]error:[[:space:]]"
|
||||
echo " FAIL: $((END-START)) seconds"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ${BUILD_ONLY} -eq 0 ]]; then
|
||||
START=${SECONDS}
|
||||
echo "Testing"
|
||||
if [[ ${VERBOSE} -eq 0 ]]; then
|
||||
make --keep-going test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2)
|
||||
else
|
||||
make --keep-going test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2)
|
||||
fi
|
||||
RESULT=$?
|
||||
END=${SECONDS}
|
||||
if [[ ${RESULT} -eq 0 ]]; then
|
||||
echo " PASS: $((END-START)) seconds"
|
||||
if [[ ${CLEAN_ON_PASS} -eq 1 ]]; then
|
||||
make clean
|
||||
fi
|
||||
else
|
||||
cat ${OUTPUT}.out | grep "FAIL"
|
||||
cat ${OUTPUT}.err | grep "FAIL"
|
||||
echo " FAIL: $((END-START)) seconds"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
exit ${RESULT}
|
||||
|
||||
@ -999,8 +999,12 @@ SET (Kokkos_INCLUDE_DIRS
|
||||
${Kokkos_SOURCE_DIR}/containers/src
|
||||
${Kokkos_SOURCE_DIR}/algorithms/src
|
||||
${Kokkos_BINARY_DIR} # to find KokkosCore_config.h
|
||||
${KOKKOS_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
# pass include dirs back to parent scope
|
||||
SET(Kokkos_INCLUDE_DIRS_RET ${Kokkos_INCLUDE_DIRS} PARENT_SCOPE)
|
||||
|
||||
INCLUDE_DIRECTORIES(${Kokkos_INCLUDE_DIRS})
|
||||
|
||||
IF(KOKKOS_SEPARATE_LIBS)
|
||||
|
||||
@ -7,3 +7,4 @@ tag: 2.02.07 date: 12:16:2016 master: 4b4cc4ba develop: 382c0966
|
||||
tag: 2.02.15 date: 02:10:2017 master: 8c64cd93 develop: 28dea8b6
|
||||
tag: 2.03.00 date: 04:25:2017 master: 120d9ce7 develop: 015ba641
|
||||
tag: 2.03.05 date: 05:27:2017 master: 36b92f43 develop: 79073186
|
||||
tag: 2.03.13 date: 07:27:2017 master: da314444 develop: 29ccb58a
|
||||
|
||||
24
lib/kokkos/config/query_cuda_arch.cpp
Normal file
24
lib/kokkos/config/query_cuda_arch.cpp
Normal file
@ -0,0 +1,24 @@
|
||||
#include <cstdio>
|
||||
#include <cuda_runtime_api.h>
|
||||
int main()
|
||||
{
|
||||
cudaDeviceProp prop;
|
||||
const cudaError_t err_code = cudaGetDeviceProperties(&prop, 0);
|
||||
if (cudaSuccess != err_code) {
|
||||
fprintf(stderr,"cudaGetDeviceProperties failed: %s\n", cudaGetErrorString(err_code));
|
||||
return -1;
|
||||
}
|
||||
switch (prop.major) {
|
||||
case 3:
|
||||
printf("Kepler"); break;
|
||||
case 5:
|
||||
printf("Maxwell"); break;
|
||||
case 6:
|
||||
printf("Pascal"); break;
|
||||
default:
|
||||
fprintf(stderr, "Unspported Device %d%d\n", (int)prop.major, (int)prop.minor);
|
||||
return -1;
|
||||
}
|
||||
printf("%d%d\n", (int)prop.major, (int)prop.minor);
|
||||
return 0;
|
||||
}
|
||||
@ -160,9 +160,14 @@ if [ "$MACHINE" = "sems" ]; then
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
@ -280,13 +285,13 @@ elif [ "$MACHINE" = "apollo" ]; then
|
||||
"gcc/5.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
|
||||
"intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
|
||||
"clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/head $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
|
||||
"clang/4.0.0 $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
|
||||
"cuda/8.0.44 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
)
|
||||
else
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("cuda/8.0.44 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"clang/head $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
|
||||
"clang/4.0.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
|
||||
"clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
|
||||
"gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
@ -584,7 +589,7 @@ single_build_and_test() {
|
||||
else
|
||||
run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
|
||||
local -i build_start_time=$(date +%s)
|
||||
run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
|
||||
run_cmd make -j 32 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
|
||||
local -i build_end_time=$(date +%s)
|
||||
comment="build_time=$(($build_end_time-$build_start_time))"
|
||||
|
||||
|
||||
@ -28,14 +28,14 @@ export JENKINS_DO_PTHREAD=ON
|
||||
export JENKINS_DO_SERIAL=OFF
|
||||
export JENKINS_DO_COMPLEX=OFF
|
||||
|
||||
export ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
|
||||
export ARCH_C_FLAG="-xCORE-AVX2 -mkl"
|
||||
export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
|
||||
export JENKINS_ARCH_C_FLAG="-xCORE-AVX2 -mkl"
|
||||
export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a"
|
||||
export LAPACK_LIBRARIES=${BLAS_LIBRARIES}
|
||||
|
||||
export JENKINS_DO_TESTS=ON
|
||||
export JENKINS_DO_EXAMPLES=ON
|
||||
export JENKINS_DO_SHARED=OFF
|
||||
export JENKINS_DO_SHARED=ON
|
||||
|
||||
export QUEUE=haswell
|
||||
|
||||
|
||||
@ -28,14 +28,14 @@ export JENKINS_DO_PTHREAD=OFF
|
||||
export JENKINS_DO_SERIAL=ON
|
||||
export JENKINS_DO_COMPLEX=ON
|
||||
|
||||
export ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
|
||||
export ARCH_C_FLAG="-xCORE-AVX2 -mkl"
|
||||
export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
|
||||
export JENKINS_ARCH_C_FLAG="-xCORE-AVX2 -mkl"
|
||||
export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a"
|
||||
export LAPACK_LIBRARIES=${BLAS_LIBRARIES}
|
||||
|
||||
export JENKINS_DO_TESTS=ON
|
||||
export JENKINS_DO_EXAMPLES=ON
|
||||
export JENKINS_DO_SHARED=OFF
|
||||
export JENKINS_DO_SHARED=ON
|
||||
|
||||
export QUEUE=haswell
|
||||
|
||||
|
||||
@ -60,7 +60,6 @@ test-threads: KokkosContainers_PerformanceTest_Threads
|
||||
test-openmp: KokkosContainers_PerformanceTest_OpenMP
|
||||
./KokkosContainers_PerformanceTest_OpenMP
|
||||
|
||||
|
||||
build_all: $(TARGETS)
|
||||
|
||||
test: $(TEST_TARGETS)
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,12 +36,15 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
::testing::InitGoogleTest(&argc,argv);
|
||||
|
||||
@ -69,30 +69,13 @@ protected:
|
||||
{
|
||||
std::cout << std::setprecision(5) << std::scientific;
|
||||
|
||||
unsigned num_threads = 4;
|
||||
|
||||
if (Kokkos::hwloc::available()) {
|
||||
num_threads = Kokkos::hwloc::get_available_numa_count()
|
||||
* Kokkos::hwloc::get_available_cores_per_numa()
|
||||
* Kokkos::hwloc::get_available_threads_per_core()
|
||||
;
|
||||
|
||||
}
|
||||
|
||||
std::cout << "OpenMP: " << num_threads << std::endl;
|
||||
|
||||
Kokkos::OpenMP::initialize( num_threads );
|
||||
|
||||
std::cout << "available threads: " << omp_get_max_threads() << std::endl;
|
||||
Kokkos::OpenMP::initialize();
|
||||
Kokkos::OpenMP::print_configuration( std::cout );
|
||||
}
|
||||
|
||||
static void TearDownTestCase()
|
||||
{
|
||||
Kokkos::OpenMP::finalize();
|
||||
|
||||
omp_set_num_threads(1);
|
||||
|
||||
ASSERT_EQ( 1 , omp_get_max_threads() );
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -564,7 +564,7 @@ namespace Impl {
|
||||
template< class D, class A1, class A2, class A3, class ... Args >
|
||||
struct DualViewSubview {
|
||||
|
||||
typedef typename Kokkos::Experimental::Impl::ViewMapping
|
||||
typedef typename Kokkos::Impl::ViewMapping
|
||||
< void
|
||||
, Kokkos::ViewTraits< D, A1, A2, A3 >
|
||||
, Args ...
|
||||
|
||||
@ -46,19 +46,6 @@
|
||||
///
|
||||
/// This header file declares and defines Kokkos::Experimental::DynRankView and its
|
||||
/// related nonmember functions.
|
||||
/*
|
||||
* Changes from View
|
||||
* 1. The rank of the DynRankView is returned by the method rank()
|
||||
* 2. Max rank of a DynRankView is 7
|
||||
* 3. subview name is subdynrankview
|
||||
* 4. Every subdynrankview is returned with LayoutStride
|
||||
*
|
||||
* NEW: Redesigned DynRankView
|
||||
* 5. subview function name now available
|
||||
* 6. Copy and Copy-Assign View to DynRankView
|
||||
* 7. deep_copy between Views and DynRankViews
|
||||
* 8. rank( view ); returns the rank of View or DynRankView
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_DYNRANKVIEW_HPP
|
||||
#define KOKKOS_DYNRANKVIEW_HPP
|
||||
@ -117,6 +104,14 @@ struct DynRankDimTraits {
|
||||
, layout.dimension[7] );
|
||||
}
|
||||
|
||||
// Extra overload to match that for specialize types v2
|
||||
template <typename Layout, typename ... P>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static size_t computeRank( const Kokkos::Impl::ViewCtorProp<P...>& prop, const Layout& layout )
|
||||
{
|
||||
return computeRank(layout);
|
||||
}
|
||||
|
||||
// Create the layout for the rank-7 view.
|
||||
// Non-strided Layout
|
||||
template <typename Layout>
|
||||
@ -158,8 +153,17 @@ struct DynRankDimTraits {
|
||||
);
|
||||
}
|
||||
|
||||
// Extra overload to match that for specialize types
|
||||
template <typename Traits, typename ... P>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static typename std::enable_if< (std::is_same<typename Traits::array_layout , Kokkos::LayoutRight>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutLeft>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutStride>::value) , typename Traits::array_layout >::type createLayout( const ViewCtorProp<P...>& prop, const typename Traits::array_layout& layout )
|
||||
{
|
||||
return createLayout( layout );
|
||||
}
|
||||
|
||||
// Create a view from the given dimension arguments.
|
||||
// This is only necessary because the shmem constructor doesn't take a layout.
|
||||
// NDE shmem View's are not compatible with the added view_alloc value_type / fad_dim deduction functionality
|
||||
template <typename ViewType, typename ViewArg>
|
||||
static ViewType createView( const ViewArg& arg
|
||||
, const size_t N0
|
||||
@ -186,7 +190,8 @@ struct DynRankDimTraits {
|
||||
// Non-strided Layout
|
||||
template <typename Layout , typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank )
|
||||
static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) && std::is_integral<iType>::value , Layout >::type
|
||||
reconstructLayout( const Layout& layout , iType dynrank )
|
||||
{
|
||||
return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0)
|
||||
, dynrank > 1 ? layout.dimension[1] : ~size_t(0)
|
||||
@ -202,7 +207,8 @@ struct DynRankDimTraits {
|
||||
// LayoutStride
|
||||
template <typename Layout , typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank )
|
||||
static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) && std::is_integral<iType>::value , Layout >::type
|
||||
reconstructLayout( const Layout& layout , iType dynrank )
|
||||
{
|
||||
return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0)
|
||||
, dynrank > 0 ? layout.stride[0] : (0)
|
||||
@ -311,6 +317,11 @@ void dyn_rank_view_verify_operator_bounds
|
||||
/** \brief Assign compatible default mappings */
|
||||
struct ViewToDynRankViewTag {};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Experimental
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template< class DstTraits , class SrcTraits >
|
||||
class ViewMapping< DstTraits , SrcTraits ,
|
||||
typename std::enable_if<(
|
||||
@ -337,7 +348,7 @@ class ViewMapping< DstTraits , SrcTraits ,
|
||||
)
|
||||
)
|
||||
)
|
||||
) , ViewToDynRankViewTag >::type >
|
||||
) , Kokkos::Experimental::Impl::ViewToDynRankViewTag >::type >
|
||||
{
|
||||
private:
|
||||
|
||||
@ -376,7 +387,7 @@ public:
|
||||
|
||||
typedef typename DstType::offset_type dst_offset_type ;
|
||||
dst.m_map.m_offset = dst_offset_type(std::integral_constant<unsigned,0>() , src.layout() ); //Check this for integer input1 for padding, etc
|
||||
dst.m_map.m_handle = Kokkos::Experimental::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
|
||||
dst.m_map.m_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
|
||||
dst.m_track.assign( src.m_track , DstTraits::is_managed );
|
||||
dst.m_rank = src.Rank ;
|
||||
}
|
||||
@ -384,22 +395,20 @@ public:
|
||||
|
||||
} //end Impl
|
||||
|
||||
namespace Experimental {
|
||||
|
||||
/* \class DynRankView
|
||||
* \brief Container that creates a Kokkos view with rank determined at runtime.
|
||||
* Essentially this is a rank 7 view that wraps the access operators
|
||||
* to yield the functionality of a view
|
||||
* Essentially this is a rank 7 view
|
||||
*
|
||||
* Changes from View
|
||||
* 1. The rank of the DynRankView is returned by the method rank()
|
||||
* 2. Max rank of a DynRankView is 7
|
||||
* 3. subview name is subdynrankview
|
||||
* 4. Every subdynrankview is returned with LayoutStride
|
||||
*
|
||||
* NEW: Redesigned DynRankView
|
||||
* 5. subview function name now available
|
||||
* 6. Copy and Copy-Assign View to DynRankView
|
||||
* 7. deep_copy between Views and DynRankViews
|
||||
* 8. rank( view ); returns the rank of View or DynRankView
|
||||
* 3. subview called with 'subview(...)' or 'subdynrankview(...)' (backward compatibility)
|
||||
* 4. Every subview is returned with LayoutStride
|
||||
* 5. Copy and Copy-Assign View to DynRankView
|
||||
* 6. deep_copy between Views and DynRankViews
|
||||
* 7. rank( view ); returns the rank of View or DynRankView
|
||||
*
|
||||
*/
|
||||
|
||||
@ -427,7 +436,7 @@ public:
|
||||
|
||||
|
||||
private:
|
||||
typedef Kokkos::Experimental::Impl::ViewMapping< traits , void > map_type ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , void > map_type ;
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ;
|
||||
|
||||
track_type m_track ;
|
||||
@ -556,7 +565,7 @@ public:
|
||||
// Allow specializations to query their specialized map
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const Kokkos::Experimental::Impl::ViewMapping< traits , void > &
|
||||
const Kokkos::Impl::ViewMapping< traits , void > &
|
||||
implementation_map() const { return m_map ; }
|
||||
|
||||
//----------------------------------------
|
||||
@ -803,7 +812,7 @@ public:
|
||||
, m_rank(rhs.m_rank)
|
||||
{
|
||||
typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
|
||||
typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
|
||||
static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
|
||||
Mapping::assign( m_map , rhs.m_map , rhs.m_track );
|
||||
}
|
||||
@ -813,7 +822,7 @@ public:
|
||||
DynRankView & operator = (const DynRankView<RT,RP...> & rhs )
|
||||
{
|
||||
typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
|
||||
typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
|
||||
static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
|
||||
Mapping::assign( m_map , rhs.m_map , rhs.m_track );
|
||||
m_track.assign( rhs.m_track , traits::is_managed );
|
||||
@ -831,7 +840,7 @@ public:
|
||||
, m_rank( rhs.Rank )
|
||||
{
|
||||
typedef typename View<RT,RP...>::traits SrcTraits ;
|
||||
typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ;
|
||||
static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
|
||||
Mapping::assign( *this , rhs );
|
||||
}
|
||||
@ -841,7 +850,7 @@ public:
|
||||
DynRankView & operator = ( const View<RT,RP...> & rhs )
|
||||
{
|
||||
typedef typename View<RT,RP...>::traits SrcTraits ;
|
||||
typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ;
|
||||
static_assert( Mapping::is_assignable , "Incompatible View to DynRankView copy assignment" );
|
||||
Mapping::assign( *this , rhs );
|
||||
return *this ;
|
||||
@ -870,7 +879,7 @@ public:
|
||||
)
|
||||
: m_track()
|
||||
, m_map()
|
||||
, m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) )
|
||||
, m_rank( Impl::DynRankDimTraits<typename traits::specialize>::template computeRank< typename traits::array_layout, P...>(arg_prop, arg_layout) )
|
||||
{
|
||||
// Append layout and spaces if not input
|
||||
typedef Impl::ViewCtorProp< P ... > alloc_prop_input ;
|
||||
@ -923,7 +932,7 @@ public:
|
||||
//------------------------------------------------------------
|
||||
|
||||
Kokkos::Experimental::Impl::SharedAllocationRecord<> *
|
||||
record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) );
|
||||
record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) );
|
||||
|
||||
//------------------------------------------------------------
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
@ -947,8 +956,8 @@ public:
|
||||
>::type const & arg_layout
|
||||
)
|
||||
: m_track() // No memory tracking
|
||||
, m_map( arg_prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) )
|
||||
, m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) )
|
||||
, m_map( arg_prop , Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) )
|
||||
, m_rank( Impl::DynRankDimTraits<typename traits::specialize>::template computeRank< typename traits::array_layout, P...>(arg_prop, arg_layout) )
|
||||
{
|
||||
static_assert(
|
||||
std::is_same< pointer_type
|
||||
@ -1034,6 +1043,7 @@ public:
|
||||
{}
|
||||
|
||||
// For backward compatibility
|
||||
// NDE This ctor does not take ViewCtorProp argument - should not use alternative createLayout call
|
||||
explicit inline
|
||||
DynRankView( const ViewAllocateWithoutInitializing & arg_prop
|
||||
, const typename traits::array_layout & arg_layout
|
||||
@ -1179,6 +1189,11 @@ namespace Impl {
|
||||
|
||||
struct DynRankSubviewTag {};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Experimental
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template< class SrcTraits , class ... Args >
|
||||
struct ViewMapping
|
||||
< typename std::enable_if<(
|
||||
@ -1192,7 +1207,7 @@ struct ViewMapping
|
||||
std::is_same< typename SrcTraits::array_layout
|
||||
, Kokkos::LayoutStride >::value
|
||||
)
|
||||
), DynRankSubviewTag >::type
|
||||
), Kokkos::Experimental::Impl::DynRankSubviewTag >::type
|
||||
, SrcTraits
|
||||
, Args ... >
|
||||
{
|
||||
@ -1264,7 +1279,7 @@ public:
|
||||
};
|
||||
|
||||
|
||||
typedef DynRankView< value_type , array_layout , typename SrcTraits::device_type , typename SrcTraits::memory_traits > ret_type;
|
||||
typedef Kokkos::Experimental::DynRankView< value_type , array_layout , typename SrcTraits::device_type , typename SrcTraits::memory_traits > ret_type;
|
||||
|
||||
template < typename T , class ... P >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -1336,9 +1351,10 @@ public:
|
||||
|
||||
} // end Impl
|
||||
|
||||
namespace Experimental {
|
||||
|
||||
template< class V , class ... Args >
|
||||
using Subdynrankview = typename Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , V , Args... >::ret_type ;
|
||||
using Subdynrankview = typename Kokkos::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , V , Args... >::ret_type ;
|
||||
|
||||
template< class D , class ... P , class ...Args >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -1348,7 +1364,7 @@ subdynrankview( const Kokkos::Experimental::DynRankView< D , P... > &src , Args.
|
||||
if ( src.rank() > sizeof...(Args) ) //allow sizeof...(Args) >= src.rank(), ignore the remaining args
|
||||
{ Kokkos::abort("subdynrankview: num of args must be >= rank of the source DynRankView"); }
|
||||
|
||||
typedef Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ;
|
||||
typedef Kokkos::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ;
|
||||
|
||||
return metafcn::subview( src.rank() , src , args... );
|
||||
}
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -57,7 +57,7 @@ namespace Experimental {
|
||||
*/
|
||||
template< typename DataType , typename ... P >
|
||||
class DynamicView : public Kokkos::ViewTraits< DataType , P ... >
|
||||
{
|
||||
{
|
||||
public:
|
||||
|
||||
typedef Kokkos::ViewTraits< DataType , P ... > traits ;
|
||||
@ -68,7 +68,7 @@ private:
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ;
|
||||
|
||||
static_assert( traits::rank == 1 && traits::rank_dynamic == 1
|
||||
static_assert( traits::rank == 1 && traits::rank_dynamic == 1
|
||||
, "DynamicView must be rank-one" );
|
||||
|
||||
static_assert( std::is_trivial< typename traits::value_type >::value &&
|
||||
@ -216,14 +216,14 @@ public:
|
||||
// Verify that allocation of the requested chunk in in progress.
|
||||
|
||||
// The allocated chunk counter is m_chunks[ m_chunk_max ]
|
||||
const uintptr_t n =
|
||||
const uintptr_t n =
|
||||
*reinterpret_cast<uintptr_t volatile *>( m_chunks + m_chunk_max );
|
||||
|
||||
if ( n <= ic ) {
|
||||
Kokkos::abort("Kokkos::DynamicView array bounds error");
|
||||
}
|
||||
|
||||
// Allocation of this chunk is in progress
|
||||
// Allocation of this chunk is in progress
|
||||
// so wait for allocation to complete.
|
||||
while ( 0 == *ch );
|
||||
}
|
||||
@ -267,7 +267,7 @@ public:
|
||||
const uintptr_t jc_try = jc ;
|
||||
|
||||
// Jump iteration to the chunk counter.
|
||||
|
||||
|
||||
jc = atomic_compare_exchange( pc , jc_try , jc_try + 1 );
|
||||
|
||||
if ( jc_try == jc ) {
|
||||
@ -316,7 +316,7 @@ public:
|
||||
}
|
||||
else {
|
||||
while ( NC + 1 <= *pc ) {
|
||||
--*pc ;
|
||||
--*pc ;
|
||||
m_pool.deallocate( m_chunks[*pc]
|
||||
, sizeof(value_type) << m_chunk_shift );
|
||||
m_chunks[*pc] = 0 ;
|
||||
@ -331,7 +331,7 @@ public:
|
||||
typename traits::value_type ** m_chunks ;
|
||||
uintptr_t * m_pc ;
|
||||
uintptr_t m_nc ;
|
||||
unsigned m_chunk_shift ;
|
||||
unsigned m_chunk_shift ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()( int ) const
|
||||
@ -348,7 +348,7 @@ public:
|
||||
}
|
||||
else {
|
||||
while ( m_nc + 1 <= *m_pc ) {
|
||||
--*m_pc ;
|
||||
--*m_pc ;
|
||||
m_pool.deallocate( m_chunks[*m_pc]
|
||||
, sizeof(value_type) << m_chunk_shift );
|
||||
m_chunks[*m_pc] = 0 ;
|
||||
@ -482,7 +482,7 @@ public:
|
||||
};
|
||||
|
||||
|
||||
/**\brief Allocation constructor
|
||||
/**\brief Allocation constructor
|
||||
*
|
||||
* Memory is allocated in chunks from the memory pool.
|
||||
* The chunk size conforms to the memory pool's chunk size.
|
||||
@ -557,7 +557,7 @@ void deep_copy( const View<T,DP...> & dst
|
||||
|
||||
if ( DstExecCanAccessSrc ) {
|
||||
// Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
|
||||
Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
|
||||
Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src );
|
||||
}
|
||||
else {
|
||||
Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
|
||||
@ -581,7 +581,7 @@ void deep_copy( const DynamicView<T,DP...> & dst
|
||||
|
||||
if ( DstExecCanAccessSrc ) {
|
||||
// Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
|
||||
Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
|
||||
Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src );
|
||||
}
|
||||
else {
|
||||
Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
|
||||
|
||||
@ -69,6 +69,8 @@
|
||||
#include <Kokkos_ErrorReporter.hpp>
|
||||
#include <TestErrorReporter.hpp>
|
||||
|
||||
#include <TestViewCtorPropEmbeddedDim.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@ -94,6 +96,10 @@ TEST_F( cuda , dyn_view_api) {
|
||||
TestDynViewAPI< double , Kokkos::Cuda >();
|
||||
}
|
||||
|
||||
TEST_F( cuda, viewctorprop_embedded_dim ) {
|
||||
TestViewCtorProp_EmbeddedDim< Kokkos::Cuda >::test_vcpt( 2, 3 );
|
||||
}
|
||||
|
||||
TEST_F( cuda , staticcrsgraph )
|
||||
{
|
||||
TestStaticCrsGraph::run_test_graph< Kokkos::Cuda >();
|
||||
|
||||
@ -66,6 +66,8 @@
|
||||
#include <Kokkos_ErrorReporter.hpp>
|
||||
#include <TestErrorReporter.hpp>
|
||||
|
||||
#include <TestViewCtorPropEmbeddedDim.hpp>
|
||||
|
||||
#include <iomanip>
|
||||
|
||||
namespace Test {
|
||||
@ -76,14 +78,7 @@ protected:
|
||||
{
|
||||
std::cout << std::setprecision(5) << std::scientific;
|
||||
|
||||
unsigned threads_count = 4 ;
|
||||
|
||||
if ( Kokkos::hwloc::available() ) {
|
||||
threads_count = Kokkos::hwloc::get_available_numa_count() *
|
||||
Kokkos::hwloc::get_available_cores_per_numa();
|
||||
}
|
||||
|
||||
Kokkos::OpenMP::initialize( threads_count );
|
||||
Kokkos::OpenMP::initialize();
|
||||
}
|
||||
|
||||
static void TearDownTestCase()
|
||||
@ -96,6 +91,10 @@ TEST_F( openmp, dyn_view_api) {
|
||||
TestDynViewAPI< double , Kokkos::OpenMP >();
|
||||
}
|
||||
|
||||
TEST_F( openmp, viewctorprop_embedded_dim ) {
|
||||
TestViewCtorProp_EmbeddedDim< Kokkos::OpenMP >::test_vcpt( 2, 3 );
|
||||
}
|
||||
|
||||
TEST_F( openmp, bitset )
|
||||
{
|
||||
test_bitset<Kokkos::OpenMP>();
|
||||
|
||||
@ -67,6 +67,8 @@
|
||||
#include <Kokkos_ErrorReporter.hpp>
|
||||
#include <TestErrorReporter.hpp>
|
||||
|
||||
#include <TestViewCtorPropEmbeddedDim.hpp>
|
||||
|
||||
namespace Test {
|
||||
|
||||
class serial : public ::testing::Test {
|
||||
@ -85,6 +87,10 @@ TEST_F( serial, dyn_view_api) {
|
||||
TestDynViewAPI< double , Kokkos::Serial >();
|
||||
}
|
||||
|
||||
TEST_F( serial, viewctorprop_embedded_dim ) {
|
||||
TestViewCtorProp_EmbeddedDim< Kokkos::Serial >::test_vcpt( 2, 3 );
|
||||
}
|
||||
|
||||
TEST_F( serial , staticcrsgraph )
|
||||
{
|
||||
TestStaticCrsGraph::run_test_graph< Kokkos::Serial >();
|
||||
|
||||
@ -70,6 +70,8 @@
|
||||
#include <Kokkos_ErrorReporter.hpp>
|
||||
#include <TestErrorReporter.hpp>
|
||||
|
||||
#include <TestViewCtorPropEmbeddedDim.hpp>
|
||||
|
||||
namespace Test {
|
||||
|
||||
class threads : public ::testing::Test {
|
||||
@ -103,6 +105,10 @@ TEST_F( threads , dyn_view_api) {
|
||||
TestDynViewAPI< double , Kokkos::Threads >();
|
||||
}
|
||||
|
||||
TEST_F( threads, viewctorprop_embedded_dim ) {
|
||||
TestViewCtorProp_EmbeddedDim< Kokkos::Threads >::test_vcpt( 2, 3 );
|
||||
}
|
||||
|
||||
TEST_F( threads , staticcrsgraph )
|
||||
{
|
||||
TestStaticCrsGraph::run_test_graph< Kokkos::Threads >();
|
||||
|
||||
213
lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp
Normal file
213
lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp
Normal file
@ -0,0 +1,213 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <Kokkos_DynRankView.hpp>
|
||||
|
||||
#include <type_traits>
|
||||
#include <typeinfo>
|
||||
|
||||
namespace Test {
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename ExecSpace >
|
||||
struct TestViewCtorProp_EmbeddedDim {
|
||||
|
||||
using ViewIntType = typename Kokkos::View< int**, ExecSpace >;
|
||||
using ViewDoubleType = typename Kokkos::View< double*, ExecSpace >;
|
||||
|
||||
using DynRankViewIntType = typename Kokkos::DynRankView< int, ExecSpace >;
|
||||
using DynRankViewDoubleType = typename Kokkos::DynRankView< double, ExecSpace >;
|
||||
|
||||
// Cuda 7.0 has issues with using a lamda in parallel_for to initialize the view - replace with this functor
|
||||
template < class ViewType >
|
||||
struct Functor {
|
||||
|
||||
ViewType v;
|
||||
|
||||
Functor( const ViewType & v_ ) : v(v_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()( const int i ) const {
|
||||
v(i) = i;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
static void test_vcpt( const int N0, const int N1 )
|
||||
{
|
||||
|
||||
// Create two views to test
|
||||
{
|
||||
using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType ;
|
||||
using VDT = typename TestViewCtorProp_EmbeddedDim::ViewDoubleType ;
|
||||
|
||||
VIT vi1("vi1", N0, N1);
|
||||
VDT vd1("vd1", N0);
|
||||
|
||||
// TEST: Test for common type between two views, one with type double, other with type int
|
||||
// Deduce common value_type and construct a view with that type
|
||||
{
|
||||
// Two views
|
||||
auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1, vd1);
|
||||
typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType;
|
||||
typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT;
|
||||
typedef typename CVT::HostMirror HostCVT;
|
||||
|
||||
// Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
|
||||
CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
|
||||
|
||||
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1),
|
||||
Functor<CVT>(cv1)
|
||||
);
|
||||
|
||||
HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
|
||||
Kokkos::deep_copy( hcv1, cv1 );
|
||||
|
||||
ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ;
|
||||
#if 0
|
||||
// debug output
|
||||
for ( int i = 0; i < N0*N1; ++i ) {
|
||||
printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) );
|
||||
}
|
||||
|
||||
printf( " Common value type view: %s \n", typeid( CVT() ).name() );
|
||||
printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() );
|
||||
if ( std::is_same< CommonViewValueType, double >::value == true ) {
|
||||
printf("Proper common value_type\n");
|
||||
}
|
||||
else {
|
||||
printf("WRONG common value_type\n");
|
||||
}
|
||||
// end debug output
|
||||
#endif
|
||||
}
|
||||
|
||||
{
|
||||
// Single view
|
||||
auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1);
|
||||
typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType;
|
||||
typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT;
|
||||
typedef typename CVT::HostMirror HostCVT;
|
||||
|
||||
// Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
|
||||
CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
|
||||
|
||||
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1),
|
||||
Functor<CVT>(cv1)
|
||||
);
|
||||
|
||||
HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
|
||||
Kokkos::deep_copy( hcv1, cv1 );
|
||||
|
||||
ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Create two dynamic rank views to test
|
||||
{
|
||||
using VIT = typename TestViewCtorProp_EmbeddedDim::DynRankViewIntType ;
|
||||
using VDT = typename TestViewCtorProp_EmbeddedDim::DynRankViewDoubleType ;
|
||||
|
||||
VIT vi1("vi1", N0, N1);
|
||||
VDT vd1("vd1", N0);
|
||||
|
||||
// TEST: Test for common type between two views, one with type double, other with type int
|
||||
// Deduce common value_type and construct a view with that type
|
||||
{
|
||||
// Two views
|
||||
auto view_alloc_arg = Kokkos::common_view_alloc_prop( vi1, vd1 );
|
||||
typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType;
|
||||
typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT;
|
||||
typedef typename CVT::HostMirror HostCVT;
|
||||
|
||||
// Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
|
||||
CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
|
||||
|
||||
|
||||
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1),
|
||||
Functor<CVT>(cv1)
|
||||
);
|
||||
|
||||
HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
|
||||
Kokkos::deep_copy( hcv1, cv1 );
|
||||
|
||||
ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ;
|
||||
}
|
||||
|
||||
{
|
||||
// Single views
|
||||
auto view_alloc_arg = Kokkos::common_view_alloc_prop( vi1 );
|
||||
typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType;
|
||||
typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT;
|
||||
typedef typename CVT::HostMirror HostCVT;
|
||||
|
||||
// Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
|
||||
CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
|
||||
|
||||
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1),
|
||||
Functor<CVT>(cv1)
|
||||
);
|
||||
|
||||
HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
|
||||
Kokkos::deep_copy( hcv1, cv1 );
|
||||
|
||||
ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // end test_vcpt
|
||||
|
||||
}; // end struct
|
||||
|
||||
} // namespace
|
||||
|
||||
} // namespace Test
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,12 +36,14 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <cstdlib>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
::testing::InitGoogleTest(&argc,argv);
|
||||
|
||||
@ -79,7 +79,6 @@ test-mempool: KokkosCore_PerformanceTest_Mempool
|
||||
test-taskdag: KokkosCore_PerformanceTest_TaskDAG
|
||||
./KokkosCore_PerformanceTest_TaskDAG
|
||||
|
||||
|
||||
build_all: $(TARGETS)
|
||||
|
||||
test: $(TEST_TARGETS)
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,12 +36,14 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
namespace Test {
|
||||
|
||||
2715
lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
Normal file
2715
lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
Normal file
File diff suppressed because it is too large
Load Diff
@ -53,6 +53,7 @@
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_abort.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Error.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Locks.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
@ -125,53 +126,12 @@ unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits:
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
struct CudaLockArraysStruct {
|
||||
int* atomic;
|
||||
int* scratch;
|
||||
int* threadid;
|
||||
int n;
|
||||
};
|
||||
}
|
||||
}
|
||||
__device__ __constant__
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
extern
|
||||
#endif
|
||||
Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
|
||||
|
||||
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
|
||||
#define CUDA_SPACE_ATOMIC_XOR_MASK 0x15A39
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink = false);
|
||||
}
|
||||
}
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
__device__ inline
|
||||
bool lock_address_cuda_space(void* ptr) {
|
||||
size_t offset = size_t(ptr);
|
||||
offset = offset >> 2;
|
||||
offset = offset & CUDA_SPACE_ATOMIC_MASK;
|
||||
return (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[offset],0,1));
|
||||
}
|
||||
|
||||
__device__ inline
|
||||
void unlock_address_cuda_space(void* ptr) {
|
||||
size_t offset = size_t(ptr);
|
||||
offset = offset >> 2;
|
||||
offset = offset & CUDA_SPACE_ATOMIC_MASK;
|
||||
atomicExch( &kokkos_impl_cuda_lock_arrays.atomic[ offset ], 0);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
template< typename T >
|
||||
inline
|
||||
__device__
|
||||
@ -192,7 +152,7 @@ namespace Impl {
|
||||
// For 2.0 capability: 48 KB L1 and 16 KB shared
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< class DriverType >
|
||||
template< class DriverType>
|
||||
__global__
|
||||
static void cuda_parallel_launch_constant_memory()
|
||||
{
|
||||
@ -202,19 +162,39 @@ static void cuda_parallel_launch_constant_memory()
|
||||
driver();
|
||||
}
|
||||
|
||||
template< class DriverType >
|
||||
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
|
||||
__global__
|
||||
__launch_bounds__(maxTperB, minBperSM)
|
||||
static void cuda_parallel_launch_constant_memory()
|
||||
{
|
||||
const DriverType & driver =
|
||||
*((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
|
||||
|
||||
driver();
|
||||
}
|
||||
|
||||
template< class DriverType>
|
||||
__global__
|
||||
static void cuda_parallel_launch_local_memory( const DriverType driver )
|
||||
{
|
||||
driver();
|
||||
}
|
||||
|
||||
template < class DriverType ,
|
||||
bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
|
||||
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
|
||||
__global__
|
||||
__launch_bounds__(maxTperB, minBperSM)
|
||||
static void cuda_parallel_launch_local_memory( const DriverType driver )
|
||||
{
|
||||
driver();
|
||||
}
|
||||
|
||||
template < class DriverType
|
||||
, class LaunchBounds = Kokkos::LaunchBounds<>
|
||||
, bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
|
||||
struct CudaParallelLaunch ;
|
||||
|
||||
template < class DriverType >
|
||||
struct CudaParallelLaunch< DriverType , true > {
|
||||
template < class DriverType, class LaunchBounds >
|
||||
struct CudaParallelLaunch< DriverType, LaunchBounds, true > {
|
||||
|
||||
inline
|
||||
CudaParallelLaunch( const DriverType & driver
|
||||
@ -238,26 +218,19 @@ struct CudaParallelLaunch< DriverType , true > {
|
||||
}
|
||||
#ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else if ( shmem ) {
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared ) );
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
|
||||
} else {
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 ) );
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
|
||||
}
|
||||
#endif
|
||||
|
||||
// Copy functor to constant memory on the device
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
|
||||
|
||||
#ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
Kokkos::Impl::CudaLockArraysStruct locks;
|
||||
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
|
||||
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
|
||||
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
|
||||
locks.n = Kokkos::Cuda::concurrency();
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
|
||||
#endif
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
// Invoke the driver function on the device
|
||||
cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem , stream >>>();
|
||||
cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>();
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
@ -267,8 +240,8 @@ struct CudaParallelLaunch< DriverType , true > {
|
||||
}
|
||||
};
|
||||
|
||||
template < class DriverType >
|
||||
struct CudaParallelLaunch< DriverType , false > {
|
||||
template < class DriverType, class LaunchBounds >
|
||||
struct CudaParallelLaunch< DriverType, LaunchBounds, false > {
|
||||
|
||||
inline
|
||||
CudaParallelLaunch( const DriverType & driver
|
||||
@ -284,22 +257,15 @@ struct CudaParallelLaunch< DriverType , false > {
|
||||
}
|
||||
#ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else if ( shmem ) {
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferShared ) );
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
|
||||
} else {
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferL1 ) );
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
Kokkos::Impl::CudaLockArraysStruct locks;
|
||||
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
|
||||
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
|
||||
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
|
||||
locks.n = Kokkos::Cuda::concurrency();
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
|
||||
#endif
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver );
|
||||
cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>( driver );
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
@ -230,18 +230,6 @@ void CudaHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t
|
||||
} catch(...) {}
|
||||
}
|
||||
|
||||
constexpr const char* CudaSpace::name() {
|
||||
return m_name;
|
||||
}
|
||||
|
||||
constexpr const char* CudaUVMSpace::name() {
|
||||
return m_name;
|
||||
}
|
||||
|
||||
constexpr const char* CudaHostPinnedSpace::name() {
|
||||
return m_name;
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -655,11 +643,12 @@ reallocate_tracked( void * const arg_alloc_ptr
|
||||
SharedAllocationRecord< Kokkos::CudaSpace , void > *
|
||||
SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr )
|
||||
{
|
||||
using Header = SharedAllocationHeader ;
|
||||
using RecordBase = SharedAllocationRecord< void , void > ;
|
||||
using RecordCuda = SharedAllocationRecord< Kokkos::CudaSpace , void > ;
|
||||
|
||||
#if 0
|
||||
using Header = SharedAllocationHeader ;
|
||||
|
||||
// Copy the header from the allocation
|
||||
Header head ;
|
||||
|
||||
@ -812,83 +801,6 @@ print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bo
|
||||
SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaHostPinned" , & s_root_record , detail );
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace {
|
||||
__global__ void init_lock_array_kernel_atomic() {
|
||||
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
|
||||
if(i<CUDA_SPACE_ATOMIC_MASK+1)
|
||||
kokkos_impl_cuda_lock_arrays.atomic[i] = 0;
|
||||
}
|
||||
|
||||
__global__ void init_lock_array_kernel_scratch_threadid(int N) {
|
||||
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
|
||||
if(i<N) {
|
||||
kokkos_impl_cuda_lock_arrays.scratch[i] = 0;
|
||||
kokkos_impl_cuda_lock_arrays.threadid[i] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
namespace Impl {
|
||||
int* atomic_lock_array_cuda_space_ptr(bool deallocate) {
|
||||
static int* ptr = NULL;
|
||||
if(deallocate) {
|
||||
cudaFree(ptr);
|
||||
ptr = NULL;
|
||||
}
|
||||
|
||||
if(ptr==NULL && !deallocate)
|
||||
cudaMalloc(&ptr,sizeof(int)*(CUDA_SPACE_ATOMIC_MASK+1));
|
||||
return ptr;
|
||||
}
|
||||
|
||||
int* scratch_lock_array_cuda_space_ptr(bool deallocate) {
|
||||
static int* ptr = NULL;
|
||||
if(deallocate) {
|
||||
cudaFree(ptr);
|
||||
ptr = NULL;
|
||||
}
|
||||
|
||||
if(ptr==NULL && !deallocate)
|
||||
cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
|
||||
return ptr;
|
||||
}
|
||||
|
||||
int* threadid_lock_array_cuda_space_ptr(bool deallocate) {
|
||||
static int* ptr = NULL;
|
||||
if(deallocate) {
|
||||
cudaFree(ptr);
|
||||
ptr = NULL;
|
||||
}
|
||||
|
||||
if(ptr==NULL && !deallocate)
|
||||
cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void init_lock_arrays_cuda_space() {
|
||||
static int is_initialized = 0;
|
||||
if(! is_initialized) {
|
||||
Kokkos::Impl::CudaLockArraysStruct locks;
|
||||
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
|
||||
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
|
||||
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
|
||||
locks.n = Kokkos::Cuda::concurrency();
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
|
||||
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
|
||||
init_lock_array_kernel_scratch_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
|
||||
}
|
||||
}
|
||||
|
||||
void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) {
|
||||
static void* ptr = NULL;
|
||||
static std::int64_t current_size = 0;
|
||||
@ -908,8 +820,8 @@ void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR() {}
|
||||
#endif // KOKKOS_ENABLE_CUDA
|
||||
|
||||
@ -51,6 +51,7 @@
|
||||
|
||||
#include <Cuda/Kokkos_Cuda_Error.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Internal.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Locks.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
|
||||
@ -69,9 +70,6 @@
|
||||
__device__ __constant__
|
||||
unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;
|
||||
|
||||
__device__ __constant__
|
||||
Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
|
||||
|
||||
#endif
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
@ -103,6 +101,7 @@ int cuda_kernel_arch()
|
||||
return arch ;
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA_UVM
|
||||
bool cuda_launch_blocking()
|
||||
{
|
||||
const char * env = getenv("CUDA_LAUNCH_BLOCKING");
|
||||
@ -111,16 +110,13 @@ bool cuda_launch_blocking()
|
||||
|
||||
return atoi(env);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void cuda_device_synchronize()
|
||||
{
|
||||
// static const bool launch_blocking = cuda_launch_blocking();
|
||||
|
||||
// if (!launch_blocking) {
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
// }
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void cuda_internal_error_throw( cudaError e , const char * name, const char * file, const int line )
|
||||
@ -240,6 +236,7 @@ public:
|
||||
unsigned m_maxWarpCount ;
|
||||
unsigned m_maxBlock ;
|
||||
unsigned m_maxSharedWords ;
|
||||
uint32_t m_maxConcurrency ;
|
||||
size_type m_scratchSpaceCount ;
|
||||
size_type m_scratchFlagsCount ;
|
||||
size_type m_scratchUnifiedCount ;
|
||||
@ -248,6 +245,7 @@ public:
|
||||
size_type * m_scratchSpace ;
|
||||
size_type * m_scratchFlags ;
|
||||
size_type * m_scratchUnified ;
|
||||
uint32_t * m_scratchConcurrentBitset ;
|
||||
cudaStream_t * m_stream ;
|
||||
|
||||
static int was_initialized;
|
||||
@ -274,6 +272,7 @@ public:
|
||||
, m_maxWarpCount( 0 )
|
||||
, m_maxBlock( 0 )
|
||||
, m_maxSharedWords( 0 )
|
||||
, m_maxConcurrency( 0 )
|
||||
, m_scratchSpaceCount( 0 )
|
||||
, m_scratchFlagsCount( 0 )
|
||||
, m_scratchUnifiedCount( 0 )
|
||||
@ -282,6 +281,7 @@ public:
|
||||
, m_scratchSpace( 0 )
|
||||
, m_scratchFlags( 0 )
|
||||
, m_scratchUnified( 0 )
|
||||
, m_scratchConcurrentBitset( 0 )
|
||||
, m_stream( 0 )
|
||||
{}
|
||||
|
||||
@ -327,7 +327,8 @@ CudaInternal::~CudaInternal()
|
||||
if ( m_stream ||
|
||||
m_scratchSpace ||
|
||||
m_scratchFlags ||
|
||||
m_scratchUnified ) {
|
||||
m_scratchUnified ||
|
||||
m_scratchConcurrentBitset ) {
|
||||
std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()"
|
||||
<< std::endl ;
|
||||
std::cerr.flush();
|
||||
@ -339,6 +340,7 @@ CudaInternal::~CudaInternal()
|
||||
m_maxWarpCount = 0 ;
|
||||
m_maxBlock = 0 ;
|
||||
m_maxSharedWords = 0 ;
|
||||
m_maxConcurrency = 0 ;
|
||||
m_scratchSpaceCount = 0 ;
|
||||
m_scratchFlagsCount = 0 ;
|
||||
m_scratchUnifiedCount = 0 ;
|
||||
@ -347,6 +349,7 @@ CudaInternal::~CudaInternal()
|
||||
m_scratchSpace = 0 ;
|
||||
m_scratchFlags = 0 ;
|
||||
m_scratchUnified = 0 ;
|
||||
m_scratchConcurrentBitset = 0 ;
|
||||
m_stream = 0 ;
|
||||
}
|
||||
|
||||
@ -485,6 +488,33 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
||||
(void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
|
||||
}
|
||||
//----------------------------------
|
||||
// Concurrent bitset for obtaining unique tokens from within
|
||||
// an executing kernel.
|
||||
{
|
||||
const unsigned max_threads_per_sm = 2048 ; // up to capability 7.0
|
||||
|
||||
m_maxConcurrency =
|
||||
max_threads_per_sm * cudaProp.multiProcessorCount ;
|
||||
|
||||
const int32_t buffer_bound =
|
||||
Kokkos::Impl::concurrent_bitset::buffer_bound( m_maxConcurrency );
|
||||
|
||||
// Allocate and initialize uint32_t[ buffer_bound ]
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
|
||||
|
||||
Record * const r = Record::allocate( Kokkos::CudaSpace()
|
||||
, "InternalScratchBitset"
|
||||
, sizeof(uint32_t) * buffer_bound );
|
||||
|
||||
Record::increment( r );
|
||||
|
||||
m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>( r->data() );
|
||||
|
||||
CUDA_SAFE_CALL( cudaMemset( m_scratchConcurrentBitset , 0 , sizeof(uint32_t) * buffer_bound ) );
|
||||
|
||||
}
|
||||
//----------------------------------
|
||||
|
||||
if ( stream_count ) {
|
||||
m_stream = (cudaStream_t*) ::malloc( stream_count * sizeof(cudaStream_t) );
|
||||
@ -543,16 +573,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
||||
cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
|
||||
|
||||
// Init the array for used for arbitrarily sized atomics
|
||||
Impl::init_lock_arrays_cuda_space();
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
Kokkos::Impl::CudaLockArraysStruct locks;
|
||||
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
|
||||
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
|
||||
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
|
||||
locks.n = Kokkos::Cuda::concurrency();
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
|
||||
#endif
|
||||
Impl::initialize_host_cuda_lock_arrays();
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -635,9 +656,7 @@ void CudaInternal::finalize()
|
||||
was_finalized = 1;
|
||||
if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
|
||||
|
||||
atomic_lock_array_cuda_space_ptr(true);
|
||||
scratch_lock_array_cuda_space_ptr(true);
|
||||
threadid_lock_array_cuda_space_ptr(true);
|
||||
Impl::finalize_host_cuda_lock_arrays();
|
||||
|
||||
if ( m_stream ) {
|
||||
for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
|
||||
@ -653,6 +672,7 @@ void CudaInternal::finalize()
|
||||
RecordCuda::decrement( RecordCuda::get_record( m_scratchFlags ) );
|
||||
RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) );
|
||||
RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) );
|
||||
RecordCuda::decrement( RecordCuda::get_record( m_scratchConcurrentBitset ) );
|
||||
|
||||
m_cudaDev = -1 ;
|
||||
m_multiProcCount = 0 ;
|
||||
@ -666,6 +686,7 @@ void CudaInternal::finalize()
|
||||
m_scratchSpace = 0 ;
|
||||
m_scratchFlags = 0 ;
|
||||
m_scratchUnified = 0 ;
|
||||
m_scratchConcurrentBitset = 0 ;
|
||||
m_stream = 0 ;
|
||||
}
|
||||
}
|
||||
@ -713,9 +734,8 @@ namespace Kokkos {
|
||||
Cuda::size_type Cuda::detect_device_count()
|
||||
{ return Impl::CudaInternalDevices::singleton().m_cudaDevCount ; }
|
||||
|
||||
int Cuda::concurrency() {
|
||||
return 131072;
|
||||
}
|
||||
int Cuda::concurrency()
|
||||
{ return Impl::CudaInternal::singleton().m_maxConcurrency ; }
|
||||
|
||||
int Cuda::is_initialized()
|
||||
{ return Impl::CudaInternal::singleton().is_initialized(); }
|
||||
@ -798,7 +818,22 @@ void Cuda::fence()
|
||||
const char* Cuda::name() { return "Cuda"; }
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
UniqueToken< Kokkos::Cuda , Kokkos::Experimental::UniqueTokenScope::Global >::
|
||||
UniqueToken( Kokkos::Cuda const & )
|
||||
: m_buffer( Kokkos::Impl::CudaInternal::singleton().m_scratchConcurrentBitset )
|
||||
, m_count( Kokkos::Impl::CudaInternal::singleton().m_maxConcurrency )
|
||||
{}
|
||||
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
#else
|
||||
|
||||
void KOKKOS_CORE_SRC_CUDA_IMPL_PREVENT_LINK_ERROR() {}
|
||||
|
||||
#endif // KOKKOS_ENABLE_CUDA
|
||||
|
||||
|
||||
119
lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
Normal file
119
lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
Normal file
@ -0,0 +1,119 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include <Cuda/Kokkos_Cuda_Locks.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Error.hpp>
|
||||
#include <Kokkos_Cuda.hpp>
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
__device__ __constant__
|
||||
CudaLockArrays g_device_cuda_lock_arrays = { nullptr, nullptr, 0 };
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace {
|
||||
|
||||
__global__ void init_lock_array_kernel_atomic() {
|
||||
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
if(i<CUDA_SPACE_ATOMIC_MASK+1) {
|
||||
Kokkos::Impl::g_device_cuda_lock_arrays.atomic[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void init_lock_array_kernel_threadid(int N) {
|
||||
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
if(i<(unsigned)N) {
|
||||
Kokkos::Impl::g_device_cuda_lock_arrays.scratch[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace Impl {
|
||||
|
||||
CudaLockArrays g_host_cuda_lock_arrays = { nullptr, nullptr, 0 };
|
||||
|
||||
void initialize_host_cuda_lock_arrays() {
|
||||
if (g_host_cuda_lock_arrays.atomic != nullptr) return;
|
||||
CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.atomic,
|
||||
sizeof(int)*(CUDA_SPACE_ATOMIC_MASK+1)));
|
||||
CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.scratch,
|
||||
sizeof(int)*(Cuda::concurrency())));
|
||||
CUDA_SAFE_CALL(cudaDeviceSynchronize());
|
||||
g_host_cuda_lock_arrays.n = Cuda::concurrency();
|
||||
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
|
||||
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+1+255)/256,256>>>();
|
||||
init_lock_array_kernel_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
|
||||
CUDA_SAFE_CALL(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
void finalize_host_cuda_lock_arrays() {
|
||||
if (g_host_cuda_lock_arrays.atomic == nullptr) return;
|
||||
cudaFree(g_host_cuda_lock_arrays.atomic);
|
||||
g_host_cuda_lock_arrays.atomic = nullptr;
|
||||
cudaFree(g_host_cuda_lock_arrays.scratch);
|
||||
g_host_cuda_lock_arrays.scratch = nullptr;
|
||||
g_host_cuda_lock_arrays.n = 0;
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#else
|
||||
|
||||
void KOKKOS_CORE_SRC_CUDA_CUDA_LOCKS_PREVENT_LINK_ERROR() {}
|
||||
|
||||
#endif
|
||||
166
lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
Normal file
166
lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
Normal file
@ -0,0 +1,166 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_CUDA_LOCKS_HPP
|
||||
#define KOKKOS_CUDA_LOCKS_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include <Cuda/Kokkos_Cuda_Error.hpp>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
struct CudaLockArrays {
|
||||
std::int32_t* atomic;
|
||||
std::int32_t* scratch;
|
||||
std::int32_t n;
|
||||
};
|
||||
|
||||
/// \brief This global variable in Host space is the central definition
|
||||
/// of these arrays.
|
||||
extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays ;
|
||||
|
||||
/// \brief After this call, the g_host_cuda_lock_arrays variable has
|
||||
/// valid, initialized arrays.
|
||||
///
|
||||
/// This call is idempotent.
|
||||
void initialize_host_cuda_lock_arrays();
|
||||
|
||||
/// \brief After this call, the g_host_cuda_lock_arrays variable has
|
||||
/// all null pointers, and all array memory has been freed.
|
||||
///
|
||||
/// This call is idempotent.
|
||||
void finalize_host_cuda_lock_arrays();
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#if defined( __CUDACC__ )
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
/// \brief This global variable in CUDA space is what kernels use
|
||||
/// to get access to the lock arrays.
|
||||
///
|
||||
/// When relocatable device code is enabled, there can be one single
|
||||
/// instance of this global variable for the entire executable,
|
||||
/// whose definition will be in Kokkos_Cuda_Locks.cpp (and whose declaration
|
||||
/// here must then be extern.
|
||||
/// This one instance will be initialized by initialize_host_cuda_lock_arrays
|
||||
/// and need not be modified afterwards.
|
||||
///
|
||||
/// When relocatable device code is disabled, an instance of this variable
|
||||
/// will be created in every translation unit that sees this header file
|
||||
/// (we make this clear by marking it static, meaning no other translation
|
||||
/// unit can link to it).
|
||||
/// Since the Kokkos_Cuda_Locks.cpp translation unit cannot initialize the
|
||||
/// instances in other translation units, we must update this CUDA global
|
||||
/// variable based on the Host global variable prior to running any kernels
|
||||
/// that will use it.
|
||||
/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
|
||||
__device__ __constant__
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
extern
|
||||
#endif
|
||||
Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays ;
|
||||
|
||||
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
|
||||
|
||||
/// \brief Aquire a lock for the address
|
||||
///
|
||||
/// This function tries to aquire the lock for the hash value derived
|
||||
/// from the provided ptr. If the lock is successfully aquired the
|
||||
/// function returns true. Otherwise it returns false.
|
||||
__device__ inline
|
||||
bool lock_address_cuda_space(void* ptr) {
|
||||
size_t offset = size_t(ptr);
|
||||
offset = offset >> 2;
|
||||
offset = offset & CUDA_SPACE_ATOMIC_MASK;
|
||||
return (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset],0,1));
|
||||
}
|
||||
|
||||
/// \brief Release lock for the address
|
||||
///
|
||||
/// This function releases the lock for the hash value derived
|
||||
/// from the provided ptr. This function should only be called
|
||||
/// after previously successfully aquiring a lock with
|
||||
/// lock_address.
|
||||
__device__ inline
|
||||
void unlock_address_cuda_space(void* ptr) {
|
||||
size_t offset = size_t(ptr);
|
||||
offset = offset >> 2;
|
||||
offset = offset & CUDA_SPACE_ATOMIC_MASK;
|
||||
atomicExch( &Kokkos::Impl::g_device_cuda_lock_arrays.atomic[ offset ], 0);
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
/* Dan Ibanez: it is critical that this code be a macro, so that it will
|
||||
capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
|
||||
putting this in an inline function will NOT do the right thing! */
|
||||
#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
|
||||
{ \
|
||||
CUDA_SAFE_CALL(cudaMemcpyToSymbol( \
|
||||
Kokkos::Impl::g_device_cuda_lock_arrays , \
|
||||
& Kokkos::Impl::g_host_cuda_lock_arrays , \
|
||||
sizeof(Kokkos::Impl::CudaLockArrays) ) ); \
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
|
||||
#else
|
||||
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
|
||||
#endif
|
||||
|
||||
#endif /* defined( __CUDACC__ ) */
|
||||
|
||||
#endif /* defined( KOKKOS_ENABLE_CUDA ) */
|
||||
|
||||
#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */
|
||||
@ -58,6 +58,7 @@
|
||||
#include <Cuda/Kokkos_CudaExec.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Internal.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Locks.hpp>
|
||||
#include <Kokkos_Vectorization.hpp>
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
@ -65,6 +66,8 @@
|
||||
#include <typeinfo>
|
||||
#endif
|
||||
|
||||
#include <KokkosExp_MDRangePolicy.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -318,6 +321,7 @@ private:
|
||||
typedef Kokkos::RangePolicy< Traits ... > Policy;
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::launch_bounds LaunchBounds ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
@ -363,7 +367,7 @@ public:
|
||||
const dim3 block( 1 , CudaTraits::WarpSize * cuda_internal_maximum_warp_count(), 1);
|
||||
const dim3 grid( std::min( ( nwork + block.y - 1 ) / block.y , cuda_internal_maximum_grid_count() ) , 1 , 1);
|
||||
|
||||
CudaParallelLaunch< ParallelFor >( *this , grid , block , 0 );
|
||||
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
|
||||
}
|
||||
|
||||
ParallelFor( const FunctorType & arg_functor ,
|
||||
@ -373,6 +377,115 @@ public:
|
||||
{ }
|
||||
};
|
||||
|
||||
|
||||
// MDRangePolicy impl
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::Experimental::MDRangePolicy< Traits ... >
|
||||
, Kokkos::Cuda
|
||||
>
|
||||
{
|
||||
private:
|
||||
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ;
|
||||
using RP = Policy;
|
||||
typedef typename Policy::array_index_type array_index_type;
|
||||
typedef typename Policy::index_type index_type;
|
||||
typedef typename Policy::launch_bounds LaunchBounds;
|
||||
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_rp ;
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
__device__
|
||||
void operator()(void) const
|
||||
{
|
||||
Kokkos::Experimental::Impl::Refactor::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag>(m_rp,m_functor).exec_range();
|
||||
}
|
||||
|
||||
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
const array_index_type maxblocks = static_cast<array_index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
|
||||
if ( RP::rank == 2 )
|
||||
{
|
||||
const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , 1);
|
||||
const dim3 grid(
|
||||
std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
|
||||
, std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
|
||||
, 1
|
||||
);
|
||||
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
|
||||
}
|
||||
else if ( RP::rank == 3 )
|
||||
{
|
||||
const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , m_rp.m_tile[2] );
|
||||
const dim3 grid(
|
||||
std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
|
||||
, std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
|
||||
, std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1 ) / block.z , maxblocks )
|
||||
);
|
||||
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
|
||||
}
|
||||
else if ( RP::rank == 4 )
|
||||
{
|
||||
// id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to threadIdx.z
|
||||
const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2] , m_rp.m_tile[3] );
|
||||
const dim3 grid(
|
||||
std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
|
||||
, static_cast<index_type>(maxblocks) )
|
||||
, std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1 ) / block.y , maxblocks )
|
||||
, std::min( ( m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1 ) / block.z , maxblocks )
|
||||
);
|
||||
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
|
||||
}
|
||||
else if ( RP::rank == 5 )
|
||||
{
|
||||
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to threadIdx.z
|
||||
const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4] );
|
||||
const dim3 grid(
|
||||
std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
|
||||
, static_cast<index_type>(maxblocks) )
|
||||
, std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
|
||||
, static_cast<index_type>(maxblocks) )
|
||||
, std::min( ( m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1 ) / block.z , maxblocks )
|
||||
);
|
||||
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
|
||||
}
|
||||
else if ( RP::rank == 6 )
|
||||
{
|
||||
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to threadIdx.z
|
||||
const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4]*m_rp.m_tile[5] );
|
||||
const dim3 grid(
|
||||
std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
|
||||
, static_cast<index_type>(maxblocks) )
|
||||
, std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
|
||||
, static_cast<index_type>(maxblocks) )
|
||||
, std::min( static_cast<index_type>( m_rp.m_tile_end[4] * m_rp.m_tile_end[5] )
|
||||
, static_cast<index_type>(maxblocks) )
|
||||
);
|
||||
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
|
||||
Kokkos::abort("Aborting");
|
||||
}
|
||||
|
||||
} //end execute
|
||||
|
||||
// inline
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, Policy arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_rp( arg_policy )
|
||||
{}
|
||||
};
|
||||
|
||||
|
||||
template< class FunctorType , class ... Properties >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::TeamPolicy< Properties ... >
|
||||
@ -384,6 +497,7 @@ private:
|
||||
typedef TeamPolicyInternal< Kokkos::Cuda , Properties ... > Policy ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::launch_bounds LaunchBounds ;
|
||||
|
||||
public:
|
||||
|
||||
@ -430,15 +544,15 @@ public:
|
||||
if ( m_scratch_size[1]>0 ) {
|
||||
__shared__ int base_thread_id;
|
||||
if (threadIdx.x==0 && threadIdx.y==0 ) {
|
||||
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
|
||||
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % Kokkos::Impl::g_device_cuda_lock_arrays.n;
|
||||
threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
|
||||
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
|
||||
if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
|
||||
int done = 0;
|
||||
while (!done) {
|
||||
done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
|
||||
done = (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],0,1));
|
||||
if(!done) {
|
||||
threadid += blockDim.x * blockDim.y;
|
||||
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
|
||||
if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid = 0;
|
||||
}
|
||||
}
|
||||
base_thread_id = threadid;
|
||||
@ -448,7 +562,8 @@ public:
|
||||
}
|
||||
|
||||
|
||||
for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
|
||||
const int int_league_size = (int)m_league_size;
|
||||
for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
|
||||
|
||||
this-> template exec_team< WorkTag >(
|
||||
typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>()
|
||||
@ -462,7 +577,7 @@ public:
|
||||
if ( m_scratch_size[1]>0 ) {
|
||||
__syncthreads();
|
||||
if (threadIdx.x==0 && threadIdx.y==0 )
|
||||
kokkos_impl_cuda_lock_arrays.atomic[threadid]=0;
|
||||
Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid]=0;
|
||||
}
|
||||
}
|
||||
|
||||
@ -473,7 +588,7 @@ public:
|
||||
const dim3 grid( int(m_league_size) , 1 , 1 );
|
||||
const dim3 block( int(m_vector_size) , int(m_team_size) , 1 );
|
||||
|
||||
CudaParallelLaunch< ParallelFor >( *this, grid, block, shmem_size_total ); // copy to device and execute
|
||||
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute
|
||||
|
||||
}
|
||||
|
||||
@ -529,6 +644,7 @@ private:
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::launch_bounds LaunchBounds ;
|
||||
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
@ -563,6 +679,7 @@ private:
|
||||
typedef int DummySHMEMReductionType;
|
||||
|
||||
public:
|
||||
// Make the exec_range calls call to Reduce::DeviceIterateTile
|
||||
template< class TagType >
|
||||
__device__ inline
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
@ -686,7 +803,7 @@ public:
|
||||
|
||||
const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y );
|
||||
|
||||
CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute
|
||||
CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
|
||||
|
||||
Cuda::fence();
|
||||
|
||||
@ -737,6 +854,232 @@ public:
|
||||
{ }
|
||||
};
|
||||
|
||||
|
||||
// MDRangePolicy impl
|
||||
template< class FunctorType , class ReducerType, class ... Traits >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::Experimental::MDRangePolicy< Traits ... >
|
||||
, ReducerType
|
||||
, Kokkos::Cuda
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ;
|
||||
typedef typename Policy::array_index_type array_index_type;
|
||||
typedef typename Policy::index_type index_type;
|
||||
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::launch_bounds LaunchBounds;
|
||||
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ;
|
||||
|
||||
public:
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::value_type value_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
typedef FunctorType functor_type ;
|
||||
typedef Cuda::size_type size_type ;
|
||||
|
||||
// Algorithmic constraints: blockSize is a power of two AND blockDim.y == blockDim.z == 1
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ; // used for workrange and nwork
|
||||
const ReducerType m_reducer ;
|
||||
const pointer_type m_result_ptr ;
|
||||
size_type * m_scratch_space ;
|
||||
size_type * m_scratch_flags ;
|
||||
size_type * m_unified_space ;
|
||||
|
||||
typedef typename Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern;
|
||||
|
||||
// Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
|
||||
enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
|
||||
// Some crutch to do function overloading
|
||||
private:
|
||||
typedef double DummyShflReductionType;
|
||||
typedef int DummySHMEMReductionType;
|
||||
|
||||
public:
|
||||
inline
|
||||
__device__
|
||||
void
|
||||
exec_range( reference_type update ) const
|
||||
{
|
||||
Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag, reference_type>(m_policy, m_functor, update).exec_range();
|
||||
}
|
||||
|
||||
inline
|
||||
__device__
|
||||
void operator() (void) const {
|
||||
run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
|
||||
}
|
||||
|
||||
__device__ inline
|
||||
void run(const DummySHMEMReductionType& ) const
|
||||
{
|
||||
const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
|
||||
word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
|
||||
|
||||
{
|
||||
reference_type value =
|
||||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
|
||||
|
||||
// Number of blocks is bounded so that the reduction can be limited to two passes.
|
||||
// Each thread block is given an approximately equal amount of work to perform.
|
||||
// Accumulate the values for this block.
|
||||
// The accumulation ordering does not match the final pass, but is arithmatically equivalent.
|
||||
|
||||
this-> exec_range( value );
|
||||
}
|
||||
|
||||
// Reduce with final value at blockDim.y - 1 location.
|
||||
// Problem: non power-of-two blockDim
|
||||
if ( cuda_single_inter_block_reduce_scan<false,ReducerTypeFwd,WorkTag>(
|
||||
ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x ,
|
||||
kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {
|
||||
|
||||
// This is the final block with the final result at the final threads' location
|
||||
size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
|
||||
size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
|
||||
|
||||
if ( threadIdx.y == 0 ) {
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
|
||||
}
|
||||
|
||||
if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
|
||||
|
||||
for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; }
|
||||
}
|
||||
}
|
||||
|
||||
__device__ inline
|
||||
void run(const DummyShflReductionType&) const
|
||||
{
|
||||
|
||||
value_type value;
|
||||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
|
||||
// Number of blocks is bounded so that the reduction can be limited to two passes.
|
||||
// Each thread block is given an approximately equal amount of work to perform.
|
||||
// Accumulate the values for this block.
|
||||
// The accumulation ordering does not match the final pass, but is arithmatically equivalent.
|
||||
|
||||
const Member work_part =
|
||||
( ( m_policy.m_num_tiles + ( gridDim.x - 1 ) ) / gridDim.x ); //portion of tiles handled by each block
|
||||
|
||||
this-> exec_range( value );
|
||||
|
||||
pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
|
||||
|
||||
int max_active_thread = work_part < blockDim.y ? work_part:blockDim.y;
|
||||
max_active_thread = (max_active_thread == 0)?blockDim.y:max_active_thread;
|
||||
|
||||
value_type init;
|
||||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init);
|
||||
if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTag>
|
||||
(value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) {
|
||||
const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
|
||||
if(id==0) {
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
|
||||
*result = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Determine block size constrained by shared memory:
|
||||
static inline
|
||||
unsigned local_block_size( const FunctorType & f )
|
||||
{
|
||||
unsigned n = CudaTraits::WarpSize * 8 ;
|
||||
while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( f , n ) ) { n >>= 1 ; }
|
||||
return n ;
|
||||
}
|
||||
|
||||
inline
|
||||
void execute()
|
||||
{
|
||||
const int nwork = m_policy.m_num_tiles;
|
||||
if ( nwork ) {
|
||||
int block_size = m_policy.m_prod_tile_dims;
|
||||
// CONSTRAINT: Algorithm requires block_size >= product of tile dimensions
|
||||
// Nearest power of two
|
||||
int exponent_pow_two = std::ceil( std::log2(block_size) );
|
||||
block_size = std::pow(2, exponent_pow_two);
|
||||
int suggested_blocksize = local_block_size( m_functor );
|
||||
|
||||
block_size = (block_size > suggested_blocksize) ? block_size : suggested_blocksize ; //Note: block_size must be less than or equal to 512
|
||||
|
||||
|
||||
m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_size /* block_size == max block_count */ );
|
||||
m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
|
||||
m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
|
||||
|
||||
// REQUIRED ( 1 , N , 1 )
|
||||
const dim3 block( 1 , block_size , 1 );
|
||||
// Required grid.x <= block.y
|
||||
const dim3 grid( std::min( int(block.y) , int( nwork ) ) , 1 , 1 );
|
||||
|
||||
const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y );
|
||||
|
||||
CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
|
||||
|
||||
Cuda::fence();
|
||||
|
||||
if ( m_result_ptr ) {
|
||||
if ( m_unified_space ) {
|
||||
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
|
||||
}
|
||||
else {
|
||||
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
|
||||
DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (m_result_ptr) {
|
||||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template< class HostViewType >
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
, const HostViewType & arg_result
|
||||
, typename std::enable_if<
|
||||
Kokkos::is_view< HostViewType >::value
|
||||
,void*>::type = NULL)
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result.ptr_on_device() )
|
||||
, m_scratch_space( 0 )
|
||||
, m_scratch_flags( 0 )
|
||||
, m_unified_space( 0 )
|
||||
{}
|
||||
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
, const ReducerType & reducer)
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.view().ptr_on_device() )
|
||||
, m_scratch_space( 0 )
|
||||
, m_scratch_flags( 0 )
|
||||
, m_unified_space( 0 )
|
||||
{}
|
||||
};
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#if 1
|
||||
@ -753,6 +1096,7 @@ private:
|
||||
typedef TeamPolicyInternal< Kokkos::Cuda, Properties ... > Policy ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::launch_bounds LaunchBounds ;
|
||||
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
@ -819,15 +1163,15 @@ public:
|
||||
if ( m_scratch_size[1]>0 ) {
|
||||
__shared__ int base_thread_id;
|
||||
if (threadIdx.x==0 && threadIdx.y==0 ) {
|
||||
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
|
||||
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % Kokkos::Impl::g_device_cuda_lock_arrays.n;
|
||||
threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
|
||||
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
|
||||
if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
|
||||
int done = 0;
|
||||
while (!done) {
|
||||
done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
|
||||
done = (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],0,1));
|
||||
if(!done) {
|
||||
threadid += blockDim.x * blockDim.y;
|
||||
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
|
||||
if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid = 0;
|
||||
}
|
||||
}
|
||||
base_thread_id = threadid;
|
||||
@ -840,7 +1184,7 @@ public:
|
||||
if ( m_scratch_size[1]>0 ) {
|
||||
__syncthreads();
|
||||
if (threadIdx.x==0 && threadIdx.y==0 )
|
||||
kokkos_impl_cuda_lock_arrays.atomic[threadid]=0;
|
||||
Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid]=0;
|
||||
}
|
||||
}
|
||||
|
||||
@ -854,7 +1198,8 @@ public:
|
||||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
|
||||
|
||||
// Iterate this block through the league
|
||||
for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
|
||||
const int int_league_size = (int)m_league_size;
|
||||
for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
|
||||
this-> template exec_team< WorkTag >
|
||||
( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
|
||||
, m_shmem_begin
|
||||
@ -894,7 +1239,8 @@ public:
|
||||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
|
||||
|
||||
// Iterate this block through the league
|
||||
for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
|
||||
const int int_league_size = (int)m_league_size;
|
||||
for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
|
||||
this-> template exec_team< WorkTag >
|
||||
( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
|
||||
, m_shmem_begin
|
||||
@ -936,7 +1282,7 @@ public:
|
||||
const dim3 grid( block_count , 1 , 1 );
|
||||
const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
|
||||
|
||||
CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem_size_total ); // copy to device and execute
|
||||
CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute
|
||||
|
||||
Cuda::fence();
|
||||
|
||||
@ -975,12 +1321,6 @@ public:
|
||||
, m_shmem_begin( 0 )
|
||||
, m_shmem_size( 0 )
|
||||
, m_scratch_ptr{NULL,NULL}
|
||||
, m_league_size( arg_policy.league_size() )
|
||||
, m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
|
||||
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
|
||||
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
|
||||
arg_policy.vector_length() )
|
||||
, m_vector_size( arg_policy.vector_length() )
|
||||
, m_scratch_size{
|
||||
arg_policy.scratch_size(0,( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
|
||||
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
|
||||
@ -991,6 +1331,12 @@ public:
|
||||
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
|
||||
arg_policy.vector_length() )
|
||||
)}
|
||||
, m_league_size( arg_policy.league_size() )
|
||||
, m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
|
||||
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
|
||||
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
|
||||
arg_policy.vector_length() )
|
||||
, m_vector_size( arg_policy.vector_length() )
|
||||
{
|
||||
// Return Init value if the number of worksets is zero
|
||||
if( arg_policy.league_size() == 0) {
|
||||
@ -1150,6 +1496,7 @@ private:
|
||||
typedef typename reducer_type<>::pointer_type pointer_type ;
|
||||
typedef typename reducer_type<>::reference_type reference_type ;
|
||||
typedef typename reducer_type<>::value_type value_type ;
|
||||
typedef typename Policy::launch_bounds LaunchBounds ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorAnalysis
|
||||
< Kokkos::Impl::FunctorPatternInterface::REDUCE
|
||||
@ -1273,7 +1620,7 @@ public:
|
||||
const int shmem = m_shmem_team_begin + m_shmem_team_size ;
|
||||
|
||||
// copy to device and execute
|
||||
CudaParallelLaunch<ParallelReduce>( *this, grid, block, shmem );
|
||||
CudaParallelLaunch<ParallelReduce,LaunchBounds>( *this, grid, block, shmem );
|
||||
|
||||
Cuda::fence();
|
||||
|
||||
@ -1373,7 +1720,7 @@ public:
|
||||
|
||||
if ( CudaTraits::WarpSize < team_threads ) {
|
||||
// Need inter-warp team reduction (collectives) shared memory
|
||||
// Speculate an upper bound for the value size
|
||||
// Speculate an upper bound for the value size
|
||||
|
||||
m_shmem_team_begin =
|
||||
align_scratch( CudaTraits::warp_count(team_threads) * sizeof(double) );
|
||||
@ -1426,7 +1773,7 @@ public:
|
||||
|
||||
// Reduce space has claim flag followed by vaue buffer
|
||||
const int global_reduce_value_size =
|
||||
max_concurrent_block *
|
||||
max_concurrent_block *
|
||||
( aligned_flag_size + align_scratch( value_size ) );
|
||||
|
||||
// Scratch space has claim flag followed by scratch buffer
|
||||
@ -1469,6 +1816,7 @@ private:
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::launch_bounds LaunchBounds ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
|
||||
@ -1655,10 +2003,10 @@ public:
|
||||
const int shmem = ValueTraits::value_size( m_functor ) * ( block_size + 2 );
|
||||
|
||||
m_final = false ;
|
||||
CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
|
||||
CudaParallelLaunch< ParallelScan, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
|
||||
|
||||
m_final = true ;
|
||||
CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
|
||||
CudaParallelLaunch< ParallelScan, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -151,7 +151,7 @@ template< class ValueType , class JoinOp>
|
||||
__device__
|
||||
inline void cuda_intra_warp_reduction( ValueType& result,
|
||||
const JoinOp& join,
|
||||
const int max_active_thread = blockDim.y) {
|
||||
const uint32_t max_active_thread = blockDim.y) {
|
||||
|
||||
unsigned int shift = 1;
|
||||
|
||||
@ -268,29 +268,33 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
|
||||
if( id + 1 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
int active = __ballot(1);
|
||||
if (int(blockDim.x*blockDim.y) > 2) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 2,32);
|
||||
if( id + 2 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
active += __ballot(1);
|
||||
if (int(blockDim.x*blockDim.y) > 4) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 4,32);
|
||||
if( id + 4 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
active += __ballot(1);
|
||||
if (int(blockDim.x*blockDim.y) > 8) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 8,32);
|
||||
if( id + 8 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
active += __ballot(1);
|
||||
if (int(blockDim.x*blockDim.y) > 16) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 16,32);
|
||||
if( id + 16 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
active += __ballot(1);
|
||||
}
|
||||
}
|
||||
|
||||
//The last block has in its thread=0 the global reduction value through "value"
|
||||
return last_block;
|
||||
#else
|
||||
@ -302,7 +306,7 @@ template< class ReducerType >
|
||||
__device__ inline
|
||||
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
|
||||
cuda_intra_warp_reduction( const ReducerType& reducer,
|
||||
const int max_active_thread = blockDim.y) {
|
||||
const uint32_t max_active_thread = blockDim.y) {
|
||||
|
||||
typedef typename ReducerType::value_type ValueType;
|
||||
|
||||
@ -428,26 +432,31 @@ cuda_inter_block_reduction( const ReducerType& reducer,
|
||||
if( id + 1 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
int active = __ballot(1);
|
||||
if (int(blockDim.x*blockDim.y) > 2) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 2,32);
|
||||
if( id + 2 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
active += __ballot(1);
|
||||
if (int(blockDim.x*blockDim.y) > 4) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 4,32);
|
||||
if( id + 4 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
active += __ballot(1);
|
||||
if (int(blockDim.x*blockDim.y) > 8) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 8,32);
|
||||
if( id + 8 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
active += __ballot(1);
|
||||
if (int(blockDim.x*blockDim.y) > 16) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 16,32);
|
||||
if( id + 16 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
active += __ballot(1);
|
||||
}
|
||||
}
|
||||
|
||||
@ -594,7 +603,7 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
|
||||
typedef FunctorValueOps< FunctorType , ArgTag > ValueOps ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
//typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
// '__ffs' = position of the least significant bit set to 1.
|
||||
// 'blockDim.y' is guaranteed to be a power of two so this
|
||||
@ -637,7 +646,7 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
|
||||
|
||||
{
|
||||
void * const shared_ptr = shared_data + word_count.value * threadIdx.y ;
|
||||
reference_type shared_value = ValueInit::init( functor , shared_ptr );
|
||||
/* reference_type shared_value = */ ValueInit::init( functor , shared_ptr );
|
||||
|
||||
for ( size_type i = b ; i < e ; ++i ) {
|
||||
ValueJoin::join( functor , shared_ptr , global_data + word_count.value * i );
|
||||
|
||||
@ -58,25 +58,56 @@ template class TaskQueue< Kokkos::Cuda > ;
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#if defined( KOKKOS_DEBUG )
|
||||
|
||||
__device__
|
||||
void verify_warp_convergence( const char * const where )
|
||||
{
|
||||
const unsigned b = __ballot(1);
|
||||
|
||||
if ( b != ~0u ) {
|
||||
|
||||
printf(" verify_warp_convergence( %s ) (%d,%d,%d) (%d,%d,%d) failed %x\n"
|
||||
, where
|
||||
, blockIdx.x
|
||||
, blockIdx.y
|
||||
, blockIdx.z
|
||||
, threadIdx.x
|
||||
, threadIdx.y
|
||||
, threadIdx.z
|
||||
, b );
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif // #if defined( KOKKOS_DEBUG )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
__device__
|
||||
void TaskQueueSpecialization< Kokkos::Cuda >::driver
|
||||
( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue )
|
||||
( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue
|
||||
, int32_t shmem_per_warp )
|
||||
{
|
||||
using Member = TaskExec< Kokkos::Cuda > ;
|
||||
using Queue = TaskQueue< Kokkos::Cuda > ;
|
||||
using task_root_type = TaskBase< Kokkos::Cuda , void , void > ;
|
||||
using task_root_type = TaskBase< void , void , void > ;
|
||||
|
||||
extern __shared__ int32_t shmem_all[];
|
||||
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
Member single_exec( 1 );
|
||||
Member team_exec( blockDim.y );
|
||||
int32_t * const warp_shmem =
|
||||
shmem_all + ( threadIdx.z * shmem_per_warp ) / sizeof(int32_t);
|
||||
|
||||
task_root_type * const task_shmem = (task_root_type *) warp_shmem ;
|
||||
|
||||
const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;
|
||||
|
||||
union {
|
||||
task_root_type * ptr ;
|
||||
int raw[2] ;
|
||||
} task ;
|
||||
Member single_exec( warp_shmem , 1 );
|
||||
Member team_exec( warp_shmem , blockDim.y );
|
||||
|
||||
task_root_type * task_ptr ;
|
||||
|
||||
// Loop until all queues are empty and no tasks in flight
|
||||
|
||||
@ -87,41 +118,86 @@ void TaskQueueSpecialization< Kokkos::Cuda >::driver
|
||||
|
||||
if ( 0 == warp_lane ) {
|
||||
|
||||
task.ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
|
||||
task_ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
|
||||
|
||||
// Loop by priority and then type
|
||||
for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
|
||||
for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
|
||||
task.ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
|
||||
for ( int i = 0 ; i < Queue::NumQueue && end == task_ptr ; ++i ) {
|
||||
for ( int j = 0 ; j < 2 && end == task_ptr ; ++j ) {
|
||||
task_ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
|
||||
, uintptr_t(task.ptr));
|
||||
, uintptr_t(task_ptr));
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
// shuffle broadcast
|
||||
|
||||
task.raw[0] = __shfl( task.raw[0] , 0 );
|
||||
task.raw[1] = __shfl( task.raw[1] , 0 );
|
||||
((int*) & task_ptr )[0] = __shfl( ((int*) & task_ptr )[0] , 0 );
|
||||
((int*) & task_ptr )[1] = __shfl( ((int*) & task_ptr )[1] , 0 );
|
||||
|
||||
if ( 0 == task.ptr ) break ; // 0 == queue->m_ready_count
|
||||
#if defined( KOKKOS_DEBUG )
|
||||
verify_warp_convergence("task_ptr");
|
||||
#endif
|
||||
|
||||
if ( end != task.ptr ) {
|
||||
if ( task_root_type::TaskTeam == task.ptr->m_task_type ) {
|
||||
if ( 0 == task_ptr ) break ; // 0 == queue->m_ready_count
|
||||
|
||||
if ( end != task_ptr ) {
|
||||
|
||||
// Whole warp copy task's closure to/from shared memory.
|
||||
// Use all threads of warp for coalesced read/write.
|
||||
|
||||
int32_t const b = sizeof(task_root_type) / sizeof(int32_t);
|
||||
int32_t const e = *((int32_t volatile *)( & task_ptr->m_alloc_size )) / sizeof(int32_t);
|
||||
|
||||
int32_t volatile * const task_mem = (int32_t volatile *) task_ptr ;
|
||||
|
||||
// copy global to shared memory:
|
||||
|
||||
for ( int32_t i = warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
|
||||
warp_shmem[i] = task_mem[i] ;
|
||||
}
|
||||
|
||||
Kokkos::memory_fence();
|
||||
|
||||
// Copy done - use memory fence so that memory writes are visible.
|
||||
// For reliable warp convergence on Pascal and Volta an explicit
|
||||
// warp level synchronization will also be required.
|
||||
|
||||
if ( task_root_type::TaskTeam == task_shmem->m_task_type ) {
|
||||
// Thread Team Task
|
||||
(*task.ptr->m_apply)( task.ptr , & team_exec );
|
||||
(*task_shmem->m_apply)( task_shmem , & team_exec );
|
||||
}
|
||||
else if ( 0 == threadIdx.y ) {
|
||||
// Single Thread Task
|
||||
(*task.ptr->m_apply)( task.ptr , & single_exec );
|
||||
(*task_shmem->m_apply)( task_shmem , & single_exec );
|
||||
}
|
||||
|
||||
// copy shared to global memory:
|
||||
|
||||
for ( int32_t i = b + warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
|
||||
task_mem[i] = warp_shmem[i] ;
|
||||
}
|
||||
|
||||
Kokkos::memory_fence();
|
||||
|
||||
#if defined( KOKKOS_DEBUG )
|
||||
verify_warp_convergence("apply");
|
||||
#endif
|
||||
|
||||
// If respawn requested copy respawn data back to main memory
|
||||
|
||||
if ( 0 == warp_lane ) {
|
||||
queue->complete( task.ptr );
|
||||
|
||||
if ( ((task_root_type *) task_root_type::LockTag) != task_shmem->m_next ) {
|
||||
( (volatile task_root_type *) task_ptr )->m_next = task_shmem->m_next ;
|
||||
( (volatile task_root_type *) task_ptr )->m_priority = task_shmem->m_priority ;
|
||||
}
|
||||
|
||||
queue->complete( task_ptr );
|
||||
}
|
||||
}
|
||||
} while(1);
|
||||
@ -130,18 +206,20 @@ printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
|
||||
namespace {
|
||||
|
||||
__global__
|
||||
void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue )
|
||||
{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue ); }
|
||||
void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue
|
||||
, int32_t shmem_size )
|
||||
{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue , shmem_size ); }
|
||||
|
||||
}
|
||||
|
||||
void TaskQueueSpecialization< Kokkos::Cuda >::execute
|
||||
( TaskQueue< Kokkos::Cuda > * const queue )
|
||||
{
|
||||
const int shared_per_warp = 2048 ;
|
||||
const int warps_per_block = 4 ;
|
||||
const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
|
||||
const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
|
||||
const int shared = 0 ;
|
||||
const int shared_total = shared_per_warp * warps_per_block ;
|
||||
const cudaStream_t stream = 0 ;
|
||||
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
@ -159,7 +237,7 @@ printf("cuda_task_queue_execute before\n");
|
||||
//
|
||||
// CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , stack_size ) );
|
||||
|
||||
cuda_task_queue_execute<<< grid , block , shared , stream >>>( queue );
|
||||
cuda_task_queue_execute<<< grid , block , shared_total , stream >>>( queue , shared_per_warp );
|
||||
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@ namespace {
|
||||
template< typename TaskType >
|
||||
__global__
|
||||
void set_cuda_task_base_apply_function_pointer
|
||||
( TaskBase<Kokkos::Cuda,void,void>::function_type * ptr )
|
||||
( TaskBase<void,void,void>::function_type * ptr )
|
||||
{ *ptr = TaskType::apply ; }
|
||||
|
||||
}
|
||||
@ -78,7 +78,7 @@ public:
|
||||
void iff_single_thread_recursive_execute( queue_type * const ) {}
|
||||
|
||||
__device__
|
||||
static void driver( queue_type * const );
|
||||
static void driver( queue_type * const , int32_t );
|
||||
|
||||
static
|
||||
void execute( queue_type * const );
|
||||
@ -106,7 +106,14 @@ public:
|
||||
|
||||
extern template class TaskQueue< Kokkos::Cuda > ;
|
||||
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
/**\brief Impl::TaskExec<Cuda> is the TaskScheduler<Cuda>::member_type
|
||||
* passed to tasks running in a Cuda space.
|
||||
*
|
||||
@ -134,11 +141,13 @@ private:
|
||||
friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda > ;
|
||||
friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Cuda > ;
|
||||
|
||||
int32_t * m_team_shmem ;
|
||||
const int m_team_size ;
|
||||
|
||||
__device__
|
||||
TaskExec( int arg_team_size = blockDim.y )
|
||||
: m_team_size( arg_team_size ) {}
|
||||
TaskExec( int32_t * arg_team_shmem , int arg_team_size = blockDim.y )
|
||||
: m_team_shmem( arg_team_shmem )
|
||||
, m_team_size( arg_team_size ) {}
|
||||
|
||||
public:
|
||||
|
||||
@ -154,7 +163,13 @@ public:
|
||||
|
||||
};
|
||||
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<typename iType>
|
||||
struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
|
||||
|
||||
@ -106,7 +106,7 @@ private:
|
||||
typedef Kokkos::Cuda execution_space ;
|
||||
typedef execution_space::scratch_memory_space scratch_memory_space ;
|
||||
|
||||
void * m_team_reduce ;
|
||||
mutable void * m_team_reduce ;
|
||||
scratch_memory_space m_team_shared ;
|
||||
int m_team_reduce_size ;
|
||||
int m_league_rank ;
|
||||
@ -166,7 +166,7 @@ public:
|
||||
if ( 1 == blockDim.z ) { // team == block
|
||||
__syncthreads();
|
||||
// Wait for shared data write until all threads arrive here
|
||||
if ( threadIdx.x == 0 && threadIdx.y == thread_id ) {
|
||||
if ( threadIdx.x == 0u && threadIdx.y == (uint32_t)thread_id ) {
|
||||
*((ValueType*) m_team_reduce) = val ;
|
||||
}
|
||||
__syncthreads(); // Wait for shared data read until root thread writes
|
||||
@ -210,7 +210,7 @@ public:
|
||||
const int wx =
|
||||
( threadIdx.x + blockDim.x * threadIdx.y ) & CudaTraits::WarpIndexMask ;
|
||||
|
||||
for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
|
||||
for ( int i = CudaTraits::WarpSize ; (int)blockDim.x <= ( i >>= 1 ) ; ) {
|
||||
|
||||
cuda_shfl_down( reducer.reference() , tmp , i , CudaTraits::WarpSize );
|
||||
|
||||
@ -354,7 +354,7 @@ public:
|
||||
|
||||
for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
|
||||
cuda_shfl_down( reducer.reference() , tmp , i , blockDim.x );
|
||||
if ( threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
|
||||
if ( (int)threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
|
||||
}
|
||||
|
||||
// Broadcast from root lane to all other lanes.
|
||||
@ -410,7 +410,7 @@ public:
|
||||
|
||||
value_type tmp( reducer.reference() );
|
||||
|
||||
for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
|
||||
for ( int i = CudaTraits::WarpSize ; (int)blockDim.x <= ( i >>= 1 ) ; ) {
|
||||
|
||||
cuda_shfl_down( reducer.reference(), tmp, i, CudaTraits::WarpSize );
|
||||
|
||||
@ -479,7 +479,7 @@ public:
|
||||
|
||||
__threadfence(); // Wait until global write is visible.
|
||||
|
||||
last_block = gridDim.x ==
|
||||
last_block = (int)gridDim.x ==
|
||||
1 + Kokkos::atomic_fetch_add(global_scratch_flags,1);
|
||||
|
||||
// If last block then reset count
|
||||
@ -509,7 +509,7 @@ public:
|
||||
reducer.copy( ((pointer_type)shmem) + offset
|
||||
, ((pointer_type)global_scratch_space) + offset );
|
||||
|
||||
for ( int i = nentry + tid ; i < gridDim.x ; i += nentry ) {
|
||||
for ( int i = nentry + tid ; i < (int)gridDim.x ; i += nentry ) {
|
||||
reducer.join( ((pointer_type)shmem) + offset
|
||||
, ((pointer_type)global_scratch_space)
|
||||
+ i * reducer.length() );
|
||||
@ -576,6 +576,14 @@ public:
|
||||
, m_league_size( arg_league_size )
|
||||
{}
|
||||
|
||||
public:
|
||||
// Declare to avoid unused private member warnings which are trigger
|
||||
// when SFINAE excludes the member function which uses these variables
|
||||
// Making another class a friend also surpresses these warnings
|
||||
bool impl_avoid_sfinae_warning() const noexcept
|
||||
{
|
||||
return m_team_reduce_size > 0 && m_team_reduce != nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
} // namspace Impl
|
||||
@ -913,10 +921,10 @@ void parallel_scan
|
||||
// [t] += [t-4] if t >= 4
|
||||
// ...
|
||||
|
||||
for ( int j = 1 ; j < blockDim.x ; j <<= 1 ) {
|
||||
for ( int j = 1 ; j < (int)blockDim.x ; j <<= 1 ) {
|
||||
value_type tmp = 0 ;
|
||||
Impl::cuda_shfl_up( tmp , sval , j , blockDim.x );
|
||||
if ( j <= threadIdx.x ) { sval += tmp ; }
|
||||
if ( j <= (int)threadIdx.x ) { sval += tmp ; }
|
||||
}
|
||||
|
||||
// Include accumulation and remove value for exclusive scan:
|
||||
|
||||
133
lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp
Normal file
133
lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp
Normal file
@ -0,0 +1,133 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_CUDA_UNIQUE_TOKEN_HPP
|
||||
#define KOKKOS_CUDA_UNIQUE_TOKEN_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include <Kokkos_CudaSpace.hpp>
|
||||
#include <Kokkos_UniqueToken.hpp>
|
||||
#include <impl/Kokkos_SharedAlloc.hpp>
|
||||
#include <impl/Kokkos_ConcurrentBitset.hpp>
|
||||
|
||||
namespace Kokkos { namespace Experimental {
|
||||
|
||||
// both global and instance Unique Tokens are implemented in the same way
|
||||
template<>
|
||||
class UniqueToken< Cuda, UniqueTokenScope::Global >
|
||||
{
|
||||
private:
|
||||
|
||||
uint32_t volatile * m_buffer ;
|
||||
uint32_t m_count ;
|
||||
|
||||
public:
|
||||
|
||||
using execution_space = Cuda;
|
||||
|
||||
explicit
|
||||
UniqueToken( execution_space const& );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
UniqueToken() : m_buffer(0), m_count(0) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
UniqueToken( const UniqueToken & ) = default;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
UniqueToken( UniqueToken && ) = default;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
UniqueToken & operator=( const UniqueToken & ) = default ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
UniqueToken & operator=( UniqueToken && ) = default ;
|
||||
|
||||
/// \brief upper bound for acquired values, i.e. 0 <= value < size()
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int32_t size() const noexcept { return m_count ; }
|
||||
|
||||
/// \brief acquire value such that 0 <= value < size()
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int32_t acquire() const
|
||||
{
|
||||
const Kokkos::pair<int,int> result =
|
||||
Kokkos::Impl::concurrent_bitset::
|
||||
acquire_bounded( m_buffer
|
||||
, m_count
|
||||
, Kokkos::Impl::clock_tic() % m_count
|
||||
);
|
||||
|
||||
if ( result.first < 0 ) {
|
||||
Kokkos::abort("UniqueToken<Cuda> failure to release tokens, no tokens available" );
|
||||
}
|
||||
|
||||
return result.first;
|
||||
}
|
||||
|
||||
/// \brief release an acquired value
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void release( int32_t i ) const noexcept
|
||||
{
|
||||
Kokkos::Impl::concurrent_bitset::release( m_buffer, i );
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
class UniqueToken< Cuda, UniqueTokenScope::Instance >
|
||||
: public UniqueToken< Cuda, UniqueTokenScope::Global >
|
||||
{
|
||||
public:
|
||||
|
||||
explicit
|
||||
UniqueToken( execution_space const& arg )
|
||||
: UniqueToken< Cuda, UniqueTokenScope::Global >( arg ) {}
|
||||
};
|
||||
|
||||
}} // namespace Kokkos::Experimental
|
||||
|
||||
#endif // KOKKOS_ENABLE_CUDA
|
||||
#endif // KOKKOS_CUDA_UNIQUE_TOKEN_HPP
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user