Compare commits

...

36 Commits

Author SHA1 Message Date
7ddcb6812b patch 17Aug17 2017-08-17 16:02:20 -06:00
76cd61350d Merge pull request #613 from akohlmey/collected-small-changes
Collected small changes and bug fixes
2017-08-17 11:34:52 -06:00
fa3c0c61d6 Merge pull request #618 from lammps/intel
USER-INTEL add-ons from Mike
2017-08-17 11:32:53 -06:00
c46d5ff422 Merge pull request #612 from giacomofiorin/colvars-update-2017-08-10
Minor fixes to Colvars module
2017-08-17 11:32:38 -06:00
dd67989c76 Merge pull request #601 from stanmoore1/kokkos_update
Update Kokkos library to v2.03.13
2017-08-17 11:32:18 -06:00
00aafef1a8 Merge pull request #597 from ndtrung81/three-body-short-nlist
Implementing short neighbor lists for three-body gpu styles
2017-08-17 11:31:59 -06:00
7175abcc71 flag more pair styles as supporting USER-INTEL to match newly added code 2017-08-16 13:58:46 -04:00
e34b20405c Fix a couple of typos in the docs 2017-08-15 21:10:05 -04:00
1d4d2155a2 USER-INTEL add-ons from Mike 2017-08-15 17:12:07 -06:00
cee87d7a54 update manual to point to packages.lammps.org instead of rpm.lammps.org 2017-08-15 16:19:09 -04:00
60e14f1490 add comment to msi2lmp README about symmetry limitations 2017-08-14 08:54:26 -04:00
81e7d4a942 fix incorrect preprocessor define for windows 2017-08-12 00:35:50 -04:00
0b3f1b8a15 patch 11Aug17 2017-08-11 12:19:02 -06:00
b209a4e246 Merge pull request #614 from akohlmey/fixes-for-stable
Fixes for stable
2017-08-11 08:35:20 -06:00
27553283c3 fix bug with pair_modify <style> compute not triggering the correct global change 2017-08-11 08:28:37 -04:00
df56b2d6a4 trigger relink after package library build through updating the time stamp on main.cpp 2017-08-11 08:16:20 -04:00
c6d923b6c8 make download function more resilient by also catching failed execution 2017-08-10 21:15:35 -04:00
6d24be8bb7 whitespace cleanup 2017-08-10 21:14:40 -04:00
8c16ea1bfc add automatic triggering of review requests with a code owners file 2017-08-10 15:58:51 -04:00
c8741f3a01 remove special_bonds extra keyword and refer to read_data and create_box instead 2017-08-10 15:12:56 -04:00
2a7d2dee36 add more strict checking of data when parsing molecule files to detect format errors 2017-08-10 14:49:51 -04:00
da01be7c18 More robust change from initial to target restraint centers in Colvars 2017-08-10 09:22:53 -04:00
3e9b41c6b7 Added references to GPU package citations 2017-08-09 10:09:40 -05:00
8a7a831bd6 Remove redundant check in Makefile.kokkos 2017-08-08 12:57:22 -06:00
8431ca5fec Remove tpls directory 2017-07-31 10:54:07 -06:00
13f2d39f55 Update Kokkos library to v2.03.13 2017-07-31 10:34:21 -06:00
aa60ef6ed8 Cleaned up 3-body kernels, reverted some mistaken changes to vashishta 2017-07-23 00:08:55 -05:00
a71f5a0c20 Enabled again neigh no with tpa > 1 for 3-body gpu styles for backward compatibility, could be slower than neigh no tpa 1 in many cases 2017-07-22 22:57:37 -05:00
3d1d0c58c7 Cleaned up 3-body gpu styles, and fixed a bug for tersoff/zbl/gpu.
There is a unresolved bug for neigh no with tpa > 1 with BaseThree, enforce tpa = 1 for neigh no in BaseThree for now.
2017-07-21 12:08:04 -05:00
cdac5f496c Built 3-body short neighbor list for the 3-body kernels using per-pair cutoffs for vashishta gpu style 2017-07-11 00:13:56 -05:00
8c9db3ea00 Built 2-body short neighbor list and used for 2-body kernels in tersoff gpu styles 2017-07-10 23:50:21 -05:00
ea2b01e83b Refactored 3-body gpu styles to remove code duplication 2017-07-08 20:17:31 -05:00
34fe2273f6 Added short neighbor list implementation for tersoff/zbl/gpu and tersoff/mod/gpu 2017-07-08 14:59:48 -05:00
77c60189b8 Minor cleanups for tersoff/gpu 2017-07-08 14:43:53 -05:00
1c6533e53d Working on short neighbor list for tersoff/gpu 2017-07-08 14:15:26 -05:00
68206079da Supported short neighbor lists for 3-body kernels in sw/gpu and vashishta/gpu 2017-07-07 16:47:24 -05:00
262 changed files with 25801 additions and 4212 deletions

21
.github/CODEOWNERS vendored Normal file
View File

@ -0,0 +1,21 @@
# This file contains file patterns that triggers automatic
# code review requests from users that are owners of these files
# Order matters, the last match has the highest precedence
# library folders
lib/colvars/* @giacomofiorin
lib/compress/* @akohlmey
lib/kokkos/* @stanmoore1
lib/molfile/* @akohlmey
lib/qmmm/* @akohlmey
lib/vtk/* @rbberger
# packages
src/KOKKOS @stanmoore1
src/USER-CGSDK @akohlmey
src/USER-COLVARS @giacomofiorin
src/USER-OMP @akohlmey
src/USER-QMMM @akohlmey
# tools
tools/msi2lmp/* @akohlmey

Binary file not shown.

Before

Width:  |  Height:  |  Size: 14 KiB

After

Width:  |  Height:  |  Size: 20 KiB

View File

@ -1,7 +1,7 @@
<!-- HTML_ONLY -->
<HEAD>
<TITLE>LAMMPS Users Manual</TITLE>
<META NAME="docnumber" CONTENT="10 Aug 2017 version">
<META NAME="docnumber" CONTENT="17 Aug 2017 version">
<META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
<META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation. This software and manual is distributed under the GNU General Public License.">
</HEAD>
@ -21,7 +21,7 @@
<H1></H1>
LAMMPS Documentation :c,h3
10 Aug 2017 version :c,h4
17 Aug 2017 version :c,h4
Version info: :h4
@ -79,7 +79,7 @@ bug reports and feature requests are mainly coordinated through the
"LAMMPS project on GitHub."_https://github.com/lammps/lammps
The lammps.org domain, currently hosting "public continuous integration
testing"_https://ci.lammps.org/job/lammps/ and "precompiled Linux
RPM and Windows installer packages"_http://rpm.lammps.org is located
RPM and Windows installer packages"_http://packages.lammps.org is located
at Temple University and managed by Richard Berger,
richard.berger at temple.edu.

View File

@ -892,8 +892,8 @@ KOKKOS, o = USER-OMP, t = OPT.
"hybrid"_pair_hybrid.html,
"hybrid/overlay"_pair_hybrid.html,
"adp (o)"_pair_adp.html,
"airebo (o)"_pair_airebo.html,
"airebo/morse (o)"_pair_airebo.html,
"airebo (oi)"_pair_airebo.html,
"airebo/morse (oi)"_pair_airebo.html,
"beck (go)"_pair_beck.html,
"body"_pair_body.html,
"bop"_pair_bop.html,
@ -927,8 +927,8 @@ KOKKOS, o = USER-OMP, t = OPT.
"dpd/tstat (go)"_pair_dpd.html,
"dsmc"_pair_dsmc.html,
"eam (gkiot)"_pair_eam.html,
"eam/alloy (gkot)"_pair_eam.html,
"eam/fs (gkot)"_pair_eam.html,
"eam/alloy (gkiot)"_pair_eam.html,
"eam/fs (gkiot)"_pair_eam.html,
"eim (o)"_pair_eim.html,
"gauss (go)"_pair_gauss.html,
"gayberne (gio)"_pair_gayberne.html,
@ -942,9 +942,9 @@ KOKKOS, o = USER-OMP, t = OPT.
"kim"_pair_kim.html,
"lcbop"_pair_lcbop.html,
"line/lj"_pair_line_lj.html,
"lj/charmm/coul/charmm (ko)"_pair_charmm.html,
"lj/charmm/coul/charmm (kio)"_pair_charmm.html,
"lj/charmm/coul/charmm/implicit (ko)"_pair_charmm.html,
"lj/charmm/coul/long (giko)"_pair_charmm.html,
"lj/charmm/coul/long (gkio)"_pair_charmm.html,
"lj/charmm/coul/msm"_pair_charmm.html,
"lj/charmmfsw/coul/charmmfsh"_pair_charmm.html,
"lj/charmmfsw/coul/long"_pair_charmm.html,
@ -990,7 +990,7 @@ KOKKOS, o = USER-OMP, t = OPT.
"polymorphic"_pair_polymorphic.html,
"python"_pair_python.html,
"reax"_pair_reax.html,
"rebo (o)"_pair_airebo.html,
"rebo (oi)"_pair_airebo.html,
"resquared (go)"_pair_resquared.html,
"snap"_pair_snap.html,
"soft (go)"_pair_soft.html,

View File

@ -7886,8 +7886,8 @@ keyword to allow for additional bonds to be formed :dd
{New bond exceeded special list size in fix bond/create} :dt
See the "special_bonds extra" command
(or the "read_data extra/special/per/atom" command)
See the "read_data extra/special/per/atom" command
(or the "create_box extra/special/per/atom" command)
for info on how to leave space in the special bonds
list to allow for additional bonds to be formed. :dd
@ -9666,8 +9666,8 @@ you are running. :dd
{Special list size exceeded in fix bond/create} :dt
See the special_bonds extra command
(or the read_data extra/special/per/atom command)
See the "read_data extra/special/per/atom" command
(or the "create_box extra/special/per/atom" command)
for info on how to leave space in the special bonds
list to allow for additional bonds to be formed. :dd

View File

@ -662,27 +662,25 @@ your own build system. Due to differences between the Windows OS
and Windows system libraries to Unix-like environments like Linux
or MacOS, when compiling for Windows a few adjustments may be needed:
Do not set the -DLAMMPS_MEMALIGN define (see LMP_INC makefile variable)
Do [not] set the -DLAMMPS_MEMALIGN define (see LMP_INC makefile variable)
Add -lwsock32 -lpsapi to the linker flags (see LIB makefile variable)
Try adding -static-libgcc or -static or both to the linker flags when your
LAMMPS executable complains about missing .dll files :ul
Try adding -static-libgcc or -static or both to the linker flags when your LAMMPS executable complains about missing .dll files :ul
Since none of the current LAMMPS core developers
has significant experience building executables on Windows, we are
happy to distribute contributed instructions and modifications, but
we cannot provide support for those.
Since none of the current LAMMPS core developers has significant
experience building executables on Windows, we are happy to distribute
contributed instructions and modifications to improve the situation,
but we cannot provide support for those.
With the so-called "Anniversary Update" to Windows 10, there is a
Ubuntu Linux subsystem available for Windows, that can be installed
and then used to compile/install LAMMPS as if you are running on a
Ubuntu Linux system instead of Windows.
As an alternative, you can download "daily builds" (and some older
versions) of the installer packages from
"rpm.lammps.org/windows.html"_http://rpm.lammps.org/windows.html.
These executables are built with most optional packages and the
download includes documentation, potential files, some tools and
many examples, but no source code.
As an alternative, you can download pre-compiled installer packages from
"packages.lammps.org/windows.html"_http://packages.lammps.org/windows.html.
These executables are built with most optional packages included and the
download includes documentation, potential files, some tools and many
examples, but no source code.
:line
@ -1095,7 +1093,7 @@ LAMMPS to be built with one or more of its optional packages.
:line
On a Windows box, you can skip making LAMMPS and simply download an
installer package from "here"_http://rpm.lammps.org/windows.html
installer package from "here"_http://packages.lammps.org/windows.html
For running the non-MPI executable, follow these steps:
@ -1107,18 +1105,27 @@ the [in.lj] input from the bench folder. (e.g. by typing: cd "Documents"). :l
At the command prompt, type "lmp_serial -in in.lj", replacing [in.lj]
with the name of your LAMMPS input script. :l
The serial executable includes support for multi-threading
parallelization from the styles in the USER-OMP packages.
To run with, e.g. 4 threads, type "lmp_serial -in in.lj -pk omp 4 -sf omp"
:ule
For the MPI version, which allows you to run LAMMPS under Windows on
multiple processors, follow these steps:
For the MPI version, which allows you to run LAMMPS under Windows with
the more general message passing parallel library (LAMMPS has been
designed from ground up to use MPI efficiently), follow these steps:
Download and install
"MPICH2"_http://www.mcs.anl.gov/research/projects/mpich2/downloads/index.php?s=downloads
for Windows. :ulb,l
Download and install a compatible MPI library binary package:
for 32-bit Windows
"mpich2-1.4.1p1-win-ia32.msi"_download.lammps.org/thirdparty/mpich2-1.4.1p1-win-ia32.msi
and for 64-bit Windows
"mpich2-1.4.1p1-win-x86-64.msi"_download.lammps.org/thirdparty/mpich2-1.4.1p1-win-x86-64.msi
:ulb,l
The LAMMPS Windows installer packages will automatically adjust your
path for the default location of this MPI package. After the installation
of the MPICH software, it needs to be integrated into the system.
of the MPICH2 software, it needs to be integrated into the system.
For this you need to start a Command Prompt in {Administrator Mode}
(right click on the icon and select it). Change into the MPICH2
installation directory, then into the subdirectory [bin] and execute
@ -1137,7 +1144,7 @@ or
mpiexec -np 4 lmp_mpi -in in.lj :pre
replacing in.lj with the name of your LAMMPS input script. For the latter
replacing [in.lj] with the name of your LAMMPS input script. For the latter
case, you may be prompted to enter your password. :l
In this mode, output may not immediately show up on the screen, so if
@ -1149,6 +1156,11 @@ something like:
lmp_mpi -in in.lj :pre
And the parallel executable also includes OpenMP multi-threading, which
can be combined with MPI using something like:
mpiexec -localonly 2 lmp_mpi -in in.lj -pk omp 2 -sf omp :pre
:ule
:line

View File

@ -29,8 +29,10 @@ Bond Styles: fene, harmonic :l
Dihedral Styles: charmm, harmonic, opls :l
Fixes: nve, npt, nvt, nvt/sllod :l
Improper Styles: cvff, harmonic :l
Pair Styles: buck/coul/cut, buck/coul/long, buck, eam, gayberne,
charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, sw, tersoff :l
Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long,
buck, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm,
lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo,
sw, tersoff :l
K-Space Styles: pppm, pppm/disp :l
:ule

View File

@ -150,10 +150,9 @@ atoms. Note that adding a single bond always adds a new 1st neighbor
but may also induce *many* new 2nd and 3rd neighbors, depending on the
molecular topology of your system. The "extra special per atom"
parameter must typically be set to allow for the new maximum total
size (1st + 2nd + 3rd neighbors) of this per-atom list. There are 3
size (1st + 2nd + 3rd neighbors) of this per-atom list. There are 2
ways to do this. See the "read_data"_read_data.html or
"create_box"_create_box.html or "special_bonds extra" commands for
details.
"create_box"_create_box.html commands for details.
NOTE: Even if you do not use the {atype}, {dtype}, or {itype}
keywords, the list of topological neighbors is updated for atoms

View File

@ -7,10 +7,13 @@
:line
pair_style airebo command :h3
pair_style airebo/intel command :h3
pair_style airebo/omp command :h3
pair_style airebo/morse command :h3
pair_style airebo/morse/intel command :h3
pair_style airebo/morse/omp command :h3
pair_style rebo command :h3
pair_style rebo/intel command :h3
pair_style rebo/omp command :h3
[Syntax:]

View File

@ -7,6 +7,7 @@
:line
pair_style lj/charmm/coul/charmm command :h3
pair_style lj/charmm/coul/charmm/intel command :h3
pair_style lj/charmm/coul/charmm/omp command :h3
pair_style lj/charmm/coul/charmm/implicit command :h3
pair_style lj/charmm/coul/charmm/implicit/omp command :h3

View File

@ -14,6 +14,7 @@ pair_style eam/omp command :h3
pair_style eam/opt command :h3
pair_style eam/alloy command :h3
pair_style eam/alloy/gpu command :h3
pair_style eam/alloy/intel command :h3
pair_style eam/alloy/kk command :h3
pair_style eam/alloy/omp command :h3
pair_style eam/alloy/opt command :h3
@ -21,6 +22,7 @@ pair_style eam/cd command :h3
pair_style eam/cd/omp command :h3
pair_style eam/fs command :h3
pair_style eam/fs/gpu command :h3
pair_style eam/fs/intel command :h3
pair_style eam/fs/kk command :h3
pair_style eam/fs/omp command :h3
pair_style eam/fs/opt command :h3

View File

@ -25,9 +25,7 @@ keyword = {amber} or {charmm} or {dreiding} or {fene} or {lj/coul} or {lj} or {c
{coul} values = w1,w2,w3
w1,w2,w3 = weights (0.0 to 1.0) on pairwise Coulombic interactions
{angle} value = {yes} or {no}
{dihedral} value = {yes} or {no}
{extra} value = N
N = number of extra 1-2,1-3,1-4 interactions to save space for :pre
{dihedral} value = {yes} or {no} :pre
:ule
Examples:
@ -36,8 +34,7 @@ special_bonds amber
special_bonds charmm
special_bonds fene dihedral no
special_bonds lj/coul 0.0 0.0 0.5 angle yes dihedral yes
special_bonds lj 0.0 0.0 0.5 coul 0.0 0.0 0.0 dihedral yes
special_bonds lj/coul 0 1 1 extra 2 :pre
special_bonds lj 0.0 0.0 0.5 coul 0.0 0.0 0.0 dihedral yes :pre
[Description:]
@ -178,14 +175,6 @@ interaction between atoms 2 and 5 will be unaffected (full weighting
of 1.0). If the {dihedral} keyword is specified as {no} which is the
default, then the 2,5 interaction will also be weighted by 0.5.
The {extra} keyword can be used when additional bonds will be created
during a simulation run, e.g. by the "fix
bond/create"_fix_bond_create.html command. It can also be used if
molecules will be added to the system, e.g. via the "fix
deposit"_fix_deposit.html, or "fix pour"_fix_pour.html commands, which
will have atoms with more special neighbors than any atom in the
current system has.
:line
NOTE: LAMMPS stores and maintains a data structure with a list of the
@ -194,8 +183,9 @@ the system). If new bonds are created (or molecules added containing
atoms with more special neighbors), the size of this list needs to
grow. Note that adding a single bond always adds a new 1st neighbor
but may also induce *many* new 2nd and 3rd neighbors, depending on the
molecular topology of your system. Using the {extra} keyword leaves
empty space in the list for this N additional 1st, 2nd, or 3rd
molecular topology of your system. Using the {extra/special/per/atom}
keyword to either "read_data"_read_data.html or "create_box"_create_box.html
reserves empty space in the list for this N additional 1st, 2nd, or 3rd
neighbors to be added. If you do not do this, you may get an error
when bonds (or molecules) are added.
@ -203,8 +193,7 @@ when bonds (or molecules) are added.
NOTE: If you reuse this command in an input script, you should set all
the options you need each time. This command cannot be used a 2nd
time incrementally, e.g. to add some extra storage locations via the
{extra} keyword. E.g. these two commands:
time incrementally. E.g. these two commands:
special_bonds lj 0.0 1.0 1.0
special_bonds coul 0.0 0.0 1.0
@ -221,25 +210,6 @@ Coul: coul 0.0 0.0 1.0
because the LJ settings are reset to their default values
each time the command is issued.
Likewise
special_bonds amber
special_bonds extra 2 :pre
is not the same as this single command:
special_bonds amber extra 2 :pre
since in the former case, the 2nd command will reset all the LJ and
Coulombic weights to 0.0 (the default).
One exception to this rule is the {extra} option itself. It is not
reset to its default value of 0 each time the special_bonds command is
invoked. This is because it can also be set by the
"read_data"_read_data.html and "create_box"_create_box.html commands,
so this command will not override those settings unless you explicitly
use {extra} as an option.
[Restrictions:] none
[Related commands:]

0
doc/src/tutorial_bash_on_windows.txt Executable file → Normal file
View File

View File

@ -176,12 +176,13 @@ By recognizing the fix {drude}, LAMMPS will find and store matching
DC-DP pairs and will treat DP as equivalent to their DC in the
{special bonds} relations. It may be necessary to extend the space
for storing such special relations. In this case extra space should
be reserved by using the {extra} keyword of the {special_bonds}
be reserved by using the {extra/special/per/atom} keyword of either
the "read_data"_read_data.html or "create_box"_create_box.html
command. With our phenol, there is 1 more special neighbor for which
space is required. Otherwise LAMMPS crashes and gives the required
value.
special_bonds lj/coul 0.0 0.0 0.5 extra 1 :pre
read_data data-p.lmp extra/special/per/atom 1 :pre
Let us assume we want to run a simple NVT simulation at 300 K. Note
that Drude oscillators need to be thermalized at a low temperature in

0
doc/src/tutorials.txt Executable file → Normal file
View File

View File

@ -45,12 +45,12 @@ while iarg < nargs:
if args[iarg] == "-m":
if iarg+2 > len(args): error()
machine = args[iarg+1]
iarg += 2
iarg += 2
elif args[iarg] == "-e":
if iarg+2 > len(args): error()
extraflag = True
suffix = args[iarg+1]
iarg += 2
iarg += 2
else: error()
# set lib from working dir

View File

@ -32,7 +32,7 @@ where Makefile.g++ uses the GNU C++ compiler and is a good template to start.
**Optional**: if you use the Install.py script provided in this folder, you
can give the machine name as the '-m' argument. This can be the suffix of one
of the files from either this folder, or from src/MAKE.
of the files from either this folder, or from src/MAKE/MACHINES.
*This is only supported by the Install.py within the lib/colvars folder*.
When you are done building this library, two files should
@ -53,10 +53,10 @@ settings in Makefile.common should work.
For the reference manual see:
http://colvars.github.io/colvars-refman-lammps
A copy of reference manual is also in:
A copy of the reference manual is also in:
doc/PDF/colvars-refman-lammps.pdf
Also included is a Doxygen-based developer documentation:
Also available is a Doxygen-based developer documentation:
http://colvars.github.io/doxygen/html/
The reference article is:

View File

@ -88,7 +88,12 @@ public:
static std::vector<feature *> cv_features;
/// \brief Implementation of the feature list accessor for colvar
std::vector<feature *> &features() {
virtual const std::vector<feature *> &features()
{
return cv_features;
}
virtual std::vector<feature *> &modify_features()
{
return cv_features;
}

View File

@ -206,7 +206,12 @@ public:
static std::vector<feature *> ag_features;
/// \brief Implementation of the feature list accessor for atom group
virtual std::vector<feature *> &features() {
virtual const std::vector<feature *> &features()
{
return ag_features;
}
virtual std::vector<feature *> &modify_features()
{
return ag_features;
}

View File

@ -384,6 +384,7 @@ std::ostream & colvarbias::write_traj(std::ostream &os)
os << " ";
if (b_output_energy)
os << " "
<< std::setprecision(cvm::en_prec) << std::setw(cvm::en_width)
<< bias_energy;
return os;
}

View File

@ -175,7 +175,11 @@ public:
static std::vector<feature *> cvb_features;
/// \brief Implementation of the feature list accessor for colvarbias
virtual std::vector<feature *> &features()
virtual const std::vector<feature *> &features()
{
return cvb_features;
}
virtual std::vector<feature *> &modify_features()
{
return cvb_features;
}

View File

@ -99,12 +99,9 @@ int colvarbias_restraint_centers::init(std::string const &conf)
if (null_centers) {
// try to initialize the restraint centers for the first time
colvar_centers.resize(num_variables());
colvar_centers_raw.resize(num_variables());
for (i = 0; i < num_variables(); i++) {
colvar_centers[i].type(variables(i)->value());
colvar_centers[i].reset();
colvar_centers_raw[i].type(variables(i)->value());
colvar_centers_raw[i].reset();
}
}
@ -113,7 +110,6 @@ int colvarbias_restraint_centers::init(std::string const &conf)
if (cvm::debug()) {
cvm::log("colvarbias_restraint: parsing initial centers, i = "+cvm::to_str(i)+".\n");
}
colvar_centers_raw[i] = colvar_centers[i];
colvar_centers[i].apply_constraints();
}
null_centers = false;
@ -141,8 +137,6 @@ int colvarbias_restraint_centers::change_configuration(std::string const &conf)
for (size_t i = 0; i < num_variables(); i++) {
colvar_centers[i].type(variables(i)->value());
colvar_centers[i].apply_constraints();
colvar_centers_raw[i].type(variables(i)->value());
colvar_centers_raw[i] = colvar_centers[i];
}
}
return COLVARS_OK;
@ -232,7 +226,6 @@ int colvarbias_restraint_moving::set_state_params(std::string const &conf)
{
if (b_chg_centers || b_chg_force_k) {
if (target_nstages) {
// cvm::log ("Reading current stage from the restart.\n");
if (!get_keyval(conf, "stage", stage))
cvm::error("Error: current stage is missing from the restart.\n");
}
@ -265,100 +258,127 @@ int colvarbias_restraint_centers_moving::init(std::string const &conf)
size_t i;
if (get_keyval(conf, "targetCenters", target_centers, colvar_centers)) {
if (colvar_centers.size() != num_variables()) {
if (target_centers.size() != num_variables()) {
cvm::error("Error: number of target centers does not match "
"that of collective variables.\n");
"that of collective variables.\n", INPUT_ERROR);
}
b_chg_centers = true;
for (i = 0; i < target_centers.size(); i++) {
target_centers[i].apply_constraints();
centers_incr.push_back(colvar_centers[i]);
centers_incr[i].reset();
}
}
if (b_chg_centers) {
// parse moving restraint options
// parse moving schedule options
colvarbias_restraint_moving::init(conf);
if (initial_centers.size() == 0) {
// One-time init
initial_centers = colvar_centers;
}
// Call to check that the definition is correct
for (i = 0; i < num_variables(); i++) {
colvarvalue const midpoint =
colvarvalue::interpolate(initial_centers[i],
target_centers[i],
0.5);
}
} else {
target_centers.clear();
return COLVARS_OK;
}
get_keyval(conf, "outputCenters", b_output_centers, b_output_centers);
get_keyval(conf, "outputAccumulatedWork", b_output_acc_work, b_output_acc_work);
get_keyval(conf, "outputAccumulatedWork", b_output_acc_work,
b_output_acc_work); // TODO this conflicts with stages
return COLVARS_OK;
}
int colvarbias_restraint_centers_moving::update_centers(cvm::real lambda)
{
if (cvm::debug()) {
cvm::log("Updating centers for the restraint bias \""+
this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
}
size_t i;
for (i = 0; i < num_variables(); i++) {
colvarvalue const c_new = colvarvalue::interpolate(initial_centers[i],
target_centers[i],
lambda);
centers_incr[i] = (c_new).dist2_grad(colvar_centers[i]);
colvar_centers[i] = c_new;
variables(i)->wrap(colvar_centers[i]);
}
if (cvm::debug()) {
cvm::log("New centers for the restraint bias \""+
this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
}
return cvm::get_error();
}
int colvarbias_restraint_centers_moving::update()
{
if (b_chg_centers) {
if (cvm::debug()) {
cvm::log("Updating centers for the restraint bias \""+
this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
}
if (!centers_incr.size()) {
// if this is the first calculation, calculate the advancement
// at each simulation step (or stage, if applicable)
// (take current stage into account: it can be non-zero
// if we are restarting a staged calculation)
centers_incr.resize(num_variables());
for (size_t i = 0; i < num_variables(); i++) {
centers_incr[i].type(variables(i)->value());
centers_incr[i] = (target_centers[i] - colvar_centers_raw[i]) /
cvm::real( target_nstages ? (target_nstages - stage) :
(target_nsteps - cvm::step_absolute()));
}
if (cvm::debug()) {
cvm::log("Center increment for the restraint bias \""+
this->name+"\": "+cvm::to_str(centers_incr)+" at stage "+cvm::to_str(stage)+ ".\n");
}
}
if (target_nstages) {
if ((cvm::step_relative() > 0)
&& (cvm::step_absolute() % target_nsteps) == 0
&& stage < target_nstages) {
for (size_t i = 0; i < num_variables(); i++) {
colvar_centers_raw[i] += centers_incr[i];
colvar_centers[i] = colvar_centers_raw[i];
variables(i)->wrap(colvar_centers[i]);
colvar_centers[i].apply_constraints();
// Staged update
if (stage <= target_nstages) {
if ((cvm::step_relative() > 0) &&
((cvm::step_absolute() % target_nsteps) == 1)) {
cvm::real const lambda =
cvm::real(stage)/cvm::real(target_nstages);
update_centers(lambda);
stage++;
cvm::log("Moving restraint \"" + this->name +
"\" stage " + cvm::to_str(stage) +
" : setting centers to " + cvm::to_str(colvar_centers) +
" at step " + cvm::to_str(cvm::step_absolute()));
} else {
for (size_t i = 0; i < num_variables(); i++) {
centers_incr[i].reset();
}
}
stage++;
cvm::log("Moving restraint \"" + this->name +
"\" stage " + cvm::to_str(stage) +
" : setting centers to " + cvm::to_str(colvar_centers) +
" at step " + cvm::to_str(cvm::step_absolute()));
}
} else if ((cvm::step_relative() > 0) && (cvm::step_absolute() <= target_nsteps)) {
// move the restraint centers in the direction of the targets
// (slow growth)
} else {
// Continuous update
if (cvm::step_absolute() <= target_nsteps) {
cvm::real const lambda =
cvm::real(cvm::step_absolute())/cvm::real(target_nsteps);
update_centers(lambda);
} else {
for (size_t i = 0; i < num_variables(); i++) {
centers_incr[i].reset();
}
}
}
if (cvm::step_relative() == 0) {
for (size_t i = 0; i < num_variables(); i++) {
colvar_centers_raw[i] += centers_incr[i];
colvar_centers[i] = colvar_centers_raw[i];
variables(i)->wrap(colvar_centers[i]);
colvar_centers[i].apply_constraints();
// finite differences are undefined when restarting
centers_incr[i].reset();
}
}
if (cvm::debug()) {
cvm::log("New centers for the restraint bias \""+
this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
cvm::log("Center increment for the restraint bias \""+
this->name+"\": "+cvm::to_str(centers_incr)+
" at stage "+cvm::to_str(stage)+ ".\n");
}
}
return COLVARS_OK;
return cvm::get_error();
}
int colvarbias_restraint_centers_moving::update_acc_work()
{
if (b_output_acc_work) {
if ((cvm::step_relative() > 0) || (cvm::step_absolute() == 0)) {
if ((cvm::step_relative() > 0) &&
(cvm::step_absolute() <= target_nsteps)) {
for (size_t i = 0; i < num_variables(); i++) {
// project forces on the calculated increments at this step
acc_work += colvar_forces[i] * centers_incr[i];
@ -383,13 +403,6 @@ std::string const colvarbias_restraint_centers_moving::get_state_params() const
<< colvar_centers[i];
}
os << "\n";
os << "centers_raw ";
for (i = 0; i < num_variables(); i++) {
os << " "
<< std::setprecision(cvm::cv_prec) << std::setw(cvm::cv_width)
<< colvar_centers_raw[i];
}
os << "\n";
if (b_output_acc_work) {
os << "accumulatedWork "
@ -398,7 +411,7 @@ std::string const colvarbias_restraint_centers_moving::get_state_params() const
}
}
return colvarbias_restraint_moving::get_state_params() + os.str();
return os.str();
}
@ -410,8 +423,6 @@ int colvarbias_restraint_centers_moving::set_state_params(std::string const &con
// cvm::log ("Reading the updated restraint centers from the restart.\n");
if (!get_keyval(conf, "centers", colvar_centers))
cvm::error("Error: restraint centers are missing from the restart.\n");
if (!get_keyval(conf, "centers_raw", colvar_centers_raw))
cvm::error("Error: \"raw\" restraint centers are missing from the restart.\n");
if (b_output_acc_work) {
if (!get_keyval(conf, "accumulatedWork", acc_work))
cvm::error("Error: accumulatedWork is missing from the restart.\n");
@ -609,7 +620,7 @@ std::string const colvarbias_restraint_k_moving::get_state_params() const
<< std::setprecision(cvm::en_prec)
<< std::setw(cvm::en_width) << force_k << "\n";
}
return colvarbias_restraint_moving::get_state_params() + os.str();
return os.str();
}
@ -770,6 +781,7 @@ cvm::real colvarbias_restraint_harmonic::d_restraint_potential_dk(size_t i) cons
std::string const colvarbias_restraint_harmonic::get_state_params() const
{
return colvarbias_restraint::get_state_params() +
colvarbias_restraint_moving::get_state_params() +
colvarbias_restraint_centers_moving::get_state_params() +
colvarbias_restraint_k_moving::get_state_params();
}
@ -779,6 +791,7 @@ int colvarbias_restraint_harmonic::set_state_params(std::string const &conf)
{
int error_code = COLVARS_OK;
error_code |= colvarbias_restraint::set_state_params(conf);
error_code |= colvarbias_restraint_moving::set_state_params(conf);
error_code |= colvarbias_restraint_centers_moving::set_state_params(conf);
error_code |= colvarbias_restraint_k_moving::set_state_params(conf);
return error_code;
@ -1037,6 +1050,7 @@ cvm::real colvarbias_restraint_harmonic_walls::d_restraint_potential_dk(size_t i
std::string const colvarbias_restraint_harmonic_walls::get_state_params() const
{
return colvarbias_restraint::get_state_params() +
colvarbias_restraint_moving::get_state_params() +
colvarbias_restraint_k_moving::get_state_params();
}
@ -1045,6 +1059,7 @@ int colvarbias_restraint_harmonic_walls::set_state_params(std::string const &con
{
int error_code = COLVARS_OK;
error_code |= colvarbias_restraint::set_state_params(conf);
error_code |= colvarbias_restraint_moving::set_state_params(conf);
error_code |= colvarbias_restraint_k_moving::set_state_params(conf);
return error_code;
}
@ -1164,6 +1179,7 @@ cvm::real colvarbias_restraint_linear::d_restraint_potential_dk(size_t i) const
std::string const colvarbias_restraint_linear::get_state_params() const
{
return colvarbias_restraint::get_state_params() +
colvarbias_restraint_moving::get_state_params() +
colvarbias_restraint_centers_moving::get_state_params() +
colvarbias_restraint_k_moving::get_state_params();
}
@ -1173,6 +1189,7 @@ int colvarbias_restraint_linear::set_state_params(std::string const &conf)
{
int error_code = COLVARS_OK;
error_code |= colvarbias_restraint::set_state_params(conf);
error_code |= colvarbias_restraint_moving::set_state_params(conf);
error_code |= colvarbias_restraint_centers_moving::set_state_params(conf);
error_code |= colvarbias_restraint_k_moving::set_state_params(conf);
return error_code;

View File

@ -74,9 +74,6 @@ protected:
/// \brief Restraint centers
std::vector<colvarvalue> colvar_centers;
/// \brief Restraint centers outside the domain of the colvars (no wrapping or constraints applied)
std::vector<colvarvalue> colvar_centers_raw;
};
@ -156,10 +153,16 @@ protected:
/// \brief New restraint centers
std::vector<colvarvalue> target_centers;
/// \brief Initial value of the restraint centers
std::vector<colvarvalue> initial_centers;
/// \brief Amplitude of the restraint centers' increment at each step
/// (or stage) towards the new values (calculated from target_nsteps)
/// towards the new values (calculated from target_nsteps)
std::vector<colvarvalue> centers_incr;
/// \brief Update the centers by interpolating between initial and target
virtual int update_centers(cvm::real lambda);
/// Whether to write the current restraint centers to the trajectory file
bool b_output_centers;

View File

@ -132,9 +132,15 @@ public:
static std::vector<feature *> cvc_features;
/// \brief Implementation of the feature list accessor for colvar
virtual std::vector<feature *> &features() {
virtual const std::vector<feature *> &features()
{
return cvc_features;
}
virtual std::vector<feature *> &modify_features()
{
return cvc_features;
}
/// \brief Obtain data needed for the calculation for the backend
virtual void read_data();

View File

@ -374,8 +374,8 @@ int colvardeps::decr_ref_count(int feature_id) {
}
void colvardeps::init_feature(int feature_id, const char *description, feature_type type) {
features()[feature_id]->description = description;
features()[feature_id]->type = type;
modify_features()[feature_id]->description = description;
modify_features()[feature_id]->type = type;
}
// Shorthand macros for describing dependencies
@ -401,7 +401,7 @@ void colvardeps::init_cvb_requires() {
int i;
if (features().size() == 0) {
for (i = 0; i < f_cvb_ntot; i++) {
features().push_back(new feature);
modify_features().push_back(new feature);
}
init_feature(f_cvb_active, "active", f_type_dynamic);
@ -438,7 +438,7 @@ void colvardeps::init_cv_requires() {
size_t i;
if (features().size() == 0) {
for (i = 0; i < f_cv_ntot; i++) {
features().push_back(new feature);
modify_features().push_back(new feature);
}
init_feature(f_cv_active, "active", f_type_dynamic);
@ -554,7 +554,7 @@ void colvardeps::init_cvc_requires() {
// Initialize static array once and for all
if (features().size() == 0) {
for (i = 0; i < colvardeps::f_cvc_ntot; i++) {
features().push_back(new feature);
modify_features().push_back(new feature);
}
init_feature(f_cvc_active, "active", f_type_dynamic);
@ -633,7 +633,7 @@ void colvardeps::init_ag_requires() {
// Initialize static array once and for all
if (features().size() == 0) {
for (i = 0; i < f_ag_ntot; i++) {
features().push_back(new feature);
modify_features().push_back(new feature);
}
init_feature(f_ag_active, "active", f_type_dynamic);

View File

@ -135,7 +135,8 @@ public:
// with a non-static array
// Intermediate classes (colvarbias and colvarcomp, which are also base classes)
// implement this as virtual to allow overriding
virtual std::vector<feature *>&features() = 0;
virtual const std::vector<feature *>&features() = 0;
virtual std::vector<feature *>&modify_features() = 0;
void add_child(colvardeps *child);

View File

@ -1,4 +1,5 @@
#define COLVARS_VERSION "2017-07-15"
#ifndef COLVARS_VERSION
#define COLVARS_VERSION "2017-08-06"
// This file is part of the Collective Variables module (Colvars).
// The original version of Colvars and its updates are located at:
// https://github.com/colvars/colvars
@ -6,3 +7,4 @@
// If you wish to distribute your changes, please submit them to the
// Colvars repository at GitHub.
#endif

View File

@ -472,7 +472,7 @@ int colvarscript::proc_features(colvardeps *obj,
}
if ((subcmd == "get") || (subcmd == "set")) {
std::vector<colvardeps::feature *> &features = obj->features();
std::vector<colvardeps::feature *> const &features = obj->features();
std::string const req_feature(obj_to_str(objv[3]));
colvardeps::feature *f = NULL;
int fid = 0;

View File

@ -19,6 +19,17 @@ bool colvarmodule::rotation::monitor_crossings = false;
cvm::real colvarmodule::rotation::crossing_threshold = 1.0E-02;
/// Numerical recipes diagonalization
static int jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot);
/// Eigenvector sort
static int eigsrt(cvm::real *d, cvm::real **v);
/// Transpose the matrix
static int transpose(cvm::real **v);
std::string cvm::rvector::to_simple_string() const
{
std::ostringstream os;
@ -286,7 +297,12 @@ void colvarmodule::rotation::diagonalize_matrix(cvm::matrix2d<cvm::real> &S,
// diagonalize
int jac_nrot = 0;
jacobi(S.c_array(), S_eigval.c_array(), S_eigvec.c_array(), &jac_nrot);
if (jacobi(S.c_array(), S_eigval.c_array(), S_eigvec.c_array(), &jac_nrot) !=
COLVARS_OK) {
cvm::error("Too many iterations in routine jacobi.\n"
"This is usually the result of an ill-defined set of atoms for "
"rotational alignment (RMSD, rotateReference, etc).\n");
}
eigsrt(S_eigval.c_array(), S_eigvec.c_array());
// jacobi saves eigenvectors by columns
transpose(S_eigvec.c_array());
@ -528,7 +544,7 @@ void colvarmodule::rotation::calc_optimal_rotation(std::vector<cvm::atom_pos> co
#define n 4
void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
int jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
{
int j,iq,ip,i;
cvm::real tresh,theta,tau,t,sm,s,h,g,c;
@ -554,7 +570,7 @@ void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
sm += std::fabs(a[ip][iq]);
}
if (sm == 0.0) {
return;
return COLVARS_OK;
}
if (i < 4)
tresh=0.2*sm/(n*n);
@ -606,10 +622,11 @@ void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
z[ip]=0.0;
}
}
cvm::error("Too many iterations in routine jacobi.\n");
return COLVARS_ERROR;
}
void eigsrt(cvm::real *d, cvm::real **v)
int eigsrt(cvm::real *d, cvm::real **v)
{
int k,j,i;
cvm::real p;
@ -628,9 +645,11 @@ void eigsrt(cvm::real *d, cvm::real **v)
}
}
}
return COLVARS_OK;
}
void transpose(cvm::real **v)
int transpose(cvm::real **v)
{
cvm::real p;
int i,j;
@ -641,6 +660,7 @@ void transpose(cvm::real **v)
v[j][i]=p;
}
}
return COLVARS_OK;
}
#undef n

View File

@ -1020,16 +1020,6 @@ inline cvm::rvector operator * (cvm::rmatrix const &m,
}
/// Numerical recipes diagonalization
void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot);
/// Eigenvector sort
void eigsrt(cvm::real *d, cvm::real **v);
/// Transpose the matrix
void transpose(cvm::real **v);
/// \brief 1-dimensional vector of real numbers with four components and

View File

@ -570,6 +570,50 @@ colvarvalue colvarvalue::dist2_grad(colvarvalue const &x2) const
}
/// Return the midpoint between x1 and x2, optionally weighted by lambda
/// (which must be between 0.0 and 1.0)
colvarvalue const colvarvalue::interpolate(colvarvalue const &x1,
colvarvalue const &x2,
cvm::real const lambda)
{
colvarvalue::check_types(x1, x2);
if ((lambda < 0.0) || (lambda > 1.0)) {
cvm::error("Error: trying to interpolate between two colvarvalues with a "
"lamdba outside [0:1].\n", BUG_ERROR);
}
colvarvalue interp = ((1.0-lambda)*x1 + lambda*x2);
cvm::real const d2 = x1.dist2(x2);
switch (x1.type()) {
case colvarvalue::type_scalar:
case colvarvalue::type_3vector:
case colvarvalue::type_vector:
case colvarvalue::type_unit3vectorderiv:
case colvarvalue::type_quaternionderiv:
return interp;
break;
case colvarvalue::type_unit3vector:
case colvarvalue::type_quaternion:
if (interp.norm()/std::sqrt(d2) < 1.0e-6) {
cvm::error("Error: interpolation between "+cvm::to_str(x1)+" and "+
cvm::to_str(x2)+" with lambda = "+cvm::to_str(lambda)+
" is undefined: result = "+cvm::to_str(interp)+"\n",
INPUT_ERROR);
}
interp.apply_constraints();
return interp;
break;
case colvarvalue::type_notset:
default:
x1.undef_op();
break;
}
return colvarvalue(colvarvalue::type_notset);
}
std::string colvarvalue::to_simple_string() const
{
switch (type()) {

View File

@ -193,6 +193,12 @@ public:
/// Derivative with respect to this \link colvarvalue \endlink of the square distance
colvarvalue dist2_grad(colvarvalue const &x2) const;
/// Return the midpoint between x1 and x2, optionally weighted by lambda
/// (which must be between 0.0 and 1.0)
static colvarvalue const interpolate(colvarvalue const &x1,
colvarvalue const &x2,
cvm::real const lambda = 0.5);
/// Assignment operator (type of x is checked)
colvarvalue & operator = (colvarvalue const &x);
@ -285,10 +291,10 @@ public:
cvm::real & operator [] (int const i);
/// Ensure that the two types are the same within a binary operator
int static check_types(colvarvalue const &x1, colvarvalue const &x2);
static int check_types(colvarvalue const &x1, colvarvalue const &x2);
/// Ensure that the two types are the same within an assignment, or that the left side is type_notset
int static check_types_assign(Type const &vt1, Type const &vt2);
static int check_types_assign(Type const &vt1, Type const &vt2);
/// Undefined operation
void undef_op() const;
@ -317,14 +323,14 @@ public:
/// \brief Optimized routine for the inner product of one collective
/// variable with an array
void static inner_opt(colvarvalue const &x,
static void inner_opt(colvarvalue const &x,
std::vector<colvarvalue>::iterator &xv,
std::vector<colvarvalue>::iterator const &xv_end,
std::vector<cvm::real>::iterator &result);
/// \brief Optimized routine for the inner product of one collective
/// variable with an array
void static inner_opt(colvarvalue const &x,
static void inner_opt(colvarvalue const &x,
std::list<colvarvalue>::iterator &xv,
std::list<colvarvalue>::iterator const &xv_end,
std::vector<cvm::real>::iterator &result);
@ -332,14 +338,14 @@ public:
/// \brief Optimized routine for the second order Legendre
/// polynomial, (3cos^2(w)-1)/2, of one collective variable with an
/// array
void static p2leg_opt(colvarvalue const &x,
static void p2leg_opt(colvarvalue const &x,
std::vector<colvarvalue>::iterator &xv,
std::vector<colvarvalue>::iterator const &xv_end,
std::vector<cvm::real>::iterator &result);
/// \brief Optimized routine for the second order Legendre
/// polynomial of one collective variable with an array
void static p2leg_opt(colvarvalue const &x,
static void p2leg_opt(colvarvalue const &x,
std::list<colvarvalue>::iterator &xv,
std::list<colvarvalue>::iterator const &xv_end,
std::vector<cvm::real>::iterator &result);

View File

@ -14,7 +14,7 @@ Syntax from lib dir: python Install.py -m machine -h hdir -a arch -p precision -
specify one or more options, order does not matter
copies an existing Makefile.machine in lib/gpu to Makefile.auto
copies an existing Makefile.machine in lib/gpu to Makefile.auto
optionally edits these variables in Makefile.auto:
CUDA_HOME, CUDA_ARCH, CUDA_PRECISION, EXTRAMAKE
optionally uses Makefile.auto to build the GPU library -> libgpu.a
@ -26,7 +26,7 @@ optionally copies Makefile.auto to a new Makefile.osuffix
-h = set CUDA_HOME variable in Makefile.auto to hdir
hdir = path to NVIDIA Cuda software, e.g. /usr/local/cuda
-a = set CUDA_ARCH variable in Makefile.auto to arch
use arch = 20 for Tesla C2050/C2070 (Fermi) (deprecated as of CUDA 8.0)
use arch = 20 for Tesla C2050/C2070 (Fermi) (deprecated as of CUDA 8.0)
or GeForce GTX 580 or similar
use arch = 30 for Tesla K10 (Kepler)
use arch = 35 for Tesla K40 (Kepler) or GeForce GTX Titan or similar
@ -108,10 +108,10 @@ if pflag:
elif precision == "mixed": precstr = "-D_SINGLE_DOUBLE"
elif precision == "single": precstr = "-D_SINGLE_SINGLE"
else: error("Invalid precision setting")
# create Makefile.auto
# reset EXTRAMAKE, CUDA_HOME, CUDA_ARCH, CUDA_PRECISION if requested
if not os.path.exists("Makefile.%s" % isuffix):
error("lib/gpu/Makefile.%s does not exist" % isuffix)

View File

@ -22,21 +22,21 @@
offset=tid & (t_per_atom-1); \
ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
#define nbor_info(nbor_mem, packed_mem, nbor_stride, t_per_atom, ii, offset, \
i, numj, stride, nbor_end, nbor_begin) \
i=nbor_mem[ii]; \
nbor_begin=ii+nbor_stride; \
numj=nbor_mem[nbor_begin]; \
if (nbor_mem==packed_mem) { \
nbor_begin+=nbor_stride+fast_mul(ii,t_per_atom-1); \
stride=fast_mul(t_per_atom,nbor_stride); \
nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & (t_per_atom-1)); \
#define nbor_info(dev_nbor, dev_packed, nbor_pitch, t_per_atom, ii, offset, \
i, numj, n_stride, nbor_end, nbor_begin) \
i=dev_nbor[ii]; \
nbor_begin=ii+nbor_pitch; \
numj=dev_nbor[nbor_begin]; \
if (dev_nbor==dev_packed) { \
nbor_begin+=nbor_pitch+fast_mul(ii,t_per_atom-1); \
n_stride=fast_mul(t_per_atom,nbor_pitch); \
nbor_end=nbor_begin+fast_mul(numj/t_per_atom,n_stride)+(numj & (t_per_atom-1)); \
nbor_begin+=offset; \
} else { \
nbor_begin+=nbor_stride; \
nbor_begin=nbor_mem[nbor_begin]; \
nbor_begin+=nbor_pitch; \
nbor_begin=dev_nbor[nbor_begin]; \
nbor_end=nbor_begin+numj; \
stride=t_per_atom; \
n_stride=t_per_atom; \
nbor_begin+=offset; \
}

View File

@ -20,7 +20,7 @@ using namespace LAMMPS_AL;
extern Device<PRECISION,ACC_PRECISION> global_device;
template <class numtyp, class acctyp>
BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
device=&global_device;
ans=new Answer<numtyp,acctyp>();
nbor=new Neighbor();
@ -53,8 +53,8 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size, const double gpu_split,
FILE *_screen, const void *pair_program,
const char *k_two, const char *k_three_center,
const char *k_three_end) {
const char *two, const char *three_center,
const char *three_end, const char *short_nbor) {
screen=_screen;
int gpu_nbor=0;
@ -70,10 +70,10 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
_gpu_host=1;
_threads_per_atom=device->threads_per_atom();
if (_threads_per_atom>1 && gpu_nbor==0) {
if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else
} else // neigh yes or tpa == 1
_nbor_data=&(nbor->dev_nbor);
if (_threads_per_atom*_threads_per_atom>device->warp_size())
return -10;
@ -97,7 +97,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
_block_pair=device->pair_block_size();
_block_size=device->block_ellipse();
compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end);
compile_kernels(*ucl_device,pair_program,two,three_center,three_end,short_nbor);
// Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split);
@ -113,6 +113,11 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
_max_an_bytes+=ans2->gpu_bytes();
#endif
int ef_nall=nall;
if (ef_nall==0)
ef_nall=2000;
dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
return 0;
}
@ -136,6 +141,7 @@ void BaseThreeT::clear_atomic() {
k_three_end.clear();
k_three_end_vatom.clear();
k_pair.clear();
k_short_nbor.clear();
delete pair_program;
_compiled=false;
}
@ -143,6 +149,7 @@ void BaseThreeT::clear_atomic() {
time_pair.clear();
hd_balancer.clear();
dev_short_nbor.clear();
nbor->clear();
ans->clear();
#ifdef THREE_CONCURRENT
@ -169,6 +176,8 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
if (!success)
return NULL;
_nall = nall;
// originally the requirement that nall == nlist was enforced
// to allow direct indexing neighbors of neighbors after re-arrangement
// nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
@ -203,6 +212,8 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
return 0;
atom->cast_copy_x(host_x,host_type);
_nall = nall;
int mn;
nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, tag,
nspecial, special, success, mn);
@ -247,12 +258,22 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
if (!success)
return;
_max_nbors = nbor->max_nbor_loop(nlist,numj,ilist);
}
atom->cast_x_data(host_x,host_type);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
// re-allocate dev_short_nbor if necessary
if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
dev_short_nbor.resize((2+_max_nbors)*_nmax);
}
// _ainum to be used in loop() for short neighbor list build
_ainum = nlist;
int evatom=0;
if (eatom || vatom)
evatom=1;
@ -300,7 +321,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
// Build neighbor list on GPU if necessary
if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
_max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return NULL;
@ -313,6 +334,15 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
*ilist=nbor->host_ilist.begin();
*jnum=nbor->host_acc.begin();
// re-allocate dev_short_nbor if necessary
if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
dev_short_nbor.resize((2+_max_nbors)*_nmax);
}
// _ainum to be used in loop() for short neighbor list build
_ainum = nall;
int evatom=0;
if (eatom || vatom)
evatom=1;
@ -339,19 +369,20 @@ double BaseThreeT::host_memory_usage_atomic() const {
template <class numtyp, class acctyp>
void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
const char *ktwo, const char *kthree_center,
const char *kthree_end) {
const char *two, const char *three_center,
const char *three_end, const char* short_nbor) {
if (_compiled)
return;
std::string vatom_name=std::string(kthree_end)+"_vatom";
std::string vatom_name=std::string(three_end)+"_vatom";
pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,device->compile_string().c_str());
k_three_center.set_function(*pair_program,kthree_center);
k_three_end.set_function(*pair_program,kthree_end);
k_three_center.set_function(*pair_program,three_center);
k_three_end.set_function(*pair_program,three_end);
k_three_end_vatom.set_function(*pair_program,vatom_name.c_str());
k_pair.set_function(*pair_program,ktwo);
k_pair.set_function(*pair_program,two);
k_short_nbor.set_function(*pair_program,short_nbor);
pos_tex.get_texture(*pair_program,"pos_tex");
#ifdef THREE_CONCURRENT

View File

@ -56,7 +56,8 @@ class BaseThree {
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen,
const void *pair_program, const char *k_two,
const char *k_three_center, const char *k_three_end);
const char *k_three_center, const char *k_three_end,
const char *k_short_nbor=NULL);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead();
@ -73,18 +74,18 @@ class BaseThree {
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
/** \param inum number of particles whose nbors must be stored on device
* \param max_nbors maximum number of neighbors
* \param success set to false if insufficient memory
* \note olist_size=total number of local particles **/
inline void resize_local(const int inum, const int max_nbors, bool &success) {
nbor->resize(inum,max_nbors,success);
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
/** \param inum number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \param max_nbors current maximum number of neighbors
* \note host_inum is 0 if the host is performing neighboring
* \note nlocal+host_inum=total number local particles
* \note olist_size=0 **/
@ -143,14 +144,6 @@ class BaseThree {
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success);
/// Pair loop with device neighboring
int * compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success);
/// Pair loop with device neighboring
int ** compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *sublo,
@ -193,6 +186,9 @@ class BaseThree {
/// Neighbor data
Neighbor *nbor;
UCL_D_Vec<int> dev_short_nbor;
UCL_Kernel k_short_nbor;
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program;
UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom;
@ -207,12 +203,13 @@ class BaseThree {
int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
int _gpu_nbor;
double _max_bytes, _max_an_bytes;
int _max_nbors, _ainum, _nall;
double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const void *pair_string,
const char *k_two, const char *k_three_center,
const char *k_three_end);
const char *two, const char *three_center,
const char *three_end, const char* short_nbor);
virtual void loop(const bool _eflag, const bool _vflag,
const int evatom) = 0;

View File

@ -55,7 +55,7 @@ int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_
int success;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,sw,"k_sw","k_sw_three_center",
"k_sw_three_end");
"k_sw_three_end","k_sw_short_nbor");
if (success!=0)
return success;
@ -193,19 +193,30 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
// build the short neighbor list
int ainum=this->_ainum;
int nbor_pitch=this->nbor->nbor_pitch();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/this->_threads_per_atom)));
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &sw3, &map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &ainum,
&nbor_pitch, &this->_threads_per_atom);
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
ainum=this->ans->inum();
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
@ -217,6 +228,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &evatom);
@ -231,7 +243,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
@ -240,7 +252,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

View File

@ -130,6 +130,63 @@ texture<int4> sw3_tex;
#endif
__kernel void k_sw_short_nbor(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict sw3,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const int inum, const int nbor_pitch, const int t_per_atom) {
__local int n_stride;
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
int ncount = 0;
int m = nbor;
dev_short_nbor[m] = 0;
int nbor_short = nbor+n_stride;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int nj = j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;
}
} // for nbor
// store the number of neighbors for each thread
dev_short_nbor[m] = ncount;
} // if ii
}
__kernel void k_sw(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict sw1,
@ -140,6 +197,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
const int nelements,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
@ -158,8 +216,8 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
int nbor, nbor_end, i, numj;
const int* nbor_mem = dev_packed;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -167,9 +225,17 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor];
nbor += n_stride;
nbor_end = nbor+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int j=nbor_mem[nbor];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -337,6 +403,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
const int nelements,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -361,7 +428,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -371,9 +438,18 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -395,14 +471,23 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
sw_cut_ij=sw3_ijparam.x;
int nbor_k=nbor_j-offset_j+offset_k;
if (nbor_k<=nbor_j)
nbor_k+=n_stride;
int nbor_k,k_end;
if (dev_packed==dev_nbor) {
nbor_k=nborj_start-offset_j+offset_k;
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
} else {
nbor_k = nbor_j-offset_j+offset_k;
if (nbor_k<=nbor_j) nbor_k += n_stride;
k_end = nbor_end;
}
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (dev_packed==dev_nbor && k <= j) continue;
numtyp4 kx; fetch4(kx,k,pos_tex);
int ktype=kx.w;
ktype=map[ktype];
@ -460,6 +545,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -484,7 +570,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -494,8 +580,16 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -534,8 +628,15 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -598,6 +699,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -622,7 +724,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -632,8 +734,16 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -672,8 +782,15 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;

View File

@ -55,7 +55,8 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
int success;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,tersoff,"k_tersoff_repulsive",
"k_tersoff_three_center", "k_tersoff_three_end");
"k_tersoff_three_center", "k_tersoff_three_end",
"k_tersoff_short_nbor");
if (success!=0)
return success;
@ -157,11 +158,16 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
UCL_WRITE_ONLY);
for (int i=0; i<nparams; i++)
double cutsqmax = 0.0;
for (int i=0; i<nparams; i++) {
cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
}
cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
ucl_copy(cutsq,cutsq_view,false);
_cutshortsq = static_cast<numtyp>(cutsqmax);
UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
*(this->ucl_device), UCL_WRITE_ONLY);
@ -219,171 +225,6 @@ double TersoffT::host_memory_usage() const {
#define KTHREADS this->_threads_per_atom
#define JTHREADS this->_threads_per_atom
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void TersoffT::compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start,
const double cpu_time, bool &success) {
this->acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return;
}
int ago=this->hd_balancer.ago_first(f_ago);
int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
this->ans->inum(inum);
#ifdef THREE_CONCURRENT
this->ans2->inum(inum);
#endif
host_start=inum;
if (ago==0) {
this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
if (!success)
return;
_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
}
this->atom->cast_x_data(host_x,host_type);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
// re-allocate zetaij if necessary
if (nall*_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(_max_nbors*_nmax);
}
int _eflag;
if (eflag)
_eflag=1;
else
_eflag=0;
int ainum=nlist;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
evatom=1;
#ifdef THREE_CONCURRENT
this->ucl_device->sync();
#endif
loop(eflag,vflag,evatom);
this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
this->device->add_ans_object(this->ans);
#ifdef THREE_CONCURRENT
this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
this->device->add_ans_object(this->ans2);
#endif
this->hd_balancer.stop_timer();
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int ** TersoffT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum,
const double cpu_time, bool &success) {
this->acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return NULL;
}
this->hd_balancer.balance(cpu_time);
int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
this->ans->inum(inum);
#ifdef THREE_CONCURRENT
this->ans2->inum(inum);
#endif
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return NULL;
this->hd_balancer.start_timer();
} else {
this->atom->cast_x_data(host_x,host_type);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
}
*ilist=this->nbor->host_ilist.begin();
*jnum=this->nbor->host_acc.begin();
// re-allocate zetaij if necessary
if (nall*_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(_max_nbors*_nmax);
}
int _eflag;
if (eflag)
_eflag=1;
else
_eflag=0;
int ainum=nall;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
evatom=1;
#ifdef THREE_CONCURRENT
this->ucl_device->sync();
#endif
loop(eflag,vflag,evatom);
this->ans->copy_answers(eflag,vflag,eatom,vatom);
this->device->add_ans_object(this->ans);
#ifdef THREE_CONCURRENT
this->ans2->copy_answers(eflag,vflag,eatom,vatom);
this->device->add_ans_object(this->ans2);
#endif
this->hd_balancer.stop_timer();
return this->nbor->host_jlist.begin()-host_start;
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
@ -402,9 +243,40 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
else
vflag=0;
int ainum=this->ans->inum();
// build the short neighbor list
int ainum=this->_ainum;
int nbor_pitch=this->nbor->nbor_pitch();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/this->_threads_per_atom)));
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
&elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &ainum,
&nbor_pitch, &this->_threads_per_atom);
// re-allocate zetaij if necessary
int nall = this->_nall;
if (nall*this->_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(this->_max_nbors*_nmax);
}
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
ainum=this->ans->inum();
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
@ -412,6 +284,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq,
&map, &elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
@ -423,6 +296,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &evatom);
@ -437,7 +311,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
@ -446,7 +320,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
}

View File

@ -106,7 +106,7 @@ texture<int4> ts5_tex;
ans[ii]=old; \
}
#define store_zeta(z, tid, t_per_atom, offset) \
#define acc_zeta(z, tid, t_per_atom, offset) \
if (t_per_atom>1) { \
__local acctyp red_acc[BLOCK_PAIR]; \
red_acc[tid]=z; \
@ -155,7 +155,7 @@ texture<int4> ts5_tex;
ans[ii]=old; \
}
#define store_zeta(z, tid, t_per_atom, offset) \
#define acc_zeta(z, tid, t_per_atom, offset) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
z += shfl_xor(z, s, t_per_atom); \
@ -164,6 +164,65 @@ texture<int4> ts5_tex;
#endif
__kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
const __global numtyp *restrict cutsq,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const int inum, const int nbor_pitch,
const int t_per_atom) {
__local int n_stride;
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
int ncount = 0;
int m = nbor;
dev_short_nbor[m] = 0;
int nbor_short = nbor+n_stride;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int nj = j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[ijparam]) {
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;
}
} // for nbor
// store the number of neighbors for each thread
dev_short_nbor[m] = ncount;
} // if ii
}
// Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
// while the block size should never be less than 32.
// SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
@ -184,6 +243,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
__global acctyp4 * zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
const int eflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__local int tpa_sq,n_stride;
@ -211,22 +271,29 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor_j, nbor_end;
int i, numj;
int nbor_j, nbor_end, i, numj;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
int nborj_start = nbor_j;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -241,14 +308,20 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
delr1.z = jx.z-ix.z;
numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
if (rsq1 > cutsq[ijparam]) continue;
// if (rsq1 > cutsq[ijparam]) continue;
// compute zeta_ij
z = (acctyp)0;
int nbor_k = nborj_start-offset_j+offset_k;
for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k_end = nbor_end;
if (dev_packed==dev_nbor) {
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == j) continue;
@ -284,10 +357,12 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
i, nbor_j, offset_j, idx);
store_zeta(z, tid, t_per_atom, offset_k);
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acc_zeta(z, tid, t_per_atom, offset_k);
numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
numtyp ijparam_lam2 = ts1_ijparam.y;
@ -330,6 +405,7 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -356,8 +432,8 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
int nbor, nbor_end, i, numj;
const int* nbor_mem=dev_packed;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -365,9 +441,17 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor];
nbor += n_stride;
nbor_end = nbor+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int j=nbor_mem[nbor];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -382,32 +466,31 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[ijparam]) {
numtyp feng[2];
numtyp ijparam_lam1 = ts1[ijparam].x;
numtyp4 ts2_ijparam = ts2[ijparam];
numtyp ijparam_biga = ts2_ijparam.x;
numtyp ijparam_bigr = ts2_ijparam.z;
numtyp ijparam_bigd = ts2_ijparam.w;
// rsq<cutsq[ijparam]
numtyp feng[2];
numtyp ijparam_lam1 = ts1[ijparam].x;
numtyp4 ts2_ijparam = ts2[ijparam];
numtyp ijparam_biga = ts2_ijparam.x;
numtyp ijparam_bigr = ts2_ijparam.z;
numtyp ijparam_bigd = ts2_ijparam.w;
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
rsq, eflag, feng);
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
rsq, eflag, feng);
numtyp force = feng[0];
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
numtyp force = feng[0];
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0)
energy+=feng[1];
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
if (eflag>0)
energy+=feng[1];
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
} // for nbor
@ -428,6 +511,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -461,20 +545,28 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
int nborj_start = nbor_j;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -489,7 +581,6 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp r1 = ucl_sqrt(rsq1);
numtyp r1inv = ucl_rsqrt(rsq1);
@ -497,9 +588,11 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
i, nbor_j, offset_j, idx);
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
numtyp force = zeta_ij.x*tpainv;
numtyp prefactor = zeta_ij.y;
@ -520,9 +613,15 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
virial[5] += delr1[1]*delr1[2]*mforce;
}
int nbor_k=nborj_start-offset_j+offset_k;
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int nbor_k = nborj_start-offset_j+offset_k;
int k_end = nbor_end;
if (dev_packed==dev_nbor) {
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (j == k) continue;
@ -598,6 +697,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -632,7 +732,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -643,9 +743,18 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
itype=map[itype];
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -660,8 +769,6 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp mdelr1[3];
mdelr1[0] = -delr1[0];
mdelr1[1] = -delr1[1];
@ -683,13 +790,20 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
int nbork_start = nbor_k;
// look up for zeta_ji: find i in the j's neighbor list
int m = tid / t_per_atom;
int ijnum = -1;
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
@ -711,9 +825,11 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, ijnum, offset_kf, idx);
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -736,7 +852,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
// attractive forces
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -777,9 +893,11 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, nbor_k, offset_k, idx);
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@ -824,6 +942,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -858,7 +977,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -869,9 +988,18 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
itype=map[itype];
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -886,8 +1014,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp mdelr1[3];
mdelr1[0] = -delr1[0];
mdelr1[1] = -delr1[1];
@ -909,13 +1035,20 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
int nbork_start = nbor_k;
// look up for zeta_ji
int m = tid / t_per_atom;
int ijnum = -1;
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
@ -937,9 +1070,11 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, ijnum, offset_kf, idx);
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -962,7 +1097,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
// attractive forces
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -1010,9 +1145,11 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, nbor_k, offset_k, idx);
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;
@ -1040,7 +1177,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
virial[3] += TWOTHIRD*(delr2[0]*fj[1] + mdelr1[0]*fk[1]);
virial[4] += TWOTHIRD*(delr2[0]*fj[2] + mdelr1[0]*fk[2]);
virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]);
}
} // for nbor

View File

@ -47,21 +47,6 @@ class Tersoff : public BaseThree<numtyp, acctyp> {
const double* h, const double* gamma, const double* beta,
const double* powern, const double* cutsq);
/// Pair loop with host neighboring
void compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, const bool eflag,
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success);
/// Pair loop with device neighboring
int ** compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
@ -104,8 +89,7 @@ class Tersoff : public BaseThree<numtyp, acctyp> {
UCL_Kernel k_zeta;
UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex;
int _max_nbors;
numtyp _cutshortsq;
private:
bool _allocated;

View File

@ -55,7 +55,8 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
int success;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,tersoff_mod,"k_tersoff_mod_repulsive",
"k_tersoff_mod_three_center", "k_tersoff_mod_three_end");
"k_tersoff_mod_three_center", "k_tersoff_mod_three_end",
"k_tersoff_mod_short_nbor");
if (success!=0)
return success;
@ -157,11 +158,16 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
UCL_WRITE_ONLY);
for (int i=0; i<nparams; i++)
double cutsqmax = 0.0;
for (int i=0; i<nparams; i++) {
cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
}
cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
ucl_copy(cutsq,cutsq_view,false);
_cutshortsq = static_cast<numtyp>(cutsqmax);
UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
*(this->ucl_device), UCL_WRITE_ONLY);
@ -219,171 +225,6 @@ double TersoffMT::host_memory_usage() const {
#define KTHREADS this->_threads_per_atom
#define JTHREADS this->_threads_per_atom
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void TersoffMT::compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start,
const double cpu_time, bool &success) {
this->acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return;
}
int ago=this->hd_balancer.ago_first(f_ago);
int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
this->ans->inum(inum);
#ifdef THREE_CONCURRENT
this->ans2->inum(inum);
#endif
host_start=inum;
if (ago==0) {
this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
if (!success)
return;
_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
}
this->atom->cast_x_data(host_x,host_type);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
// re-allocate zetaij if necessary
if (nall*_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(_max_nbors*_nmax);
}
int _eflag;
if (eflag)
_eflag=1;
else
_eflag=0;
int ainum=nlist;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
evatom=1;
#ifdef THREE_CONCURRENT
this->ucl_device->sync();
#endif
loop(eflag,vflag,evatom);
this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
this->device->add_ans_object(this->ans);
#ifdef THREE_CONCURRENT
this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
this->device->add_ans_object(this->ans2);
#endif
this->hd_balancer.stop_timer();
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int ** TersoffMT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum,
const double cpu_time, bool &success) {
this->acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return NULL;
}
this->hd_balancer.balance(cpu_time);
int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
this->ans->inum(inum);
#ifdef THREE_CONCURRENT
this->ans2->inum(inum);
#endif
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return NULL;
this->hd_balancer.start_timer();
} else {
this->atom->cast_x_data(host_x,host_type);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
}
*ilist=this->nbor->host_ilist.begin();
*jnum=this->nbor->host_acc.begin();
// re-allocate zetaij if necessary
if (nall*_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(_max_nbors*_nmax);
}
int _eflag;
if (eflag)
_eflag=1;
else
_eflag=0;
int ainum=nall;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
evatom=1;
#ifdef THREE_CONCURRENT
this->ucl_device->sync();
#endif
loop(eflag,vflag,evatom);
this->ans->copy_answers(eflag,vflag,eatom,vatom);
this->device->add_ans_object(this->ans);
#ifdef THREE_CONCURRENT
this->ans2->copy_answers(eflag,vflag,eatom,vatom);
this->device->add_ans_object(this->ans2);
#endif
this->hd_balancer.stop_timer();
return this->nbor->host_jlist.begin()-host_start;
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
@ -402,9 +243,40 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
else
vflag=0;
int ainum=this->ans->inum();
// build the short neighbor list
int ainum=this->_ainum;
int nbor_pitch=this->nbor->nbor_pitch();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/this->_threads_per_atom)));
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
&elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &ainum,
&nbor_pitch, &this->_threads_per_atom);
// re-allocate zetaij if necessary
int nall = this->_nall;
if (nall*this->_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(this->_max_nbors*_nmax);
}
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
ainum=this->ans->inum();
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
@ -412,6 +284,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq,
&map, &elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
@ -423,6 +296,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &evatom);
@ -437,7 +311,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
@ -446,7 +320,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
}

View File

@ -106,7 +106,7 @@ texture<int4> ts5_tex;
ans[ii]=old; \
}
#define store_zeta(z, tid, t_per_atom, offset) \
#define acc_zeta(z, tid, t_per_atom, offset) \
if (t_per_atom>1) { \
__local acctyp red_acc[BLOCK_PAIR]; \
red_acc[tid]=z; \
@ -155,7 +155,7 @@ texture<int4> ts5_tex;
ans[ii]=old; \
}
#define store_zeta(z, tid, t_per_atom, offset) \
#define acc_zeta(z, tid, t_per_atom, offset) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
z += shfl_xor(z, s, t_per_atom); \
@ -164,6 +164,65 @@ texture<int4> ts5_tex;
#endif
__kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
const __global numtyp *restrict cutsq,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const int inum, const int nbor_pitch,
const int t_per_atom) {
__local int n_stride;
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
int ncount = 0;
int m = nbor;
dev_short_nbor[m] = 0;
int nbor_short = nbor+n_stride;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int nj = j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[ijparam]) {
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;
}
} // for nbor
// store the number of neighbors for each thread
dev_short_nbor[m] = ncount;
} // if ii
}
// Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
// while the block size should never be less than 32.
// SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
@ -184,6 +243,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
__global acctyp4 * zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
const int eflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__local int tpa_sq,n_stride;
@ -211,22 +271,29 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor_j, nbor_end;
int i, numj;
int nbor_j, nbor_end, i, numj;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
int nborj_start = nbor_j;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -241,14 +308,18 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
delr1.z = jx.z-ix.z;
numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
if (rsq1 > cutsq[ijparam]) continue;
// compute zeta_ij
z = (numtyp)0;
z = (acctyp)0;
int nbor_k = nborj_start-offset_j+offset_k;
for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k_end = nbor_end;
if (dev_packed==dev_nbor) {
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == j) continue;
@ -287,10 +358,12 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
i, nbor_j, offset_j, idx);
store_zeta(z, tid, t_per_atom, offset_k);
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acc_zeta(z, tid, t_per_atom, offset_k);
numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
numtyp ijparam_lam2 = ts1_ijparam.y;
@ -331,6 +404,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -357,8 +431,8 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
int nbor, nbor_end, i, numj;
const int* nbor_mem=dev_packed;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -366,9 +440,17 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor];
nbor += n_stride;
nbor_end = nbor+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int j=nbor_mem[nbor];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -383,32 +465,31 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[ijparam]) {
numtyp feng[2];
numtyp ijparam_lam1 = ts1[ijparam].x;
numtyp4 ts2_ijparam = ts2[ijparam];
numtyp ijparam_biga = ts2_ijparam.x;
numtyp ijparam_bigr = ts2_ijparam.z;
numtyp ijparam_bigd = ts2_ijparam.w;
// rsq<cutsq[ijparam]
numtyp feng[2];
numtyp ijparam_lam1 = ts1[ijparam].x;
numtyp4 ts2_ijparam = ts2[ijparam];
numtyp ijparam_biga = ts2_ijparam.x;
numtyp ijparam_bigr = ts2_ijparam.z;
numtyp ijparam_bigd = ts2_ijparam.w;
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
rsq, eflag, feng);
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
rsq, eflag, feng);
numtyp force = feng[0];
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
numtyp force = feng[0];
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0)
energy+=feng[1];
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
if (eflag>0)
energy+=feng[1];
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
} // for nbor
@ -430,6 +511,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -465,20 +547,28 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
int nborj_start = nbor_j;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -493,7 +583,6 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp r1 = ucl_sqrt(rsq1);
numtyp r1inv = ucl_rsqrt(rsq1);
@ -501,9 +590,11 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
i, nbor_j, offset_j, idx);
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
numtyp force = zeta_ij.x*tpainv;
numtyp prefactor = zeta_ij.y;
@ -524,9 +615,15 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
virial[5] += delr1[1]*delr1[2]*mforce;
}
int nbor_k=nborj_start-offset_j+offset_k;
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int nbor_k = nborj_start-offset_j+offset_k;
int k_end = nbor_end;
if (dev_packed==dev_nbor) {
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (j == k) continue;
@ -606,6 +703,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -642,7 +740,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -653,9 +751,18 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
itype=map[itype];
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -670,8 +777,6 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp mdelr1[3];
mdelr1[0] = -delr1[0];
mdelr1[1] = -delr1[1];
@ -693,13 +798,20 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
int nbork_start = nbor_k;
// look up for zeta_ji: find i in the j's neighbor list
int m = tid / t_per_atom;
int ijnum = -1;
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
@ -721,9 +833,11 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, ijnum, offset_kf, idx);
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -746,7 +860,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
// attractive forces
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -790,9 +904,11 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, nbor_k, offset_k, idx);
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@ -841,6 +957,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -877,7 +994,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -888,9 +1005,18 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
itype=map[itype];
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -905,8 +1031,6 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp mdelr1[3];
mdelr1[0] = -delr1[0];
mdelr1[1] = -delr1[1];
@ -928,13 +1052,20 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
int nbork_start = nbor_k;
// look up for zeta_ji
int m = tid / t_per_atom;
int ijnum = -1;
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
@ -956,9 +1087,11 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, ijnum, offset_kf, idx);
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -981,7 +1114,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
// attractive forces
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -1032,9 +1165,11 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, nbor_k, offset_k, idx);
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;

View File

@ -47,21 +47,6 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
const double* h, const double* beta, const double* powern,
const double* powern_del, const double* ca1, const double* cutsq);
/// Pair loop with host neighboring
void compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, const bool eflag,
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success);
/// Pair loop with device neighboring
int ** compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
@ -104,8 +89,7 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
UCL_Kernel k_zeta;
UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex;
int _max_nbors;
numtyp _cutshortsq;
private:
bool _allocated;

View File

@ -62,7 +62,8 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
int success;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,tersoff_zbl,"k_tersoff_zbl_repulsive",
"k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end");
"k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end",
"k_tersoff_zbl_short_nbor");
if (success!=0)
return success;
@ -177,11 +178,16 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
UCL_WRITE_ONLY);
for (int i=0; i<nparams; i++)
double cutsqmax = 0.0;
for (int i=0; i<nparams; i++) {
cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
}
cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
ucl_copy(cutsq,cutsq_view,false);
_cutshortsq = static_cast<numtyp>(cutsqmax);
UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
*(this->ucl_device), UCL_WRITE_ONLY);
@ -244,171 +250,6 @@ double TersoffZT::host_memory_usage() const {
#define KTHREADS this->_threads_per_atom
#define JTHREADS this->_threads_per_atom
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void TersoffZT::compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start,
const double cpu_time, bool &success) {
this->acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return;
}
int ago=this->hd_balancer.ago_first(f_ago);
int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
this->ans->inum(inum);
#ifdef THREE_CONCURRENT
this->ans2->inum(inum);
#endif
host_start=inum;
if (ago==0) {
this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
if (!success)
return;
_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
}
this->atom->cast_x_data(host_x,host_type);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
// re-allocate zetaij if necessary
if (nall*_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(_max_nbors*_nmax);
}
int _eflag;
if (eflag)
_eflag=1;
else
_eflag=0;
int ainum=nlist;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
evatom=1;
#ifdef THREE_CONCURRENT
this->ucl_device->sync();
#endif
loop(eflag,vflag,evatom);
this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
this->device->add_ans_object(this->ans);
#ifdef THREE_CONCURRENT
this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
this->device->add_ans_object(this->ans2);
#endif
this->hd_balancer.stop_timer();
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int ** TersoffZT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum,
const double cpu_time, bool &success) {
this->acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return NULL;
}
this->hd_balancer.balance(cpu_time);
int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
this->ans->inum(inum);
#ifdef THREE_CONCURRENT
this->ans2->inum(inum);
#endif
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return NULL;
this->hd_balancer.start_timer();
} else {
this->atom->cast_x_data(host_x,host_type);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
}
*ilist=this->nbor->host_ilist.begin();
*jnum=this->nbor->host_acc.begin();
// re-allocate zetaij if necessary
if (nall*_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(_max_nbors*_nmax);
}
int _eflag;
if (eflag)
_eflag=1;
else
_eflag=0;
int ainum=nall;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
evatom=1;
#ifdef THREE_CONCURRENT
this->ucl_device->sync();
#endif
loop(eflag,vflag,evatom);
this->ans->copy_answers(eflag,vflag,eatom,vatom);
this->device->add_ans_object(this->ans);
#ifdef THREE_CONCURRENT
this->ans2->copy_answers(eflag,vflag,eatom,vatom);
this->device->add_ans_object(this->ans2);
#endif
this->hd_balancer.stop_timer();
return this->nbor->host_jlist.begin()-host_start;
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
@ -427,9 +268,40 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
else
vflag=0;
int ainum=this->ans->inum();
// build the short neighbor list
int ainum=this->_ainum;
int nbor_pitch=this->nbor->nbor_pitch();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/this->_threads_per_atom)));
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
&elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &ainum,
&nbor_pitch, &this->_threads_per_atom);
// re-allocate zetaij if necessary
int nall = this->_nall;
if (nall*this->_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(this->_max_nbors*_nmax);
}
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
ainum=this->ans->inum();
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
@ -438,6 +310,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
&_global_e, &_global_a_0, &_global_epsilon_0, &cutsq,
&map, &elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
@ -449,6 +322,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &evatom);
@ -463,7 +337,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
@ -472,7 +346,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
}

View File

@ -109,7 +109,7 @@ texture<int4> ts6_tex;
ans[ii]=old; \
}
#define store_zeta(z, tid, t_per_atom, offset) \
#define acc_zeta(z, tid, t_per_atom, offset) \
if (t_per_atom>1) { \
__local acctyp red_acc[BLOCK_PAIR]; \
red_acc[tid]=z; \
@ -158,7 +158,7 @@ texture<int4> ts6_tex;
ans[ii]=old; \
}
#define store_zeta(z, tid, t_per_atom, offset) \
#define acc_zeta(z, tid, t_per_atom, offset) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
z += shfl_xor(z, s, t_per_atom); \
@ -167,6 +167,65 @@ texture<int4> ts6_tex;
#endif
__kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
const __global numtyp *restrict cutsq,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const int inum, const int nbor_pitch,
const int t_per_atom) {
__local int n_stride;
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
int ncount = 0;
int m = nbor;
dev_short_nbor[m] = 0;
int nbor_short = nbor+n_stride;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int nj = j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[ijparam]) {
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;
}
} // for nbor
// store the number of neighbors for each thread
dev_short_nbor[m] = ncount;
} // if ii
}
// Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
// while the block size should never be less than 32.
// SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
@ -188,6 +247,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
__global acctyp4 * zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
const int eflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__local int tpa_sq,n_stride;
@ -217,22 +277,29 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor_j, nbor_end;
int i, numj;
int nbor_j, nbor_end, i, numj;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
int nborj_start = nbor_j;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -247,14 +314,18 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
delr1.z = jx.z-ix.z;
numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
if (rsq1 > cutsq[ijparam]) continue;
// compute zeta_ij
z = (acctyp)0;
int nbor_k = nborj_start-offset_j+offset_k;
for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k_end = nbor_end;
if (dev_packed==dev_nbor) {
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == j) continue;
@ -290,10 +361,12 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
i, nbor_j, offset_j, idx);
store_zeta(z, tid, t_per_atom, offset_k);
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acc_zeta(z, tid, t_per_atom, offset_k);
numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
numtyp ijparam_lam2 = ts1_ijparam.y;
@ -342,6 +415,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -370,8 +444,8 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
int nbor, nbor_end, i, numj;
const int* nbor_mem=dev_packed;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -379,9 +453,17 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor];
nbor += n_stride;
nbor_end = nbor+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int j=nbor_mem[nbor];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -396,38 +478,37 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[ijparam]) {
numtyp feng[2];
numtyp ijparam_lam1 = ts1[ijparam].x;
numtyp4 ts2_ijparam = ts2[ijparam];
numtyp ijparam_biga = ts2_ijparam.x;
numtyp ijparam_bigr = ts2_ijparam.z;
numtyp ijparam_bigd = ts2_ijparam.w;
numtyp4 ts6_ijparam = ts6[ijparam];
numtyp ijparam_Z_i = ts6_ijparam.x;
numtyp ijparam_Z_j = ts6_ijparam.y;
numtyp ijparam_ZBLcut = ts6_ijparam.z;
numtyp ijparam_ZBLexpscale = ts6_ijparam.w;
// rsq<cutsq[ijparam]
numtyp feng[2];
numtyp ijparam_lam1 = ts1[ijparam].x;
numtyp4 ts2_ijparam = ts2[ijparam];
numtyp ijparam_biga = ts2_ijparam.x;
numtyp ijparam_bigr = ts2_ijparam.z;
numtyp ijparam_bigd = ts2_ijparam.w;
numtyp4 ts6_ijparam = ts6[ijparam];
numtyp ijparam_Z_i = ts6_ijparam.x;
numtyp ijparam_Z_j = ts6_ijparam.y;
numtyp ijparam_ZBLcut = ts6_ijparam.z;
numtyp ijparam_ZBLexpscale = ts6_ijparam.w;
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
ijparam_Z_i, ijparam_Z_j, ijparam_ZBLcut, ijparam_ZBLexpscale,
global_e, global_a_0, global_epsilon_0, rsq, eflag, feng);
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
ijparam_Z_i, ijparam_Z_j, ijparam_ZBLcut, ijparam_ZBLexpscale,
global_e, global_a_0, global_epsilon_0, rsq, eflag, feng);
numtyp force = feng[0];
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
numtyp force = feng[0];
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0)
energy+=feng[1];
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
if (eflag>0)
energy+=feng[1];
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
} // for nbor
@ -448,6 +529,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -481,20 +563,28 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
int nborj_start = nbor_j;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -509,7 +599,6 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp r1 = ucl_sqrt(rsq1);
numtyp r1inv = ucl_rsqrt(rsq1);
@ -517,9 +606,11 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
i, nbor_j, offset_j, idx);
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
numtyp force = zeta_ij.x*tpainv;
numtyp prefactor = zeta_ij.y;
@ -540,9 +631,15 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
virial[5] += delr1[1]*delr1[2]*mforce;
}
int nbor_k=nborj_start-offset_j+offset_k;
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int nbor_k = nborj_start-offset_j+offset_k;
int k_end = nbor_end;
if (dev_packed==dev_nbor) {
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (j == k) continue;
@ -618,6 +715,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -652,7 +750,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -663,9 +761,18 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
itype=map[itype];
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -680,8 +787,6 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp mdelr1[3];
mdelr1[0] = -delr1[0];
mdelr1[1] = -delr1[1];
@ -703,13 +808,20 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
int nbork_start = nbor_k;
// look up for zeta_ji: find i in the j's neighbor list
int m = tid / t_per_atom;
int ijnum = -1;
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
@ -731,9 +843,11 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, ijnum, offset_kf, idx);
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -756,7 +870,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
// attractive forces
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -797,9 +911,11 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, nbor_k, offset_k, idx);
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@ -844,6 +960,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -878,7 +995,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -889,9 +1006,18 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
itype=map[itype];
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -906,8 +1032,6 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp mdelr1[3];
mdelr1[0] = -delr1[0];
mdelr1[1] = -delr1[1];
@ -929,13 +1053,20 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
int nbork_start = nbor_k;
// look up for zeta_ji
int m = tid / t_per_atom;
int ijnum = -1;
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
@ -957,9 +1088,11 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, ijnum, offset_kf, idx);
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -982,7 +1115,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
// attractive forces
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -1030,9 +1163,11 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, nbor_k, offset_k, idx);
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;

View File

@ -49,21 +49,6 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
const double* ZBLcut, const double* ZBLexpscale, const double global_e,
const double global_a_0, const double global_epsilon_0, const double* cutsq);
/// Pair loop with host neighboring
void compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, const bool eflag,
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success);
/// Pair loop with device neighboring
int ** compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
@ -109,8 +94,8 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
UCL_Kernel k_zeta;
UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex, ts6_tex;
int _max_nbors;
numtyp _global_e,_global_a_0,_global_epsilon_0;
numtyp _cutshortsq;
private:
bool _allocated;

View File

@ -59,7 +59,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
int success;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,vashishta,"k_vashishta","k_vashishta_three_center",
"k_vashishta_three_end");
"k_vashishta_three_end","k_vashishta_short_nbor");
if (success!=0)
return success;
@ -128,15 +128,18 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
param4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
double r0sqmax = 0;
for (int i=0; i<nparams; i++) {
double r0sq = r0[i]*r0[i]-1e-4; // TODO: should we have the 1e-4?
double r0sq = r0[i]*r0[i]; // TODO: should we have the 1e-4?
if (r0sqmax < r0sq) r0sqmax = r0sq;
dview[i].x=static_cast<numtyp>(r0sq);
dview[i].y=static_cast<numtyp>(gamma[i]);
dview[i].z=static_cast<numtyp>(cutsq[i]);
dview[i].w=static_cast<numtyp>(r0[i]);
}
_cutshortsq = static_cast<numtyp>(r0sqmax);
ucl_copy(param4,dview,false);
param4_tex.get_texture(*(this->pair_program),"param4_tex");
param4_tex.bind_float(param4,4);
@ -223,15 +226,28 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
// build the short neighbor list
int ainum=this->_ainum;
int nbor_pitch=this->nbor->nbor_pitch();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/this->_threads_per_atom)));
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &param4, &map,
&elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &ainum,
&nbor_pitch, &this->_threads_per_atom);
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
ainum=this->ans->inum();
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
// note that k_pair does not run with the short neighbor list
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
&map, &elem2param, &_nelements,
@ -248,6 +264,7 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_center.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &evatom);
Answer<numtyp,acctyp> *end_ans;
@ -257,21 +274,19 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
end_ans=this->ans;
#endif
if (evatom!=0) {
this->k_three_end_vatom.set_size(GX,BX);
this->k_three_end_vatom.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
} else {
this->k_three_end.set_size(GX,BX);
this->k_three_end.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
}

View File

@ -136,6 +136,64 @@ texture<int4> param5_tex;
#endif
__kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict param4,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const int inum, const int nbor_pitch,
const int t_per_atom) {
__local int n_stride;
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
int ncount = 0;
int m = nbor;
dev_short_nbor[m] = 0;
int nbor_short = nbor+n_stride;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int nj = j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<param4[ijparam].x) { //param4[ijparam].x = r0sq; //param4[ijparam].z=cutsq
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;
}
} // for nbor
// store the number of neighbors for each thread
dev_short_nbor[m] = ncount;
} // if ii
}
__kernel void k_vashishta(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict param1,
@ -166,8 +224,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
int nbor, nbor_end, i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -211,7 +268,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
numtyp param3_dvrc=param3_ijparam.z;
numtyp param3_c0 =param3_ijparam.w;
numtyp r=sqrt(rsq);
numtyp r=ucl_sqrt(rsq);
numtyp rinvsq=1.0/rsq;
numtyp r4inv = rinvsq*rinvsq;
numtyp r6inv = rinvsq*r4inv;
@ -219,8 +276,8 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
numtyp reta = pow(r,-param1_eta);
numtyp lam1r = r*param1_lam1inv;
numtyp lam4r = r*param1_lam4inv;
numtyp vc2 = param1_zizj * exp(-lam1r)/r;
numtyp vc3 = param2_mbigd * r4inv*exp(-lam4r);
numtyp vc2 = param1_zizj * ucl_exp(-lam1r)/r;
numtyp vc3 = param2_mbigd * r4inv*ucl_exp(-lam4r);
numtyp force = (param2_dvrc*r
- (4.0*vc3 + lam4r*vc3+param2_big6w*r6inv
@ -230,6 +287,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0)
energy += (param3_bigh*reta+vc2-vc3-param3_bigw*r6inv-r*param3_dvrc+param3_c0);
@ -255,31 +313,31 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
numtyp r1 = ucl_sqrt(rsq1); \
numtyp rinvsq1 = ucl_recip(rsq1); \
numtyp rainv1 = ucl_recip(r1 - param_r0_ij); \
numtyp gsrainv1 = param_gamma_ij * rainv1; \
numtyp gsrainv1 = param_gamma_ij * rainv1; \
numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \
numtyp expgsrainv1 = ucl_exp(gsrainv1); \
\
numtyp r2 = ucl_sqrt(rsq2); \
numtyp rinvsq2 = ucl_recip(rsq2); \
numtyp rainv2 = ucl_recip(r2 - param_r0_ik); \
numtyp gsrainv2 = param_gamma_ik * rainv2; \
numtyp gsrainv2 = param_gamma_ik * rainv2; \
numtyp gsrainvsq2 = gsrainv2*rainv2/r2; \
numtyp expgsrainv2 = ucl_exp(gsrainv2); \
\
numtyp rinv12 = ucl_recip(r1*r2); \
numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \
numtyp delcs = cs - param_costheta_ijk; \
numtyp delcs = cs - param_costheta_ijk; \
numtyp delcssq = delcs*delcs; \
numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \
numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \
numtyp pcsinvsq = pcsinv*pcsinv; \
numtyp pcs = delcssq/pcsinv; \
\
numtyp facexp = expgsrainv1*expgsrainv2; \
\
numtyp facrad = param_bigb_ijk * facexp*pcs; \
numtyp facrad = param_bigb_ijk * facexp*pcs; \
numtyp frad1 = facrad*gsrainvsq1; \
numtyp frad2 = facrad*gsrainvsq2; \
numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \
numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \
numtyp facang12 = rinv12*facang; \
numtyp csfacang = cs*facang; \
numtyp csfac1 = rinvsq1*csfacang; \
@ -311,28 +369,28 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
numtyp r1 = ucl_sqrt(rsq1); \
numtyp rinvsq1 = ucl_recip(rsq1); \
numtyp rainv1 = ucl_recip(r1 - param_r0_ij); \
numtyp gsrainv1 = param_gamma_ij * rainv1; \
numtyp gsrainv1 = param_gamma_ij * rainv1; \
numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \
numtyp expgsrainv1 = ucl_exp(gsrainv1); \
\
numtyp r2 = ucl_sqrt(rsq2); \
numtyp rainv2 = ucl_recip(r2 - param_r0_ik); \
numtyp gsrainv2 = param_gamma_ik * rainv2; \
numtyp gsrainv2 = param_gamma_ik * rainv2; \
numtyp expgsrainv2 = ucl_exp(gsrainv2); \
\
numtyp rinv12 = ucl_recip(r1*r2); \
numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \
numtyp delcs = cs - param_costheta_ijk; \
numtyp delcs = cs - param_costheta_ijk; \
numtyp delcssq = delcs*delcs; \
numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \
numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \
numtyp pcsinvsq = pcsinv*pcsinv; \
numtyp pcs = delcssq/pcsinv; \
\
numtyp facexp = expgsrainv1*expgsrainv2; \
\
numtyp facrad = param_bigb_ijk * facexp*pcs; \
numtyp facrad = param_bigb_ijk * facexp*pcs; \
numtyp frad1 = facrad*gsrainvsq1; \
numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \
numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \
numtyp facang12 = rinv12*facang; \
numtyp csfacang = cs*facang; \
numtyp csfac1 = rinvsq1*csfacang; \
@ -353,6 +411,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
const int nelements,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -377,7 +436,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -387,9 +446,18 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -406,18 +474,27 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
param_r0sq_ij=param4_ijparam.x;
if (rsq1 > param_r0sq_ij) continue;
if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1
param_gamma_ij=param4_ijparam.y;
param_r0_ij=param4_ijparam.w;
int nbor_k=nbor_j-offset_j+offset_k;
if (nbor_k<=nbor_j)
nbor_k+=n_stride;
int nbor_k,k_end;
if (dev_packed==dev_nbor) {
nbor_k=nborj_start-offset_j+offset_k;
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
} else {
nbor_k = nbor_j-offset_j+offset_k;
if (nbor_k<=nbor_j) nbor_k += n_stride;
k_end = nbor_end;
}
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (dev_packed==dev_nbor && k <= j) continue;
numtyp4 kx; fetch4(kx,k,pos_tex);
int ktype=kx.w;
ktype=map[ktype];
@ -478,6 +555,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -502,7 +580,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -512,8 +590,16 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -529,7 +615,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
param_r0sq_ij = param4_ijparam.x;
if (rsq1 > param_r0sq_ij) continue;
if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1
param_gamma_ij=param4_ijparam.y;
param_r0_ij = param4_ijparam.w;
@ -551,8 +637,15 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -617,6 +710,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -641,7 +735,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -651,8 +745,16 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -668,7 +770,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
param_r0sq_ij=param4_ijparam.x;
if (rsq1 > param_r0sq_ij) continue;
if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1
param_gamma_ij=param4_ijparam.y;
param_r0_ij=param4_ijparam.w;
@ -690,8 +792,15 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;

View File

@ -82,6 +82,7 @@ class Vashishta : public BaseThree<numtyp, acctyp> {
UCL_D_Vec<int> elem2param;
UCL_D_Vec<int> map;
int _nparams,_nelements;
numtyp _cutshortsq;
UCL_Texture param1_tex, param2_tex, param3_tex, param4_tex, param5_tex;

View File

@ -6,6 +6,8 @@
from __future__ import print_function
import sys,os,re,subprocess
# help message
help = """
Syntax from src dir: make lib-kim args="-b -v version -a kim-name"
or: make lib-kim args="-b -a everything"
@ -23,7 +25,7 @@ specify one or more options, order does not matter
-b = download and build base KIM API library with example Models
this will delete any previous installation in the current folder
-n = do NOT download and build base KIM API library.
Use an existing installation
Use an existing installation
-p = specify location of KIM API installation (implies -n)
-a = add single KIM model or model driver with kim-name
to existing KIM API lib (see example below).
@ -78,13 +80,27 @@ def which(program):
return None
def geturl(url,fname):
success = False
if which('curl') != None:
cmd = 'curl -L -o "%s" %s' % (fname,url)
elif which('wget') != None:
try:
subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
success = True
except subprocess.CalledProcessError as e:
print("Calling curl failed with: %s" % e.output.decode('UTF-8'))
if not success and which('wget') != None:
cmd = 'wget -O "%s" %s' % (fname,url)
else: error("cannot find 'wget' or 'curl' to download source code")
txt = subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
return txt
try:
subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
success = True
except subprocess.CalledProcessError as e:
print("Calling wget failed with: %s" % e.output.decode('UTF-8'))
if not success:
error("Failed to download source code with 'curl' or 'wget'")
return
# parse args

View File

@ -1,5 +1,46 @@
# Change Log
## [2.03.13](https://github.com/kokkos/kokkos/tree/2.03.13) (2017-07-27)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.05...2.03.13)
**Implemented enhancements:**
- Disallow enabling both OpenMP and Threads in the same executable [\#406](https://github.com/kokkos/kokkos/issues/406)
- Make Kokkos::OpenMP respect OMP environment even if hwloc is available [\#630](https://github.com/kokkos/kokkos/issues/630)
- Improve Atomics Performance on KNL/Broadwell where PREFETCHW/RFO is Available [\#898](https://github.com/kokkos/kokkos/issues/898)
- Kokkos::resize should test whether dimensions have changed before resizing [\#904](https://github.com/kokkos/kokkos/issues/904)
- Develop performance-regression/acceptance tests [\#737](https://github.com/kokkos/kokkos/issues/737)
- Make the deep\_copy Profiling hook a start/end system [\#890](https://github.com/kokkos/kokkos/issues/890)
- Add deep\_copy Profiling hook [\#843](https://github.com/kokkos/kokkos/issues/843)
- Append tag name to parallel construct name for Profiling [\#842](https://github.com/kokkos/kokkos/issues/842)
- Add view label to `View bounds error` message for CUDA backend [\#870](https://github.com/kokkos/kokkos/issues/870)
- Disable printing the loaded profiling library [\#824](https://github.com/kokkos/kokkos/issues/824)
- "Declared but never referenced" warnings [\#853](https://github.com/kokkos/kokkos/issues/853)
- Warnings about lock\_address\_cuda\_space [\#852](https://github.com/kokkos/kokkos/issues/852)
- WorkGraph execution policy [\#771](https://github.com/kokkos/kokkos/issues/771)
- Simplify makefiles by guarding compilation with appropriate KOKKOS\_ENABLE\_\#\#\# macros [\#716](https://github.com/kokkos/kokkos/issues/716)
- Cmake build: wrong include install directory [\#668](https://github.com/kokkos/kokkos/issues/668)
- Derived View type and allocation [\#566](https://github.com/kokkos/kokkos/issues/566)
- Fix Compiler warnings when compiling core unit tests for Cuda [\#214](https://github.com/kokkos/kokkos/issues/214)
**Fixed bugs:**
- Out-of-bounds read in Kokkos\_Layout.hpp [\#975](https://github.com/kokkos/kokkos/issues/975)
- CudaClang: Fix failing test with Clang 4.0 [\#941](https://github.com/kokkos/kokkos/issues/941)
- Respawn when memory pool allocation fails \(not available memory\) [\#940](https://github.com/kokkos/kokkos/issues/940)
- Memory pool aborts on zero allocation request, returns NULL for \< minimum [\#939](https://github.com/kokkos/kokkos/issues/939)
- Error with TaskScheduler query of underlying memory pool [\#917](https://github.com/kokkos/kokkos/issues/917)
- Profiling::\*Callee static variables declared in header [\#863](https://github.com/kokkos/kokkos/issues/863)
- calling \*Space::name\(\) causes compile error [\#862](https://github.com/kokkos/kokkos/issues/862)
- bug in Profiling::deallocateData [\#860](https://github.com/kokkos/kokkos/issues/860)
- task\_depend test failing, CUDA 8.0 + Pascal + RDC [\#829](https://github.com/kokkos/kokkos/issues/829)
- \[develop branch\] Standalone cmake issues [\#826](https://github.com/kokkos/kokkos/issues/826)
- Kokkos CUDA failes to compile with OMPI\_CXX and MPICH\_CXX wrappers [\#776](https://github.com/kokkos/kokkos/issues/776)
- Task Team reduction on Pascal [\#767](https://github.com/kokkos/kokkos/issues/767)
- CUDA stack overflow with TaskDAG test [\#758](https://github.com/kokkos/kokkos/issues/758)
- TeamVector test on Cuda [\#670](https://github.com/kokkos/kokkos/issues/670)
- Clang 4.0 Cuda Build broken again [\#560](https://github.com/kokkos/kokkos/issues/560)
## [2.03.05](https://github.com/kokkos/kokkos/tree/2.03.05) (2017-05-27)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.00...2.03.05)

View File

@ -33,6 +33,7 @@ KOKKOS_INTERNAL_USE_LIBRT := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "lib
KOKKOS_INTERNAL_USE_MEMKIND := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "experimental_memkind" | wc -l))
# Check for advanced settings.
KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "compiler_warnings" | wc -l))
KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "aggressive_vectorization" | wc -l))
KOKKOS_INTERNAL_DISABLE_PROFILING := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_profiling" | wc -l))
KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_dualview_modify_check" | wc -l))
@ -78,14 +79,14 @@ KOKKOS_INTERNAL_COMPILER_PGI := $(strip $(shell $(CXX) --version 2
KOKKOS_INTERNAL_COMPILER_XL := $(strip $(shell $(CXX) -qversion 2>&1 | grep XL | wc -l))
KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-" | wc -l))
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(CXX) --version 2>&1 | grep nvcc | wc -l))
KOKKOS_INTERNAL_COMPILER_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep clang | wc -l))
KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep "apple-darwin" | wc -l))
ifneq ($(OMPI_CXX),)
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(OMPI_CXX) --version 2>&1 | grep "nvcc" | wc -l))
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(OMPI_CXX) --version 2>&1 | grep nvcc | wc -l))
endif
ifneq ($(MPICH_CXX),)
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(MPICH_CXX) --version 2>&1 | grep "nvcc" | wc -l))
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(MPICH_CXX) --version 2>&1 | grep nvcc | wc -l))
endif
KOKKOS_INTERNAL_COMPILER_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep clang | wc -l))
KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep "apple-darwin" | wc -l))
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2)
KOKKOS_INTERNAL_COMPILER_CLANG = 1
@ -111,6 +112,36 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
endif
endif
# Set compiler warnings flags.
ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
# TODO check if PGI accepts GNU style warnings
KOKKOS_INTERNAL_COMPILER_WARNINGS =
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
# TODO check if cray accepts GNU style warnings
KOKKOS_INTERNAL_COMPILER_WARNINGS =
else
#gcc
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
endif
endif
endif
endif
endif
else
KOKKOS_INTERNAL_COMPILER_WARNINGS =
endif
# Set OpenMP flags.
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_INTERNAL_OPENMP_FLAG := -mp
@ -162,6 +193,7 @@ endif
# Intel based.
KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
KOKKOS_INTERNAL_USE_ARCH_WSM := $(strip $(shell echo $(KOKKOS_ARCH) | grep WSM | wc -l))
KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW | wc -l))
@ -229,13 +261,14 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l))
# Any AVX?
KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
# Decide what ISA level we are able to support.
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM)+$(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
KOKKOS_INTERNAL_USE_ISA_KNC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc ))
@ -243,7 +276,7 @@ KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_
KOKKOS_INTERNAL_USE_TM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
# Incompatible flags?
KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc ))
KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc ))
KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc))
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
@ -257,12 +290,10 @@ endif
KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src
# No warnings:
KOKKOS_CXXFLAGS =
# INTEL and CLANG warnings:
#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
# GCC warnings:
#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized -Wignored-qualifiers -Wempty-body -Wclobbered
ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_WARNINGS)
endif
KOKKOS_LIBS = -lkokkos -ldl
KOKKOS_LDFLAGS = -L$(shell pwd)
@ -486,6 +517,28 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_SSE42 1" >> KokkosCore_config.tmp )
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
KOKKOS_CXXFLAGS += -xSSE4.2
KOKKOS_LDFLAGS += -xSSE4.2
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_CXXFLAGS += -tp=nehalem
KOKKOS_LDFLAGS += -tp=nehalem
else
# Assume that this is a really a GNU compiler.
KOKKOS_CXXFLAGS += -msse4.2
KOKKOS_LDFLAGS += -msse4.2
endif
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp )
@ -689,7 +742,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
endif
endif
KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h)
KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1)
ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h)
KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
else

View File

@ -20,8 +20,10 @@ Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ta
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
Kokkos_Rendezvous.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Rendezvous.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Rendezvous.cpp
Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
@ -36,6 +38,8 @@ Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
endif
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)

View File

@ -61,14 +61,19 @@ protected:
{
std::cout << std::setprecision(5) << std::scientific;
unsigned threads_count = omp_get_max_threads();
int threads_count = 0;
#pragma omp parallel
{
#pragma omp atomic
++threads_count;
}
if ( Kokkos::hwloc::available() ) {
threads_count = Kokkos::hwloc::get_available_numa_count() *
Kokkos::hwloc::get_available_cores_per_numa();
if (threads_count > 3) {
threads_count /= 2;
}
Kokkos::OpenMP::initialize( threads_count );
Kokkos::OpenMP::print_configuration( std::cout );
}
static void TearDownTestCase()

View File

@ -1,12 +1,12 @@
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -35,7 +35,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
@ -283,12 +283,12 @@ struct test_random_scalar {
RandomGenerator& pool,
unsigned int num_draws)
{
using std::cerr;
using std::cout;
using std::endl;
using Kokkos::parallel_reduce;
{
cerr << " -- Testing randomness properties" << endl;
cout << " -- Testing randomness properties" << endl;
RandomProperties result;
typedef test_random_functor<RandomGenerator, Scalar> functor_type;
@ -307,7 +307,7 @@ struct test_random_scalar {
( 1.5*tolerance > variance_eps)) ? 1:0;
pass_covar = ((-2.0*tolerance < covariance_eps) &&
( 2.0*tolerance > covariance_eps)) ? 1:0;
cerr << "Pass: " << pass_mean
cout << "Pass: " << pass_mean
<< " " << pass_var
<< " " << mean_eps
<< " " << variance_eps
@ -315,7 +315,7 @@ struct test_random_scalar {
<< " || " << tolerance << endl;
}
{
cerr << " -- Testing 1-D histogram" << endl;
cout << " -- Testing 1-D histogram" << endl;
RandomProperties result;
typedef test_histogram1d_functor<typename RandomGenerator::device_type> functor_type;
@ -335,7 +335,7 @@ struct test_random_scalar {
pass_hist1d_covar = ((-0.06 < covariance_eps) &&
( 0.06 > covariance_eps)) ? 1:0;
cerr << "Density 1D: " << mean_eps
cout << "Density 1D: " << mean_eps
<< " " << variance_eps
<< " " << (result.covariance/HIST_DIM1D/HIST_DIM1D)
<< " || " << tolerance
@ -348,7 +348,7 @@ struct test_random_scalar {
<< endl;
}
{
cerr << " -- Testing 3-D histogram" << endl;
cout << " -- Testing 3-D histogram" << endl;
RandomProperties result;
typedef test_histogram3d_functor<typename RandomGenerator::device_type> functor_type;
@ -368,7 +368,7 @@ struct test_random_scalar {
pass_hist3d_covar = ((-tolerance < covariance_eps) &&
( tolerance > covariance_eps)) ? 1:0;
cerr << "Density 3D: " << mean_eps
cout << "Density 3D: " << mean_eps
<< " " << variance_eps
<< " " << result.covariance/HIST_DIM1D/HIST_DIM1D
<< " || " << tolerance
@ -381,18 +381,18 @@ struct test_random_scalar {
template <class RandomGenerator>
void test_random(unsigned int num_draws)
{
using std::cerr;
using std::cout;
using std::endl;
typename test_random_functor<RandomGenerator,int>::type_1d density_1d("D1d");
typename test_random_functor<RandomGenerator,int>::type_3d density_3d("D3d");
uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count();
cerr << "Test Seed:" << ticks << endl;
cout << "Test Seed:" << ticks << endl;
RandomGenerator pool(ticks);
cerr << "Test Scalar=int" << endl;
cout << "Test Scalar=int" << endl;
test_random_scalar<RandomGenerator,int> test_int(density_1d,density_3d,pool,num_draws);
ASSERT_EQ( test_int.pass_mean,1);
ASSERT_EQ( test_int.pass_var,1);
@ -406,7 +406,7 @@ void test_random(unsigned int num_draws)
deep_copy(density_1d,0);
deep_copy(density_3d,0);
cerr << "Test Scalar=unsigned int" << endl;
cout << "Test Scalar=unsigned int" << endl;
test_random_scalar<RandomGenerator,unsigned int> test_uint(density_1d,density_3d,pool,num_draws);
ASSERT_EQ( test_uint.pass_mean,1);
ASSERT_EQ( test_uint.pass_var,1);
@ -420,7 +420,7 @@ void test_random(unsigned int num_draws)
deep_copy(density_1d,0);
deep_copy(density_3d,0);
cerr << "Test Scalar=int64_t" << endl;
cout << "Test Scalar=int64_t" << endl;
test_random_scalar<RandomGenerator,int64_t> test_int64(density_1d,density_3d,pool,num_draws);
ASSERT_EQ( test_int64.pass_mean,1);
ASSERT_EQ( test_int64.pass_var,1);
@ -434,7 +434,7 @@ void test_random(unsigned int num_draws)
deep_copy(density_1d,0);
deep_copy(density_3d,0);
cerr << "Test Scalar=uint64_t" << endl;
cout << "Test Scalar=uint64_t" << endl;
test_random_scalar<RandomGenerator,uint64_t> test_uint64(density_1d,density_3d,pool,num_draws);
ASSERT_EQ( test_uint64.pass_mean,1);
ASSERT_EQ( test_uint64.pass_var,1);
@ -448,7 +448,7 @@ void test_random(unsigned int num_draws)
deep_copy(density_1d,0);
deep_copy(density_3d,0);
cerr << "Test Scalar=float" << endl;
cout << "Test Scalar=float" << endl;
test_random_scalar<RandomGenerator,float> test_float(density_1d,density_3d,pool,num_draws);
ASSERT_EQ( test_float.pass_mean,1);
ASSERT_EQ( test_float.pass_var,1);
@ -462,7 +462,7 @@ void test_random(unsigned int num_draws)
deep_copy(density_1d,0);
deep_copy(density_3d,0);
cerr << "Test Scalar=double" << endl;
cout << "Test Scalar=double" << endl;
test_random_scalar<RandomGenerator,double> test_double(density_1d,density_3d,pool,num_draws);
ASSERT_EQ( test_double.pass_mean,1);
ASSERT_EQ( test_double.pass_var,1);

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/

View File

@ -44,12 +44,13 @@
#include<Kokkos_Core.hpp>
#include<impl/Kokkos_Timer.hpp>
#include<bench.hpp>
#include<cstdlib>
int main(int argc, char* argv[]) {
Kokkos::initialize();
if(argc<10) {
if(argc<10) {
printf("Arguments: N K R D U F T S\n");
printf(" P: Precision (1==float, 2==double)\n");
printf(" N,K: dimensions of the 2D array to allocate\n");
@ -68,7 +69,7 @@ int main(int argc, char* argv[]) {
Kokkos::finalize();
return 0;
}
int P = atoi(argv[1]);
int N = atoi(argv[2]);
@ -80,7 +81,7 @@ int main(int argc, char* argv[]) {
int T = atoi(argv[8]);
int S = atoi(argv[9]);
if(U>8) {printf("U must be 1-8\n"); return 0;}
if(U>8) {printf("U must be 1-8\n"); return 0;}
if( (D!=1) && (D!=2) && (D!=4) && (D!=8) && (D!=16) && (D!=32)) {printf("D must be one of 1,2,4,8,16,32\n"); return 0;}
if( (P!=1) && (P!=2) ) {printf("P must be one of 1,2\n"); return 0;}

View File

@ -44,11 +44,11 @@
#include<Kokkos_Core.hpp>
#include<impl/Kokkos_Timer.hpp>
#include<gather.hpp>
#include<cstdlib>
int main(int argc, char* argv[]) {
Kokkos::initialize(argc,argv);
if(argc<8) {
printf("Arguments: S N K D\n");
printf(" S: Scalar Type Size (1==float, 2==double, 4=complex<double>)\n");

View File

@ -0,0 +1,44 @@
KOKKOS_PATH = ../..
SRC = $(wildcard *.cpp)
default: build
echo "Start Build"
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
CXXFLAGS = -O3 -g
LINK = ${CXX}
LINKFLAGS =
EXE = policy_performance.cuda
KOKKOS_DEVICES = "Cuda,OpenMP"
KOKKOS_ARCH = "SNB,Kepler35"
KOKKOS_CUDA_OPTIONS+=enable_lambda
else
CXX = g++
CXXFLAGS = -O3 -g -Wall -Werror
LINK = ${CXX}
LINKFLAGS =
EXE = policy_performance.host
KOKKOS_DEVICES = "OpenMP"
KOKKOS_ARCH = "SNB"
endif
DEPFLAGS = -M
OBJ = $(SRC:.cpp=.o)
LIB =
include $(KOKKOS_PATH)/Makefile.kokkos
build: $(EXE)
$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
clean: kokkos-clean
rm -f *.o *.cuda *.host
# Compilation rules
%.o:%.cpp $(KOKKOS_CPP_DEPENDS) main.cpp policy_perf_test.hpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<

View File

@ -0,0 +1,170 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#include "policy_perf_test.hpp"
int main(int argc, char* argv[] ) {
Kokkos::initialize(argc,argv);
if(argc<10) {
printf(" Ten arguments are needed to run this program:\n");
printf(" (1)team_range, (2)thread_range, (3)vector_range, (4)outer_repeat, (5)thread_repeat, (6)vector_repeat, (7)team_size, (8)vector_size, (9)schedule, (10)test_type\n");
printf(" team_range: number of teams (league_size)\n");
printf(" thread_range: range for nested TeamThreadRange parallel_*\n");
printf(" vector_range: range for nested ThreadVectorRange parallel_*\n");
printf(" outer_repeat: number of repeats for outer parallel_* call\n");
printf(" thread_repeat: number of repeats for TeamThreadRange parallel_* call\n");
printf(" vector_repeat: number of repeats for ThreadVectorRange parallel_* call\n");
printf(" team_size: number of team members (team_size)\n");
printf(" vector_size: desired vectorization (if possible)\n");
printf(" schedule: 1 == Static 2 == Dynamic\n");
printf(" test_type: 3-digit code XYZ for testing (nested) parallel_*\n");
printf(" code key: XYZ X in {1,2,3,4,5}, Y in {0,1,2}, Z in {0,1,2}\n");
printf(" TeamPolicy:\n");
printf(" X: 0 = none (never used, makes no sense); 1 = parallel_for; 2 = parallel_reduce\n");
printf(" Y: 0 = none; 1 = parallel_for; 2 = parallel_reduce\n");
printf(" Z: 0 = none; 1 = parallel_for; 2 = parallel_reduce\n");
printf(" RangePolicy:\n");
printf(" X: 3 = parallel_for; 4 = parallel_reduce; 5 = parallel_scan\n");
printf(" Y: 0 = none\n");
printf(" Z: 0 = none\n");
printf(" Example Input:\n");
printf(" 100000 32 32 100 100 100 8 1 1 100\n");
Kokkos::finalize();
return 0;
}
int team_range = atoi(argv[1]);
int thread_range = atoi(argv[2]);
int vector_range = atoi(argv[3]);
int outer_repeat = atoi(argv[4]);
int thread_repeat = atoi(argv[5]);
int vector_repeat = atoi(argv[6]);
int team_size = atoi(argv[7]);
int vector_size = atoi(argv[8]);
int schedule = atoi(argv[9]);
int test_type = atoi(argv[10]);
int disable_verbose_output = 0;
if ( argc > 11 ) {
disable_verbose_output = atoi(argv[11]);
}
if ( schedule != 1 && schedule != 2 ) {
printf("schedule: %d\n", schedule);
printf("Options for schedule are: 1 == Static 2 == Dynamic\n");
Kokkos::finalize();
return -1;
}
if ( test_type != 100 && test_type != 110 && test_type != 111 && test_type != 112 && test_type != 120 && test_type != 121 && test_type != 122
&& test_type != 200 && test_type != 210 && test_type != 211 && test_type != 212 && test_type != 220 && test_type != 221 && test_type != 222
&& test_type != 300 && test_type != 400 && test_type != 500
)
{
printf("Incorrect test_type option\n");
Kokkos::finalize();
return -2;
}
double result = 0.0;
Kokkos::parallel_reduce( "parallel_reduce warmup", Kokkos::TeamPolicy<>(10,1),
KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type team, double& lval) {
lval += 1;
}, result);
typedef Kokkos::View<double*, Kokkos::LayoutRight> view_type_1d;
typedef Kokkos::View<double**, Kokkos::LayoutRight> view_type_2d;
typedef Kokkos::View<double***, Kokkos::LayoutRight> view_type_3d;
// Allocate view without initializing
// Call a 'warmup' test with 1 repeat - this will initialize the corresponding view appropriately for test and should obey first-touch etc
// Second call to test is the one we actually care about and time
view_type_1d v_1( Kokkos::ViewAllocateWithoutInitializing("v_1"), team_range*team_size);
view_type_2d v_2( Kokkos::ViewAllocateWithoutInitializing("v_2"), team_range*team_size, thread_range);
view_type_3d v_3( Kokkos::ViewAllocateWithoutInitializing("v_3"), team_range*team_size, thread_range, vector_range);
double result_computed = 0.0;
double result_expect = 0.0;
double time = 0.0;
if(schedule==1) {
if ( test_type != 500 ) {
// warmup - no repeat of loops
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
}
else {
// parallel_scan: initialize 1d view for parallel_scan
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,100,v_1,v_2,v_3,result_computed,result_expect,time);
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
}
}
if(schedule==2) {
if ( test_type != 500 ) {
// warmup - no repeat of loops
test_policy<Kokkos::Schedule<Kokkos::Dynamic>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
test_policy<Kokkos::Schedule<Kokkos::Dynamic>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
}
else {
// parallel_scan: initialize 1d view for parallel_scan
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,100,v_1,v_2,v_3,result_computed,result_expect,time);
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
}
}
if ( disable_verbose_output == 0 ) {
printf("%7i %4i %2i %9i %4i %4i %4i %2i %1i %3i %e %e %lf\n",team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,schedule,test_type,result_computed,result_expect,time);
}
else {
printf("%lf\n",time);
}
Kokkos::finalize();
return 0;
}

View File

@ -0,0 +1,354 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
template < class ViewType >
struct ParallelScanFunctor {
using value_type = double;
ViewType v;
ParallelScanFunctor( const ViewType & v_ )
: v(v_)
{}
KOKKOS_INLINE_FUNCTION
void operator()( const int idx, value_type& val, const bool& final ) const
{
// inclusive scan
val += v(idx);
if ( final ) {
v(idx) = val;
}
}
};
template<class ScheduleType,class IndexType,class ViewType1, class ViewType2, class ViewType3>
void test_policy(int team_range, int thread_range, int vector_range,
int outer_repeat, int thread_repeat, int inner_repeat,
int team_size, int vector_size, int test_type,
ViewType1 &v1, ViewType2 &v2, ViewType3 &v3,
double &result, double &result_expect, double &time) {
typedef Kokkos::TeamPolicy<ScheduleType,IndexType> t_policy;
typedef typename t_policy::member_type t_team;
Kokkos::Timer timer;
for(int orep = 0; orep<outer_repeat; orep++) {
if (test_type == 100) {
Kokkos::parallel_for("100 outer for", t_policy(team_range,team_size),
KOKKOS_LAMBDA (const t_team& team) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
v1(idx) = idx;
// prevent compiler optimizing loop away
});
}
if (test_type == 110) {
Kokkos::parallel_for("110 outer for", t_policy(team_range,team_size),
KOKKOS_LAMBDA (const t_team& team) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
for (int tr = 0; tr<thread_repeat; ++tr) {
// Each team launches a parallel_for; thread_range is partitioned among team members
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
v2( idx, t ) = t;
// prevent compiler optimizing loop away
});
}
});
}
if (test_type == 111) {
Kokkos::parallel_for("111 outer for", t_policy(team_range,team_size,vector_size),
KOKKOS_LAMBDA (const t_team& team) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
for (int tr = 0; tr<thread_repeat; ++tr) {
// Each team launches a parallel_for; thread_range is partitioned among team members
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
for (int vr = 0; vr<inner_repeat; ++vr)
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi) {
v3( idx, t, vi ) = vi;
// prevent compiler optimizing loop away
});
});
}
});
}
if (test_type == 112) {
Kokkos::parallel_for("112 outer for", t_policy(team_range,team_size,vector_size),
KOKKOS_LAMBDA (const t_team& team) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
for (int tr = 0; tr<thread_repeat; ++tr) {
// Each team launches a parallel_for; thread_range is partitioned among team members
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
double vector_result = 0.0;
for (int vr = 0; vr<inner_repeat; ++vr) {
vector_result = 0.0;
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi, double &vval) {
vval += 1;
}, vector_result);
}
v2( idx, t ) = vector_result;
// prevent compiler optimizing loop away
});
}
});
}
if (test_type == 120) {
Kokkos::parallel_for("120 outer for", t_policy(team_range,team_size),
KOKKOS_LAMBDA (const t_team& team) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
double team_result = 0.0;
for (int tr = 0; tr<thread_repeat; ++tr) {
team_result = 0.0;
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
lval += 1;
}, team_result);
}
v1(idx) = team_result;
// prevent compiler optimizing loop away
});
}
if (test_type == 121) {
Kokkos::parallel_for("121 outer for", t_policy(team_range,team_size,vector_size),
KOKKOS_LAMBDA (const t_team& team) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
double team_result = 0.0;
for (int tr = 0; tr<thread_repeat; ++tr) {
team_result = 0.0;
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
lval += 1;
for (int vr = 0; vr<inner_repeat; ++vr) {
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi) {
v3( idx, t, vi ) = vi;
// prevent compiler optimizing loop away
});
}
}, team_result);
}
v3( idx, 0, 0 ) = team_result;
// prevent compiler optimizing loop away
});
}
if (test_type == 122) {
Kokkos::parallel_for("122 outer for", t_policy(team_range,team_size,vector_size),
KOKKOS_LAMBDA (const t_team& team) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
double team_result = 0.0;
for (int tr = 0; tr<thread_repeat; ++tr) {
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
double vector_result = 0.0;
for (int vr = 0; vr<inner_repeat; ++vr)
vector_result = 0.0;
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi, double &vval) {
vval += 1;
}, vector_result);
lval += vector_result;
}, team_result);
}
v1(idx) = team_result;
// prevent compiler optimizing loop away
});
}
if (test_type == 200) {
Kokkos::parallel_reduce("200 outer reduce", t_policy(team_range,team_size),
KOKKOS_LAMBDA (const t_team& team, double& lval) {
lval+=team.team_size()*team.league_rank() + team.team_rank();
},result);
result_expect = 0.5* (team_range*team_size)*(team_range*team_size-1);
// sum ( seq( [0, team_range*team_size) )
}
if (test_type == 210) {
Kokkos::parallel_reduce("210 outer reduce", t_policy(team_range,team_size),
KOKKOS_LAMBDA (const t_team& team, double& lval) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
double thread_for = 1.0;
for(int tr = 0; tr<thread_repeat; tr++) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
v2(idx,t) = t;
// prevent compiler optimizing loop away
});
}
lval+=(team.team_size()*team.league_rank() + team.team_rank() + thread_for);
},result);
result_expect = 0.5* (team_range*team_size)*(team_range*team_size-1) + (team_range*team_size);
// sum ( seq( [0, team_range*team_size) + 1 per team_member (total of team_range*team_size) )
}
if (test_type == 211) {
Kokkos::parallel_reduce("211 outer reduce", t_policy(team_range,team_size,vector_size),
KOKKOS_LAMBDA (const t_team& team, double& lval) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
double thread_for = 1.0;
for(int tr = 0; tr<thread_repeat; tr++) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
for (int vr = 0; vr<inner_repeat; ++vr)
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi) {
v3(idx, t, vi) = vi;
// prevent compiler optimizing loop away
});
});
}
lval+=idx+thread_for;
},result);
result_expect = 0.5*(team_range*team_size)*(team_range*team_size-1) + (team_range*team_size);
// sum ( seq( [0, team_range*team_size) + 1 per team_member (total of team_range*team_size) )
}
if (test_type == 212) {
Kokkos::parallel_reduce("212 outer reduce", t_policy(team_range,team_size,vector_size),
KOKKOS_LAMBDA (const t_team& team, double& lval) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
double vector_result = 0.0;
for(int tr = 0; tr<thread_repeat; tr++) {
// This parallel_for is executed by each team; the thread_range is partitioned among the team members
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
v2(idx,t) = t;
// prevent compiler optimizing loop away
for (int vr = 0; vr<inner_repeat; ++vr) {
vector_result = 0.0;
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi, double &vval) {
vval += vi;
}, vector_result );
}
});
}
lval+= idx + vector_result;
},result);
result_expect = 0.5*(team_range*team_size)*(team_range*team_size-1) + (0.5*vector_range*(vector_range-1)*team_range*team_size);
// sum ( seq( [0, team_range*team_size) + sum( seq( [0, vector_range) ) per team_member (total of team_range*team_size) )
}
if (test_type == 220) {
Kokkos::parallel_reduce("220 outer reduce", t_policy(team_range,team_size),
KOKKOS_LAMBDA (const t_team& team, double& lval) {
double team_result = 0.0;
for(int tr = 0; tr<thread_repeat; tr++) {
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
tval += t;
},team_result);
}
lval+=team_result*team.league_rank(); // constant * league_rank
},result);
result_expect = 0.5*(team_range)*(team_range-1) * team_size * 0.5*(thread_range)*(thread_range-1);
// sum ( seq( [0, team_range) * constant ); constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
}
if (test_type == 221) {
Kokkos::parallel_reduce("221 outer reduce", t_policy(team_range,team_size,vector_size),
KOKKOS_LAMBDA (const t_team& team, double& lval) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
double team_result = 0;
for(int tr = 0; tr<thread_repeat; tr++) {
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
double vector_for = 1.0;
for (int vr = 0; vr<inner_repeat; ++vr) {
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi) {
v3(idx, t, vi) = vi;
// prevent compiler optimizing loop away
});
}
tval += t + vector_for;
},team_result);
}
lval+=team_result*team.league_rank();
},result);
result_expect = 0.5* (team_range)*(team_range-1) * team_size * (0.5*(thread_range) * (thread_range-1) + thread_range);
// sum ( seq( [0, team_range) * constant ) + 1 per member per team; constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
}
if (test_type == 222) {
Kokkos::parallel_reduce("222 outer reduce", t_policy(team_range,team_size,vector_size),
KOKKOS_LAMBDA (const t_team& team, double& lval) {
double team_result = 0.0;
for(int tr = 0; tr<thread_repeat; tr++) {
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
double vector_result = 0.0;
for (int vr = 0; vr<inner_repeat; ++vr) {
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi, double& vval) {
vval += vi;
}, vector_result);
}
tval += t + vector_result;
},team_result);
}
lval+=team_result*team.league_rank();
},result);
result_expect = 0.5* (team_range)*(team_range-1) * team_size * (0.5*(thread_range) * (thread_range-1) + thread_range*0.5*(vector_range)*(vector_range-1));
// sum ( seq( [0, team_range) * constant ) + 1 + sum( seq([0,vector_range) ) per member per team; constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
}
// parallel_for RangePolicy: range = team_size*team_range
if (test_type == 300) {
Kokkos::parallel_for("300 outer for", team_size*team_range,
KOKKOS_LAMBDA (const int idx) {
v1(idx) = idx;
// prevent compiler from optimizing away the loop
});
}
// parallel_reduce RangePolicy: range = team_size*team_range
if (test_type == 400) {
Kokkos::parallel_reduce("400 outer reduce", team_size*team_range,
KOKKOS_LAMBDA (const int idx, double& val) {
val += idx;
}, result);
result_expect = 0.5*(team_size*team_range)*(team_size*team_range-1);
}
// parallel_scan RangePolicy: range = team_size*team_range
if (test_type == 500) {
Kokkos::parallel_scan("500 outer scan", team_size*team_range,
ParallelScanFunctor<ViewType1>(v1)
#if 0
// This does not compile with pre Cuda 8.0 - see Github Issue #913 for explanation
KOKKOS_LAMBDA (const int idx, double& val, const bool& final) {
// inclusive scan
val += v1(idx);
if ( final ) {
v1(idx) = val;
}
}
#endif
);
// result = v1( team_size*team_range - 1 ); // won't work with Cuda - need to copy result back to host to print
// result_expect = 0.5*(team_size*team_range)*(team_size*team_range-1);
}
} // end outer for loop
time = timer.seconds();
} //end test_policy

View File

@ -0,0 +1,53 @@
#!/bin/bash
# Script to check policy_perf_test code works with each possible combo of options
echo "Performance test results for parallel_reduce code computing sum of sequence [0,N) with various (nested) policies"
EXECUTABLE=policy_performance
TEAMRANGE=1000
THREADRANGE=4
VECTORRANGE=32
TEAMSIZE=4
VECTORSIZE=1
OREPEAT=1
MREPEAT=1
IREPEAT=1
SCHEDULE=1
SUFFIX=host
if [ -e $EXECUTABLE.$SUFFIX ]
then
SCHEDULE=1
echo "Host tests Static schedule"
for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
do
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
done
SCHEDULE=2
echo "Host tests Dynamic schedule"
for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
do
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
done
fi
SUFFIX=cuda
if [ -e $EXECUTABLE.$SUFFIX ]
then
SCHEDULE=1
echo "Cuda tests Static schedule"
for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
do
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
done
SCHEDULE=2
echo "Cuda tests Dynamic schedule"
for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
do
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
done
fi

View File

@ -0,0 +1,126 @@
#!/bin/bash
# Sample script for benchmarking policy performance
# Suggested enviroment variables to export prior to executing script:
# KNL:
# OMP_NUM_THREADS=256 KMP_AFFINITY=compact
# Power:
# OMP_NUM_THREADS=64 OMP_PROC_BIND=true
# Constants and Variables:
# Vary: TEAMSIZE, and THREADRANGE
# for TEAMSIZE in {1,2,4,5,8}; do
# for THREADRANGE in {32,41,1000}; do
# Fixed: TEAMRANGE, VECTORRANGE, VECTORSIZE
# System specific: Adjust REPEAT values to architecture tests are run on
# Tests
# Static SCHEDULE = 1
# Tier 1: parallel_for + RangePolicy 300
# Tier 2: parallel_reduce, parallel_scan + RangePolicy 400 500
# Tier 3: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY
# Tier 4: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY
# Dynamic SCHEDULE = 2
# Tier 5: parallel_for + RangePolicy 300
# Tier 6: parallel_reduce, parallel_scan + RangePolicy 400 500
# Tier 7: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY
# Tier 8: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY
# Results grouped by:
# 0) SCHEDULE 1) CODE (test) 2) TEAMRANGE 3) TEAMSIZE 4) THREADRANGE
EXECUTABLE=policy_performance
# Default defined values
TEAMRANGE=1000
THREADRANGE=1
VECTORRANGE=32
TEAMSIZE=1
VECTORSIZE=1
OREPEAT=1
MREPEAT=1
IREPEAT=1
SCHEDULE=1
# Host tests
SUFFIX=host
if [ -e $EXECUTABLE.$SUFFIX ]; then
echo "Host"
for SCHEDULE in {1,2}; do
# Tier 1 and 2, 5 and 6
for CODE in {300,400,500}; do
for TEAMSIZE in {1,2,4,5,8}; do
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
done
done
# Tier 3, 7
for CODE in {100,110,111,112,120,121,122}; do
for TEAMSIZE in {1,2,4,5,8}; do
for THREADRANGE in {32,41,1000}; do
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
done
done
done
# Tier 4, 8
for CODE in {200,210,211,212,220,221,222}; do
for TEAMSIZE in {1,2,4,5,8}; do
for THREADRANGE in {32,41,1000}; do
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
done
done
done
done # end SCHEDULE
fi # end host
# Cuda tests
SUFFIX=cuda
# TEAMRANGE=10000, TEAMSIZE=8 too large
# TEAMRANGE=10000, TEAMSIZE=8, THREADRANGE=1000 too large
if [ -e $EXECUTABLE.$SUFFIX ]; then
echo "Cuda"
for SCHEDULE in {1,2}; do
# Reset defaults
TEAMRANGE=1000
THREADRANGE=1
VECTORRANGE=32
TEAMSIZE=1
VECTORSIZE=1
# Tier 1 and 2, 5 and 6
for CODE in {300,400,500}; do
for TEAMSIZE in {1,2,4,5,8}; do
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
done
done
# Tier 3, 7
for CODE in {100,110,111,112,120,121,122}; do
for TEAMSIZE in {1,2,4,5,8}; do
for THREADRANGE in {32,41,1000}; do
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
done
done
done
# Tier 4, 8
for CODE in {200,210,211,212,220,221,222}; do
for TEAMSIZE in {1,2,4,5,8}; do
for THREADRANGE in {32,41,1000}; do
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
done
done
done
done # end SCHEDULE
fi #end cuda

454
lib/kokkos/bin/hpcbind Executable file
View File

@ -0,0 +1,454 @@
#!/usr/bin/env bash
################################################################################
# Check if hwloc commands exist
################################################################################
declare -i HPCBIND_HAS_HWLOC=1
type hwloc-bind >/dev/null 2>&1
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
type hwloc-distrib >/dev/null 2>&1
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
type hwloc-ls >/dev/null 2>&1
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
type hwloc-calc >/dev/null 2>&1
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
type hwloc-ps >/dev/null 2>&1
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
if [[ ${HPCBIND_HAS_HWLOC} -eq 0 ]]; then
echo "hwloc not found, no process binding will occur"
fi
# Get parent cpuset
HPCBIND_HWLOC_PARENT_CPUSET=""
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
MY_PID="$BASHPID"
HPCBIND_HWLOC_PARENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
fi
################################################################################
# Check if nvidia-smi exist
################################################################################
declare -i HPCBIND_HAS_NVIDIA=0
type nvidia-smi >/dev/null 2>&1
HPCBIND_HAS_NVIDIA=$((!$?))
################################################################################
# Get visible gpu
################################################################################
declare -i NUM_GPUS=0
HPCBIND_VISIBLE_GPUS=""
if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then
NUM_GPUS=$(nvidia-smi -L | wc -l);
GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )"
HPCBIND_VISIBLE_GPUS=${CUDA_VISIBLE_DEVICES:-${GPU_LIST}}
fi
declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0))
################################################################################
# Get queue id
# supports sbatch, bsub, aprun
################################################################################
HPCBIND_QUEUE_NAME=""
declare -i HPCBIND_QUEUE_INDEX=0
declare -i HPCBIND_QUEUE_GPU_MAPPING=0
if [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
HPCBIND_QUEUE_GPU_MAPPING=1
HPCBIND_QUEUE_NAME="sbatch"
HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID}
elif [[ ! -z "${LBS_JOBINDEX}" ]]; then
HPCBIND_QUEUE_GPU_MAPPING=1
HPCBIND_QUEUE_NAME="bsub"
HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX}
elif [[ ! -z "${ALPS_APP_PE}" ]]; then
HPCBIND_QUEUE_GPU_MAPPING=1
HPCBIND_QUEUE_NAME="aprun"
HPCBIND_QUEUE_INDEX=${ALPS_APP_PE}
fi
################################################################################
# Show help
################################################################################
function show_help {
local cmd=$(basename "$0")
echo "Usage: ${cmd} <options> -- command ..."
echo " Set the process mask, OMP environment variables and CUDA environment"
echo " variables to sane values if possible. Uses hwloc and nvidia-smi if"
echo " available. Will preserve the current process binding, so it is safe"
echo " to use with a queuing system or mpiexec."
echo ""
echo "Options:"
echo " --no-hwloc-bind Disable binding"
echo " --proc-bind=<LOC> Set the initial process mask for the script"
echo " LOC can be any valid location argument for"
echo " hwloc-calc Default: all"
echo " --distribute=N Distribute the current cpuset into N partitions"
echo " --distribute-partition=I"
echo " Use the i'th partition (zero based)"
echo " --visible-gpus=<L> Comma separated list of gpu ids"
echo " Default: CUDA_VISIBLE_DEVICES or all gpus in"
echo " sequential order"
echo " --gpu-ignore-queue Ignore queue job id when choosing visible GPU"
echo " --no-gpu-mapping Do not set CUDA_VISIBLE_DEVICES"
echo " --openmp=M.m Set env variables for the given OpenMP version"
echo " Default: 4.0"
echo " --openmp-percent=N Integer percentage of cpuset to use for OpenMP"
echo " threads Default: 100"
echo " --openmp-places=<Op> Op=threads|cores|sockets. Default: threads"
echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
echo " --force-openmp-num-threads=N"
echo " Override logic for selecting OMP_NUM_THREADS"
echo " --force-openmp-proc-bind=<OP>"
echo " Override logic for selecting OMP_PROC_BIND"
echo " --no-openmp-nested Set OMP_NESTED to false"
echo " --show-bindings Show the bindings"
echo " --lstopo Show bindings in lstopo without executing a command"
echo " -v|--verbose Show options and relevant environment variables"
echo " -h|--help Show this message"
echo ""
echo "Sample Usage:"
echo " Split the current process cpuset into 4 and use the 3rd partition"
echo " ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..."
echo " Bing the process to all even cores"
echo " ${cmd} --proc-bind=core:even -v -- command ..."
echo " Bind to the first 64 cores and split the current process cpuset into 4"
echo " ${cmd} --proc-bind=core:0-63 --distribute=4 --distribute-partition=0 -- command ..."
echo " skip GPU 0 when mapping visible devices"
echo " ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..."
echo " Display the current bindings"
echo " ${cmd} --proc-bind=numa:0 --show-bindings -- command"
echo " Display the current bindings using lstopo"
echo " ${cmd} --proc-bind=numa:0.core:odd --lstopo"
echo ""
}
################################################################################
# Parse command line arguments
################################################################################
# Show help if no command line arguments given
if [[ "$#" -eq 0 ]]; then
show_help
exit 0
fi
declare -a UNKNOWN_ARGS=()
declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC}
declare -i HPCBIND_DISTRIBUTE=1
declare -i HPCBIND_PARTITION=0
HPCBIND_PROC_BIND="all"
HPCBIND_OPENMP_VERSION=4.0
declare -i HPCBIND_OPENMP_PERCENT=100
HPCBIND_OPENMP_PLACES=${OMP_PLACES:-threads}
declare -i HPCBIND_OPENMP_PROC_BIND=1
declare -i HPCBIND_OPENMP_FORCE_NUM_THREADS=-1
HPCBIND_OPENMP_FORCE_PROC_BIND=""
HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true}
declare -i HPCBIND_VERBOSE=0
declare -i HPCBIND_SHOW_BINDINGS=0
declare -i HPCBIND_LSTOPO=0
for i in $@; do
case $i in
# number of partitions to create
--no-hwloc-bind)
HPCBIND_ENABLE_HWLOC_BIND=0
shift
;;
--proc-bind=*)
HPCBIND_PROC_BIND="${i#*=}"
shift
;;
--distribute=*)
HPCBIND_DISTRIBUTE="${i#*=}"
shift
;;
# which partition to use
--distribute-partition=*)
HPCBIND_PARTITION="${i#*=}"
shift
;;
--visible-gpus=*)
HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ')
shift
;;
--gpu-ignore-queue)
HPCBIND_QUEUE_GPU_MAPPING=0
shift
;;
--no-gpu-mapping)
HPCBIND_ENABLE_GPU_MAPPING=0
shift
;;
--openmp=*)
HPCBIND_OPENMP_VERSION="${i#*=}"
shift
;;
--openmp-percent=*)
HPCBIND_OPENMP_PERCENT="${i#*=}"
shift
;;
--openmp-places=*)
HPCBIND_OPENMP_PLACES="${i#*=}"
shift
;;
--no-openmp-proc-bind)
HPCBIND_OPENMP_PROC_BIND=0
shift
;;
--force-openmp-proc-bind=*)
HPCBIND_OPENMP_FORCE_PROC_BIND="${i#*=}"
shift
;;
--force-openmp-num-threads=*)
HPCBIND_OPENMP_FORCE_NUM_THREADS="${i#*=}"
shift
;;
--no-openmp-nested)
HPCBIND_OPENMP_NESTED="false"
shift
;;
--show-bindings)
HPCBIND_VERBOSE=1
HPCBIND_SHOW_BINDINGS=1
shift
;;
--lstopo)
HPCBIND_VERBOSE=1
HPCBIND_SHOW_BINDINGS=0
HPCBIND_LSTOPO=1
shift
;;
-v|--verbose)
HPCBIND_VERBOSE=1
shift
;;
-h|--help)
show_help
exit 0
;;
# ignore remaining arguments
--)
shift
break
;;
# unknown option
*)
UNKNOWN_ARGS+=("$i")
shift
;;
esac
done
################################################################################
# Check unknown arguments
################################################################################
if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
echo "Uknown options: ${UNKNOWN_ARGS[*]}"
exit 1
fi
################################################################################
# Check that visible gpus are valid
################################################################################
HPCBIND_VISIBLE_GPUS=(${HPCBIND_VISIBLE_GPUS})
if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do
if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} ||
${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then
echo "Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]}, setting to 0"
HPCBIND_VISIBLE_GPUS[$i]=0;
fi
done
NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]}
fi
################################################################################
# Check OpenMP percent
################################################################################
if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then
echo "OpenMP percent < 1, setting to 1"
HPCBIND_OPENMP_PERCENT=1
elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then
echo "OpenMP percent > 100, setting to 100"
HPCBIND_OPENMP_PERCENT=100
fi
################################################################################
# Check distribute
################################################################################
if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then
echo "Invalid input for distribute, changing distribute to 1"
HPCBIND_DISTRIBUTE=1
fi
if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
echo "Invalid input for distribute-partition, changing to 0"
HPCBIND_PARTITION=0
fi
################################################################################
# Find cpuset and num threads
################################################################################
HPCBIND_HWLOC_CPUSET=""
declare -i HPCBIND_NUM_PUS=0
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND})
else
BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND})
fi
CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
HPCBIND_HWLOC_CPUSET=${CPUSETS[${HPCBIND_PARTITION}]}
HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l)
else
HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor)
fi
declare -i HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_NUM_PUS * HPCBIND_OPENMP_PERCENT))
HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_OPENMP_NUM_THREADS / 100))
if [[ ${HPCBIND_OPENMP_NUM_THREADS} -lt 1 ]]; then
HPCBIND_OPENMP_NUM_THREADS=1
elif [[ ${HPCBIND_OPENMP_NUM_THREADS} -gt ${HPCBIND_NUM_PUS} ]]; then
HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_NUM_PUS}
fi
if [[ ${HPCBIND_OPENMP_FORCE_NUM_THREADS} -gt 0 ]]; then
HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_OPENMP_FORCE_NUM_THREADS}
fi
################################################################################
# Set OpenMP environment variables
################################################################################
# set OMP_NUM_THREADS
export OMP_NUM_THREADS=${HPCBIND_OPENMP_NUM_THREADS}
# set OMP_PROC_BIND and OMP_PLACES
if [[ ${HPCBIND_OPENMP_PROC_BIND} -eq 1 ]]; then
if [[ "${HPCBIND_OPENMP_FORCE_PROC_BIND}" == "" ]]; then
#default proc bind logic
if [[ "${HPCBIND_OPENMP_VERSION}" == "4.0" || "${HPCBIND_OPENMP_VERSION}" > "4.0" ]]; then
export OMP_PLACES="${HPCBIND_OPENMP_PLACES}"
export OMP_PROC_BIND="spread"
else
export OMP_PROC_BIND="true"
unset OMP_PLACES
fi
else
#force proc bind
export OMP_PLACES="${HPCBIND_OPENMP_PLACES}"
export OMP_PROC_BIND="${HPCBIND_OPENMP_FORCE_PROC_BIND}"
fi
else
# no openmp proc bind
unset OMP_PLACES
unset OMP_PROC_BIND
fi
# set OMP_NESTED
export OMP_NESTED=${HPCBIND_OPENMP_NESTED}
################################################################################
# Set CUDA environment variables
################################################################################
if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
if [[ ${HPCBIND_QUEUE_GPU_MAPPING} -eq 0 ]]; then
declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
else
declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
fi
fi
################################################################################
# Set hpcbind environment variables
################################################################################
export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC}
export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA}
export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS}
export HPCBIND_HWLOC_CPUSET=${HPCBIND_HWLOC_CPUSET}
export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE}
export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION}
if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
export HPCBIND_HWLOC_PARENT_CPUSET="all"
else
export HPCBIND_HWLOC_PARENT_CPUSET=${HPCBIND_HWLOC_PARENT_CPUSET}
fi
export HPCBIND_HWLOC_PROC_BIND=${HPCBIND_PROC_BIND}
export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
export HPCBIND_OPENMP_VERSION=${HPCBIND_OPENMP_VERSION}
if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then
export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX}
export HPCBIND_QUEUE_NAME=${HPCBIND_QUEUE_NAME}
export HPCBIND_QUEUE_GPU_MAPPING=${HPCBIND_QUEUE_GPU_MAPPING}
fi
################################################################################
# Print verbose
################################################################################
if [[ ${HPCBIND_VERBOSE} -eq 1 ]]; then
MY_ENV=$(env | sort)
echo "[HPCBIND]"
echo "${MY_ENV}" | grep -E "^HPCBIND_"
echo "[CUDA]"
echo "${MY_ENV}" | grep -E "^CUDA_"
echo "[OPENMP]"
echo "${MY_ENV}" | grep -E "^OMP_"
fi
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
echo "[BINDINGS]"
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
elif [[ ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
echo "Unable to show bindings, hwloc not available."
fi
################################################################################
# Run command
################################################################################
if [[ ${HPCBIND_LSTOPO} -eq 0 ]]; then
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- $@
else
eval $@
fi
else
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
echo "[BINDINGS]"
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- lstopo --pid 0
else
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET}
fi
else
echo "Unable to show bindings, hwloc not available."
fi
fi

221
lib/kokkos/bin/kokkos-bind Executable file
View File

@ -0,0 +1,221 @@
#!/usr/bin/env bash
# check if hwloc commands exist
declare -i HAS_HWLOC=0
type hwloc-bind >/dev/null 2>&1
HAS_HWLOC="${HAS_HWLOC} + $?"
type hwloc-distrib >/dev/null 2>&1
HAS_HWLOC="${HAS_HWLOC} + $?"
type hwloc-ls >/dev/null 2>&1
HAS_HWLOC="${HAS_HWLOC} + $?"
type hwloc-calc >/dev/null 2>&1
HAS_HWLOC="${HAS_HWLOC} + $?"
type hwloc-ps >/dev/null 2>&1
HAS_HWLOC="${HAS_HWLOC} + $?"
#parse args
declare -a UNKNOWN_ARGS=()
declare -i DISTRIBUTE=1
declare -i INDEX=0
PROC_BIND="all"
CURRENT_CPUSET=""
OPENMP_VERSION=4.0
OPENMP_PROC_BIND=True
OPENMP_NESTED=True
VERBOSE=False
#get the current process cpuset
if [[ ${HAS_HWLOC} -eq 0 ]]; then
MY_PID="$BASHPID"
CURRENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
echo "$CURRENT_CPUSET"
fi
function show_help {
local cmd=$(basename "$0")
echo "Usage: ${cmd} <options> -- command ..."
echo " Uses hwloc to divide the node into the given number of groups,"
echo " set the appropriate OMP_NUM_THREADS and execute the command on the"
echo " selected group."
echo ""
echo " NOTE: This command assumes it has exclusive use of the node"
echo ""
echo "Options:"
echo " --proc-bind=<LOC> Set the initial process mask for the script. "
echo " LOC can be any valid location argumnet for"
echo " hwloc-calc. Defaults to the entire machine"
echo " --distribute=N Distribute the current proc-bind into N groups"
echo " --index=I Use the i'th group (zero based)"
echo " --openmp=M.m Set env variables for the given OpenMP version"
echo " (default 4.0)"
echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
echo " --no-openmp-nested Set OMP_NESTED to false"
echo " -v|--verbose"
echo " -h|--help"
echo ""
echo "Sample Usage:"
echo " ${cmd} --distribute=4 --index=2 -v -- command ..."
echo ""
}
if [[ "$#" -eq 0 ]]; then
show_help
exit 0
fi
for i in $@; do
case $i in
# number of partitions to create
--proc-bind=*)
PROC_BIND="${i#*=}"
shift
;;
--distribute=*)
DISTRIBUTE="${i#*=}"
shift
;;
# which group to use
--index=*)
INDEX="${i#*=}"
shift
;;
--openmp=*)
OPENMP_VERSION="${i#*=}"
shift
;;
--no-openmp-proc-bind)
OPENMP_PROC_BIND=False
shift
;;
--no-openmp-nested)
OPENMP_NESTED=False
shift
;;
-v|--verbose)
VERBOSE=True
shift
;;
-h|--help)
show_help
exit 0
;;
# ignore remaining arguments
--)
shift
break
;;
# unknown option
*)
UNKNOWN_ARGS+=("$i")
shift
;;
esac
done
if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
echo "Uknown options: ${UNKNOWN_ARGS[*]}"
exit 1
fi
if [[ ${DISTRIBUTE} -le 0 ]]; then
echo "Invalid input for distribute, changing distribute to 1"
DISTRIBUTE=1
fi
if [[ ${INDEX} -ge ${DISTRIBUTE} ]]; then
echo "Invalid input for index, changing index to 0"
INDEX=0
fi
if [[ ${HAS_HWLOC} -ne 0 ]]; then
echo "hwloc not found, no process binding will occur"
DISTRIBUTE=1
INDEX=0
fi
if [[ ${HAS_HWLOC} -eq 0 ]]; then
if [[ "${CURRENT_CPUSET}" == "" ]]; then
BINDING=$(hwloc-calc ${PROC_BIND})
else
BINDING=$(hwloc-calc --restrict ${CURRENT_CPUSET} ${PROC_BIND})
fi
CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${DISTRIBUTE}))
CPUSET=${CPUSETS[${INDEX}]}
NUM_THREADS=$(hwloc-ls --restrict ${CPUSET} --only pu | wc -l)
if [[ "${VERBOSE}" == "True" ]]; then
echo "hwloc: true"
echo " proc_bind: ${PROC_BIND}"
echo " distribute: ${DISTRIBUTE}"
echo " index: ${INDEX}"
echo " parent_cpuset: ${CURRENT_CPUSET}"
echo " cpuset: ${CPUSET}"
echo "omp_num_threads: ${NUM_THREADS}"
echo "omp_proc_bind: ${OPENMP_PROC_BIND}"
echo "omp_nested: ${OPENMP_NESTED}"
echo "OpenMP: ${OPENMP_VERSION}"
fi
# set OMP env
if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
export OMP_PLACES="threads"
export OMP_PROC_BIND="spread"
else
export OMP_PROC_BIND="true"
unset OMP_PLACES
fi
else
unset OMP_PLACES
unset OMP_PROC_BIND
fi
if [[ "${OPENMP_NESTED}" == "True" ]]; then
export OMP_NESTED="true"
else
export OMP_NESTED="false"
fi
export OMP_NUM_THREADS="${NUM_THREADS}"
hwloc-bind ${CPUSET} -- $@
else
NUM_THREADS=$(cat /proc/cpuinfo | grep -c processor)
if [[ "${VERBOSE}" == "True" ]]; then
echo "hwloc: false"
echo "omp_num_threads: ${NUM_THREADS}"
echo "omp_proc_bind: ${OPENMP_PROC_BIND}"
echo "omp_nested: ${OPENMP_NESTED}"
echo "OpenMP: ${OPENMP_VERSION}"
fi
# set OMP env
if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
export OMP_PLACES="threads"
export OMP_PROC_BIND="spread"
else
export OMP_PROC_BIND="true"
unset OMP_PLACES
fi
else
unset OMP_PLACES
unset OMP_PROC_BIND
fi
if [[ "${OPENMP_NESTED}" == "True" ]]; then
export OMP_NESTED="true"
else
export OMP_NESTED="false"
fi
export OMP_NUM_THREADS="${NUM_THREADS}"
eval $@
fi

165
lib/kokkos/bin/runtest Executable file
View File

@ -0,0 +1,165 @@
#!/usr/bin/env bash
function get_path() {
cd "$(dirname "$0")"
cd ..
echo "$(pwd -P)"
}
KOKKOS_PATH="$(get_path "$0")"
function show_help() {
local cmd=$(basename "$0")
echo "Usage: ${cmd} <options> "
echo " Build and run the tests"
echo ""
echo "Options:"
echo " -j=N|--make-j=N Build the tests in parallel"
echo " -c|--clean Clean build and regenerate make files"
echo " --clean-on-pass Clean build when runtest passes"
echo " --output-prefix=<pre> Prefix of log files Default: runtest"
echo " --build-only Only build the tests"
echo " -v|--verbose Tee STDOUT and STDERR to screen and files"
echo " -h|--help Show this message"
echo ""
${KOKKOS_PATH}/generate_makefile.bash --help
return 0
}
declare -a GENERATE_ARGS=()
declare -i VERBOSE=0
declare -i CLEAN=0
declare -i CLEAN_ON_PASS=0
declare -i BUILD_ONLY=0
OUTPUT="runtest"
declare -i MAKE_J=${HPCBIND_NUM_PUS:-1}
for i in $@; do
case $i in
-j=*|--make-j=*)
MAKE_J=${i#*=}
shift
;;
-c|--clean)
CLEAN=1
shift
;;
--clean-on-pass)
CLEAN_ON_PASS=1
shift
;;
--output-prefix=*)
OUTPUT=${i#*=}
shift
;;
--build-only)
BUILD_ONLY=1
shift
;;
-v|--verbose)
VERBOSE=1
shift
;;
-h|--help)
show_help
exit 0
;;
*)
GENERATE_ARGS+=("$i")
shift
;;
esac
done
if [[ "$(pwd -P)" == ${KOKKOS_PATH} ]]; then
echo "Cannot call $0 from root repository path ${KOKKOS_PATH}"
exit 1
fi
# Some makefile dependencies are incorrect, so clean needs to force
# a new call to generate_makefiles.bash
if [[ ${CLEAN} -eq 1 ]]; then
START=${SECONDS}
echo "Cleaning"
/bin/rm -rf algorithms containers core example install Makefile >/dev/null 2>&1
END=${SECONDS}
echo " $((END-START)) seconds"
if [[ ${VERBOSE} -eq 1 ]]; then
echo ""
echo ""
fi
fi
declare -i START=${SECONDS}
echo "Generating Makefile"
echo " ${KOKKOS_PATH}/generate_makefile.bash --kokkos-path=${KOKKOS_PATH} ${GENERATE_ARGS[@]}"
if [[ ${VERBOSE} -eq 0 ]]; then
"${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > ${OUTPUT}.out 2> >(tee ${OUTPUT}.err >&2)
else
"${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > >(tee ${OUTPUT}.out) 2> >(tee ${OUTPUT}.err >&2)
fi
declare -i RESULT=$?
declare -i END=${SECONDS}
if [[ ${RESULT} -eq 0 ]]; then
echo " PASS: $((END-START)) seconds"
if [[ ${VERBOSE} -eq 1 ]]; then
echo ""
echo ""
fi
else
cat ${OUTPUT}.out | grep "FAIL"
cat ${OUTPUT}.err | grep "FAIL"
echo " FAIL: $((END-START)) seconds"
exit 1
fi
START=${SECONDS}
echo "Building"
if [[ ${VERBOSE} -eq 0 ]]; then
make --keep-going -j ${MAKE_J} build-test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2)
else
make --keep-going -j ${MAKE_J} build-test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2)
fi
RESULT=$?
END=${SECONDS}
if [[ ${RESULT} -eq 0 ]]; then
echo " PASS: $((END-START)) seconds"
if [[ ${VERBOSE} -eq 1 ]]; then
echo ""
echo ""
fi
else
cat ${OUTPUT}.out | grep -E "[[:space:]]error:[[:space:]]"
cat ${OUTPUT}.err | grep -E "[[:space:]]error:[[:space:]]"
echo " FAIL: $((END-START)) seconds"
exit 1
fi
if [[ ${BUILD_ONLY} -eq 0 ]]; then
START=${SECONDS}
echo "Testing"
if [[ ${VERBOSE} -eq 0 ]]; then
make --keep-going test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2)
else
make --keep-going test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2)
fi
RESULT=$?
END=${SECONDS}
if [[ ${RESULT} -eq 0 ]]; then
echo " PASS: $((END-START)) seconds"
if [[ ${CLEAN_ON_PASS} -eq 1 ]]; then
make clean
fi
else
cat ${OUTPUT}.out | grep "FAIL"
cat ${OUTPUT}.err | grep "FAIL"
echo " FAIL: $((END-START)) seconds"
exit 1
fi
fi
exit ${RESULT}

View File

@ -999,8 +999,12 @@ SET (Kokkos_INCLUDE_DIRS
${Kokkos_SOURCE_DIR}/containers/src
${Kokkos_SOURCE_DIR}/algorithms/src
${Kokkos_BINARY_DIR} # to find KokkosCore_config.h
${KOKKOS_INCLUDE_DIRS}
)
# pass include dirs back to parent scope
SET(Kokkos_INCLUDE_DIRS_RET ${Kokkos_INCLUDE_DIRS} PARENT_SCOPE)
INCLUDE_DIRECTORIES(${Kokkos_INCLUDE_DIRS})
IF(KOKKOS_SEPARATE_LIBS)

View File

@ -7,3 +7,4 @@ tag: 2.02.07 date: 12:16:2016 master: 4b4cc4ba develop: 382c0966
tag: 2.02.15 date: 02:10:2017 master: 8c64cd93 develop: 28dea8b6
tag: 2.03.00 date: 04:25:2017 master: 120d9ce7 develop: 015ba641
tag: 2.03.05 date: 05:27:2017 master: 36b92f43 develop: 79073186
tag: 2.03.13 date: 07:27:2017 master: da314444 develop: 29ccb58a

View File

@ -0,0 +1,24 @@
#include <cstdio>
#include <cuda_runtime_api.h>
int main()
{
cudaDeviceProp prop;
const cudaError_t err_code = cudaGetDeviceProperties(&prop, 0);
if (cudaSuccess != err_code) {
fprintf(stderr,"cudaGetDeviceProperties failed: %s\n", cudaGetErrorString(err_code));
return -1;
}
switch (prop.major) {
case 3:
printf("Kepler"); break;
case 5:
printf("Maxwell"); break;
case 6:
printf("Pascal"); break;
default:
fprintf(stderr, "Unspported Device %d%d\n", (int)prop.major, (int)prop.minor);
return -1;
}
printf("%d%d\n", (int)prop.major, (int)prop.minor);
return 0;
}

View File

@ -160,9 +160,14 @@ if [ "$MACHINE" = "sems" ]; then
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
@ -280,13 +285,13 @@ elif [ "$MACHINE" = "apollo" ]; then
"gcc/5.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
"intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
"clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
"clang/head $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
"clang/4.0.0 $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
"cuda/8.0.44 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
)
else
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("cuda/8.0.44 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
"clang/head $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
"clang/4.0.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
"clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
"gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
@ -584,7 +589,7 @@ single_build_and_test() {
else
run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
local -i build_start_time=$(date +%s)
run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
run_cmd make -j 32 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
local -i build_end_time=$(date +%s)
comment="build_time=$(($build_end_time-$build_start_time))"

View File

@ -28,14 +28,14 @@ export JENKINS_DO_PTHREAD=ON
export JENKINS_DO_SERIAL=OFF
export JENKINS_DO_COMPLEX=OFF
export ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
export ARCH_C_FLAG="-xCORE-AVX2 -mkl"
export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
export JENKINS_ARCH_C_FLAG="-xCORE-AVX2 -mkl"
export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a"
export LAPACK_LIBRARIES=${BLAS_LIBRARIES}
export JENKINS_DO_TESTS=ON
export JENKINS_DO_EXAMPLES=ON
export JENKINS_DO_SHARED=OFF
export JENKINS_DO_SHARED=ON
export QUEUE=haswell

View File

@ -28,14 +28,14 @@ export JENKINS_DO_PTHREAD=OFF
export JENKINS_DO_SERIAL=ON
export JENKINS_DO_COMPLEX=ON
export ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
export ARCH_C_FLAG="-xCORE-AVX2 -mkl"
export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
export JENKINS_ARCH_C_FLAG="-xCORE-AVX2 -mkl"
export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a"
export LAPACK_LIBRARIES=${BLAS_LIBRARIES}
export JENKINS_DO_TESTS=ON
export JENKINS_DO_EXAMPLES=ON
export JENKINS_DO_SHARED=OFF
export JENKINS_DO_SHARED=ON
export QUEUE=haswell

View File

@ -60,7 +60,6 @@ test-threads: KokkosContainers_PerformanceTest_Threads
test-openmp: KokkosContainers_PerformanceTest_OpenMP
./KokkosContainers_PerformanceTest_OpenMP
build_all: $(TARGETS)
test: $(TEST_TARGETS)

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,12 +36,15 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
#include <gtest/gtest.h>
#include <cstdlib>
#include <Kokkos_Macros.hpp>
int main(int argc, char *argv[]) {
::testing::InitGoogleTest(&argc,argv);

View File

@ -69,30 +69,13 @@ protected:
{
std::cout << std::setprecision(5) << std::scientific;
unsigned num_threads = 4;
if (Kokkos::hwloc::available()) {
num_threads = Kokkos::hwloc::get_available_numa_count()
* Kokkos::hwloc::get_available_cores_per_numa()
* Kokkos::hwloc::get_available_threads_per_core()
;
}
std::cout << "OpenMP: " << num_threads << std::endl;
Kokkos::OpenMP::initialize( num_threads );
std::cout << "available threads: " << omp_get_max_threads() << std::endl;
Kokkos::OpenMP::initialize();
Kokkos::OpenMP::print_configuration( std::cout );
}
static void TearDownTestCase()
{
Kokkos::OpenMP::finalize();
omp_set_num_threads(1);
ASSERT_EQ( 1 , omp_get_max_threads() );
}
};

View File

@ -564,7 +564,7 @@ namespace Impl {
template< class D, class A1, class A2, class A3, class ... Args >
struct DualViewSubview {
typedef typename Kokkos::Experimental::Impl::ViewMapping
typedef typename Kokkos::Impl::ViewMapping
< void
, Kokkos::ViewTraits< D, A1, A2, A3 >
, Args ...

View File

@ -46,19 +46,6 @@
///
/// This header file declares and defines Kokkos::Experimental::DynRankView and its
/// related nonmember functions.
/*
* Changes from View
* 1. The rank of the DynRankView is returned by the method rank()
* 2. Max rank of a DynRankView is 7
* 3. subview name is subdynrankview
* 4. Every subdynrankview is returned with LayoutStride
*
* NEW: Redesigned DynRankView
* 5. subview function name now available
* 6. Copy and Copy-Assign View to DynRankView
* 7. deep_copy between Views and DynRankViews
* 8. rank( view ); returns the rank of View or DynRankView
*/
#ifndef KOKKOS_DYNRANKVIEW_HPP
#define KOKKOS_DYNRANKVIEW_HPP
@ -117,6 +104,14 @@ struct DynRankDimTraits {
, layout.dimension[7] );
}
// Extra overload to match that for specialize types v2
template <typename Layout, typename ... P>
KOKKOS_INLINE_FUNCTION
static size_t computeRank( const Kokkos::Impl::ViewCtorProp<P...>& prop, const Layout& layout )
{
return computeRank(layout);
}
// Create the layout for the rank-7 view.
// Non-strided Layout
template <typename Layout>
@ -158,8 +153,17 @@ struct DynRankDimTraits {
);
}
// Extra overload to match that for specialize types
template <typename Traits, typename ... P>
KOKKOS_INLINE_FUNCTION
static typename std::enable_if< (std::is_same<typename Traits::array_layout , Kokkos::LayoutRight>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutLeft>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutStride>::value) , typename Traits::array_layout >::type createLayout( const ViewCtorProp<P...>& prop, const typename Traits::array_layout& layout )
{
return createLayout( layout );
}
// Create a view from the given dimension arguments.
// This is only necessary because the shmem constructor doesn't take a layout.
// NDE shmem View's are not compatible with the added view_alloc value_type / fad_dim deduction functionality
template <typename ViewType, typename ViewArg>
static ViewType createView( const ViewArg& arg
, const size_t N0
@ -186,7 +190,8 @@ struct DynRankDimTraits {
// Non-strided Layout
template <typename Layout , typename iType>
KOKKOS_INLINE_FUNCTION
static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank )
static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) && std::is_integral<iType>::value , Layout >::type
reconstructLayout( const Layout& layout , iType dynrank )
{
return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0)
, dynrank > 1 ? layout.dimension[1] : ~size_t(0)
@ -202,7 +207,8 @@ struct DynRankDimTraits {
// LayoutStride
template <typename Layout , typename iType>
KOKKOS_INLINE_FUNCTION
static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank )
static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) && std::is_integral<iType>::value , Layout >::type
reconstructLayout( const Layout& layout , iType dynrank )
{
return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0)
, dynrank > 0 ? layout.stride[0] : (0)
@ -311,6 +317,11 @@ void dyn_rank_view_verify_operator_bounds
/** \brief Assign compatible default mappings */
struct ViewToDynRankViewTag {};
} // namespace Impl
} // namespace Experimental
namespace Impl {
template< class DstTraits , class SrcTraits >
class ViewMapping< DstTraits , SrcTraits ,
typename std::enable_if<(
@ -337,7 +348,7 @@ class ViewMapping< DstTraits , SrcTraits ,
)
)
)
) , ViewToDynRankViewTag >::type >
) , Kokkos::Experimental::Impl::ViewToDynRankViewTag >::type >
{
private:
@ -376,7 +387,7 @@ public:
typedef typename DstType::offset_type dst_offset_type ;
dst.m_map.m_offset = dst_offset_type(std::integral_constant<unsigned,0>() , src.layout() ); //Check this for integer input1 for padding, etc
dst.m_map.m_handle = Kokkos::Experimental::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
dst.m_map.m_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
dst.m_track.assign( src.m_track , DstTraits::is_managed );
dst.m_rank = src.Rank ;
}
@ -384,22 +395,20 @@ public:
} //end Impl
namespace Experimental {
/* \class DynRankView
* \brief Container that creates a Kokkos view with rank determined at runtime.
* Essentially this is a rank 7 view that wraps the access operators
* to yield the functionality of a view
* Essentially this is a rank 7 view
*
* Changes from View
* 1. The rank of the DynRankView is returned by the method rank()
* 2. Max rank of a DynRankView is 7
* 3. subview name is subdynrankview
* 4. Every subdynrankview is returned with LayoutStride
*
* NEW: Redesigned DynRankView
* 5. subview function name now available
* 6. Copy and Copy-Assign View to DynRankView
* 7. deep_copy between Views and DynRankViews
* 8. rank( view ); returns the rank of View or DynRankView
* 3. subview called with 'subview(...)' or 'subdynrankview(...)' (backward compatibility)
* 4. Every subview is returned with LayoutStride
* 5. Copy and Copy-Assign View to DynRankView
* 6. deep_copy between Views and DynRankViews
* 7. rank( view ); returns the rank of View or DynRankView
*
*/
@ -427,7 +436,7 @@ public:
private:
typedef Kokkos::Experimental::Impl::ViewMapping< traits , void > map_type ;
typedef Kokkos::Impl::ViewMapping< traits , void > map_type ;
typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ;
track_type m_track ;
@ -556,7 +565,7 @@ public:
// Allow specializations to query their specialized map
KOKKOS_INLINE_FUNCTION
const Kokkos::Experimental::Impl::ViewMapping< traits , void > &
const Kokkos::Impl::ViewMapping< traits , void > &
implementation_map() const { return m_map ; }
//----------------------------------------
@ -803,7 +812,7 @@ public:
, m_rank(rhs.m_rank)
{
typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
Mapping::assign( m_map , rhs.m_map , rhs.m_track );
}
@ -813,7 +822,7 @@ public:
DynRankView & operator = (const DynRankView<RT,RP...> & rhs )
{
typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
Mapping::assign( m_map , rhs.m_map , rhs.m_track );
m_track.assign( rhs.m_track , traits::is_managed );
@ -831,7 +840,7 @@ public:
, m_rank( rhs.Rank )
{
typedef typename View<RT,RP...>::traits SrcTraits ;
typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ;
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ;
static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
Mapping::assign( *this , rhs );
}
@ -841,7 +850,7 @@ public:
DynRankView & operator = ( const View<RT,RP...> & rhs )
{
typedef typename View<RT,RP...>::traits SrcTraits ;
typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ;
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ;
static_assert( Mapping::is_assignable , "Incompatible View to DynRankView copy assignment" );
Mapping::assign( *this , rhs );
return *this ;
@ -870,7 +879,7 @@ public:
)
: m_track()
, m_map()
, m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) )
, m_rank( Impl::DynRankDimTraits<typename traits::specialize>::template computeRank< typename traits::array_layout, P...>(arg_prop, arg_layout) )
{
// Append layout and spaces if not input
typedef Impl::ViewCtorProp< P ... > alloc_prop_input ;
@ -923,7 +932,7 @@ public:
//------------------------------------------------------------
Kokkos::Experimental::Impl::SharedAllocationRecord<> *
record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) );
record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) );
//------------------------------------------------------------
#if defined( KOKKOS_ENABLE_CUDA )
@ -947,8 +956,8 @@ public:
>::type const & arg_layout
)
: m_track() // No memory tracking
, m_map( arg_prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) )
, m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) )
, m_map( arg_prop , Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) )
, m_rank( Impl::DynRankDimTraits<typename traits::specialize>::template computeRank< typename traits::array_layout, P...>(arg_prop, arg_layout) )
{
static_assert(
std::is_same< pointer_type
@ -1034,6 +1043,7 @@ public:
{}
// For backward compatibility
// NDE This ctor does not take ViewCtorProp argument - should not use alternative createLayout call
explicit inline
DynRankView( const ViewAllocateWithoutInitializing & arg_prop
, const typename traits::array_layout & arg_layout
@ -1179,6 +1189,11 @@ namespace Impl {
struct DynRankSubviewTag {};
} // namespace Impl
} // namespace Experimental
namespace Impl {
template< class SrcTraits , class ... Args >
struct ViewMapping
< typename std::enable_if<(
@ -1192,7 +1207,7 @@ struct ViewMapping
std::is_same< typename SrcTraits::array_layout
, Kokkos::LayoutStride >::value
)
), DynRankSubviewTag >::type
), Kokkos::Experimental::Impl::DynRankSubviewTag >::type
, SrcTraits
, Args ... >
{
@ -1264,7 +1279,7 @@ public:
};
typedef DynRankView< value_type , array_layout , typename SrcTraits::device_type , typename SrcTraits::memory_traits > ret_type;
typedef Kokkos::Experimental::DynRankView< value_type , array_layout , typename SrcTraits::device_type , typename SrcTraits::memory_traits > ret_type;
template < typename T , class ... P >
KOKKOS_INLINE_FUNCTION
@ -1336,9 +1351,10 @@ public:
} // end Impl
namespace Experimental {
template< class V , class ... Args >
using Subdynrankview = typename Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , V , Args... >::ret_type ;
using Subdynrankview = typename Kokkos::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , V , Args... >::ret_type ;
template< class D , class ... P , class ...Args >
KOKKOS_INLINE_FUNCTION
@ -1348,7 +1364,7 @@ subdynrankview( const Kokkos::Experimental::DynRankView< D , P... > &src , Args.
if ( src.rank() > sizeof...(Args) ) //allow sizeof...(Args) >= src.rank(), ignore the remaining args
{ Kokkos::abort("subdynrankview: num of args must be >= rank of the source DynRankView"); }
typedef Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ;
typedef Kokkos::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ;
return metafcn::subview( src.rank() , src , args... );
}

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -57,7 +57,7 @@ namespace Experimental {
*/
template< typename DataType , typename ... P >
class DynamicView : public Kokkos::ViewTraits< DataType , P ... >
{
{
public:
typedef Kokkos::ViewTraits< DataType , P ... > traits ;
@ -68,7 +68,7 @@ private:
typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ;
static_assert( traits::rank == 1 && traits::rank_dynamic == 1
static_assert( traits::rank == 1 && traits::rank_dynamic == 1
, "DynamicView must be rank-one" );
static_assert( std::is_trivial< typename traits::value_type >::value &&
@ -216,14 +216,14 @@ public:
// Verify that allocation of the requested chunk in in progress.
// The allocated chunk counter is m_chunks[ m_chunk_max ]
const uintptr_t n =
const uintptr_t n =
*reinterpret_cast<uintptr_t volatile *>( m_chunks + m_chunk_max );
if ( n <= ic ) {
Kokkos::abort("Kokkos::DynamicView array bounds error");
}
// Allocation of this chunk is in progress
// Allocation of this chunk is in progress
// so wait for allocation to complete.
while ( 0 == *ch );
}
@ -267,7 +267,7 @@ public:
const uintptr_t jc_try = jc ;
// Jump iteration to the chunk counter.
jc = atomic_compare_exchange( pc , jc_try , jc_try + 1 );
if ( jc_try == jc ) {
@ -316,7 +316,7 @@ public:
}
else {
while ( NC + 1 <= *pc ) {
--*pc ;
--*pc ;
m_pool.deallocate( m_chunks[*pc]
, sizeof(value_type) << m_chunk_shift );
m_chunks[*pc] = 0 ;
@ -331,7 +331,7 @@ public:
typename traits::value_type ** m_chunks ;
uintptr_t * m_pc ;
uintptr_t m_nc ;
unsigned m_chunk_shift ;
unsigned m_chunk_shift ;
KOKKOS_INLINE_FUNCTION
void operator()( int ) const
@ -348,7 +348,7 @@ public:
}
else {
while ( m_nc + 1 <= *m_pc ) {
--*m_pc ;
--*m_pc ;
m_pool.deallocate( m_chunks[*m_pc]
, sizeof(value_type) << m_chunk_shift );
m_chunks[*m_pc] = 0 ;
@ -482,7 +482,7 @@ public:
};
/**\brief Allocation constructor
/**\brief Allocation constructor
*
* Memory is allocated in chunks from the memory pool.
* The chunk size conforms to the memory pool's chunk size.
@ -557,7 +557,7 @@ void deep_copy( const View<T,DP...> & dst
if ( DstExecCanAccessSrc ) {
// Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src );
}
else {
Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
@ -581,7 +581,7 @@ void deep_copy( const DynamicView<T,DP...> & dst
if ( DstExecCanAccessSrc ) {
// Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src );
}
else {
Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");

View File

@ -69,6 +69,8 @@
#include <Kokkos_ErrorReporter.hpp>
#include <TestErrorReporter.hpp>
#include <TestViewCtorPropEmbeddedDim.hpp>
//----------------------------------------------------------------------------
@ -94,6 +96,10 @@ TEST_F( cuda , dyn_view_api) {
TestDynViewAPI< double , Kokkos::Cuda >();
}
TEST_F( cuda, viewctorprop_embedded_dim ) {
TestViewCtorProp_EmbeddedDim< Kokkos::Cuda >::test_vcpt( 2, 3 );
}
TEST_F( cuda , staticcrsgraph )
{
TestStaticCrsGraph::run_test_graph< Kokkos::Cuda >();

View File

@ -66,6 +66,8 @@
#include <Kokkos_ErrorReporter.hpp>
#include <TestErrorReporter.hpp>
#include <TestViewCtorPropEmbeddedDim.hpp>
#include <iomanip>
namespace Test {
@ -76,14 +78,7 @@ protected:
{
std::cout << std::setprecision(5) << std::scientific;
unsigned threads_count = 4 ;
if ( Kokkos::hwloc::available() ) {
threads_count = Kokkos::hwloc::get_available_numa_count() *
Kokkos::hwloc::get_available_cores_per_numa();
}
Kokkos::OpenMP::initialize( threads_count );
Kokkos::OpenMP::initialize();
}
static void TearDownTestCase()
@ -96,6 +91,10 @@ TEST_F( openmp, dyn_view_api) {
TestDynViewAPI< double , Kokkos::OpenMP >();
}
TEST_F( openmp, viewctorprop_embedded_dim ) {
TestViewCtorProp_EmbeddedDim< Kokkos::OpenMP >::test_vcpt( 2, 3 );
}
TEST_F( openmp, bitset )
{
test_bitset<Kokkos::OpenMP>();

View File

@ -67,6 +67,8 @@
#include <Kokkos_ErrorReporter.hpp>
#include <TestErrorReporter.hpp>
#include <TestViewCtorPropEmbeddedDim.hpp>
namespace Test {
class serial : public ::testing::Test {
@ -85,6 +87,10 @@ TEST_F( serial, dyn_view_api) {
TestDynViewAPI< double , Kokkos::Serial >();
}
TEST_F( serial, viewctorprop_embedded_dim ) {
TestViewCtorProp_EmbeddedDim< Kokkos::Serial >::test_vcpt( 2, 3 );
}
TEST_F( serial , staticcrsgraph )
{
TestStaticCrsGraph::run_test_graph< Kokkos::Serial >();

View File

@ -70,6 +70,8 @@
#include <Kokkos_ErrorReporter.hpp>
#include <TestErrorReporter.hpp>
#include <TestViewCtorPropEmbeddedDim.hpp>
namespace Test {
class threads : public ::testing::Test {
@ -103,6 +105,10 @@ TEST_F( threads , dyn_view_api) {
TestDynViewAPI< double , Kokkos::Threads >();
}
TEST_F( threads, viewctorprop_embedded_dim ) {
TestViewCtorProp_EmbeddedDim< Kokkos::Threads >::test_vcpt( 2, 3 );
}
TEST_F( threads , staticcrsgraph )
{
TestStaticCrsGraph::run_test_graph< Kokkos::Threads >();

View File

@ -0,0 +1,213 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <cstdio>
#include <gtest/gtest.h>
#include <Kokkos_Core.hpp>
#include <Kokkos_DynRankView.hpp>
#include <type_traits>
#include <typeinfo>
namespace Test {
namespace {
template <typename ExecSpace >
struct TestViewCtorProp_EmbeddedDim {
using ViewIntType = typename Kokkos::View< int**, ExecSpace >;
using ViewDoubleType = typename Kokkos::View< double*, ExecSpace >;
using DynRankViewIntType = typename Kokkos::DynRankView< int, ExecSpace >;
using DynRankViewDoubleType = typename Kokkos::DynRankView< double, ExecSpace >;
// Cuda 7.0 has issues with using a lamda in parallel_for to initialize the view - replace with this functor
template < class ViewType >
struct Functor {
ViewType v;
Functor( const ViewType & v_ ) : v(v_) {}
KOKKOS_INLINE_FUNCTION
void operator()( const int i ) const {
v(i) = i;
}
};
static void test_vcpt( const int N0, const int N1 )
{
// Create two views to test
{
using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType ;
using VDT = typename TestViewCtorProp_EmbeddedDim::ViewDoubleType ;
VIT vi1("vi1", N0, N1);
VDT vd1("vd1", N0);
// TEST: Test for common type between two views, one with type double, other with type int
// Deduce common value_type and construct a view with that type
{
// Two views
auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1, vd1);
typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType;
typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT;
typedef typename CVT::HostMirror HostCVT;
// Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1),
Functor<CVT>(cv1)
);
HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
Kokkos::deep_copy( hcv1, cv1 );
ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ;
#if 0
// debug output
for ( int i = 0; i < N0*N1; ++i ) {
printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) );
}
printf( " Common value type view: %s \n", typeid( CVT() ).name() );
printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() );
if ( std::is_same< CommonViewValueType, double >::value == true ) {
printf("Proper common value_type\n");
}
else {
printf("WRONG common value_type\n");
}
// end debug output
#endif
}
{
// Single view
auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1);
typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType;
typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT;
typedef typename CVT::HostMirror HostCVT;
// Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1),
Functor<CVT>(cv1)
);
HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
Kokkos::deep_copy( hcv1, cv1 );
ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ;
}
}
// Create two dynamic rank views to test
{
using VIT = typename TestViewCtorProp_EmbeddedDim::DynRankViewIntType ;
using VDT = typename TestViewCtorProp_EmbeddedDim::DynRankViewDoubleType ;
VIT vi1("vi1", N0, N1);
VDT vd1("vd1", N0);
// TEST: Test for common type between two views, one with type double, other with type int
// Deduce common value_type and construct a view with that type
{
// Two views
auto view_alloc_arg = Kokkos::common_view_alloc_prop( vi1, vd1 );
typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType;
typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT;
typedef typename CVT::HostMirror HostCVT;
// Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1),
Functor<CVT>(cv1)
);
HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
Kokkos::deep_copy( hcv1, cv1 );
ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ;
}
{
// Single views
auto view_alloc_arg = Kokkos::common_view_alloc_prop( vi1 );
typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType;
typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT;
typedef typename CVT::HostMirror HostCVT;
// Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1),
Functor<CVT>(cv1)
);
HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
Kokkos::deep_copy( hcv1, cv1 );
ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ;
}
}
} // end test_vcpt
}; // end struct
} // namespace
} // namespace Test

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,12 +36,14 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
#include <gtest/gtest.h>
#include <cstdlib>
#include <Kokkos_Macros.hpp>
int main(int argc, char *argv[]) {
::testing::InitGoogleTest(&argc,argv);

View File

@ -79,7 +79,6 @@ test-mempool: KokkosCore_PerformanceTest_Mempool
test-taskdag: KokkosCore_PerformanceTest_TaskDAG
./KokkosCore_PerformanceTest_TaskDAG
build_all: $(TARGETS)
test: $(TEST_TARGETS)

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,12 +36,14 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
#include <gtest/gtest.h>
#include <cstdlib>
#include <Kokkos_Core.hpp>
namespace Test {

File diff suppressed because it is too large Load Diff

View File

@ -53,6 +53,7 @@
#include <impl/Kokkos_Error.hpp>
#include <Cuda/Kokkos_Cuda_abort.hpp>
#include <Cuda/Kokkos_Cuda_Error.hpp>
#include <Cuda/Kokkos_Cuda_Locks.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
@ -125,53 +126,12 @@ unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits:
#endif
namespace Kokkos {
namespace Impl {
struct CudaLockArraysStruct {
int* atomic;
int* scratch;
int* threadid;
int n;
};
}
}
__device__ __constant__
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
extern
#endif
Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
#define CUDA_SPACE_ATOMIC_XOR_MASK 0x15A39
namespace Kokkos {
namespace Impl {
void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink = false);
}
}
namespace Kokkos {
namespace Impl {
__device__ inline
bool lock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK;
return (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[offset],0,1));
}
__device__ inline
void unlock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK;
atomicExch( &kokkos_impl_cuda_lock_arrays.atomic[ offset ], 0);
}
}
}
template< typename T >
inline
__device__
@ -192,7 +152,7 @@ namespace Impl {
// For 2.0 capability: 48 KB L1 and 16 KB shared
//----------------------------------------------------------------------------
template< class DriverType >
template< class DriverType>
__global__
static void cuda_parallel_launch_constant_memory()
{
@ -202,19 +162,39 @@ static void cuda_parallel_launch_constant_memory()
driver();
}
template< class DriverType >
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
__global__
__launch_bounds__(maxTperB, minBperSM)
static void cuda_parallel_launch_constant_memory()
{
const DriverType & driver =
*((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
driver();
}
template< class DriverType>
__global__
static void cuda_parallel_launch_local_memory( const DriverType driver )
{
driver();
}
template < class DriverType ,
bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
__global__
__launch_bounds__(maxTperB, minBperSM)
static void cuda_parallel_launch_local_memory( const DriverType driver )
{
driver();
}
template < class DriverType
, class LaunchBounds = Kokkos::LaunchBounds<>
, bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
struct CudaParallelLaunch ;
template < class DriverType >
struct CudaParallelLaunch< DriverType , true > {
template < class DriverType, class LaunchBounds >
struct CudaParallelLaunch< DriverType, LaunchBounds, true > {
inline
CudaParallelLaunch( const DriverType & driver
@ -238,26 +218,19 @@ struct CudaParallelLaunch< DriverType , true > {
}
#ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
else if ( shmem ) {
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared ) );
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
} else {
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 ) );
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
}
#endif
// Copy functor to constant memory on the device
cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
#ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
Kokkos::Impl::CudaLockArraysStruct locks;
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
locks.n = Kokkos::Cuda::concurrency();
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
#endif
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
// Invoke the driver function on the device
cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem , stream >>>();
cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>();
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );
@ -267,8 +240,8 @@ struct CudaParallelLaunch< DriverType , true > {
}
};
template < class DriverType >
struct CudaParallelLaunch< DriverType , false > {
template < class DriverType, class LaunchBounds >
struct CudaParallelLaunch< DriverType, LaunchBounds, false > {
inline
CudaParallelLaunch( const DriverType & driver
@ -284,22 +257,15 @@ struct CudaParallelLaunch< DriverType , false > {
}
#ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
else if ( shmem ) {
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferShared ) );
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
} else {
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferL1 ) );
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
}
#endif
#ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
Kokkos::Impl::CudaLockArraysStruct locks;
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
locks.n = Kokkos::Cuda::concurrency();
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
#endif
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver );
cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>( driver );
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );

View File

@ -230,18 +230,6 @@ void CudaHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t
} catch(...) {}
}
constexpr const char* CudaSpace::name() {
return m_name;
}
constexpr const char* CudaUVMSpace::name() {
return m_name;
}
constexpr const char* CudaHostPinnedSpace::name() {
return m_name;
}
} // namespace Kokkos
//----------------------------------------------------------------------------
@ -655,11 +643,12 @@ reallocate_tracked( void * const arg_alloc_ptr
SharedAllocationRecord< Kokkos::CudaSpace , void > *
SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr )
{
using Header = SharedAllocationHeader ;
using RecordBase = SharedAllocationRecord< void , void > ;
using RecordCuda = SharedAllocationRecord< Kokkos::CudaSpace , void > ;
#if 0
using Header = SharedAllocationHeader ;
// Copy the header from the allocation
Header head ;
@ -812,83 +801,6 @@ print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bo
SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaHostPinned" , & s_root_record , detail );
}
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace {
__global__ void init_lock_array_kernel_atomic() {
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
if(i<CUDA_SPACE_ATOMIC_MASK+1)
kokkos_impl_cuda_lock_arrays.atomic[i] = 0;
}
__global__ void init_lock_array_kernel_scratch_threadid(int N) {
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
if(i<N) {
kokkos_impl_cuda_lock_arrays.scratch[i] = 0;
kokkos_impl_cuda_lock_arrays.threadid[i] = 0;
}
}
}
namespace Impl {
int* atomic_lock_array_cuda_space_ptr(bool deallocate) {
static int* ptr = NULL;
if(deallocate) {
cudaFree(ptr);
ptr = NULL;
}
if(ptr==NULL && !deallocate)
cudaMalloc(&ptr,sizeof(int)*(CUDA_SPACE_ATOMIC_MASK+1));
return ptr;
}
int* scratch_lock_array_cuda_space_ptr(bool deallocate) {
static int* ptr = NULL;
if(deallocate) {
cudaFree(ptr);
ptr = NULL;
}
if(ptr==NULL && !deallocate)
cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
return ptr;
}
int* threadid_lock_array_cuda_space_ptr(bool deallocate) {
static int* ptr = NULL;
if(deallocate) {
cudaFree(ptr);
ptr = NULL;
}
if(ptr==NULL && !deallocate)
cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
return ptr;
}
void init_lock_arrays_cuda_space() {
static int is_initialized = 0;
if(! is_initialized) {
Kokkos::Impl::CudaLockArraysStruct locks;
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
locks.n = Kokkos::Cuda::concurrency();
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
init_lock_array_kernel_scratch_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
}
}
void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) {
static void* ptr = NULL;
static std::int64_t current_size = 0;
@ -908,8 +820,8 @@ void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) {
return ptr;
}
}
}
} // namespace Impl
} // namespace Kokkos
#else
void KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR() {}
#endif // KOKKOS_ENABLE_CUDA

View File

@ -51,6 +51,7 @@
#include <Cuda/Kokkos_Cuda_Error.hpp>
#include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <Cuda/Kokkos_Cuda_Locks.hpp>
#include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_Profiling_Interface.hpp>
@ -69,9 +70,6 @@
__device__ __constant__
unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;
__device__ __constant__
Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
#endif
/*--------------------------------------------------------------------------*/
@ -103,6 +101,7 @@ int cuda_kernel_arch()
return arch ;
}
#ifdef KOKKOS_ENABLE_CUDA_UVM
bool cuda_launch_blocking()
{
const char * env = getenv("CUDA_LAUNCH_BLOCKING");
@ -111,16 +110,13 @@ bool cuda_launch_blocking()
return atoi(env);
}
#endif
}
void cuda_device_synchronize()
{
// static const bool launch_blocking = cuda_launch_blocking();
// if (!launch_blocking) {
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
// }
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
}
void cuda_internal_error_throw( cudaError e , const char * name, const char * file, const int line )
@ -240,6 +236,7 @@ public:
unsigned m_maxWarpCount ;
unsigned m_maxBlock ;
unsigned m_maxSharedWords ;
uint32_t m_maxConcurrency ;
size_type m_scratchSpaceCount ;
size_type m_scratchFlagsCount ;
size_type m_scratchUnifiedCount ;
@ -248,6 +245,7 @@ public:
size_type * m_scratchSpace ;
size_type * m_scratchFlags ;
size_type * m_scratchUnified ;
uint32_t * m_scratchConcurrentBitset ;
cudaStream_t * m_stream ;
static int was_initialized;
@ -274,6 +272,7 @@ public:
, m_maxWarpCount( 0 )
, m_maxBlock( 0 )
, m_maxSharedWords( 0 )
, m_maxConcurrency( 0 )
, m_scratchSpaceCount( 0 )
, m_scratchFlagsCount( 0 )
, m_scratchUnifiedCount( 0 )
@ -282,6 +281,7 @@ public:
, m_scratchSpace( 0 )
, m_scratchFlags( 0 )
, m_scratchUnified( 0 )
, m_scratchConcurrentBitset( 0 )
, m_stream( 0 )
{}
@ -327,7 +327,8 @@ CudaInternal::~CudaInternal()
if ( m_stream ||
m_scratchSpace ||
m_scratchFlags ||
m_scratchUnified ) {
m_scratchUnified ||
m_scratchConcurrentBitset ) {
std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()"
<< std::endl ;
std::cerr.flush();
@ -339,6 +340,7 @@ CudaInternal::~CudaInternal()
m_maxWarpCount = 0 ;
m_maxBlock = 0 ;
m_maxSharedWords = 0 ;
m_maxConcurrency = 0 ;
m_scratchSpaceCount = 0 ;
m_scratchFlagsCount = 0 ;
m_scratchUnifiedCount = 0 ;
@ -347,6 +349,7 @@ CudaInternal::~CudaInternal()
m_scratchSpace = 0 ;
m_scratchFlags = 0 ;
m_scratchUnified = 0 ;
m_scratchConcurrentBitset = 0 ;
m_stream = 0 ;
}
@ -485,6 +488,33 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
(void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
}
//----------------------------------
// Concurrent bitset for obtaining unique tokens from within
// an executing kernel.
{
const unsigned max_threads_per_sm = 2048 ; // up to capability 7.0
m_maxConcurrency =
max_threads_per_sm * cudaProp.multiProcessorCount ;
const int32_t buffer_bound =
Kokkos::Impl::concurrent_bitset::buffer_bound( m_maxConcurrency );
// Allocate and initialize uint32_t[ buffer_bound ]
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
Record * const r = Record::allocate( Kokkos::CudaSpace()
, "InternalScratchBitset"
, sizeof(uint32_t) * buffer_bound );
Record::increment( r );
m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>( r->data() );
CUDA_SAFE_CALL( cudaMemset( m_scratchConcurrentBitset , 0 , sizeof(uint32_t) * buffer_bound ) );
}
//----------------------------------
if ( stream_count ) {
m_stream = (cudaStream_t*) ::malloc( stream_count * sizeof(cudaStream_t) );
@ -543,16 +573,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
// Init the array for used for arbitrarily sized atomics
Impl::init_lock_arrays_cuda_space();
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
Kokkos::Impl::CudaLockArraysStruct locks;
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
locks.n = Kokkos::Cuda::concurrency();
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
#endif
Impl::initialize_host_cuda_lock_arrays();
}
//----------------------------------------------------------------------------
@ -635,9 +656,7 @@ void CudaInternal::finalize()
was_finalized = 1;
if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
atomic_lock_array_cuda_space_ptr(true);
scratch_lock_array_cuda_space_ptr(true);
threadid_lock_array_cuda_space_ptr(true);
Impl::finalize_host_cuda_lock_arrays();
if ( m_stream ) {
for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
@ -653,6 +672,7 @@ void CudaInternal::finalize()
RecordCuda::decrement( RecordCuda::get_record( m_scratchFlags ) );
RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) );
RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) );
RecordCuda::decrement( RecordCuda::get_record( m_scratchConcurrentBitset ) );
m_cudaDev = -1 ;
m_multiProcCount = 0 ;
@ -666,6 +686,7 @@ void CudaInternal::finalize()
m_scratchSpace = 0 ;
m_scratchFlags = 0 ;
m_scratchUnified = 0 ;
m_scratchConcurrentBitset = 0 ;
m_stream = 0 ;
}
}
@ -713,9 +734,8 @@ namespace Kokkos {
Cuda::size_type Cuda::detect_device_count()
{ return Impl::CudaInternalDevices::singleton().m_cudaDevCount ; }
int Cuda::concurrency() {
return 131072;
}
int Cuda::concurrency()
{ return Impl::CudaInternal::singleton().m_maxConcurrency ; }
int Cuda::is_initialized()
{ return Impl::CudaInternal::singleton().is_initialized(); }
@ -798,7 +818,22 @@ void Cuda::fence()
const char* Cuda::name() { return "Cuda"; }
} // namespace Kokkos
namespace Kokkos {
namespace Experimental {
UniqueToken< Kokkos::Cuda , Kokkos::Experimental::UniqueTokenScope::Global >::
UniqueToken( Kokkos::Cuda const & )
: m_buffer( Kokkos::Impl::CudaInternal::singleton().m_scratchConcurrentBitset )
, m_count( Kokkos::Impl::CudaInternal::singleton().m_maxConcurrency )
{}
} // namespace Experimental
} // namespace Kokkos
#else
void KOKKOS_CORE_SRC_CUDA_IMPL_PREVENT_LINK_ERROR() {}
#endif // KOKKOS_ENABLE_CUDA

View File

@ -0,0 +1,119 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_CUDA
#include <Cuda/Kokkos_Cuda_Locks.hpp>
#include <Cuda/Kokkos_Cuda_Error.hpp>
#include <Kokkos_Cuda.hpp>
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
namespace Kokkos {
namespace Impl {
__device__ __constant__
CudaLockArrays g_device_cuda_lock_arrays = { nullptr, nullptr, 0 };
}
}
#endif
namespace Kokkos {
namespace {
__global__ void init_lock_array_kernel_atomic() {
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
if(i<CUDA_SPACE_ATOMIC_MASK+1) {
Kokkos::Impl::g_device_cuda_lock_arrays.atomic[i] = 0;
}
}
__global__ void init_lock_array_kernel_threadid(int N) {
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
if(i<(unsigned)N) {
Kokkos::Impl::g_device_cuda_lock_arrays.scratch[i] = 0;
}
}
} // namespace
namespace Impl {
CudaLockArrays g_host_cuda_lock_arrays = { nullptr, nullptr, 0 };
void initialize_host_cuda_lock_arrays() {
if (g_host_cuda_lock_arrays.atomic != nullptr) return;
CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.atomic,
sizeof(int)*(CUDA_SPACE_ATOMIC_MASK+1)));
CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.scratch,
sizeof(int)*(Cuda::concurrency())));
CUDA_SAFE_CALL(cudaDeviceSynchronize());
g_host_cuda_lock_arrays.n = Cuda::concurrency();
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+1+255)/256,256>>>();
init_lock_array_kernel_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
CUDA_SAFE_CALL(cudaDeviceSynchronize());
}
void finalize_host_cuda_lock_arrays() {
if (g_host_cuda_lock_arrays.atomic == nullptr) return;
cudaFree(g_host_cuda_lock_arrays.atomic);
g_host_cuda_lock_arrays.atomic = nullptr;
cudaFree(g_host_cuda_lock_arrays.scratch);
g_host_cuda_lock_arrays.scratch = nullptr;
g_host_cuda_lock_arrays.n = 0;
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
#endif
}
} // namespace Impl
} // namespace Kokkos
#else
void KOKKOS_CORE_SRC_CUDA_CUDA_LOCKS_PREVENT_LINK_ERROR() {}
#endif

View File

@ -0,0 +1,166 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_LOCKS_HPP
#define KOKKOS_CUDA_LOCKS_HPP
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_CUDA
#include <cstdint>
#include <Cuda/Kokkos_Cuda_Error.hpp>
namespace Kokkos {
namespace Impl {
struct CudaLockArrays {
std::int32_t* atomic;
std::int32_t* scratch;
std::int32_t n;
};
/// \brief This global variable in Host space is the central definition
/// of these arrays.
extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays ;
/// \brief After this call, the g_host_cuda_lock_arrays variable has
/// valid, initialized arrays.
///
/// This call is idempotent.
void initialize_host_cuda_lock_arrays();
/// \brief After this call, the g_host_cuda_lock_arrays variable has
/// all null pointers, and all array memory has been freed.
///
/// This call is idempotent.
void finalize_host_cuda_lock_arrays();
} // namespace Impl
} // namespace Kokkos
#if defined( __CUDACC__ )
namespace Kokkos {
namespace Impl {
/// \brief This global variable in CUDA space is what kernels use
/// to get access to the lock arrays.
///
/// When relocatable device code is enabled, there can be one single
/// instance of this global variable for the entire executable,
/// whose definition will be in Kokkos_Cuda_Locks.cpp (and whose declaration
/// here must then be extern.
/// This one instance will be initialized by initialize_host_cuda_lock_arrays
/// and need not be modified afterwards.
///
/// When relocatable device code is disabled, an instance of this variable
/// will be created in every translation unit that sees this header file
/// (we make this clear by marking it static, meaning no other translation
/// unit can link to it).
/// Since the Kokkos_Cuda_Locks.cpp translation unit cannot initialize the
/// instances in other translation units, we must update this CUDA global
/// variable based on the Host global variable prior to running any kernels
/// that will use it.
/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
__device__ __constant__
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
extern
#endif
Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays ;
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
/// \brief Aquire a lock for the address
///
/// This function tries to aquire the lock for the hash value derived
/// from the provided ptr. If the lock is successfully aquired the
/// function returns true. Otherwise it returns false.
__device__ inline
bool lock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK;
return (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset],0,1));
}
/// \brief Release lock for the address
///
/// This function releases the lock for the hash value derived
/// from the provided ptr. This function should only be called
/// after previously successfully aquiring a lock with
/// lock_address.
__device__ inline
void unlock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK;
atomicExch( &Kokkos::Impl::g_device_cuda_lock_arrays.atomic[ offset ], 0);
}
} // namespace Impl
} // namespace Kokkos
/* Dan Ibanez: it is critical that this code be a macro, so that it will
capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
putting this in an inline function will NOT do the right thing! */
#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
{ \
CUDA_SAFE_CALL(cudaMemcpyToSymbol( \
Kokkos::Impl::g_device_cuda_lock_arrays , \
& Kokkos::Impl::g_host_cuda_lock_arrays , \
sizeof(Kokkos::Impl::CudaLockArrays) ) ); \
}
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
#else
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
#endif
#endif /* defined( __CUDACC__ ) */
#endif /* defined( KOKKOS_ENABLE_CUDA ) */
#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */

View File

@ -58,6 +58,7 @@
#include <Cuda/Kokkos_CudaExec.hpp>
#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
#include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <Cuda/Kokkos_Cuda_Locks.hpp>
#include <Kokkos_Vectorization.hpp>
#if defined(KOKKOS_ENABLE_PROFILING)
@ -65,6 +66,8 @@
#include <typeinfo>
#endif
#include <KokkosExp_MDRangePolicy.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
@ -318,6 +321,7 @@ private:
typedef Kokkos::RangePolicy< Traits ... > Policy;
typedef typename Policy::member_type Member ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::launch_bounds LaunchBounds ;
const FunctorType m_functor ;
const Policy m_policy ;
@ -363,7 +367,7 @@ public:
const dim3 block( 1 , CudaTraits::WarpSize * cuda_internal_maximum_warp_count(), 1);
const dim3 grid( std::min( ( nwork + block.y - 1 ) / block.y , cuda_internal_maximum_grid_count() ) , 1 , 1);
CudaParallelLaunch< ParallelFor >( *this , grid , block , 0 );
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
}
ParallelFor( const FunctorType & arg_functor ,
@ -373,6 +377,115 @@ public:
{ }
};
// MDRangePolicy impl
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, Kokkos::Cuda
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ;
using RP = Policy;
typedef typename Policy::array_index_type array_index_type;
typedef typename Policy::index_type index_type;
typedef typename Policy::launch_bounds LaunchBounds;
const FunctorType m_functor ;
const Policy m_rp ;
public:
inline
__device__
void operator()(void) const
{
Kokkos::Experimental::Impl::Refactor::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag>(m_rp,m_functor).exec_range();
}
inline
void execute() const
{
const array_index_type maxblocks = static_cast<array_index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
if ( RP::rank == 2 )
{
const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , 1);
const dim3 grid(
std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
, std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
, 1
);
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
}
else if ( RP::rank == 3 )
{
const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , m_rp.m_tile[2] );
const dim3 grid(
std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
, std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
, std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1 ) / block.z , maxblocks )
);
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
}
else if ( RP::rank == 4 )
{
// id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to threadIdx.z
const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2] , m_rp.m_tile[3] );
const dim3 grid(
std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
, static_cast<index_type>(maxblocks) )
, std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1 ) / block.y , maxblocks )
, std::min( ( m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1 ) / block.z , maxblocks )
);
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
}
else if ( RP::rank == 5 )
{
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to threadIdx.z
const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4] );
const dim3 grid(
std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
, static_cast<index_type>(maxblocks) )
, std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
, static_cast<index_type>(maxblocks) )
, std::min( ( m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1 ) / block.z , maxblocks )
);
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
}
else if ( RP::rank == 6 )
{
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to threadIdx.z
const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4]*m_rp.m_tile[5] );
const dim3 grid(
std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
, static_cast<index_type>(maxblocks) )
, std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
, static_cast<index_type>(maxblocks) )
, std::min( static_cast<index_type>( m_rp.m_tile_end[4] * m_rp.m_tile_end[5] )
, static_cast<index_type>(maxblocks) )
);
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
}
else
{
printf("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
Kokkos::abort("Aborting");
}
} //end execute
// inline
ParallelFor( const FunctorType & arg_functor
, Policy arg_policy )
: m_functor( arg_functor )
, m_rp( arg_policy )
{}
};
template< class FunctorType , class ... Properties >
class ParallelFor< FunctorType
, Kokkos::TeamPolicy< Properties ... >
@ -384,6 +497,7 @@ private:
typedef TeamPolicyInternal< Kokkos::Cuda , Properties ... > Policy ;
typedef typename Policy::member_type Member ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::launch_bounds LaunchBounds ;
public:
@ -430,15 +544,15 @@ public:
if ( m_scratch_size[1]>0 ) {
__shared__ int base_thread_id;
if (threadIdx.x==0 && threadIdx.y==0 ) {
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % Kokkos::Impl::g_device_cuda_lock_arrays.n;
threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
int done = 0;
while (!done) {
done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
done = (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],0,1));
if(!done) {
threadid += blockDim.x * blockDim.y;
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid = 0;
}
}
base_thread_id = threadid;
@ -448,7 +562,8 @@ public:
}
for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
const int int_league_size = (int)m_league_size;
for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
this-> template exec_team< WorkTag >(
typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>()
@ -462,7 +577,7 @@ public:
if ( m_scratch_size[1]>0 ) {
__syncthreads();
if (threadIdx.x==0 && threadIdx.y==0 )
kokkos_impl_cuda_lock_arrays.atomic[threadid]=0;
Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid]=0;
}
}
@ -473,7 +588,7 @@ public:
const dim3 grid( int(m_league_size) , 1 , 1 );
const dim3 block( int(m_vector_size) , int(m_team_size) , 1 );
CudaParallelLaunch< ParallelFor >( *this, grid, block, shmem_size_total ); // copy to device and execute
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute
}
@ -529,6 +644,7 @@ private:
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
typedef typename Policy::launch_bounds LaunchBounds ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
@ -563,6 +679,7 @@ private:
typedef int DummySHMEMReductionType;
public:
// Make the exec_range calls call to Reduce::DeviceIterateTile
template< class TagType >
__device__ inline
typename std::enable_if< std::is_same< TagType , void >::value >::type
@ -686,7 +803,7 @@ public:
const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y );
CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute
CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
Cuda::fence();
@ -737,6 +854,232 @@ public:
{ }
};
// MDRangePolicy impl
template< class FunctorType , class ReducerType, class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, ReducerType
, Kokkos::Cuda
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ;
typedef typename Policy::array_index_type array_index_type;
typedef typename Policy::index_type index_type;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
typedef typename Policy::launch_bounds LaunchBounds;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ;
public:
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::value_type value_type ;
typedef typename ValueTraits::reference_type reference_type ;
typedef FunctorType functor_type ;
typedef Cuda::size_type size_type ;
// Algorithmic constraints: blockSize is a power of two AND blockDim.y == blockDim.z == 1
const FunctorType m_functor ;
const Policy m_policy ; // used for workrange and nwork
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
size_type * m_scratch_space ;
size_type * m_scratch_flags ;
size_type * m_unified_space ;
typedef typename Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern;
// Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
// Some crutch to do function overloading
private:
typedef double DummyShflReductionType;
typedef int DummySHMEMReductionType;
public:
inline
__device__
void
exec_range( reference_type update ) const
{
Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag, reference_type>(m_policy, m_functor, update).exec_range();
}
inline
__device__
void operator() (void) const {
run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
}
__device__ inline
void run(const DummySHMEMReductionType& ) const
{
const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
{
reference_type value =
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
// Number of blocks is bounded so that the reduction can be limited to two passes.
// Each thread block is given an approximately equal amount of work to perform.
// Accumulate the values for this block.
// The accumulation ordering does not match the final pass, but is arithmatically equivalent.
this-> exec_range( value );
}
// Reduce with final value at blockDim.y - 1 location.
// Problem: non power-of-two blockDim
if ( cuda_single_inter_block_reduce_scan<false,ReducerTypeFwd,WorkTag>(
ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x ,
kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {
// This is the final block with the final result at the final threads' location
size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
if ( threadIdx.y == 0 ) {
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
}
if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; }
}
}
__device__ inline
void run(const DummyShflReductionType&) const
{
value_type value;
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
// Number of blocks is bounded so that the reduction can be limited to two passes.
// Each thread block is given an approximately equal amount of work to perform.
// Accumulate the values for this block.
// The accumulation ordering does not match the final pass, but is arithmatically equivalent.
const Member work_part =
( ( m_policy.m_num_tiles + ( gridDim.x - 1 ) ) / gridDim.x ); //portion of tiles handled by each block
this-> exec_range( value );
pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
int max_active_thread = work_part < blockDim.y ? work_part:blockDim.y;
max_active_thread = (max_active_thread == 0)?blockDim.y:max_active_thread;
value_type init;
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init);
if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTag>
(value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) {
const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
if(id==0) {
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
*result = value;
}
}
}
// Determine block size constrained by shared memory:
static inline
unsigned local_block_size( const FunctorType & f )
{
unsigned n = CudaTraits::WarpSize * 8 ;
while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( f , n ) ) { n >>= 1 ; }
return n ;
}
inline
void execute()
{
const int nwork = m_policy.m_num_tiles;
if ( nwork ) {
int block_size = m_policy.m_prod_tile_dims;
// CONSTRAINT: Algorithm requires block_size >= product of tile dimensions
// Nearest power of two
int exponent_pow_two = std::ceil( std::log2(block_size) );
block_size = std::pow(2, exponent_pow_two);
int suggested_blocksize = local_block_size( m_functor );
block_size = (block_size > suggested_blocksize) ? block_size : suggested_blocksize ; //Note: block_size must be less than or equal to 512
m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_size /* block_size == max block_count */ );
m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
// REQUIRED ( 1 , N , 1 )
const dim3 block( 1 , block_size , 1 );
// Required grid.x <= block.y
const dim3 grid( std::min( int(block.y) , int( nwork ) ) , 1 , 1 );
const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y );
CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
Cuda::fence();
if ( m_result_ptr ) {
if ( m_unified_space ) {
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
}
else {
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
}
}
}
else {
if (m_result_ptr) {
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr );
}
}
}
template< class HostViewType >
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const HostViewType & arg_result
, typename std::enable_if<
Kokkos::is_view< HostViewType >::value
,void*>::type = NULL)
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result.ptr_on_device() )
, m_scratch_space( 0 )
, m_scratch_flags( 0 )
, m_unified_space( 0 )
{}
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const ReducerType & reducer)
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.view().ptr_on_device() )
, m_scratch_space( 0 )
, m_scratch_flags( 0 )
, m_unified_space( 0 )
{}
};
//----------------------------------------------------------------------------
#if 1
@ -753,6 +1096,7 @@ private:
typedef TeamPolicyInternal< Kokkos::Cuda, Properties ... > Policy ;
typedef typename Policy::member_type Member ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::launch_bounds LaunchBounds ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
@ -819,15 +1163,15 @@ public:
if ( m_scratch_size[1]>0 ) {
__shared__ int base_thread_id;
if (threadIdx.x==0 && threadIdx.y==0 ) {
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % Kokkos::Impl::g_device_cuda_lock_arrays.n;
threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
int done = 0;
while (!done) {
done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
done = (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],0,1));
if(!done) {
threadid += blockDim.x * blockDim.y;
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid = 0;
}
}
base_thread_id = threadid;
@ -840,7 +1184,7 @@ public:
if ( m_scratch_size[1]>0 ) {
__syncthreads();
if (threadIdx.x==0 && threadIdx.y==0 )
kokkos_impl_cuda_lock_arrays.atomic[threadid]=0;
Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid]=0;
}
}
@ -854,7 +1198,8 @@ public:
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
// Iterate this block through the league
for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
const int int_league_size = (int)m_league_size;
for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
this-> template exec_team< WorkTag >
( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
, m_shmem_begin
@ -894,7 +1239,8 @@ public:
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
// Iterate this block through the league
for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
const int int_league_size = (int)m_league_size;
for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
this-> template exec_team< WorkTag >
( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
, m_shmem_begin
@ -936,7 +1282,7 @@ public:
const dim3 grid( block_count , 1 , 1 );
const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem_size_total ); // copy to device and execute
CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute
Cuda::fence();
@ -975,12 +1321,6 @@ public:
, m_shmem_begin( 0 )
, m_shmem_size( 0 )
, m_scratch_ptr{NULL,NULL}
, m_league_size( arg_policy.league_size() )
, m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
arg_policy.vector_length() )
, m_vector_size( arg_policy.vector_length() )
, m_scratch_size{
arg_policy.scratch_size(0,( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
@ -991,6 +1331,12 @@ public:
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
arg_policy.vector_length() )
)}
, m_league_size( arg_policy.league_size() )
, m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
arg_policy.vector_length() )
, m_vector_size( arg_policy.vector_length() )
{
// Return Init value if the number of worksets is zero
if( arg_policy.league_size() == 0) {
@ -1150,6 +1496,7 @@ private:
typedef typename reducer_type<>::pointer_type pointer_type ;
typedef typename reducer_type<>::reference_type reference_type ;
typedef typename reducer_type<>::value_type value_type ;
typedef typename Policy::launch_bounds LaunchBounds ;
typedef Kokkos::Impl::FunctorAnalysis
< Kokkos::Impl::FunctorPatternInterface::REDUCE
@ -1273,7 +1620,7 @@ public:
const int shmem = m_shmem_team_begin + m_shmem_team_size ;
// copy to device and execute
CudaParallelLaunch<ParallelReduce>( *this, grid, block, shmem );
CudaParallelLaunch<ParallelReduce,LaunchBounds>( *this, grid, block, shmem );
Cuda::fence();
@ -1373,7 +1720,7 @@ public:
if ( CudaTraits::WarpSize < team_threads ) {
// Need inter-warp team reduction (collectives) shared memory
// Speculate an upper bound for the value size
// Speculate an upper bound for the value size
m_shmem_team_begin =
align_scratch( CudaTraits::warp_count(team_threads) * sizeof(double) );
@ -1426,7 +1773,7 @@ public:
// Reduce space has claim flag followed by vaue buffer
const int global_reduce_value_size =
max_concurrent_block *
max_concurrent_block *
( aligned_flag_size + align_scratch( value_size ) );
// Scratch space has claim flag followed by scratch buffer
@ -1469,6 +1816,7 @@ private:
typedef typename Policy::member_type Member ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::launch_bounds LaunchBounds ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
@ -1655,10 +2003,10 @@ public:
const int shmem = ValueTraits::value_size( m_functor ) * ( block_size + 2 );
m_final = false ;
CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
CudaParallelLaunch< ParallelScan, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
m_final = true ;
CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
CudaParallelLaunch< ParallelScan, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
}
}

View File

@ -151,7 +151,7 @@ template< class ValueType , class JoinOp>
__device__
inline void cuda_intra_warp_reduction( ValueType& result,
const JoinOp& join,
const int max_active_thread = blockDim.y) {
const uint32_t max_active_thread = blockDim.y) {
unsigned int shift = 1;
@ -268,29 +268,33 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
if( id + 1 < int(gridDim.x) )
join(value, tmp);
}
int active = __ballot(1);
if (int(blockDim.x*blockDim.y) > 2) {
value_type tmp = Kokkos::shfl_down(value, 2,32);
if( id + 2 < int(gridDim.x) )
join(value, tmp);
}
active += __ballot(1);
if (int(blockDim.x*blockDim.y) > 4) {
value_type tmp = Kokkos::shfl_down(value, 4,32);
if( id + 4 < int(gridDim.x) )
join(value, tmp);
}
active += __ballot(1);
if (int(blockDim.x*blockDim.y) > 8) {
value_type tmp = Kokkos::shfl_down(value, 8,32);
if( id + 8 < int(gridDim.x) )
join(value, tmp);
}
active += __ballot(1);
if (int(blockDim.x*blockDim.y) > 16) {
value_type tmp = Kokkos::shfl_down(value, 16,32);
if( id + 16 < int(gridDim.x) )
join(value, tmp);
}
active += __ballot(1);
}
}
//The last block has in its thread=0 the global reduction value through "value"
return last_block;
#else
@ -302,7 +306,7 @@ template< class ReducerType >
__device__ inline
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
cuda_intra_warp_reduction( const ReducerType& reducer,
const int max_active_thread = blockDim.y) {
const uint32_t max_active_thread = blockDim.y) {
typedef typename ReducerType::value_type ValueType;
@ -428,26 +432,31 @@ cuda_inter_block_reduction( const ReducerType& reducer,
if( id + 1 < int(gridDim.x) )
reducer.join(value, tmp);
}
int active = __ballot(1);
if (int(blockDim.x*blockDim.y) > 2) {
value_type tmp = Kokkos::shfl_down(value, 2,32);
if( id + 2 < int(gridDim.x) )
reducer.join(value, tmp);
}
active += __ballot(1);
if (int(blockDim.x*blockDim.y) > 4) {
value_type tmp = Kokkos::shfl_down(value, 4,32);
if( id + 4 < int(gridDim.x) )
reducer.join(value, tmp);
}
active += __ballot(1);
if (int(blockDim.x*blockDim.y) > 8) {
value_type tmp = Kokkos::shfl_down(value, 8,32);
if( id + 8 < int(gridDim.x) )
reducer.join(value, tmp);
}
active += __ballot(1);
if (int(blockDim.x*blockDim.y) > 16) {
value_type tmp = Kokkos::shfl_down(value, 16,32);
if( id + 16 < int(gridDim.x) )
reducer.join(value, tmp);
}
active += __ballot(1);
}
}
@ -594,7 +603,7 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
typedef FunctorValueOps< FunctorType , ArgTag > ValueOps ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
//typedef typename ValueTraits::reference_type reference_type ;
// '__ffs' = position of the least significant bit set to 1.
// 'blockDim.y' is guaranteed to be a power of two so this
@ -637,7 +646,7 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
{
void * const shared_ptr = shared_data + word_count.value * threadIdx.y ;
reference_type shared_value = ValueInit::init( functor , shared_ptr );
/* reference_type shared_value = */ ValueInit::init( functor , shared_ptr );
for ( size_type i = b ; i < e ; ++i ) {
ValueJoin::join( functor , shared_ptr , global_data + word_count.value * i );

View File

@ -58,25 +58,56 @@ template class TaskQueue< Kokkos::Cuda > ;
//----------------------------------------------------------------------------
#if defined( KOKKOS_DEBUG )
__device__
void verify_warp_convergence( const char * const where )
{
const unsigned b = __ballot(1);
if ( b != ~0u ) {
printf(" verify_warp_convergence( %s ) (%d,%d,%d) (%d,%d,%d) failed %x\n"
, where
, blockIdx.x
, blockIdx.y
, blockIdx.z
, threadIdx.x
, threadIdx.y
, threadIdx.z
, b );
}
}
#endif // #if defined( KOKKOS_DEBUG )
//----------------------------------------------------------------------------
__device__
void TaskQueueSpecialization< Kokkos::Cuda >::driver
( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue )
( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue
, int32_t shmem_per_warp )
{
using Member = TaskExec< Kokkos::Cuda > ;
using Queue = TaskQueue< Kokkos::Cuda > ;
using task_root_type = TaskBase< Kokkos::Cuda , void , void > ;
using task_root_type = TaskBase< void , void , void > ;
extern __shared__ int32_t shmem_all[];
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
Member single_exec( 1 );
Member team_exec( blockDim.y );
int32_t * const warp_shmem =
shmem_all + ( threadIdx.z * shmem_per_warp ) / sizeof(int32_t);
task_root_type * const task_shmem = (task_root_type *) warp_shmem ;
const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;
union {
task_root_type * ptr ;
int raw[2] ;
} task ;
Member single_exec( warp_shmem , 1 );
Member team_exec( warp_shmem , blockDim.y );
task_root_type * task_ptr ;
// Loop until all queues are empty and no tasks in flight
@ -87,41 +118,86 @@ void TaskQueueSpecialization< Kokkos::Cuda >::driver
if ( 0 == warp_lane ) {
task.ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
task_ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
// Loop by priority and then type
for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
task.ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
for ( int i = 0 ; i < Queue::NumQueue && end == task_ptr ; ++i ) {
for ( int j = 0 ; j < 2 && end == task_ptr ; ++j ) {
task_ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
}
}
#if 0
printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
, uintptr_t(task.ptr));
, uintptr_t(task_ptr));
#endif
}
// shuffle broadcast
task.raw[0] = __shfl( task.raw[0] , 0 );
task.raw[1] = __shfl( task.raw[1] , 0 );
((int*) & task_ptr )[0] = __shfl( ((int*) & task_ptr )[0] , 0 );
((int*) & task_ptr )[1] = __shfl( ((int*) & task_ptr )[1] , 0 );
if ( 0 == task.ptr ) break ; // 0 == queue->m_ready_count
#if defined( KOKKOS_DEBUG )
verify_warp_convergence("task_ptr");
#endif
if ( end != task.ptr ) {
if ( task_root_type::TaskTeam == task.ptr->m_task_type ) {
if ( 0 == task_ptr ) break ; // 0 == queue->m_ready_count
if ( end != task_ptr ) {
// Whole warp copy task's closure to/from shared memory.
// Use all threads of warp for coalesced read/write.
int32_t const b = sizeof(task_root_type) / sizeof(int32_t);
int32_t const e = *((int32_t volatile *)( & task_ptr->m_alloc_size )) / sizeof(int32_t);
int32_t volatile * const task_mem = (int32_t volatile *) task_ptr ;
// copy global to shared memory:
for ( int32_t i = warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
warp_shmem[i] = task_mem[i] ;
}
Kokkos::memory_fence();
// Copy done - use memory fence so that memory writes are visible.
// For reliable warp convergence on Pascal and Volta an explicit
// warp level synchronization will also be required.
if ( task_root_type::TaskTeam == task_shmem->m_task_type ) {
// Thread Team Task
(*task.ptr->m_apply)( task.ptr , & team_exec );
(*task_shmem->m_apply)( task_shmem , & team_exec );
}
else if ( 0 == threadIdx.y ) {
// Single Thread Task
(*task.ptr->m_apply)( task.ptr , & single_exec );
(*task_shmem->m_apply)( task_shmem , & single_exec );
}
// copy shared to global memory:
for ( int32_t i = b + warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
task_mem[i] = warp_shmem[i] ;
}
Kokkos::memory_fence();
#if defined( KOKKOS_DEBUG )
verify_warp_convergence("apply");
#endif
// If respawn requested copy respawn data back to main memory
if ( 0 == warp_lane ) {
queue->complete( task.ptr );
if ( ((task_root_type *) task_root_type::LockTag) != task_shmem->m_next ) {
( (volatile task_root_type *) task_ptr )->m_next = task_shmem->m_next ;
( (volatile task_root_type *) task_ptr )->m_priority = task_shmem->m_priority ;
}
queue->complete( task_ptr );
}
}
} while(1);
@ -130,18 +206,20 @@ printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
namespace {
__global__
void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue )
{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue ); }
void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue
, int32_t shmem_size )
{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue , shmem_size ); }
}
void TaskQueueSpecialization< Kokkos::Cuda >::execute
( TaskQueue< Kokkos::Cuda > * const queue )
{
const int shared_per_warp = 2048 ;
const int warps_per_block = 4 ;
const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
const int shared = 0 ;
const int shared_total = shared_per_warp * warps_per_block ;
const cudaStream_t stream = 0 ;
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
@ -159,7 +237,7 @@ printf("cuda_task_queue_execute before\n");
//
// CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , stack_size ) );
cuda_task_queue_execute<<< grid , block , shared , stream >>>( queue );
cuda_task_queue_execute<<< grid , block , shared_total , stream >>>( queue , shared_per_warp );
CUDA_SAFE_CALL( cudaGetLastError() );

View File

@ -57,7 +57,7 @@ namespace {
template< typename TaskType >
__global__
void set_cuda_task_base_apply_function_pointer
( TaskBase<Kokkos::Cuda,void,void>::function_type * ptr )
( TaskBase<void,void,void>::function_type * ptr )
{ *ptr = TaskType::apply ; }
}
@ -78,7 +78,7 @@ public:
void iff_single_thread_recursive_execute( queue_type * const ) {}
__device__
static void driver( queue_type * const );
static void driver( queue_type * const , int32_t );
static
void execute( queue_type * const );
@ -106,7 +106,14 @@ public:
extern template class TaskQueue< Kokkos::Cuda > ;
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
/**\brief Impl::TaskExec<Cuda> is the TaskScheduler<Cuda>::member_type
* passed to tasks running in a Cuda space.
*
@ -134,11 +141,13 @@ private:
friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda > ;
friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Cuda > ;
int32_t * m_team_shmem ;
const int m_team_size ;
__device__
TaskExec( int arg_team_size = blockDim.y )
: m_team_size( arg_team_size ) {}
TaskExec( int32_t * arg_team_shmem , int arg_team_size = blockDim.y )
: m_team_shmem( arg_team_shmem )
, m_team_size( arg_team_size ) {}
public:
@ -154,7 +163,13 @@ public:
};
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template<typename iType>
struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >

View File

@ -106,7 +106,7 @@ private:
typedef Kokkos::Cuda execution_space ;
typedef execution_space::scratch_memory_space scratch_memory_space ;
void * m_team_reduce ;
mutable void * m_team_reduce ;
scratch_memory_space m_team_shared ;
int m_team_reduce_size ;
int m_league_rank ;
@ -166,7 +166,7 @@ public:
if ( 1 == blockDim.z ) { // team == block
__syncthreads();
// Wait for shared data write until all threads arrive here
if ( threadIdx.x == 0 && threadIdx.y == thread_id ) {
if ( threadIdx.x == 0u && threadIdx.y == (uint32_t)thread_id ) {
*((ValueType*) m_team_reduce) = val ;
}
__syncthreads(); // Wait for shared data read until root thread writes
@ -210,7 +210,7 @@ public:
const int wx =
( threadIdx.x + blockDim.x * threadIdx.y ) & CudaTraits::WarpIndexMask ;
for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
for ( int i = CudaTraits::WarpSize ; (int)blockDim.x <= ( i >>= 1 ) ; ) {
cuda_shfl_down( reducer.reference() , tmp , i , CudaTraits::WarpSize );
@ -354,7 +354,7 @@ public:
for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
cuda_shfl_down( reducer.reference() , tmp , i , blockDim.x );
if ( threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
if ( (int)threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
}
// Broadcast from root lane to all other lanes.
@ -410,7 +410,7 @@ public:
value_type tmp( reducer.reference() );
for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
for ( int i = CudaTraits::WarpSize ; (int)blockDim.x <= ( i >>= 1 ) ; ) {
cuda_shfl_down( reducer.reference(), tmp, i, CudaTraits::WarpSize );
@ -479,7 +479,7 @@ public:
__threadfence(); // Wait until global write is visible.
last_block = gridDim.x ==
last_block = (int)gridDim.x ==
1 + Kokkos::atomic_fetch_add(global_scratch_flags,1);
// If last block then reset count
@ -509,7 +509,7 @@ public:
reducer.copy( ((pointer_type)shmem) + offset
, ((pointer_type)global_scratch_space) + offset );
for ( int i = nentry + tid ; i < gridDim.x ; i += nentry ) {
for ( int i = nentry + tid ; i < (int)gridDim.x ; i += nentry ) {
reducer.join( ((pointer_type)shmem) + offset
, ((pointer_type)global_scratch_space)
+ i * reducer.length() );
@ -576,6 +576,14 @@ public:
, m_league_size( arg_league_size )
{}
public:
// Declare to avoid unused private member warnings which are trigger
// when SFINAE excludes the member function which uses these variables
// Making another class a friend also surpresses these warnings
bool impl_avoid_sfinae_warning() const noexcept
{
return m_team_reduce_size > 0 && m_team_reduce != nullptr;
}
};
} // namspace Impl
@ -913,10 +921,10 @@ void parallel_scan
// [t] += [t-4] if t >= 4
// ...
for ( int j = 1 ; j < blockDim.x ; j <<= 1 ) {
for ( int j = 1 ; j < (int)blockDim.x ; j <<= 1 ) {
value_type tmp = 0 ;
Impl::cuda_shfl_up( tmp , sval , j , blockDim.x );
if ( j <= threadIdx.x ) { sval += tmp ; }
if ( j <= (int)threadIdx.x ) { sval += tmp ; }
}
// Include accumulation and remove value for exclusive scan:

View File

@ -0,0 +1,133 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_UNIQUE_TOKEN_HPP
#define KOKKOS_CUDA_UNIQUE_TOKEN_HPP
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_CUDA
#include <Kokkos_CudaSpace.hpp>
#include <Kokkos_UniqueToken.hpp>
#include <impl/Kokkos_SharedAlloc.hpp>
#include <impl/Kokkos_ConcurrentBitset.hpp>
namespace Kokkos { namespace Experimental {
// both global and instance Unique Tokens are implemented in the same way
template<>
class UniqueToken< Cuda, UniqueTokenScope::Global >
{
private:
uint32_t volatile * m_buffer ;
uint32_t m_count ;
public:
using execution_space = Cuda;
explicit
UniqueToken( execution_space const& );
KOKKOS_INLINE_FUNCTION
UniqueToken() : m_buffer(0), m_count(0) {}
KOKKOS_INLINE_FUNCTION
UniqueToken( const UniqueToken & ) = default;
KOKKOS_INLINE_FUNCTION
UniqueToken( UniqueToken && ) = default;
KOKKOS_INLINE_FUNCTION
UniqueToken & operator=( const UniqueToken & ) = default ;
KOKKOS_INLINE_FUNCTION
UniqueToken & operator=( UniqueToken && ) = default ;
/// \brief upper bound for acquired values, i.e. 0 <= value < size()
KOKKOS_INLINE_FUNCTION
int32_t size() const noexcept { return m_count ; }
/// \brief acquire value such that 0 <= value < size()
KOKKOS_INLINE_FUNCTION
int32_t acquire() const
{
const Kokkos::pair<int,int> result =
Kokkos::Impl::concurrent_bitset::
acquire_bounded( m_buffer
, m_count
, Kokkos::Impl::clock_tic() % m_count
);
if ( result.first < 0 ) {
Kokkos::abort("UniqueToken<Cuda> failure to release tokens, no tokens available" );
}
return result.first;
}
/// \brief release an acquired value
KOKKOS_INLINE_FUNCTION
void release( int32_t i ) const noexcept
{
Kokkos::Impl::concurrent_bitset::release( m_buffer, i );
}
};
template<>
class UniqueToken< Cuda, UniqueTokenScope::Instance >
: public UniqueToken< Cuda, UniqueTokenScope::Global >
{
public:
explicit
UniqueToken( execution_space const& arg )
: UniqueToken< Cuda, UniqueTokenScope::Global >( arg ) {}
};
}} // namespace Kokkos::Experimental
#endif // KOKKOS_ENABLE_CUDA
#endif // KOKKOS_CUDA_UNIQUE_TOKEN_HPP

Some files were not shown because too many files have changed in this diff Show More