Compare commits

...

59 Commits

Author SHA1 Message Date
cf79751f4f Merge pull request #1207 from akohlmey/next-patch-release
Patch release 15 November 2018
2018-11-15 19:33:52 -05:00
e4dee3de17 Merge pull request #1206 from akohlmey/collected-small-changes
Collected small changes for next release
2018-11-15 17:29:26 -05:00
6e225d90fc fix some minor bugs write data file writing and remove dead code and silence compiler warnings 2018-11-15 16:50:56 -05:00
1fc3b4618c remove dead code and silence compiler warnings 2018-11-15 16:50:56 -05:00
eae9d27f6d OpenMP support from the compiler is not a requirement for USER-OMP. Without OpenMP, it is like the OPT package but for many more styles, so it is still useful and should be supported. 2018-11-15 16:50:56 -05:00
db29ec7eee complete workflow document 2018-11-15 14:58:02 -05:00
090778c42b Merge pull request #1204 from lammps/doc-plumed
Linkage mode improvements and documentation updates for USER-PLUMED package
2018-11-15 13:48:58 -05:00
db935dba5e Merge pull request #1201 from junghans/cmake_doc
cmake: update internal doc about how cmake finds executables
2018-11-15 13:48:17 -05:00
e160376365 incomplete first draft. committed for checking the markup in github. 2018-11-15 12:45:15 -05:00
d5f222464b Update README.md 2018-11-15 09:48:46 -07:00
4d9e2a014b add detailed build instructions and discussion of linkage modes for PLUMED library and USER-PLUMED package 2018-11-15 11:35:04 -05:00
8a4983e4bc reformatting and simplification of fix plumed docs 2018-11-15 10:53:38 -05:00
82d6aa9add interlink fixes colvars, plumed, and smd 2018-11-15 10:52:41 -05:00
4231ab3d57 correct some links 2018-11-15 10:52:03 -05:00
25914ea3f3 patch 15Nov2018 2018-11-15 10:17:25 -05:00
003bb28471 make @gtribello code owner of the USER-PLUMED package 2018-11-14 22:17:25 -05:00
a557644939 support all three plumed linkage modes with CMake as well. For downloaded and previously installed plumed lib 2018-11-14 22:13:18 -05:00
04520e627d add code and scripts to support all three plumed linkage modes with fix plumed for conventional build 2018-11-14 21:26:36 -05:00
952e52982e add comment to indicate code intended for backward compatibility only 2018-11-14 05:37:59 -05:00
a942d8b3ba use memset() for clearing of arrays 2018-11-14 05:30:23 -05:00
7a22b8aa62 check only in currently added data file atoms for dihedral overflow 2018-11-14 05:29:26 -05:00
4c1fbc359a use tagint when unpacking atom tags from communication buffers 2018-11-14 05:28:19 -05:00
2c644c5f2e Merge pull request #1197 from akohlmey/collected-small-fixes
Collection of small changes and bugfixes for the next release
2018-11-13 15:18:09 -05:00
b1186a971e Merge pull request #1202 from lammps/hyper
Add Hyper-dynamics to REPLICA package
2018-11-13 15:17:30 -05:00
2dbd575a4b Merge pull request #1203 from stanmoore1/kk_update
Update Kokkos library in LAMMPS to v2.7.24
2018-11-13 15:15:09 -05:00
4805e1df22 doc page additions for USER-PLUMED package 2018-11-13 08:29:07 -07:00
380f0e4971 remove some debugging code 2018-11-13 08:06:40 -07:00
a026ce9669 correct broken links detected by make mobi 2018-11-12 21:38:26 -05:00
7e779d16de correct broken links in manual reported by 'make html' 2018-11-12 21:33:37 -05:00
b776f0f29f remove dead code and silence warnings about unused parameters 2018-11-12 21:11:55 -05:00
443644025f silence compiler warnings 2018-11-12 20:50:14 -05:00
c4c90a96ec avoid void return from non-void function 2018-11-12 20:49:01 -05:00
5cb2463204 c++ style include files do not have a .h extension 2018-11-12 20:33:30 -05:00
5a4e44b75a remove accidentally duplicated code 2018-11-12 20:27:21 -05:00
0ca02b6f41 added new commands to doc pages, fixed a few missing entries as well 2018-11-12 17:23:15 -07:00
2b96dfd6cc Remove deprecated Kokkos code 2018-11-12 15:49:31 -07:00
c22c6e4d34 Add LAMMPS changes to Kokkos Makefile 2018-11-12 15:30:14 -07:00
b2d67bcbb5 Remove tpls dir 2018-11-12 15:18:06 -07:00
b3f08b38a2 Update Kokkos library in LAMMPS to v2.7.24 2018-11-12 15:16:26 -07:00
8e9d4f5bce modify bond style hybrid, so it can handle bond style quartic as a sub-style 2018-11-12 16:06:55 -05:00
fe07ad279d added NULL declations to constructor, removed debug code 2018-11-12 12:32:54 -07:00
5062c43aea rename example outputs 2018-11-12 12:32:53 -07:00
90caf0019c fix doc page errors 2018-11-12 12:32:53 -07:00
3b7ebbb8df new hyper examples 2018-11-12 12:32:53 -07:00
d7a479d2f6 hyper example dir 2018-11-12 12:32:52 -07:00
0c8ce199af more updates to hyper docs 2018-11-12 12:32:52 -07:00
4a6f088c0b updates to hyper doc pages 2018-11-12 12:32:52 -07:00
56598fcd0b changes to prd command doc page 2018-11-12 12:32:52 -07:00
265c11dca9 more edits to hyper docs 2018-11-12 12:32:52 -07:00
d6631266ce doc files in wrong dir 2018-11-12 12:32:52 -07:00
fbd610b8a9 global/local hyperdynamics src and doc files 2018-11-12 12:32:52 -07:00
86d1304176 cmake: update doc aobut executables 2018-11-10 18:58:53 -07:00
f68d77c7af correct formatting 2018-11-09 08:03:58 -05:00
7a4f534676 replace non-ASCII character 2018-11-09 08:03:41 -05:00
729201ab93 fix typo reported in #1199 2018-11-09 08:03:19 -05:00
ab8215a669 remove dead code 2018-11-09 01:09:31 -05:00
fe04147ee0 fix typo 2018-11-09 01:09:22 -05:00
62b1159673 update presets for USER-PLUMED package. fix typo. 2018-11-09 01:08:57 -05:00
adeb0c2b54 replace faulty preprocessor logic
fixes #1196
2018-11-09 01:08:57 -05:00
420 changed files with 21554 additions and 2289 deletions

1
.github/CODEOWNERS vendored
View File

@ -29,6 +29,7 @@ src/USER-MEAMC/* @martok
src/USER-MOFFF/* @hheenen
src/USER-MOLFILE/* @akohlmey
src/USER-NETCDF/* @pastewka
src/USER-PLUMED/* @gtribello
src/USER-PHONON/* @lingtikong
src/USER-PTM/* @pmla
src/USER-OMP/* @akohlmey

View File

@ -304,7 +304,7 @@ pkg_depends(USER-SCAFACOS MPI)
find_package(OpenMP QUIET)
option(BUILD_OMP "Build with OpenMP support" ${OpenMP_FOUND})
if(BUILD_OMP OR PKG_USER-OMP OR PKG_KOKKOS OR PKG_USER-INTEL)
if(BUILD_OMP OR PKG_KOKKOS OR PKG_USER-INTEL)
find_package(OpenMP REQUIRED)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
@ -349,7 +349,7 @@ if(PKG_KSPACE)
endif()
endif()
if(PKG_MSCG OR PKG_USER-ATC OR PKG_USER-AWPMD OR PKG_USER-QUIP OR PKG_LATTE)
if(PKG_MSCG OR PKG_USER-ATC OR PKG_USER-AWPMD OR PKG_USER-QUIP OR PKG_LATTE OR PKG_USER-PLUMED)
find_package(LAPACK)
find_package(BLAS)
if(NOT LAPACK_FOUND OR NOT BLAS_FOUND)
@ -531,6 +531,12 @@ endif()
if(PKG_USER-PLUMED)
find_package(GSL REQUIRED)
set(PLUMED_MODE "static" CACHE STRING "Linkage mode for Plumed2 library")
set(PLUMED_MODE_VALUES static shared runtime)
set_property(CACHE PLUMED_MODE PROPERTY STRINGS ${PLUMED_MODE_VALUES})
validate_option(PLUMED_MODE PLUMED_MODE_VALUES)
string(TOUPPER ${PLUMED_MODE} PLUMED_MODE)
option(DOWNLOAD_PLUMED "Download Plumed (instead of using the system's one)" OFF)
if(DOWNLOAD_PLUMED)
include(ExternalProject)
@ -543,13 +549,29 @@ if(PKG_USER-PLUMED)
ExternalProject_get_property(plumed_build INSTALL_DIR)
set(PLUMED_INSTALL_DIR ${INSTALL_DIR})
list(APPEND LAMMPS_DEPS plumed_build)
list(APPEND LAMMPS_LINK_LIBS ${PLUMED_INSTALL_DIR}/lib/plumed/obj/kernel.o
${PLUMED_INSTALL_DIR}/lib/plumed/obj/PlumedStatic.o ${GSL_LIBRARIES} ${CMAKE_DL_LIBS})
if(PLUMED_MODE STREQUAL "STATIC")
add_definitions(-D__PLUMED_WRAPPER_CXX=1)
list(APPEND LAMMPS_LINK_LIBS ${PLUMED_INSTALL_DIR}/lib/plumed/obj/kernel.o
"${PLUMED_INSTALL_DIR}/lib/plumed/obj/PlumedStatic.o" ${GSL_LIBRARIES} ${CMAKE_DL_LIBS} ${LAPACK_LIBRARIES})
elseif(PLUMED_MODE STREQUAL "SHARED")
list(APPEND LAMMPS_LINK_LIBS ${PLUMED_INSTALL_DIR}/lib/libplumed.so ${CMAKE_DL_LIBS})
elseif(PLUMED_MODE STREQUAL "RUNTIME")
add_definitions(-D__PLUMED_HAS_DLOPEN=1 -D__PLUMED_DEFAULT_KERNEL=${PLUMED_INSTALL_DIR}/lib/libplumedKernel.so)
list(APPEND LAMMPS_LINK_LIBS ${PLUMED_INSTALL_DIR}/lib/libplumedWrapper.a -rdynamic ${CMAKE_DL_LIBS})
endif()
set(PLUMED_INCLUDE_DIRS "${PLUMED_INSTALL_DIR}/include")
else()
find_package(PkgConfig REQUIRED)
pkg_check_modules(PLUMED plumed REQUIRED)
include(${PLUMED_LIBDIR}/plumed/src/lib/Plumed.cmake.static)
if(PLUMED_MODE STREQUAL "STATIC")
add_definitions(-D__PLUMED_WRAPPER_CXX=1)
include(${PLUMED_LIBDIR}/plumed/src/lib/Plumed.cmake.static)
elseif(PLUMED_MODE STREQUAL "SHARED")
include(${PLUMED_LIBDIR}/plumed/src/lib/Plumed.cmake.shared)
elseif(PLUMED_MODE STREQUAL "RUNTIME")
add_definitions(-D__PLUMED_HAS_DLOPEN=1 -D__PLUMED_DEFAULT_KERNEL=${PLUMED_LIBDIR}/libplumedKernel.so)
include(${PLUMED_LIBDIR}/plumed/src/lib/Plumed.cmake.runtime)
endif()
list(APPEND LAMMPS_LINK_LIBS ${PLUMED_LOAD})
endif()
include_directories(${PLUMED_INCLUDE_DIRS})

View File

@ -1492,6 +1492,11 @@ target API.
</dl>
</td>
</tr>
<tr>
<td><code>BIN2C</code> (CUDA only)</td>
<td>Path to bin2c executable, will automatically pick up the first one in your $PATH.</td>
<td>(automatic)</td>
</tr>
</tbody>
</table>
@ -1647,9 +1652,8 @@ requires `gzip` to be in your `PATH`
</tr>
<tr>
<td><code>GZIP_EXECUTABLE</code></td>
<td></td>
<td>
</td>
<td>Path to gzip executable, will automatically pick up the first one in your $PATH.</td>
<td>(automatic)</td>
</tr>
</tbody>
</table>
@ -1679,9 +1683,8 @@ requires `ffmpeg` to be in your `PATH`
</tr>
<tr>
<td><code>FFMPEG_EXECUTABLE</code></td>
<td></td>
<td>
</td>
<td>Path to ffmpeg executable, will automatically pick up the first one in your $PATH.</td>
<td>(automatic)</td>
</tr>
</tbody>
</table>

View File

@ -13,7 +13,7 @@ set(USER_PACKAGES USER-ATC USER-AWPMD USER-BOCS USER-CGDNA USER-CGSDK USER-COLVA
set(PACKAGES_WITH_LIB COMPRESS GPU KIM KOKKOS LATTE MEAM MPIIO MSCG POEMS PYTHON REAX VORONOI
USER-ATC USER-AWPMD USER-COLVARS USER-H5MD USER-LB USER-MOLFILE
USER-NETCDF USER-QMMM USER-QUIP USER-SMD USER-VTK)
USER-NETCDF USER-PLUMED USER-QMMM USER-QUIP USER-SMD USER-VTK)
set(ALL_PACKAGES ${STANDARD_PACKAGES} ${USER_PACKAGES})

View File

@ -13,7 +13,7 @@ set(USER_PACKAGES USER-ATC USER-AWPMD USER-BOCS USER-CGDNA USER-CGSDK USER-COLVA
set(PACKAGES_WITH_LIB COMPRESS GPU KIM KOKKOS LATTE MEAM MPIIO MSCG POEMS PYTHON REAX VORONOI
USER-ATC USER-AWPMD USER-COLVARS USER-H5MD USER-LB USER-MOLFILE
USER-NETCDF USER-QMMM USER-QUIP USER-SMD USER-VTK)
USER-NETCDF USER-PLUMED USER-QMMM USER-QUIP USER-SMD USER-VTK)
set(ALL_PACKAGES ${STANDARD_PACKAGES} ${USER_PACKAGES})

View File

@ -56,7 +56,8 @@ set(PKG_USER-MOFFF OFF CACHE BOOL "" FORCE)
set(PKG_USER-MOLFILE OFF CACHE BOOL "" FORCE)
set(PKG_USER-NETCDF OFF CACHE BOOL "" FORCE)
set(PKG_USER-OMP OFF CACHE BOOL "" FORCE)
set(PKG_USER-PHOFFOFF OFF CACHE BOOL "" FORCE)
set(PKG_USER-PHONON OFF CACHE BOOL "" FORCE)
set(PKG_USER-PLUMED OFF CACHE BOOL "" FORCE)
set(PKG_USER-QMMM OFF CACHE BOOL "" FORCE)
set(PKG_USER-QTB OFF CACHE BOOL "" FORCE)
set(PKG_USER-QUIP OFF CACHE BOOL "" FORCE)

View File

@ -13,7 +13,7 @@ set(USER_PACKAGES USER-ATC USER-AWPMD USER-BOCS USER-CGDNA USER-CGSDK USER-COLVA
set(PACKAGES_WITH_LIB COMPRESS GPU KIM KOKKOS LATTE MEAM MPIIO MSCG POEMS PYTHON REAX VORONOI
USER-ATC USER-AWPMD USER-COLVARS USER-H5MD USER-LB USER-MOLFILE
USER-NETCDF USER-QMMM USER-QUIP USER-SMD USER-VTK)
USER-NETCDF USER-PLUMED USER-QMMM USER-QUIP USER-SMD USER-VTK)
set(ALL_PACKAGES ${STANDARD_PACKAGES} ${USER_PACKAGES})

View File

@ -13,7 +13,7 @@ set(USER_PACKAGES USER-ATC USER-AWPMD USER-BOCS USER-CGDNA USER-CGSDK USER-COLVA
set(PACKAGES_WITH_LIB COMPRESS GPU KIM KOKKOS LATTE MEAM MPIIO MSCG POEMS PYTHON REAX VORONOI
USER-ATC USER-AWPMD USER-COLVARS USER-H5MD USER-LB USER-MOLFILE
USER-NETCDF USER-QMMM USER-QUIP USER-SMD USER-VTK)
USER-NETCDF USER-PLUMED USER-QMMM USER-QUIP USER-SMD USER-VTK)
set(ALL_PACKAGES ${STANDARD_PACKAGES} ${USER_PACKAGES})

View File

@ -0,0 +1,184 @@
# Outline of the GitHub Development Workflow
This purpose of this document is to provide a point of reference for the
core LAMMPS developers and other LAMMPS contibutors to understand the
choices the LAMMPS developers have agreed on. Git and GitHub provide the
tools, but do not set policies, so it is up to the developers to come to
an agreement as to how to define and interpret policies. This document
is likely to change as our experiences and needs change and we try to
adapt accordingly. Last change 2018-11-15.
## Table of Contents
* [GitHub Merge Management](#github-merge-management)
* [Pull Requests](#pull-requests)
* [Pull Request Assignments](#pull-request-assignments)
* [Pull Request Reviews](#pull-request-reviews)
* [Pull Request Discussions](#pull-request-discussions)
* [Checklist for Pull Requests](#checklist-for-pull-requests)
* [GitHub Issues](#github-issues)
* [Milestones and Release Planning](#milestones-and-release-planning)
## GitHub Merge Management
In the interest of consistency, ONLY ONE of the core LAMMPS developers
should doing the merging itself. This is currently
[@akohlmey](https://github.com/akohlmey) (Axel Kohlmeyer).
If this assignment needs to be changed, it shall be done right after a
stable release.
## Pull Requests
ALL changes to the LAMMPS code and documentation, however trivial, MUST
be submitted as a pull request to GitHub. All changes to the "master"
branch must be made exclusively through merging pull requests. The
"unstable" and "stable" branches, respectively are only to be updated
upon patch or stable releases with fast-forward merges based on the
associated tags. Pull requests may also be submitted to (long-running)
feature branches created by LAMMPS developers inside the LAMMPS project,
if needed. Those are not subject to the merge and review restrictions
discussed in this document, though, but get manages as needed on a
case-by-case basis.
### Pull Request Assignments
Pull requests can be "chaperoned" by one of the LAMMPS core developers.
This is indicated by who the pull request is assigned to. LAMMPS core
developers can self-assign or they can decide to assign a pull request
to a different LAMMPS developer. Being assigned to a pull request means,
that this pull request may need some work and the assignee is tasked to
determine what this might be needed or not, and may either implement the
required changes or ask the submitter of the pull request to implement
them. Even though, all LAMMPS developers may have write access to pull
requests (if enabled by the submitter, which is the default), only the
submitter or the assignee of a pull request may do so. During this
period the "work_in_progress" label shall be applied to the pull
request. The assignee gets to decide what happens to the pull request
next, e.g. whether it should be assigned to a different developer for
additional checks and changes, or is recommended to be merged. Removing
the "work_in_progress" label and assigning the pull request to the
developer tasked with merging signals that a pull request is ready to be
merged.
### Pull Request Reviews
People can be assigned to review a pull request in two ways:
* They can be assigned manually to review a pull request
by the submitter or a LAMMPS developer
* They can be automatically assigned, because a developers matches
a file pattern in the `.github/CODEOWNERS` file, which associates
developers with the code they contributed and maintain.
Reviewers are requested to state their appraisal of the proposed changes
and either approve or request changes. People may unassign themselves
from review, if they feel not competent about the changes proposed. At
least one review from a LAMMPS developer with write access is required
before merging in addition to the automated compilation tests. The
feature, that reviews from code owners are "hard" reviews (i.e. they
must all be approved before merging is allowed), is currently disabled
and it is in the discretion of the merge maintainer to assess when
a sufficient degree of approval has been reached. Reviews may be
(automatically) dismissed, when the reviewed code has been changed,
and then approval is required a second time.
### Pull Request Discussions
All discussions about a pull request should be kept as much as possible
on the pull request discussion page on GitHub, so that other developers
can later review the entire discussion after the fact and understand the
rationale behind choices made. Exceptions to this policy are technical
discussions, that are centered on tools or policies themselves
(git, github, c++) rather than on the content of the pull request.
### Checklist for Pull Requests
Here are some items to check:
* source and text files should not have CR/LF line endings (use dos2unix to remove)
* every new command or style should have documentation. The names of
source files (c++ and manual) should follow the name of the style.
(example: `src/fix_nve.cpp`, `src/fix_nve.h` for `fix nve` command,
implementing the class `FixNVE`, documented in `doc/src/fix_nve.txt`)
* all new style names should be lower case, the must be no dashes,
blanks, or underscores separating words, only forward slashes.
* new style docs should be added to the "overview" files in
`doc/src/Commands_*.txt`, `doc/src/{fixes,computes,pairs,bonds,...}.txt`
and `doc/src/lammps.book`
* new files in packages should be added to `src/.gitignore`
* removed or renamed files in packages should be added to `src/Purge.list`
* C++ source files should use C++ style include files for accessing
C-library APIs, e.g. `#include <cstdlib>` instead of `#include <stdlib.h>`.
And they should use angular brackets instead of double quotes. Full list:
* assert.h -> cassert
* ctype.h -> cctype
* errno.h -> cerrno
* float.h -> cfloat
* limits.h -> climits
* math.h -> cmath
* omplex.h -> complex
* setjmp.h -> csetjmp
* signal.h -> csignal
* stddef.h -> cstddef
* stdint.h -> cstdint
* stdio.h -> cstdio
* stdlib.h -> cstdlib
* string.h -> cstring
* time.h -> ctime
Do not replace (as they are C++-11): `inttypes.h` and `stdint.h`.
* Code should follow the C++-98 standard. C++-11 is only accepted
in individual special purpose packages
* indentation is two spaces per level
* there should be no tabs and no trailing whitespace
* header files, especially of new styles, should not include any
other headers, except the header with the base class or cstdio.
Forward declarations should be used instead when possible.
* iostreams should be avoided. LAMMPS uses stdio from the C-library.
* use of STL in headers and class definitions should be avoided.
* static class members should be avoided at all cost.
* anything storing atom IDs should be using `tagint` and not `int`.
This can be flagged by the compiler only for pointers and only when
compiling LAMMPS with `-DLAMMPS_BIGBIG`.
* when including both `lmptype.h` (and using defines or macros from it)
and `mpi.h`, `lmptype.h` must be included first.
## GitHub Issues
The GitHub issue tracker is the location where the LAMMPS developers
and other contributors or LAMMPS users can report issues or bugs with
the LAMMPS code or request new features to be added. Feature requests
are usually indicated by a `[Feature Request]` marker in the subject.
Issues are assigned to a person, if this person is working on this
feature or working to resolve an issue. Issues that have nobody working
on them at the moment, have the label `volunteer needed` attached.
When an issue, say `#125` is resolved by a specific pull request,
the comment for the pull request shall contain the text `closes #125`
or `fixes #125`, so that the issue is automatically deleted when
the pull request is merged.
## Milestones and Release Planning
LAMMPS uses a continuous release development model with incremental
changes, i.e. significant effort is made - including automated pre-merge
testing - that the code in the branch "master" does not get broken.
More extensive testing (including regression testing) is performed after
code is merged to the "master" branch. There are patch releases of
LAMMPS every 1-3 weeks at a point, when the LAMMPS developers feel, that
a sufficient amount of changes have happened, and the post-merge testing
has been successful. These patch releases are marked with a
`patch_<version date>` tag and the "unstable" branch follows only these
versions (and thus is always supposed to be of production quality,
unlike "master", which may be temporary broken, in the case of larger
change sets or unexpected incompatibilities or side effects.
About 3-4 times each year, there are going to be "stable" releases
of LAMMPS. These have seen additional, manual testing and review of
results from testing with instrumented code and static code analysis.
Also, in the last 2-3 patch releases before a stable release are
"release candidate" versions which only contain bugfixes and
documentation updates. For release planning and the information of
code contributors, issues and pull requests being actively worked on
are assigned a "milestone", which corresponds to the next stable
release or the stable release after that, with a tentative release
date.

View File

@ -137,9 +137,9 @@ simply loading the appropriate module before building LAMMPS.
-D CMAKE_C_COMPILER=name # name of C compiler
-D CMAKE_Fortran_COMPILER=name # name of Fortran compiler :pre
-D CMAKE_CXX_FlAGS=string # flags to use with C++ compiler
-D CMAKE_C_FlAGS=string # flags to use with C compiler
-D CMAKE_Fortran_FlAGS=string # flags to use with Fortran compiler :pre
-D CMAKE_CXX_FLAGS=string # flags to use with C++ compiler
-D CMAKE_C_FLAGS=string # flags to use with C compiler
-D CMAKE_Fortran_FLAGS=string # flags to use with Fortran compiler :pre
By default CMake will use a compiler it finds and it will add
optimization flags appropriate to that compiler and any "accelerator

View File

@ -41,11 +41,11 @@ This is the list of packages that may require additional steps.
"USER-ATC"_#user-atc,
"USER-AWPMD"_#user-awpmd,
"USER-COLVARS"_#user-colvars,
"USER-PLUMED" _#user-plumed,
"USER-H5MD"_#user-h5md,
"USER-INTEL"_#user-intel,
"USER-MOLFILE"_#user-molfile,
"USER-NETCDF"_#user-netcdf,
"USER-PLUMED"_#user-plumed,
"USER-OMP"_#user-omp,
"USER-QMMM"_#user-qmmm,
"USER-QUIP"_#user-quip,
@ -715,57 +715,98 @@ a corresponding Makefile.lammps.machine file.
USER-PLUMED package :h4,link(user-plumed)
Before building LAMMPS with this package, you must first build PLUMED.
PLUMED can be built as part of the LAMMPS build or installed separately
from LAMMPS using the generic "plumed installation instructions"_plumedinstall.
:link(plumedinstall,http://plumed.github.io/doc-master/user-doc/html/_installation.html)
PLUMED can be linked into MD codes in three different modes: static,
shared, and runtime. With the "static" mode, all required PLUMED code
is linked statically into the MD code. The MD code is then fully
independent from the PLUMED installation, but also you have to
rebuild/relink the MD code to update the PLUMED code inside it. With
"shared" linkage mode, the MD code is linked to a shared library
containing the PLUMED code, preferably after it was installed in a
globally accessible location. This way the same installed PLUMED code
can be shared across multiple MD packages and can be updated, for as
long as the shared PLUMED library is ABI-compatible. The third linkage
mode is "runtime" which allows to switch the PLUMED kernel at runtime
between different variants through setting the PLUMED_KERNEL environment
varible, which has to point to the location of the libplumedKernel.so
dynamical shared object, which is then loaded at runtime. This is
particularly convenient for doing PLUMED development and comparing
multiple PLUMED versions without having to recompile the hosting MD
code. All three linkage modes are supported by LAMMPS on selected
operating systems (e.g. Linux) and using either CMake or traditional
make build. The "static" mode should be most portable, the "runtime"
mode support in LAMMPS makes the most assumptions about operating
system and compiler environment. If one mode does not work, try a
different one, or switch to a different build system, or consider
a global PLUMED installation or downloading it during building LAMMPS.
[CMake build]:
-D DOWNLOAD_PLUMED=value # download PLUMED for build, value = no (default) or yes
-D PLUMED_MODE=value # Linkage mode for PLUMED, value = static (default), shared, or runtime :pre
If DOWNLOAD_PLUMED is set to "yes", the PLUMED library will be
downloaded (the version of that is hardcoded to a vetted version of
PLUMED, usually a recent stable release version) and built inside the
CMake build directory. If DOWNLOAD_PLUMED is set to "no" (the default),
CMake will try to detect an installed version of PLUMED and link to
that. For this to work, the PLUMED library has to be installed into a
location where the pkg-config tool can find it or the PKG_CONFIG_PATH
environment variable has to be set up accordingly.
The PLUMED_MODE setting determines the linkage mode of the PLUMED
library. Allowed values are "static" (default), "shared", or "runtime".
For a discussion of PLUMED linkage modes, please see above. When
enabling DOWNLOAD_PLUMED, the static linkage mode is recommended.
[Traditional make]:
Before building LAMMPS with this package, you must first build
PLUMED. We recommending building PLUMED separately to LAMMPS using
the instructions that can be found at http://plumed.github.io/doc-master/user-doc/html/_installation.html.
Before compiling LAMMPS you can then install the fix plumed command
and compile LAMMPS in the usual manner:
Before installing the USER-PLUMED package, first the PLUMED library
needs to be configured so that LAMMPS can find the right settings when
compiling and linking the LAMMPS executable itself. You can either
download and build PLUMED inside the LAMMPS plumed library folder or use
a previously installed PLUMED library and point LAMMPS to its
location. You also have to choose the linkage mode: "static" (default),
"shared" or "runtime". For a discussion of PLUMED linkage modes, please
see above.
Download/compilation/configuration of the plumed library can be done
from the src folder through the following make args:
make lib-plumed # print help message
make lib-plumed args="-b" # download and build PLUMED in lib/plumed/plumed2
make lib-plumed args="-p $HOME/.local" # use existing PLUMED installation in $HOME/.local
make lib-plumed args="-p /usr/local -m shared" # use existing PLUMED installation in
# /usr/local and use shared linkage mode
:pre
Note that 2 symbolic (soft) links, "includelink" and "liblink" are
created in lib/plumed to point into the location of the PLUMED build to
use and also a new file lib/plumed/Makefile.lammps is created with
settings suitable for LAMMPS to compile and link PLUMED in the desired
linkage mode. After this step is compleded, you can install the
USER-PLUMED package and compile LAMMPS in the usual manner:
make yes-user-plumed
make machine :pre
Once this compilation completes you should be able to run LAMMPS in the usual
way. When running LAMMPS with an input script that contains a fix
plumed command LAMMPS will try to call the PLUMED runtime library. PLUMED
must therefore be available in your path if LAMMPS is compiled in this way.
Once this compilation completes you should be able to run LAMMPS in the
usual way. For shared linkage mode, libplumed.so must be found by the
LAMMPS executable, which on many operating systems means, you have to
set the LD_LIBRARY_PATH environment variable accordingly.
On some machines it is not possible to call runtime libraries in the way described
above. When compiling on these machines it is thus better to statically link
PLUMED when compiling LAMMPS. To do this you must either download a PLUMED
tarball from http://www.plumed.org/get-it or clone it using
git clone https://github.com/plumed/plumed2.git. If you download the tarball
unpack it in the /lib/plumed directory. Similarly if you clone
it clone it to the /lib/plumed directory as if there is a version of PLUMED within
this directory LAMMPS will always try to statically link the version of PLUMED
that this directory contains instead of dynamically linking the library.
Support for the different linkage modes in LAMMPS varies for different
operating systems, using the static linkage is expected to be the most
portable, and thus set to be the default.
Once you have downloaded PLUMED into /lib/plumed you must again build the code
here by following the instructions that can be found at
http://plumed.github.io/doc-master/user-doc/html/_installation.html.
You can statically link PLUMED manually and if you want to access the full
range of PLUMED functionalities this is what you should do. If you only want the
basic range of functionalities, however, (i.e. no user contributed modules) then
you can download and compile PLUMED in one step from the lammps/src dir, using a
command like like those below:
make lib-plumed # print help message
make lib-plumed args="-b" # download and build the latest stable version of PLUMED
These commands will simply invoke the lib/plumed/Install.py script with
args specified. Furthermore, once the script has completed you should
have a compiled version of PLUMED. With this built you can install/un-install
PLUMED and build LAMMPS in the usual manner:
make yes-user-plumed
make machine :pre
make no-user-plumed
make machine :pre
If you want to change the linkage mode, you have to re-run "make
lib-plumed" with the desired settings [and] do a reinstall if the
USER-PLUMED package with "make yes-user-plumed" to update the required
makefile settings with the changes in the lib/plumed folder.
:line

View File

@ -56,6 +56,7 @@ packages:
"USER-INTEL"_Build_extras.html#user-intel,
"USER-MOLFILE"_Build_extras.html#user-molfile,
"USER-NETCDF"_Build_extras.html#user-netcdf,
"USER-PLUMED"_Build_extras.html#user-plumed,
"USER-OMP"_Build_extras.html#user-omp,
"USER-QMMM"_Build_extras.html#user-qmmm,
"USER-QUIP"_Build_extras.html#user-quip,

View File

@ -59,6 +59,7 @@ An alphabetic list of all LAMMPS commands.
"fix_modify"_fix_modify.html,
"group"_group.html,
"group2ndx"_group2ndx.html,
"hyper"_hyper.html,
"if"_if.html,
"info"_info.html,
"improper_coeff"_improper_coeff.html,

View File

@ -78,6 +78,8 @@ OPT.
"grem"_fix_grem.html,
"halt"_fix_halt.html,
"heat"_fix_heat.html,
"hyper/global"_fix_hyper_global.html,
"hyper/local"_fix_hyper_local.html,
"imd"_fix_imd.html,
"indent"_fix_indent.html,
"ipi"_fix_ipi.html,
@ -108,7 +110,7 @@ OPT.
"nph/asphere (o)"_fix_nph_asphere.html,
"nph/body"_fix_nph_body.html,
"nph/eff"_fix_nh_eff.html,
"nph/sphere (ko)"_fix_nph_sphere.html,
"nph/sphere (o)"_fix_nph_sphere.html,
"nphug (o)"_fix_nphug.html,
"npt (iko)"_fix_nh.html,
"npt/asphere (o)"_fix_npt_asphere.html,
@ -128,7 +130,7 @@ OPT.
"nve/line"_fix_nve_line.html,
"nve/manifold/rattle"_fix_nve_manifold_rattle.html,
"nve/noforce"_fix_nve_noforce.html,
"nve/sphere (o)"_fix_nve_sphere.html,
"nve/sphere (ko)"_fix_nve_sphere.html,
"nve/spin"_fix_nve_spin.html,
"nve/tri"_fix_nve_tri.html,
"nvk"_fix_nvk.html,
@ -147,6 +149,7 @@ OPT.
"phonon"_fix_phonon.html,
"pimd"_fix_pimd.html,
"planeforce"_fix_planeforce.html,
"plumed"_fix_plumed.html,
"poems"_fix_poems.html,
"pour"_fix_pour.html,
"precession/spin"_fix_precession_spin.html,

View File

@ -1,7 +1,7 @@
<!-- HTML_ONLY -->
<HEAD>
<TITLE>LAMMPS Users Manual</TITLE>
<META NAME="docnumber" CONTENT="9 Nov 2018 version">
<META NAME="docnumber" CONTENT="15 Nov 2018 version">
<META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
<META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation. This software and manual is distributed under the GNU General Public License.">
</HEAD>
@ -21,7 +21,7 @@
:line
LAMMPS Documentation :c,h1
9 Nov 2018 version :c,h2
15 Nov 2018 version :c,h2
"What is a LAMMPS version?"_Manual_version.html

View File

@ -89,6 +89,7 @@ as contained in the file name.
"USER-NETCDF"_#PKG-USER-NETCDF,
"USER-OMP"_#PKG-USER-OMP,
"USER-PHONON"_#PKG-USER-PHONON,
"USER-PLUMED"_#PKG-USER-PLUMED,
"USER-PTM"_#PKG-USER-PTM,
"USER-QMMM"_#PKG-USER-QMMM,
"USER-QTB"_#PKG-USER-QTB,
@ -1187,7 +1188,7 @@ the NAMD MD code, but with portability in mind. Axel Kohlmeyer
[Install:]
This package has "specific installation
instructions"_Build_extras.html#gpu on the "Build
instructions"_Build_extras.html#user-colvars on the "Build
extras"_Build_extras.html doc page.
[Supporting info:]
@ -1201,18 +1202,20 @@ examples/USER/colvars :ul
:line
USER-PLUMED package :link(USER-PLUMED),h4
USER-PLUMED package :link(PKG-USER-PLUMED),h4
[Contents:]
The fix plumed command allows you to use the plugin for molecular
dynamics PLUMED to analyse and bias your LAMMPS trajectory on the fly.
In practise PLUMED is called from within the lammps input script by using
the "fix plumed _fix_plumed.html command.
The fix plumed command allows you to use the PLUMED free energy plugin
for molecular dynamics to analyse and bias your LAMMPS trajectory on
the fly. The PLUMED library is called from within the LAMMPS input
script by using the "fix plumed _fix_plumed.html command.
[Authors:] The PLUMED library is written and maintained by
Massimilliano Bonomi, Giovanni Bussi, Carlo Camiloni and
Gareth Tribello.
[Authors:] The "PLUMED library"_#PLUMED is written and maintained by
Massimilliano Bonomi, Giovanni Bussi, Carlo Camiloni and Gareth
Tribello.
:link(PLUMED,http://www.plumed.org)
[Install:]
@ -1224,7 +1227,7 @@ extras"_Build_extras.html doc page.
src/USER-PLUMED/README
lib/plumed/README
"fix plumed "_fix_plumed.html
"fix plumed"_fix_plumed.html
examples/USER/plumed :ul
:line

View File

@ -62,17 +62,20 @@ Package, Description, Doc page, Example, Library
"USER-NETCDF"_Packages_details.html#PKG-USER-NETCDF, dump output via NetCDF,"dump netcdf"_dump_netcdf.html, n/a, ext
"USER-OMP"_Packages_details.html#PKG-USER-OMP, OpenMP-enabled styles,"Speed omp"_Speed_omp.html, "Benchmarks"_http://lammps.sandia.gov/bench.html, no
"USER-PHONON"_Packages_details.html#PKG-USER-PHONON, phonon dynamical matrix,"fix phonon"_fix_phonon.html, USER/phonon, no
"USER-PLUMED"_Packages_details.html#PKG-USER-PLUMED, "PLUMED"_#PLUMED free energy library,"fix plumed"_fix_plumed.html, USER/plumed, ext
"USER-PTM"_Packages_details.html#PKG-USER-PTM, Polyhedral Template Matching,"compute ptm/atom"_compute_ptm_atom.html, n/a, no
"USER-QMMM"_Packages_details.html#PKG-USER-QMMM, QM/MM coupling,"fix qmmm"_fix_qmmm.html, USER/qmmm, ext
"USER-QTB"_Packages_details.html#PKG-USER-QTB, quantum nuclear effects,"fix qtb"_fix_qtb.html "fix qbmsst"_fix_qbmsst.html, qtb, no
"USER-QUIP"_Packages_details.html#PKG-USER-QUIP, QUIP/libatoms interface,"pair_style quip"_pair_quip.html, USER/quip, ext
"USER-REAXC"_Packages_details.html#PKG-USER-REAXC, ReaxFF potential (C/C++) ,"pair_style reaxc"_pair_reaxc.html, reax, no
"USER-SCAFACOS"_Packages_details.html#PKG-USER-SCAFACOS, wrapper on ScaFaCoS solver,"kspace_style scafacos"_kspace_style.html, USER/scafacos, ext
"USER-SDPD"_Packages_details.html#PKG-USER-SDPD, smoothed dissipative particle dynamics,"pair_style sdpd/taitwater/isothermal"_pair_sdpd_taitwater_isothermal, USER/sdpd, no
"USER-SDPD"_Packages_details.html#PKG-USER-SDPD, smoothed dissipative particle dynamics,"pair_style sdpd/taitwater/isothermal"_pair_sdpd_taitwater_isothermal.html, USER/sdpd, no
"USER-SMD"_Packages_details.html#PKG-USER-SMD, smoothed Mach dynamics,"SMD User Guide"_PDF/SMD_LAMMPS_userguide.pdf, USER/smd, ext
"USER-SMTBQ"_Packages_details.html#PKG-USER-SMTBQ, second moment tight binding QEq potential,"pair_style smtbq"_pair_smtbq.html, USER/smtbq, no
"USER-SPH"_Packages_details.html#PKG-USER-SPH, smoothed particle hydrodynamics,"SPH User Guide"_PDF/SPH_LAMMPS_userguide.pdf, USER/sph, no
"USER-TALLY"_Packages_details.html#PKG-USER-TALLY, pairwise tally computes,"compute XXX/tally"_compute_tally.html, USER/tally, no
"USER-UEF"_Packages_details.html#PKG-USER-UEF, extensional flow,"fix nvt/uef"_fix_nh_uef.html, USER/uef, no
"USER-VTK"_Packages_details.html#PKG-USER-VTK, dump output via VTK, "compute vtk"_dump_vtk.html, n/a, ext :tb(ea=c,ca1=l)
:link(MOFplus,https://www.mofplus.org/content/show/MOF-FF)
:link(PLUMED,http://www.plumed.org)

View File

@ -44,6 +44,7 @@ Commands :h1
fix_modify
group
group2ndx
hyper
if
improper_coeff
improper_style

View File

@ -176,6 +176,7 @@ compute"_Commands_compute.html doc page are followed by one or more of
(g,i,k,o,t) to indicate which accelerated styles exist.
"ackland/atom"_compute_ackland_atom.html -
"adf"_compute_adf.html - angular distribution function
"aggregate/atom"_compute_cluster_atom.html - aggregate ID for each atom
"angle"_compute_angle.html -
"angle/local"_compute_angle_local.html -

View File

@ -117,5 +117,5 @@ package"_Build_package.html doc page for more info.
:line
:link(Larsen)
[(Larsen)] Larsen, Schmidt, Schiøtz, Modelling Simul Mater Sci Eng, 24, 055007 (2016).
[(Larsen)] Larsen, Schmidt, Schiotz, Modelling Simul Mater Sci Eng, 24, 055007 (2016).

View File

@ -221,6 +221,8 @@ accelerated styles exist.
"grem"_fix_grem.html -
"halt"_fix_halt.html - terminate a dynamics run or minimization
"heat"_fix_heat.html - add/subtract momentum-conserving heat
"hyper/global"_fix_hyper_global.html - global hyperdynamics
"hyper/local"_fix_hyper_local.html - local hyperdynamics
"imd"_fix_imd.html -
"indent"_fix_indent.html - impose force due to an indenter
"ipi"_fix_ipi.html -
@ -238,6 +240,7 @@ accelerated styles exist.
"manifoldforce"_fix_manifoldforce.html -
"meso"_fix_meso.html -
"meso"_fix_meso_move.html - move mesoscopic SPH/SDPD particles in a prescribed fashion
"meso/move"_fix_meso_move.html -
"meso/stationary"_fix_meso_stationary.html -
"momentum"_fix_momentum.html - zero the linear and/or angular momentum of a group of atoms
"move"_fix_move.html - move atoms in a prescribed fashion
@ -293,6 +296,7 @@ accelerated styles exist.
"phonon"_fix_phonon.html -
"pimd"_fix_pimd.html -
"planeforce"_fix_planeforce.html - constrain atoms to move in a plane
"plumed"_fix_plumed.html - wrapper on PLUMED free energy library
"poems"_fix_poems.html - constrain clusters of atoms to move as coupled rigid bodies
"pour"_fix_pour.html - pour new atoms/molecules into a granular simulation domain
"precession/spin"_fix_precession_spin.html -

View File

@ -41,7 +41,7 @@ react = mandatory argument indicating new reaction specification :l
fraction = initiate reaction with this probability if otherwise eligible
seed = random number seed (positive integer)
{stabilize_steps} value = timesteps
timesteps = number of timesteps to apply internally created nve/limit.html
timesteps = number of timesteps to apply internally created nve/limit fix :pre
{update_edges} value = {none} or {charges} :l
none = do not update topology near the edges of reaction templates
charges = update atomic charges of all atoms in reaction templates

View File

@ -116,7 +116,8 @@ not a limitation of functionality.
[Related commands:]
"fix smd"_fix_smd.html
"fix smd"_fix_smd.html, "fix spring"_fix_spring.html,
"fix plumed"_fix_plumed.html
[Default:]
@ -126,4 +127,4 @@ and tstat = NULL.
:line
:link(Fiorin)
[(Fiorin)] Fiorin , Klein, Henin, Mol. Phys., DOI:10.1080/00268976.2013.813594
[(Fiorin)] Fiorin, Klein, Henin, Mol. Phys., DOI:10.1080/00268976.2013.813594

View File

@ -0,0 +1,260 @@
"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
:link(lws,http://lammps.sandia.gov)
:link(ld,Manual.html)
:link(lc,Section_commands.html#comm)
:line
fix hyper/global command :h3
[Syntax:]
fix ID group-ID hyper/global cutbond qfactor Vmax Tequil :pre
ID, group-ID are documented in "fix"_fix.html command
hyper/global = style name of this fix command
cutbond = max distance at which a pair of atoms is considered bonded (distance units)
qfactor = max strain at which bias potential goes to 0.0 (unitless)
Vmax = height of bias potential (energy units)
Tequil = equilibration temperature (temperature units) :ul
[Examples:]
fix 1 all hyper/global 1.0 0.3 0.8 300.0 :pre
[Description:]
This fix is meant to be used with the "hyper"_hyper.html command to
perform a bond-boost global hyperdynamics (GHD) simulation. The role
of this fix is to a select a single pair of atoms in the system at
each timestep to add a global bias potential to, which will alter the
dynamics of the system in a manner that effectively accelerates time.
This is in contrast to the "fix hyper/local"_fix_hyper_local.html
command, which can be user to perform a local hyperdynamics (LHD)
simulation, by adding a local bias potential to multiple pairs of
atoms at each timestep. GHD can time accelerate a small simulation
with up to a few 100 atoms. For larger systems, LHD is needed to
achieve good time acceleration.
For a system that undergoes rare transition events, where one or more
atoms move over an energy barrier to a new potential energy basin, the
effect of the bias potential is to induce more rapid transitions.
This can lead to a dramatic speed-up in the rate at which events
occurs, without altering their relative frequencies, thus leading to
an overall increase in the elapsed real time of the simulation as
compared to running for the same number of timesteps with normal MD.
See the "hyper"_hyper.html doc page for a more general discussion of
hyperdynamics and citations that explain both GHD and LHD.
The equations and logic used by this fix and described here to perform
GHD follow the description given in "(Voter2013)"_#Voter2013ghd. The
bond-boost form of a bias potential for HD is due to Miron and
Fichthorn as described in "(Miron)"_#Mironghd. In LAMMPS we use a
simplified version of bond-boost GHD where a single bond in the system
is biased at any one timestep.
Bonds are defined between each pair of I,J atoms whose R0ij distance
is less than {cutbond}, when the system is in a quenched state
(minimum) energy. Note that these are not "bonds" in a covalent
sense. A bond is simply any pair of atoms that meet the distance
criterion. {Cutbond} is an argument to this fix; it is discussed
below. A bond is only formed if one or both of the I.J atoms are in
the specified group.
The current strain of bond IJ (when running dynamics) is defined as
Eij = (Rij - R0ij) / R0ij :pre
where Rij is the current distance between atoms I,J, and R0ij is the
equilibrium distance in the quenched state.
The bias energy Vij of any bond IJ is defined as
Vij = Vmax * (1 - (Eij/q)^2) for abs(Eij) < qfactor
= 0 otherwise :pre
where the prefactor {Vmax} and the cutoff {qfactor} are arguments to
this fix; they are discussed below. This functional form is an
inverse parabola centered at 0.0 with height Vmax and which goes to
0.0 at +/- qfactor.
Let Emax = the maximum of abs(Eij) for all IJ bonds in the system on a
given timestep. On that step, Vij is added as a bias potential to
only the single bond with strain Emax, call it Vij(max). Note that
Vij(max) will be 0.0 if Emax >= qfactor on that timestep. Also note
that Vij(max) is added to the normal interatomic potential that is
computed between all atoms in the system at every step.
The derivative of Vij(max) with respect to the position of each atom
in the Emax bond gives a bias force Fij(max) acting on the bond as
Fij(max) = - dVij(max)/dEij = 2 Vmax Eij / qfactor^2 for abs(Eij) < qfactor
= 0 otherwise :pre
which can be decomposed into an equal and opposite force acting on
only the two I,J atoms in the Emax bond.
The time boost factor for the system is given each timestep I by
Bi = exp(beta * Vij(max)) :pre
where beta = 1/kTequil, and {Tequil} is the temperature of the system
and an argument to this fix. Note that Bi >= 1 at every step.
NOTE: To run GHD, the input script must also use the "fix
langevin"_fix_langevin.html command to thermostat the atoms at the
same {Tequil} as specified by this fix, so that the system is running
constant-temperature (NVT) dynamics. LAMMPS does not check that this
is done.
The elapsed time t_hyper for a GHD simulation running for {N}
timesteps is simply
t_hyper = Sum (i = 1 to N) Bi * dt :pre
where dt is the timestep size defined by the "timestep"_timestep.html
command. The effective time acceleration due to GHD is thus t_hyper /
N*dt, where N*dt is elapsed time for a normal MD run of N timesteps.
Note that in GHD, the boost factor varies from timestep to timestep.
Likewise, which bond has Emax strain and thus which pair of atoms the
bias potential is added to, will also vary from timestep to timestep.
This is in contrast to local hyperdynamics (LHD) where the boost
factor is an input parameter; see the "fix
hyper/local"_fix_hyper_local.html doc page for details.
:line
Here is additional information on the input parameters for GHD.
The {cutbond} argument is the cutoff distance for defining bonds
between pairs of nearby atoms. A pair of I,J atoms in their
equilibrium, minimum-energy configuration, which are separated by a
distance Rij < {cutbond}, are flagged as a bonded pair. Setting
{cubond} to be ~25% larger than the nearest-neighbor distance in a
crystalline lattice is a typical choice for solids, so that bonds
exist only between nearest neighbor pairs.
The {qfactor} argument is the limiting strain at which the bias
potential goes to 0.0. It is dimensionless, so a value of 0.3 means a
bond distance can be up to 30% larger or 30% smaller than the
equilibrium (quenched) R0ij distance and the two atoms in the bond
could still experience a non-zero bias force.
If {qfactor} is set too large, then transitions from one energy basin
to another are affected because the bias potential is non-zero at the
transition state (e.g. saddle point). If {qfactor} is set too small
than little boost is achieved because the Eij strain of some bond in
the system will (nearly) always exceed {qfactor}. A value of 0.3 for
{qfactor} is typically reasonable.
The {Vmax} argument is the prefactor on the bias potential. Ideally,
tt should be set to a value slightly less than the smallest barrier
height for an event to occur. Otherwise the applied bias potential
may be large enough (when added to the interatomic potential) to
produce a local energy basin with a maxima in the center. This can
produce artificial energy minima in the same basin that trap an atom.
Or if {Vmax} is even larger, it may induce an atom(s) to rapidly
transition to another energy basin. Both cases are "bad dynamics"
which violate the assumptions of GHD that guarantee an accelerated
time-accurate trajectory of the system.
Note that if {Vmax} is set too small, the GHD simulation will run
correctly. There will just be fewer events because the hyper time
(t_hyper equation above) will be shorter.
NOTE: If you have no physical intuition as to the smallest barrier
height in your system, a reasonable strategy to determine the largest
{Vmax} you can use for an LHD model, is to run a sequence of
simulations with smaller and smaller {Vmax} values, until the event
rate does not change.
The {Tequil} argument is the temperature at which the system is
simulated; see the comment above about the "fix
langevin"_fix_langevin.html thermostatting. It is also part of the
beta term in the exponential factor that determines how much boost is
achieved as a function of the bias potential.
In general, the lower the value of {Tequil} and the higher the value
of {Vmax}, the more boost will be achievable by the GHD algorithm.
:line
[Restart, fix_modify, output, run start/stop, minimize info:]
No information about this fix is written to "binary restart
files"_restart.html.
The "fix_modify"_fix_modify.html {energy} option is supported by this
fix to add the energy of the bias potential to the the system's
potential energy as part of "thermodynamic output"_thermo_style.html.
This fix computes a global scalar and global vector of length 11, which
can be accessed by various "output commands"_Howto_output.html. The
scalar is the magnitude of the bias potential (energy units) applied on
the current timestep. The vector stores the following quantities:
1 = boost factor on this step (unitless)
2 = max strain Eij of any bond on this step (unitless)
3 = ID of first atom in the max-strain bond
4 = ID of second atom in the max-strain bond
5 = average # of bonds/atom on this step :ul
6 = fraction of timesteps with bias = 0.0 during this run
7 = max drift distance of any atom during this run (distance units)
8 = max bond length during this run (distance units) :ul
9 = cummulative hyper time since fix was defined (time units)
10 = cummulative count of event timesteps since fix was defined
11 = cummulative count of atoms in events since fix was defined :ul
The first 5 quantities are for the current timestep. Quantities 6-8
are for the current hyper run. Quantities 9-11 are cummulative across
multiple runs (since the fix was defined in the input script).
For value 7, drift is the distance an atom moves between timesteps
when the bond list is reset, i.e. between events. Atoms involved in
an event will typically move the greatest distance since others are
typically oscillating around their lattice site.
For value 10, events are checked for by the "hyper"_hyper.html command
once every {Nevent} timesteps. This value is the count of those
timesteps on which one (or more) events was detected. It is NOT the
number of distinct events, since more than one event may occur in the
same {Nevent} time window.
For value 11, each time the "hyper"_hyper.html command checks for an
event, it invokes a compute to flag zero or more atoms as
participating in one or more events. E.g. atoms that have displaced
more than some distance from the previous quench state. Value 11 is
the cummulative count of the number of atoms participating in any of
the events that were found.
The scalar and vector values calculated by this fix are all
"intensive".
No parameter of this fix can be used with the {start/stop} keywords of
the "run"_run.html command. This fix is not invoked during "energy
minimization"_minimize.html.
[Restrictions:]
This command can only be used if LAMMPS was built with the REPLICA
package. See the "Build package"_Build_package.html doc page for more
info.
[Related commands:]
"hyper"_hyper.html, "fix hyper/local"_fix_hyper_local.html
[Default:] None
:line
:link(Voter2013ghd)
[(Voter2013)] S. Y. Kim, D. Perez, A. F. Voter, J Chem Phys, 139,
144110 (2013).
:link(Mironghd)
[(Miron)] R. A. Miron and K. A. Fichthorn, J Chem Phys, 119, 6210 (2003).

404
doc/src/fix_hyper_local.txt Normal file
View File

@ -0,0 +1,404 @@
"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
:link(lws,http://lammps.sandia.gov)
:link(ld,Manual.html)
:link(lc,Section_commands.html#comm)
:line
fix hyper/local command :h3
[Syntax:]
fix ID group-ID hyper/local cutbond qfactor Vmax Tequil Dcut alpha Btarget :pre
ID, group-ID are documented in "fix"_fix.html command :ulb,l
hyper/local = style name of this fix command :l
cutbond = max distance at which a pair of atoms is considered bonded (distance units) :l
qfactor = max strain at which bias potential goes to 0.0 (unitless) :l
Vmax = estimated height of bias potential (energy units) :l
Tequil = equilibration temperature (temperature units) :l
Dcut = minimum distance between boosted bonds (distance units) :l
alpha = boostostat relaxation time (time units) :l
Btarget = desired time boost factor (unitless) :l
zero or more keyword/value pairs may be appended :l
keyword = {lost} or {check/bias} or {check/coeff}
{lostbond} value = error/warn/ignore
{check/bias} values = Nevery error/warn/ignore
{check/coeff} values = Nevery error/warn/ignore :pre
:ule
[Examples:]
fix 1 all hyper/local 1.0 0.3 0.8 300.0 :pre
[Description:]
This fix is meant to be used with the "hyper"_hyper.html command to
perform a bond-boost local hyperdynamics (LHD) simulation. The role
of this fix is to a select multiple pairs of atoms in the system at
each timestep to add a local bias potential to, which will alter the
dynamics of the system in a manner that effectively accelerates time.
This is in contrast to the "fix hyper/global"_fix_hyper_global.html
command, which can be user to perform a global hyperdynamics (GHD)
simulation, by adding a global bias potential to a single pair of
atoms at each timestep. GHD can time accelerate a small simulation
with up to a few 100 atoms. For larger systems, LHD is needed to
achieve good time acceleration.
For a system that undergoes rare transition events, where one or more
atoms move over an energy barrier to a new potential energy basin, the
effect of the bias potential is to induce more rapid transitions.
This can lead to a dramatic speed-up in the rate at which events
occurs, without altering their relative frequencies, thus leading to
an overall increase in the elapsed real time of the simulation as
compared to running for the same number of timesteps with normal MD.
See the "hyper"_hyper.html doc page for a more general discussion of
hyperdynamics and citations that explain both GHD and LHD.
The equations and logic used by this fix and described here to perform
LHD follow the description given in "(Voter2013)"_#Voter2013lhd. The
bond-boost form of a bias potential for HD is due to Miron and
Fichthorn as described in "(Miron)"_#Mironlhd.
To understand this description, you should first read the description
of the GHD algorithm on the "fix hyper/global"_fix_hyper_global.html
doc page. This description of LHD builds on the GHD description.
The definition of bonds, Eij, and Emax are the same for GHD and LHD.
The formulas for Vij(max) and Fij(max) are also the same except for a
pre-factor Cij, explained below.
The bias energy Vij applied to a bond IJ with maximum strain is
Vij(max) = Cij * Vmax * (1 - (Eij/q)^2) for abs(Eij) < qfactor
= 0 otherwise :pre
The derivative of Vij(max) with respect to the position of each atom
in the IJ bond gives a bias force Fij(max) acting on the bond as
Fij(max) = - dVij(max)/dEij = 2 Cij Vmax Eij / qfactor^2 for abs(Eij) < qfactor
= 0 otherwise :pre
which can be decomposed into an equal and opposite force acting on
only the two I,J atoms in the IJ bond.
The key difference is that in GHD a bias energy and force is added (on
a particular timestep) to only one bond (pair of atoms) in the system,
which is the bond with maximum strain Emax.
In LHD, a bias energy and force can be added to multiple bonds
separated by the specified {Dcut} distance or more. A bond IJ is
biased if it is the maximum strain bond within its local
"neighborhood", which is defined as the bond IJ plus any neighbor
bonds within a distance {Dcut} from IJ. The "distance" between bond
IJ and bond KL is the minimum distance between any of the IK, IL, JK,
JL pairs of atoms.
For a large system, multiple bonds will typically meet this
requirement, and thus a bias potential Vij(max) will be applied to
many bonds on the same timestep.
In LHD, all bonds store a Cij prefactor which appears in the Vij(max)
and Fij(max) equations above. Note that the Cij factor scales the
strength of the bias energy and forces whenever bond IJ is the maximum
strain bond in its neighborhood.
Cij is initialized to 1.0 when a bond between the I,J atoms is first
defined. The specified {Btarget} factor is then used to adjust the
Cij prefactors for each bond every timestep in the following manner.
An instantaneous boost factor Bij is computed each timestep
for each bond, as
Bij = exp(beta * Vkl(max)) :pre
where Vkl(max) is the bias energy of the maxstrain bond KL within bond
IJ's neighborhood, beta = 1/kTequil, and {Tequil} is the temperature
of the system and an argument to this fix.
NOTE: To run LHD, the input script must also use the "fix
langevin"_fix_langevin.html command to thermostat the atoms at the
same {Tequil} as specified by this fix, so that the system is running
constant-temperature (NVT) dynamics. LAMMPS does not check that this
is done.
Note that if IJ = KL, then bond IJ is a biased bond on that timestep,
otherwise it is not. But regardless, the boost factor Bij can be
thought of an estimate of time boost currently being applied within a
local region centered on bond IJ. For LHD, we want this to be the
specified {Btarget} value everywhere in the simulation domain.
To accomplish this, if Bij < Btarget, the Cij prefactor for bond IJ is
incremented on the current timestep by an amount proportional to the
inverse of the specified {alpha} and the difference (Bij - Btarget).
Conversely if Bij > Btarget, Cij is decremented by the same amount.
This procedure is termed "boostostatting" in
"(Voter2013)"_#Voter2013lhd. It drives all of the individual Cij to
values such that when Vij{max} is applied as a bias to bond IJ, the
resulting boost factor Bij will be close to {Btarget} on average.
Thus the LHD time acceleration factor for the overall system is
effectively {Btarget}.
Note that in LHD, the boost factor {Btarget} is specified by the user.
This is in contrast to global hyperdynamics (GHD) where the boost
factor varies each timestep and is computed as a function of {Vmax},
Emax, and {Tequil}; see the "fix hyper/global"_fix_hyper_global.html
doc page for details.
:line
Here is additional information on the input parameters for LHD.
Note that the {cutbond}, {qfactor}, and {Tequil} arguments have the
same meaning as for GHD. The {Vmax} argument is slightly different.
The {Dcut}, {alpha}, and {Btarget} parameters are unique to LHD.
The {cutbond} argument is the cutoff distance for defining bonds
between pairs of nearby atoms. A pair of I,J atoms in their
equilibrium, minimum-energy configuration, which are separated by a
distance Rij < {cutbond}, are flagged as a bonded pair. Setting
{cubond} to be ~25% larger than the nearest-neighbor distance in a
crystalline lattice is a typical choice for solids, so that bonds
exist only between nearest neighbor pairs.
The {qfactor} argument is the limiting strain at which the bias
potential goes to 0.0. It is dimensionless, so a value of 0.3 means a
bond distance can be up to 30% larger or 30% smaller than the
equilibrium (quenched) R0ij distance and the two atoms in the bond
could still experience a non-zero bias force.
If {qfactor} is set too large, then transitions from one energy basin
to another are affected because the bias potential is non-zero at the
transition state (e.g. saddle point). If {qfactor} is set too small
than little boost can be achieved because the Eij strain of some bond in
the system will (nearly) always exceed {qfactor}. A value of 0.3 for
{qfactor} is typically a reasonable value.
The {Vmax} argument is a fixed prefactor on the bias potential. There
is a also a dynamic prefactor Cij, driven by the choice of {Btarget}
as discussed above. The product of these should be a value less than
the smallest barrier height for an event to occur. Otherwise the
applied bias potential may be large enough (when added to the
interatomic potential) to produce a local energy basin with a maxima
in the center. This can produce artificial energy minima in the same
basin that trap an atom. Or if Cij*{Vmax} is even larger, it may
induce an atom(s) to rapidly transition to another energy basin. Both
cases are "bad dynamics" which violate the assumptions of LHD that
guarantee an accelerated time-accurate trajectory of the system.
NOTE: It may seem that {Vmax} can be set to any value, and Cij will
compensate to reduce the overall prefactor if necessary. However the
Cij are initialized to 1.0 and the boostostatting procedure typically
operates slowly enough that there can be a time period of bad dynamics
if {Vmax} is set too large. A better strategy is to set {Vmax} to the
smallest barrier height for an event (the same as for GHD), so that
the Cij remain near unity.
The {Tequil} argument is the temperature at which the system is
simulated; see the comment above about the "fix
langevin"_fix_langevin.html thermostatting. It is also part of the
beta term in the exponential factor that determines how much boost is
achieved as a function of the bias potential. See the discussion of
the {Btarget} argument below.
As discussed above, the {Dcut} argument is the distance required
between two locally maxstrain bonds for them to both be selected as
biased bonds on the same timestep. Computationally, the larger {Dcut}
is, the more work (computation and communication) must be done each
timestep within the LHD algorithm. And the fewer bonds can be
simultaneously biased, which may mean the specified {Btarget} time
acceleration cannot be achieved.
Physically {Dcut} should be a long enough distance that biasing two
pairs of atoms that close together will not influence the dynamics of
each pair. E.g. something like 2x the cutoff of the interatomic
potential. In practice a {Dcut} value of ~10 Angstroms seems to work
well for many solid-state systems.
NOTE: You must also insure that ghost atom communication is performed
for a distance of at least {Dcut} + {cutevent} where {cutevent} = the
distance one or more atoms move (between quenched states) to be
considered an "event". It is an argument to the "compute
event/displace" command used to detect events. By default the ghost
communication distance is set by the pair_style cutoff, which will
typically be < {Dcut}. The "comm_modify cutoff"_comm_modify.html
command can be used to set the ghost cutoff explicitly, e.g.
comm_modify cutoff 12.0 :pre
This fix does not know the {cutevent} parameter, but uses half the
bond length as an estimate to warn if the ghost cutoff is not long
enough.
As described above the {alpha} argument is a pre-factor in the
boostostat update equation for each bond's Cij prefactor. {Alpha} is
specified in time units, similar to other thermostat or barostat
damping parameters. It is roughly the physical time it will take the
boostostat to adjust a Cij value from a too high (or too low) value to
a correct one. An {alpha} setting of a few ps is typically good for
solid-state systems. Note that the {alpha} argument here is the
inverse of the alpha parameter discussed in
"(Voter2013)"_#Voter2013lhd.
The {Btarget} argument is the desired time boost factor (a value > 1)
that all the atoms in the system will experience. The elapsed time
t_hyper for an LHD simulation running for {N} timesteps is simply
t_hyper = Btarget * N*dt :pre
where dt is the timestep size defined by the "timestep"_timestep.html
command. The effective time acceleration due to LHD is thus t_hyper /
N*dt = Btarget, where N*dt is elapsed time for a normal MD run
of N timesteps.
You cannot choose an arbitrarily large setting for {Btarget}. The
maximum value you should choose is
Btarget = exp(beta * Vsmall) :pre
where Vsmall is the smallest event barrier height in your system, beta
= 1/kTequil, and {Tequil} is the specified temperature of the system
(both by this fix and the Langevin thermostat).
Note that if {Btarget} is set smaller than this, the LHD simulation
will run correctly. There will just be fewer events because the hyper
time (t_hyper equation above) will be shorter.
NOTE: If you have no physical intuition as to the smallest barrier
height in your system, a reasonable strategy to determine the largest
{Btarget} you can use for an LHD model, is to run a sequence of
simulations with smaller and smaller {Btarget} values, until the event
rate does not change.
:line
[Restart, fix_modify, output, run start/stop, minimize info:]
No information about this fix is written to "binary restart
files"_restart.html.
The "fix_modify"_fix_modify.html {energy} option is supported by this
fix to add the energy of the bias potential to the the system's
potential energy as part of "thermodynamic output"_thermo_style.html.
This fix computes a global scalar and global vector of length 23,
which can be accessed by various "output
commands"_Howto_output.html. The scalar is the magnitude of
the bias potential (energy units) applied on the current timestep,
summed over all biased bonds. The vector stores the following
quantities:
1 = # of biased bonds on this step
2 = max strain Eij of any bond on this step (unitless)
3 = average bias potential for all biased bonds on this step (energy units)
4 = average # of bonds/atom on this step
5 = average neighbor bonds/bond on this step within {Dcut} :ul
6 = fraction of steps and bonds with no bias during this run
7 = max drift distance of any atom during this run (distance units)
8 = max bond length during this run (distance units)
9 = average # of biased bonds/step during this run
10 = average bias potential for all biased bonds during this run (energy units)
11 = max bias potential for any biased bond during this run (energy units)
12 = min bias potential for any biased bond during this run (energy units)
13 = max distance from my sub-box of any ghost atom with maxstrain < qfactor during this run (distance units)
14 = max distance outside my box of any ghost atom with any maxstrain during this run (distance units)
15 = count of ghost neighbor atoms not found on reneighbor steps during this run
16 = count of lost bond partners during this run
17 = average bias coeff for lost bond partners during this run
18 = count of bias overlaps found during this run
19 = count of non-matching bias coefficients found during this run :ul
20 = cummulative hyper time since fix created (time units)
21 = cummulative count of event timesteps since fix created
22 = cummulative count of atoms in events since fix created
23 = cummulative # of new bonds since fix created :ul
The first quantities (1-5) are for the current timestep. Quantities
6-19 are for the current hyper run. They are reset each time a new
hyper run is performed. Quantities 20-23 are cummulative across
multiple runs (since the fix was defined in the input script).
For value 6, the numerator is a count of all biased bonds on every
timestep whose bias energy = 0.0 due to Eij >= {qfactor}. The
denominator is the count of all biased bonds on all timesteps.
For value 7, drift is the distance an atom moves between timesteps
when the bond list is reset, i.e. between events. Atoms involved in
an event will typically move the greatest distance since others are
typically oscillating around their lattice site.
For values 13 and 14, the maxstrain of a ghost atom is the maxstrain
of any bond it is part of, and it is checked for ghost atoms within
the bond neighbor cutoff.
Values 15-19 are mostly useful for debugging and diagnositc purposes.
For values 15-17, it is possible that a ghost atom owned by another
processor will move far enough (e.g. as part of an event-in-progress)
that it will no longer be within the communication cutoff distance for
acquiring ghost atoms. Likewise it may be a ghost atom bond partner
that cannot be found because it has moved too far. These values count
those occurrences. Because they typically involve atoms that are part
of events, they do not usually indicate bad dynamics. Value 16 is the
average bias coefficient for bonds where a partner atom was lost.
For value 18, no two bonds should be biased if they are within a
{Dcut} distance of each other. This value should be zero, indicating
that no pair of bonds "overlap", meaning they are closer than {Dcut}
from each other.
For value 19, the same bias coefficient is stored by both atoms in an
IJ bond. This value should be zero, indicating that for all bonds,
each atom in the bond stores the a bias coefficient with the same
value.
Value 20 is simply the specified {boost} factor times the number of
timestep times the timestep size.
For value 21, events are checked for by the "hyper"_hyper.html command
once every {Nevent} timesteps. This value is the count of those
timesteps on which one (or more) events was detected. It is NOT the
number of distinct events, since more than one event may occur in the
same {Nevent} time window.
For value 22, each time the "hyper"_hyper.html command checks for an
event, it invokes a compute to flag zero or more atoms as
participating in one or more events. E.g. atoms that have displaced
more than some distance from the previous quench state. Value 22 is
the cummulative count of the number of atoms participating in any of
the events that were found.
Value 23 tallies the number of new bonds created by the bond reset
operation. Bonds between a specific I,J pair of atoms may persist for
the entire hyperdynamics simulation if neither I or J are involved in
an event.
The scalar and vector values calculated by this fix are all
"intensive".
No parameter of this fix can be used with the {start/stop} keywords of
the "run"_run.html command. This fix is not invoked during "energy
minimization"_minimize.html.
[Restrictions:]
This fix is part of the REPLICA package. It is only enabled if LAMMPS
was built with that package. See the "Build package"_Build_package.html
doc page for more info.
[Related commands:]
"hyper"_hyper.html, "fix hyper/global"_fix_hyper_global.html
[Default:] None
:line
:link(Voter2013lhd)
[(Voter2013)] S. Y. Kim, D. Perez, A. F. Voter, J Chem Phys, 139,
144110 (2013).
:link(Mironlhd)
[(Miron)] R. A. Miron and K. A. Fichthorn, J Chem Phys, 119, 6210 (2003).

View File

@ -25,33 +25,32 @@ fix pl all plumed all plumed plumedfile plumed.dat outfile p.log
[Description:]
This fix instructs LAMMPS to call the PLUMED library, which allows one
to perform various forms of trajectory analysis on the fly and to also
use methods such as umbrella sampling and metadynamics to enhance the
sampling of phase space.
This fix instructs LAMMPS to call the "PLUMED"_plumedhome library, which
allows one to perform various forms of trajectory analysis on the fly
and to also use methods such as umbrella sampling and metadynamics to
enhance the sampling of phase space.
The documentation included here only describes the fix plumed command.
This command is LAMMPS specific whereas most of the functionality
implemented in PLUMED will work with a range of MD codes and also when
PLUMED is used as a stand alone code. The full documentation for PLUMED
is available at "this website"_http://www.plumed.org/documentation
The documentation included here only describes the fix plumed command
itself. This command is LAMMPS specific, whereas most of the
functionality implemented in PLUMED, however, will work with a range of
MD codes, and when PLUMED is used as a stand alone code for analysis.
The full "documentation for PLUMED"_plumeddocs is available online and
included in the PLUMED source code. The PLUMED library development is
hosted at
"https://github.com/plumed/plumed2"_https://github.com/plumed/plumed2
A detailed discussion of the code can be found in "(PLUMED)"_#PLUMED.
The PLUMED library is developed at
"https://github.com/plumed/plumed2"_https://github.com/plumed/plumed2 A
detailed discussion of the code can be found in "(PLUMED)"_#PLUMED.
There are some example scripts for using this package with LAMMPS in the
There is an example input for using this package with LAMMPS in the
examples/USER/plumed directory.
:line
The command to call PLUMED above is reasonably self explanatory. Within
the input file for lammps the user is required to specify the input file
for PLUMED and a file on which to output the PLUMED log. The user must
specify both of these arguments every time PLUMED is to be used.
Furthermore, the fix plumed command should appear in the LAMMPS input
file after the relevant input paramters (e.g. the timestep) have been
set.
The command to make LAMMPS call PLUMED during a run requires two keyword
value pairs pointing to the PLUMED input file and an output file for the
PLUMED log. The user must specify these arguments every time PLUMED is
to be used. Furthermore, the fix plumed command should appear in the
LAMMPS input file [after] relevant input paramters (e.g. the timestep)
have been set.
The {group-ID} entry is ignored. LAMMPS will always pass all the atoms
to PLUMED and there can only be one instance of the plumed fix at a
@ -64,10 +63,10 @@ functionality by only allowing only one plumed fix in the LAMMPS input.
The {plumedfile} keyword allows the user to specify the name of the
PLUMED input file. Instructions as to what should be included in a
plumed input file can be found in the "documentation for
PLUMED"_http://www.plumed.org/documentation.
PLUMED"_plumeddocs
The {outfile} keyword allows the user to specify the name of a file on
which to output the PLUMED log. This log file normally just parots the
which to output the PLUMED log. This log file normally just parrots the
information that is contained in the input file. The names of the files
on which the results from the various analyses that have been performed
using PLUMED will be specified by the user in the PLUMED input file.
@ -76,12 +75,13 @@ using PLUMED will be specified by the user in the PLUMED input file.
When performing a restart of a calculation that involves PLUMED you must
include a RESTART command in the PLUMED input file as detailed in the
"PLUMED documentation"_http://www.plumed.org/documentation. When the
restart command is found in the PLUMED input PLUMED will append to the
files that were generated in the run that was performed previously.
Furthermore, any history dependent bias potentials that were accumulated
in previous calculations will be read in when the restart command is
included in the PLUMED input.
"PLUMED documentation"_plumeddocs. When the restart command is found in
the PLUMED input PLUMED will append to the files that were generated in
the run that was performed previously. No part of the PLUMED restart
data is included in the LAMMPS restart files. Furthermore, any history
dependent bias potentials that were accumulated in previous calculations
will be read in when the RESTART command is included in the PLUMED
input.
The "fix_modify"_fix_modify.html {energy} option is not supported by
this fix.
@ -97,10 +97,7 @@ This fix is part of the USER-PLUMED package. It is only enabled if
LAMMPS was built with that package. See the "Build
package"_Build_package.html doc page for more info.
There can only be one plumed fix active at a time. Since the interface
communicates only the minimum amount of information and since the PLUMED
module itself can handle an arbitrary number of analysis and biasing
methods, this is not a limitation of functionality.
There can only be one plumed fix active at a time.
[Related commands:]
@ -115,3 +112,6 @@ The default options are plumedfile = NULL and outfile = NULL
:link(PLUMED)
[(PLUMED)] G.A. Tribello, M. Bonomi, D. Branduardi, C. Camilloni and G. Bussi, Comp. Phys. Comm 185, 604 (2014)
:link(plumeddocs,http://www.plumed.org/documentation)
:link(plumedhome,http://www.plumed.org/)

View File

@ -137,7 +137,8 @@ package"_Build_package.html doc page for more info.
"fix drag"_fix_drag.html, "fix spring"_fix_spring.html,
"fix spring/self"_fix_spring_self.html,
"fix spring/rg"_fix_spring_rg.html
"fix spring/rg"_fix_spring_rg.html,
"fix colvars"_fix_colvars.html, "fix plumed"_fix_plumed.html
[Default:] none

View File

@ -57,6 +57,8 @@ Fixes :h1
fix_grem
fix_halt
fix_heat
fix_hyper_global
fix_hyper_local
fix_imd
fix_indent
fix_ipi

192
doc/src/hyper.txt Normal file
View File

@ -0,0 +1,192 @@
"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
:link(lws,http://lammps.sandia.gov)
:link(ld,Manual.html)
:link(lc,Section_commands.html#comm)
:line
hyper command :h3
[Syntax:]
hyper N Nevent fix-ID compute-ID keyword values ... :pre
N = # of timesteps to run :ulb,l
Nevent = check for events every this many steps :l
fix-ID = ID of a fix that applies a global or local bias potential, can be NULL :l
compute-ID = ID of a compute that identifies when an event has occurred :l
zero or more keyword/value pairs may be appended :l
keyword = {min} or {dump} or {rebond} :l
{min} values = etol ftol maxiter maxeval
etol = stopping tolerance for energy, used in quenching
ftol = stopping tolerance for force, used in quenching
maxiter = max iterations of minimize, used in quenching
maxeval = max number of force/energy evaluations, used in quenching
{dump} value = dump-ID
dump-ID = ID of dump to trigger whenever an event takes place
{rebond} value = Nrebond
Nrebond = frequency at which to reset bonds, even if no event has occurred
:pre
:ule
[Examples:]
compute event all event/displace 1.0
fix HG mobile hyper/global 3.0 0.3 0.4 800.0
hyper 5000 100 HG event min 1.0e-6 1.0e-6 100 100 dump 1 dump 5 :pre
[Description:]
Run a bond-boost hyperdynamics (HD) simulation where time is
accelerated by application of a bias potential to one or more pairs of
nearby atoms in the system. This command can be used to run both
global and local hyperdyamics. In global HD a single bond within the
system is biased on each timestep. In local HD multiple bonds
(separated by a sufficient distance) can be biased simultaneously at
each timestep. In the bond-boost hyperdynamics context, a "bond" is
not a covalent bond between a pair of atoms in a molecule. Rather it
is simply a pair of nearby atoms as discussed below.
Both global and local HD are described in "(Voter2013)"_#Voter2013 by
Art Voter and collaborators. Similar to parallel replica dynamics
(PRD), global and local HD are methods for performing accelerated
dynamics that are suitable for infrequent-event systems that obey
first-order kinetics. A good overview of accelerated dynamics methods
for such systems in given in "(Voter2002)"_#Voter2002hd from the same
group. To quote from the review paper: "The dynamical evolution is
characterized by vibrational excursions within a potential basin,
punctuated by occasional transitions between basins." The transition
probability is characterized by p(t) = k*exp(-kt) where k is the rate
constant. Running multiple replicas gives an effective enhancement in
the timescale spanned by the multiple simulations, while waiting for
an event to occur.
Both HD and PRD produce a time-accurate trajectory that effectively
extends the timescale over which a system can be simulated, but they
do it differently. HD uses a single replica of the system and
accelerates time by biasing the interaction potential in a manner such
that each timestep is effectively longer. PRD creates Nr replicas of
the system and runs dynamics on each independently with a normal
unbiased potential until an event occurs in one of the replicas. The
time between events is reduced by a factor of Nr replicas. For both
methods, per CPU second, more physical time elapses and more events
occur. See the "prd"_prd.html doc page for more info about PRD.
An HD run has several stages, which are repeated each time an event
occurs, as explained below. The logic for an HD run is as follows:
quench
create initial list of bonds :pre
while (time remains):
run dynamics for Nevent steps
quench
check for an event
if event occurred: reset list of bonds
restore pre-quench state :pre
The list of bonds is the list of atom pairs of atoms that are within a
short cutoff distance of each other after the system energy is
minimized (quenched). This list is created and reset by a "fix
hyper/global"_fix_hyper_global.html or "fix
hyper/local"_fix_hyper_local.html command specified as {fix-ID}. At
every dynamics timestep, the same fix selects one of more bonds to
apply a bias potential to.
IMPORTANT NOTE: The style of fix associated with the specified
{fix-ID} determines whether you are running the global versus local
hyperdynamics algorithm.
Dynamics (with the bias potential) is run continuously, stopping every
{Nevent} steps to check if a transition event has occurred. The
specified {N} for total steps must be a multiple of {Nevent}. check
is performed by quenching the system and comparing the resulting atom
coordinates to the coordinates from the previous basin.
A quench is an energy minimization and is performed by whichever
algorithm has been defined by the "min_style"_min_style.html command.
Minimization parameters may be set via the
"min_modify"_min_modify.html command and by the {min} keyword of the
hyper command. The latter are the settings that would be used with
the "minimize"_minimize.html command. Note that typically, you do not
need to perform a highly-converged minimization to detect a transition
event, though you may need to in order to prevent a set of atoms in
the system from relaxing to a saddle point.
The event check is performed by a compute with the specified
{compute-ID}. Currently there is only one compute that works with the
hyper command, which is the "compute
event/displace"_compute_event_displace.html command. Other
event-checking computes may be added. "Compute
event/displace"_compute_event_displace.html checks whether any atom in
the compute group has moved further than a specified threshold
distance. If so, an event has occurred.
If this happens, the list of bonds is reset, since some bond pairs
are likely now too far apart, and new pairs are likely close enough
to be considered a bond. The pre-quenched state of the
system (coordinates and velocities) is restored, and dynamics continue.
At the end of the hyper run, a variety of statistics are output to the
screen and logfile. These include info relevant to both global and
local hyperdynamics, such as the number of events and the elapsed
hyper time (acclerated time), And it includes info specific to one or
the other, depending on which style of fix was specified by {fix-ID}.
:line
The optional keywords operate as follows.
As explained above, the {min} keyword can be used to specify
parameters for the quench. Their meaning is the same
as for the "minimize"_minimize.html command
The {dump} keyword can be used to trigger a specific dump command with
the specified {dump-ID} to output a snapshot each time an event is
detected. It can be specified multiple times with different {dump-ID}
values, as in the example above. These snapshots will be for the
quenched state of the system on a timestep that is a multiple of
{Nevent}, i.e. a timestep after the event has occurred. Note that any
dump command in the input script will also output snapshots at
whatever timestep interval it defines via its {N} argument; see the
"dump"_dump.html command for details. This means if you only want a
particular dump to output snapshots when events are detected, you
should specify its {N} as a value larger than the length of the
hyperdynamics run.
As in the code logic above, the bond list is normally only reset when
an event occurs. The {rebond} keyword will force a reset of the bond
list every {Nrebond} steps, even if an event has not occurred.
{Nrebond} must be a multiple of {Nevent}. This can be useful to check
if more frequent resets alter event statistics, perhaps because the
parameters chosen for defining what is a bond and what is an event are
producing bad dynamics in the presence of the bias potential.
:line
[Restrictions:]
This command can only be used if LAMMPS was built with the REPLICA
package. See the "Build package"_Build_package.html doc
page for more info.
[Related commands:]
"fix hyper/global"_fix_hyper_global.html, "fix
hyper/local"_fix_hyper_local.html, "compute
event/displace"_compute_event_displace.html, "prd"_prd.html
[Default:]
The option defaults are min = 0.1 0.1 40 50 and time = steps.
:line
:link(Voter2013)
[(Voter2013)] S. Y. Kim, D. Perez, A. F. Voter, J Chem Phys, 139,
144110 (2013).
:link(Voter2002hd)
[(Voter2002)] Voter, Montalenti, Germann, Annual Review of Materials
Research 32, 321 (2002).

View File

@ -160,6 +160,7 @@ dump_cfg_uef.html
echo.html
group.html
group2ndx.html
hyper.html
if.html
include.html
info.html
@ -277,6 +278,8 @@ fix_gravity.html
fix_grem.html
fix_halt.html
fix_heat.html
fix_hyper_global.html
fix_hyper_local.html
fix_imd.html
fix_indent.html
fix_ipi.html

View File

@ -48,11 +48,12 @@ replicas of a system. One or more replicas can be used. The total
number of steps {N} to run can be interpreted in one of two ways; see
discussion of the {time} keyword below.
PRD is described in "this paper"_#Voter1998 by Art Voter. It is a method
for performing accelerated dynamics that is suitable for
infrequent-event systems that obey first-order kinetics. A good
overview of accelerated dynamics methods for such systems in given in
"this review paper"_#Voter2002prd from the same group. To quote from the
PRD is described in "(Voter1998)"_#Voter1998 by Art Voter. Similar to
global or local hyperdynamics (HD), PRD is a method for performing
accelerated dynamics that is suitable for infrequent-event systems
that obey first-order kinetics. A good overview of accelerated
dynamics methods for such systems in given in this review paper
"(Voter2002)"_#Voter2002prd from Art's group. To quote from the
paper: "The dynamical evolution is characterized by vibrational
excursions within a potential basin, punctuated by occasional
transitions between basins." The transition probability is
@ -61,15 +62,26 @@ Running multiple replicas gives an effective enhancement in the
timescale spanned by the multiple simulations, while waiting for an
event to occur.
Each replica runs on a partition of one or more processors. Processor
partitions are defined at run-time using the "-partition command-line
switch"_Run_options.html. Note that if you have MPI installed, you
can run a multi-replica simulation with more replicas (partitions)
than you have physical processors, e.g you can run a 10-replica
simulation on one or two processors. However for PRD, this makes
little sense, since running a replica on virtual instead of physical
processors,offers no effective parallel speed-up in searching for
infrequent events. See the "Howto replica"_Howto_replica.html doc
Both PRD and HD produce a time-accurate trajectory that effectively
extends the timescale over which a system can be simulated, but they
do it differently. PRD creates Nr replicas of the system and runs
dynamics on each independently with a normal unbiased potential until
an event occurs in one of the replicas. The time between events is
reduced by a factor of Nr replicas. HD uses a single replica of the
system and accelerates time by biasing the interaction potential in a
manner such that each timestep is effectively longer. For both
methods, per CPU second, more physical time elapses and more events
occur. See the "hyper"_hyper.html doc page for more info about HD.
In PRD, each replica runs on a partition of one or more processors.
Processor partitions are defined at run-time using the "-partition
command-line switch"_Run_options.html. Note that if you have MPI
installed, you can run a multi-replica simulation with more replicas
(partitions) than you have physical processors, e.g you can run a
10-replica simulation on one or two processors. However for PRD, this
makes little sense, since running a replica on virtual instead of
physical processors,offers no effective parallel speed-up in searching
for infrequent events. See the "Howto replica"_Howto_replica.html doc
page for further discussion.
When a PRD simulation is performed, it is assumed that each replica is
@ -78,8 +90,8 @@ I.e. the simulation domain, the number of atoms, the interaction
potentials, etc should be the same for every replica.
A PRD run has several stages, which are repeated each time an "event"
occurs in one of the replicas, as defined below. The logic for a PRD
run is as follows:
occurs in one of the replicas, as explained below. The logic for a
PRD run is as follows:
while (time remains):
dephase for n_dephase*t_dephase steps
@ -129,7 +141,8 @@ Minimization parameters may be set via the
PRD command. The latter are the settings that would be used with the
"minimize"_minimize.html command. Note that typically, you do not
need to perform a highly-converged minimization to detect a transition
event.
event, though you may need to in order to prevent a set of atoms in
the system from relaxing to a saddle point.
The event check is performed by a compute with the specified
{compute-ID}. Currently there is only one compute that works with the
@ -307,7 +320,7 @@ deposit"_fix_deposit.html.
"min_modify"_min_modify.html, "min_style"_min_style.html,
"run_style"_run_style.html, "minimize"_minimize.html,
"velocity"_velocity.html, "temper"_temper.html, "neb"_neb.html,
"tad"_tad.html
"tad"_tad.html, "hyper"_hyper.html
[Default:]

View File

@ -78,6 +78,7 @@ friction: frictional contact of spherical asperities between 2d surfaces
gcmc: Grand Canonical Monte Carlo (GCMC) via the fix gcmc command
granregion: use of fix wall/region/gran as boundary on granular particles
hugoniostat: Hugoniostat shock dynamics
hyper: global and local hyperdynamics of diffusion on Pt surface
indent: spherical indenter into a 2d solid
kim: use of potentials in Knowledge Base for Interatomic Models (KIM)
latte: use of LATTE density-functional tight-binding quantum code

View File

@ -0,0 +1,184 @@
create_atoms 1 single 27.5 9.5 4
create_atoms 1 single 16 9 4
create_atoms 1 single 10 12 4
create_atoms 1 single 31 44 4
create_atoms 1 single 13 17 4
create_atoms 1 single 8.5 28.5 4
create_atoms 1 single 23 26 4
create_atoms 1 single 38 27 4
create_atoms 1 single 37.5 4.5 4
create_atoms 1 single 41.5 47.5 4
create_atoms 1 single 20.5 37.5 4
create_atoms 1 single 5 8 4
create_atoms 1 single 2.5 16.5 4
create_atoms 1 single 38.5 45.5 4
create_atoms 1 single 9 0 4
create_atoms 1 single 39 32 4
create_atoms 1 single 45.5 11.5 4
create_atoms 1 single 40 0 4
create_atoms 1 single 44.5 2.5 4
create_atoms 1 single 4.5 44.5 4
create_atoms 1 single 24.5 13.5 4
create_atoms 1 single 47.5 23.5 4
create_atoms 1 single 1 20 4
create_atoms 1 single 38.5 31.5 4
create_atoms 1 single 12.5 12.5 4
create_atoms 1 single 2 27 4
create_atoms 1 single 21 5 4
create_atoms 1 single 47 12 4
create_atoms 1 single 32.5 46.5 4
create_atoms 1 single 9.5 40.5 4
create_atoms 1 single 8.5 2.5 4
create_atoms 1 single 41.5 22.5 4
create_atoms 1 single 29 11 4
create_atoms 1 single 3.5 3.5 4
create_atoms 1 single 5 21 4
create_atoms 1 single 46.5 31.5 4
create_atoms 1 single 35 46 4
create_atoms 1 single 40.5 41.5 4
create_atoms 1 single 10 22 4
create_atoms 1 single 43.5 14.5 4
create_atoms 1 single 42 42 4
create_atoms 1 single 4 26 4
create_atoms 1 single 19 34 4
create_atoms 1 single 33 9 4
create_atoms 1 single 0.5 45.5 4
create_atoms 1 single 30.5 32.5 4
create_atoms 1 single 25.5 5.5 4
create_atoms 1 single 47.5 39.5 4
create_atoms 1 single 15 13 4
create_atoms 1 single 21 21 4
create_atoms 1 single 14 28 4
create_atoms 1 single 9 34 4
create_atoms 1 single 7 38 4
create_atoms 1 single 11 35 4
create_atoms 1 single 20.5 45.5 4
create_atoms 1 single 30.5 31.5 4
create_atoms 1 single 32.5 2.5 4
create_atoms 1 single 21.5 3.5 4
create_atoms 1 single 23 12 4
create_atoms 1 single 4.5 33.5 4
create_atoms 1 single 46 43 4
create_atoms 1 single 42.5 45.5 4
create_atoms 1 single 4.5 10.5 4
create_atoms 1 single 33.5 15.5 4
create_atoms 1 single 24 5 4
create_atoms 1 single 13 16 4
create_atoms 1 single 16.5 23.5 4
create_atoms 1 single 45.5 28.5 4
create_atoms 1 single 44.5 5.5 4
create_atoms 1 single 27.5 46.5 4
create_atoms 1 single 44.5 12.5 4
create_atoms 1 single 12 41 4
create_atoms 1 single 6 4 4
create_atoms 1 single 31.5 10.5 4
create_atoms 1 single 1 44 4
create_atoms 1 single 31 4 4
create_atoms 1 single 21 33 4
create_atoms 1 single 3 33 4
create_atoms 1 single 15 10 4
create_atoms 1 single 28.5 22.5 4
create_atoms 1 single 43 1 4
create_atoms 1 single 3.5 0.5 4
create_atoms 1 single 41 37 4
create_atoms 1 single 18.5 43.5 4
create_atoms 1 single 17 27 4
create_atoms 1 single 3 5 4
create_atoms 1 single 18.5 23.5 4
create_atoms 1 single 31.5 14.5 4
create_atoms 1 single 41 31 4
create_atoms 1 single 22 3 4
create_atoms 1 single 14.5 40.5 4
create_atoms 1 single 9 38 4
create_atoms 1 single 36 42 4
create_atoms 1 single 33 22 4
create_atoms 1 single 15.5 47.5 4
create_atoms 1 single 3 0 4
create_atoms 1 single 25.5 27.5 4
create_atoms 1 single 2.5 28.5 4
create_atoms 1 single 29.5 28.5 4
create_atoms 1 single 44.5 18.5 4
create_atoms 1 single 26 40 4
create_atoms 1 single 41 27 4
create_atoms 1 single 39.5 5.5 4
create_atoms 1 single 3 38 4
create_atoms 1 single 35 29 4
create_atoms 1 single 11 19 4
create_atoms 1 single 18 1 4
create_atoms 1 single 39.5 40.5 4
create_atoms 1 single 46 17 4
create_atoms 1 single 1.5 23.5 4
create_atoms 1 single 28.5 23.5 4
create_atoms 1 single 10 28 4
create_atoms 1 single 19 47 4
create_atoms 1 single 10.5 16.5 4
create_atoms 1 single 38 45 4
create_atoms 1 single 42.5 41.5 4
create_atoms 1 single 47.5 42.5 4
create_atoms 1 single 38 7 4
create_atoms 1 single 10 44 4
create_atoms 1 single 29.5 27.5 4
create_atoms 1 single 45 30 4
create_atoms 1 single 3 9 4
create_atoms 1 single 8.5 35.5 4
create_atoms 1 single 24 44 4
create_atoms 1 single 47 4 4
create_atoms 1 single 7.5 8.5 4
create_atoms 1 single 32.5 41.5 4
create_atoms 1 single 0.5 34.5 4
create_atoms 1 single 11 8 4
create_atoms 1 single 2 40 4
create_atoms 1 single 25 24 4
create_atoms 1 single 47.5 6.5 4
create_atoms 1 single 39.5 28.5 4
create_atoms 1 single 17 21 4
create_atoms 1 single 32 43 4
create_atoms 1 single 16.5 29.5 4
create_atoms 1 single 34 34 4
create_atoms 1 single 11.5 3.5 4
create_atoms 1 single 39 22 4
create_atoms 1 single 24.5 36.5 4
create_atoms 1 single 33 31 4
create_atoms 1 single 35.5 35.5 4
create_atoms 1 single 14.5 34.5 4
create_atoms 1 single 34 28 4
create_atoms 1 single 37 41 4
create_atoms 1 single 33 46 4
create_atoms 1 single 27.5 28.5 4
create_atoms 1 single 40.5 22.5 4
create_atoms 1 single 27.5 1.5 4
create_atoms 1 single 12 2 4
create_atoms 1 single 36 43 4
create_atoms 1 single 28.5 9.5 4
create_atoms 1 single 20.5 25.5 4
create_atoms 1 single 3 3 4
create_atoms 1 single 38 33 4
create_atoms 1 single 3 20 4
create_atoms 1 single 35 11 4
create_atoms 1 single 5 25 4
create_atoms 1 single 36.5 6.5 4
create_atoms 1 single 19.5 24.5 4
create_atoms 1 single 27 41 4
create_atoms 1 single 39.5 11.5 4
create_atoms 1 single 21.5 2.5 4
create_atoms 1 single 46.5 15.5 4
create_atoms 1 single 13 24 4
create_atoms 1 single 11 37 4
create_atoms 1 single 11.5 31.5 4
create_atoms 1 single 47 0 4
create_atoms 1 single 25.5 17.5 4
create_atoms 1 single 32 11 4
create_atoms 1 single 8 17 4
create_atoms 1 single 27.5 12.5 4
create_atoms 1 single 25 7 4
create_atoms 1 single 25.5 37.5 4
create_atoms 1 single 12 15 4
create_atoms 1 single 1 7 4
create_atoms 1 single 18.5 47.5 4
create_atoms 1 single 5 38 4
create_atoms 1 single 42 19 4
create_atoms 1 single 30.5 7.5 4
create_atoms 1 single 42.5 7.5 4
create_atoms 1 single 26.5 18.5 4
create_atoms 1 single 18.5 1.5 4
create_atoms 1 single 41.5 10.5 4

Binary file not shown.

After

Width:  |  Height:  |  Size: 71 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 70 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 69 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 70 KiB

View File

@ -0,0 +1,95 @@
# 3d EAM surface for global HD
# nearest neighbor distance = a * sqrt(2)/2 = 2.77 Angs for Pt with a = 3.92
# hop event on (100) surface is same distance
# exchange event is 2 atoms moving same distance
variable Tequil index 500.0
variable Vmax index 0.5
variable qfactor index 0.3
variable cutbond index 3.2
variable cutevent index 1.1
variable steps index 100000
variable nevent index 1000
variable zoom index 1.8
units metal
atom_style atomic
atom_modify map array
boundary p p p
lattice fcc 3.92
region box block 0 6 0 6 0 4
create_box 3 box
create_atoms 1 box
mass * 1.0
change_box all z final -0.1 5.0 boundary p p f
create_atoms 2 single 3.5 3.5 4
# define frozen substrate and mobile atoms
group adatom type 2
region base block INF INF INF INF 0 1.8
set region base type 3
group base type 3
group mobile type 1 2
# pair style
pair_style eam/alloy
pair_coeff * * ptvoterlammps.eam Pt Pt Pt
neighbor 0.5 bin
neigh_modify every 1 delay 5 check yes
fix 1 mobile nve
fix 2 mobile langevin ${Tequil} ${Tequil} 1.0 858872873 zero yes
timestep 0.005
compute tmobile mobile temp
thermo 100
thermo_modify temp tmobile
# thermal equilibration
run 1000
reset_timestep 0
# pin base so will not move during quenches
fix freeze base setforce 0.0 0.0 0.0
# event detection
compute event all event/displace ${cutevent}
# hyper/global
fix HG mobile hyper/global ${cutbond} ${qfactor} ${Vmax} ${Tequil}
# thermo output
thermo_style custom step temp pe f_HG f_HG[*]
thermo_modify lost ignore
thermo_modify temp tmobile
thermo ${nevent}
# dump output options
region substrate block INF INF INF INF 1.8 3.8
region adatoms block INF INF INF INF 3.8 INF
variable acolor atom rmask(base)+2*rmask(substrate)+3*rmask(adatoms)
dump 1 all image 1000000 global.*.jpg v_acolor type &
zoom ${zoom} adiam 2.5 view 0.0 0.0 up 0 1 0 axes yes 0.9 0.01
dump_modify 1 pad 6 amap 1 3 sa 1 3 blue red green
# run
hyper ${steps} ${nevent} HG event min 1.0e-6 1.0e-6 100 100 dump 1

View File

@ -0,0 +1,112 @@
# 3d EAM surface for local HD
# nearest neighbor distance = a * sqrt(2)/2 = 2.77 Angs for Pt with a = 3.92
# hop event on (100) surface is same distance
# exchange event is 2 atoms moving same distance
variable Tequil index 400.0
variable Vmax index 0.4
variable qfactor index 0.3
variable cutbond index 3.2
variable Dcut index 10.0
variable cutevent index 1.1
variable alpha index 200.0
variable boost index 4000.0
variable ghostcut index 12.0
variable steps index 1500
variable nevent index 100
variable nx index 8
variable ny index 8
variable zoom index 1.8
variable seed index 826626413
variable tol index 1.0e-15
variable add index 37K
units metal
atom_style atomic
atom_modify map array
boundary p p p
comm_modify cutoff ${ghostcut}
lattice fcc 3.92
region box block 0 6 0 6 0 4
create_box 2 box
create_atoms 1 box
mass * 1.0
change_box all z final -0.1 5.0 boundary p p f
# replicate in xy
replicate ${nx} ${ny} 1
# add adatoms
include adatoms.list.${add}
# define frozen substrate and mobile atoms
region base block INF INF INF INF 0 1.8
set region base type 2
group base type 2
group mobile type 1
# pair style
pair_style eam/alloy
pair_coeff * * ptvoterlammps.eam Pt Pt
neighbor 0.5 bin
neigh_modify every 1 delay 5 check yes
fix 1 mobile nve
fix 2 mobile langevin ${Tequil} ${Tequil} 1.0 ${seed} zero yes
timestep 0.005
compute tmobile mobile temp
thermo 100
thermo_modify temp tmobile
# thermal equilibration
run 1000
reset_timestep 0
# pin base so will not move during quenches
fix freeze base setforce 0.0 0.0 0.0
# event detection
compute event all event/displace ${cutevent}
# hyper/local
fix HL mobile hyper/local ${cutbond} ${qfactor} ${Vmax} ${Tequil} &
${Dcut} ${alpha} ${boost}
# thermo output
thermo_style custom step temp pe f_HL f_HL[*]
thermo_modify lost ignore
thermo_modify temp tmobile
thermo ${nevent}
# dump
region substrate block INF INF INF INF 1.8 3.8
region adatoms block INF INF INF INF 3.8 INF
variable acolor atom rmask(base)+2*rmask(substrate)+3*rmask(adatoms)
dump 1 all image 10000000 local.*.jpg v_acolor type size 1024 1024 &
zoom ${zoom} adiam 2.5 view 0.0 0.0 up 0 1 0 axes yes 0.9 0.01
dump_modify 1 pad 6 amap 1 3 sa 1 3 blue red green
# run
hyper ${steps} ${nevent} HL event min ${tol} ${tol} 1000 1000 dump 1

Binary file not shown.

After

Width:  |  Height:  |  Size: 482 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 479 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 477 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 479 KiB

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,993 @@
LAMMPS (10 Oct 2018)
# 3d EAM surface for local HD
# nearest neighbor distance = a * sqrt(2)/2 = 2.77 Angs for Pt with a = 3.92
# hop event on (100) surface is same distance
# exchange event is 2 atoms moving same distance
variable Tequil index 400.0
variable Vmax index 0.4
variable qfactor index 0.3
variable cutbond index 3.2
variable Dcut index 10.0
variable cutevent index 1.1
variable alpha index 200.0
variable boost index 4000.0
variable ghostcut index 12.0
variable steps index 1500
variable nevent index 100
variable nx index 8
variable ny index 8
variable zoom index 1.8
variable seed index 826626413
variable tol index 1.0e-15
variable add index 37K
units metal
atom_style atomic
atom_modify map array
boundary p p p
comm_modify cutoff ${ghostcut}
comm_modify cutoff 12.0
lattice fcc 3.92
Lattice spacing in x,y,z = 3.92 3.92 3.92
region box block 0 6 0 6 0 4
create_box 2 box
Created orthogonal box = (0 0 0) to (23.52 23.52 15.68)
2 by 4 by 2 MPI processor grid
create_atoms 1 box
Created 576 atoms
Time spent = 0.00108504 secs
mass * 1.0
change_box all z final -0.1 5.0 boundary p p f
orthogonal box = (0 0 -0.392) to (23.52 23.52 19.6)
# replicate in xy
replicate ${nx} ${ny} 1
replicate 8 ${ny} 1
replicate 8 8 1
orthogonal box = (0 0 -0.392) to (188.16 188.16 19.6)
4 by 4 by 1 MPI processor grid
36864 atoms
Time spent = 0.0028758 secs
# add adatoms
include adatoms.list.${add}
include adatoms.list.37K
create_atoms 1 single 27.5 9.5 4
Created 1 atoms
Time spent = 0.000183105 secs
create_atoms 1 single 16 9 4
Created 1 atoms
Time spent = 0.000178099 secs
create_atoms 1 single 10 12 4
Created 1 atoms
Time spent = 0.000179768 secs
create_atoms 1 single 31 44 4
Created 1 atoms
Time spent = 0.000184059 secs
create_atoms 1 single 13 17 4
Created 1 atoms
Time spent = 0.000173807 secs
create_atoms 1 single 8.5 28.5 4
Created 1 atoms
Time spent = 0.000167847 secs
create_atoms 1 single 23 26 4
Created 1 atoms
Time spent = 0.000179052 secs
create_atoms 1 single 38 27 4
Created 1 atoms
Time spent = 0.000169992 secs
create_atoms 1 single 37.5 4.5 4
Created 1 atoms
Time spent = 0.000166178 secs
create_atoms 1 single 41.5 47.5 4
Created 1 atoms
Time spent = 0.000172138 secs
create_atoms 1 single 20.5 37.5 4
Created 1 atoms
Time spent = 0.000165939 secs
create_atoms 1 single 5 8 4
Created 1 atoms
Time spent = 0.00018096 secs
create_atoms 1 single 2.5 16.5 4
Created 1 atoms
Time spent = 0.000165939 secs
create_atoms 1 single 38.5 45.5 4
Created 1 atoms
Time spent = 0.000164986 secs
create_atoms 1 single 9 0 4
Created 1 atoms
Time spent = 0.000168085 secs
create_atoms 1 single 39 32 4
Created 1 atoms
Time spent = 0.000170946 secs
create_atoms 1 single 45.5 11.5 4
Created 1 atoms
Time spent = 0.00018096 secs
create_atoms 1 single 40 0 4
Created 1 atoms
Time spent = 0.000168085 secs
create_atoms 1 single 44.5 2.5 4
Created 1 atoms
Time spent = 0.000165939 secs
create_atoms 1 single 4.5 44.5 4
Created 1 atoms
Time spent = 0.000168085 secs
create_atoms 1 single 24.5 13.5 4
Created 1 atoms
Time spent = 0.000165939 secs
create_atoms 1 single 47.5 23.5 4
Created 1 atoms
Time spent = 0.00018096 secs
create_atoms 1 single 1 20 4
Created 1 atoms
Time spent = 0.000166893 secs
create_atoms 1 single 38.5 31.5 4
Created 1 atoms
Time spent = 0.000168085 secs
create_atoms 1 single 12.5 12.5 4
Created 1 atoms
Time spent = 0.000169992 secs
create_atoms 1 single 2 27 4
Created 1 atoms
Time spent = 0.000188828 secs
create_atoms 1 single 21 5 4
Created 1 atoms
Time spent = 0.000174999 secs
create_atoms 1 single 47 12 4
Created 1 atoms
Time spent = 0.000164986 secs
create_atoms 1 single 32.5 46.5 4
Created 1 atoms
Time spent = 0.000165939 secs
create_atoms 1 single 9.5 40.5 4
Created 1 atoms
Time spent = 0.000166893 secs
create_atoms 1 single 8.5 2.5 4
Created 1 atoms
Time spent = 0.000164986 secs
create_atoms 1 single 41.5 22.5 4
Created 1 atoms
Time spent = 0.000174046 secs
create_atoms 1 single 29 11 4
Created 1 atoms
Time spent = 0.000166893 secs
create_atoms 1 single 3.5 3.5 4
Created 1 atoms
Time spent = 0.000165224 secs
create_atoms 1 single 5 21 4
Created 1 atoms
Time spent = 0.000164986 secs
create_atoms 1 single 46.5 31.5 4
Created 1 atoms
Time spent = 0.000166178 secs
create_atoms 1 single 35 46 4
Created 1 atoms
Time spent = 0.000183105 secs
create_atoms 1 single 40.5 41.5 4
Created 1 atoms
Time spent = 0.000164986 secs
create_atoms 1 single 10 22 4
Created 1 atoms
Time spent = 0.000164986 secs
create_atoms 1 single 43.5 14.5 4
Created 1 atoms
Time spent = 0.000169992 secs
create_atoms 1 single 42 42 4
Created 1 atoms
Time spent = 0.000165939 secs
create_atoms 1 single 4 26 4
Created 1 atoms
Time spent = 0.000174999 secs
create_atoms 1 single 19 34 4
Created 1 atoms
Time spent = 0.000163078 secs
create_atoms 1 single 33 9 4
Created 1 atoms
Time spent = 0.000164986 secs
create_atoms 1 single 0.5 45.5 4
Created 1 atoms
Time spent = 0.000163078 secs
create_atoms 1 single 30.5 32.5 4
Created 1 atoms
Time spent = 0.000164986 secs
create_atoms 1 single 25.5 5.5 4
Created 1 atoms
Time spent = 0.000178099 secs
create_atoms 1 single 47.5 39.5 4
Created 1 atoms
Time spent = 0.000165939 secs
create_atoms 1 single 15 13 4
Created 1 atoms
Time spent = 0.000164986 secs
create_atoms 1 single 21 21 4
Created 1 atoms
Time spent = 0.000164986 secs
create_atoms 1 single 14 28 4
Created 1 atoms
Time spent = 0.000164986 secs
create_atoms 1 single 9 34 4
Created 1 atoms
Time spent = 0.000174999 secs
create_atoms 1 single 7 38 4
Created 1 atoms
Time spent = 0.000175953 secs
create_atoms 1 single 11 35 4
Created 1 atoms
Time spent = 0.000154972 secs
create_atoms 1 single 20.5 45.5 4
Created 1 atoms
Time spent = 0.000156879 secs
create_atoms 1 single 30.5 31.5 4
Created 1 atoms
Time spent = 0.000159979 secs
create_atoms 1 single 32.5 2.5 4
Created 1 atoms
Time spent = 0.000166178 secs
create_atoms 1 single 21.5 3.5 4
Created 1 atoms
Time spent = 0.000157833 secs
create_atoms 1 single 23 12 4
Created 1 atoms
Time spent = 0.000155926 secs
create_atoms 1 single 4.5 33.5 4
Created 1 atoms
Time spent = 0.000156879 secs
create_atoms 1 single 46 43 4
Created 1 atoms
Time spent = 0.000155926 secs
create_atoms 1 single 42.5 45.5 4
Created 1 atoms
Time spent = 0.000156879 secs
create_atoms 1 single 4.5 10.5 4
Created 1 atoms
Time spent = 0.000158072 secs
create_atoms 1 single 33.5 15.5 4
Created 1 atoms
Time spent = 0.000157833 secs
create_atoms 1 single 24 5 4
Created 1 atoms
Time spent = 0.000154018 secs
create_atoms 1 single 13 16 4
Created 1 atoms
Time spent = 0.000158072 secs
create_atoms 1 single 16.5 23.5 4
Created 1 atoms
Time spent = 0.000156164 secs
create_atoms 1 single 45.5 28.5 4
Created 1 atoms
Time spent = 0.000247002 secs
create_atoms 1 single 44.5 5.5 4
Created 1 atoms
Time spent = 0.000156164 secs
create_atoms 1 single 27.5 46.5 4
Created 1 atoms
Time spent = 0.000155926 secs
create_atoms 1 single 44.5 12.5 4
Created 1 atoms
Time spent = 0.000157833 secs
create_atoms 1 single 12 41 4
Created 1 atoms
Time spent = 0.000156879 secs
create_atoms 1 single 6 4 4
Created 1 atoms
Time spent = 0.0001688 secs
create_atoms 1 single 31.5 10.5 4
Created 1 atoms
Time spent = 0.00015521 secs
create_atoms 1 single 1 44 4
Created 1 atoms
Time spent = 0.000154972 secs
create_atoms 1 single 31 4 4
Created 1 atoms
Time spent = 0.000156879 secs
create_atoms 1 single 21 33 4
Created 1 atoms
Time spent = 0.000156879 secs
create_atoms 1 single 3 33 4
Created 1 atoms
Time spent = 0.000164032 secs
create_atoms 1 single 15 10 4
Created 1 atoms
Time spent = 0.0001719 secs
create_atoms 1 single 28.5 22.5 4
Created 1 atoms
Time spent = 0.000153065 secs
create_atoms 1 single 43 1 4
Created 1 atoms
Time spent = 0.000155926 secs
create_atoms 1 single 3.5 0.5 4
Created 1 atoms
Time spent = 0.000154972 secs
create_atoms 1 single 41 37 4
Created 1 atoms
Time spent = 0.000153065 secs
create_atoms 1 single 18.5 43.5 4
Created 1 atoms
Time spent = 0.000213146 secs
create_atoms 1 single 17 27 4
Created 1 atoms
Time spent = 0.000159979 secs
create_atoms 1 single 3 5 4
Created 1 atoms
Time spent = 0.000153065 secs
create_atoms 1 single 18.5 23.5 4
Created 1 atoms
Time spent = 0.000154972 secs
create_atoms 1 single 31.5 14.5 4
Created 1 atoms
Time spent = 0.000154972 secs
create_atoms 1 single 41 31 4
Created 1 atoms
Time spent = 0.000155926 secs
create_atoms 1 single 22 3 4
Created 1 atoms
Time spent = 0.00015521 secs
create_atoms 1 single 14.5 40.5 4
Created 1 atoms
Time spent = 0.000154018 secs
create_atoms 1 single 9 38 4
Created 1 atoms
Time spent = 0.000154018 secs
create_atoms 1 single 36 42 4
Created 1 atoms
Time spent = 0.000155926 secs
create_atoms 1 single 33 22 4
Created 1 atoms
Time spent = 0.000163078 secs
create_atoms 1 single 15.5 47.5 4
Created 1 atoms
Time spent = 0.000154972 secs
create_atoms 1 single 3 0 4
Created 1 atoms
Time spent = 0.000154972 secs
create_atoms 1 single 25.5 27.5 4
Created 1 atoms
Time spent = 0.000176907 secs
create_atoms 1 single 2.5 28.5 4
Created 1 atoms
Time spent = 0.000154972 secs
create_atoms 1 single 29.5 28.5 4
Created 1 atoms
Time spent = 0.000162125 secs
create_atoms 1 single 44.5 18.5 4
Created 1 atoms
Time spent = 0.000152826 secs
create_atoms 1 single 26 40 4
Created 1 atoms
Time spent = 0.000155926 secs
create_atoms 1 single 41 27 4
Created 1 atoms
Time spent = 0.000158072 secs
create_atoms 1 single 39.5 5.5 4
Created 1 atoms
Time spent = 0.000155926 secs
create_atoms 1 single 3 38 4
Created 1 atoms
Time spent = 0.000152826 secs
create_atoms 1 single 35 29 4
Created 1 atoms
Time spent = 0.000164986 secs
create_atoms 1 single 11 19 4
Created 1 atoms
Time spent = 0.000164986 secs
create_atoms 1 single 18 1 4
Created 1 atoms
Time spent = 0.000146866 secs
create_atoms 1 single 39.5 40.5 4
Created 1 atoms
Time spent = 0.000146866 secs
create_atoms 1 single 46 17 4
Created 1 atoms
Time spent = 0.000146151 secs
create_atoms 1 single 1.5 23.5 4
Created 1 atoms
Time spent = 0.000154018 secs
create_atoms 1 single 28.5 23.5 4
Created 1 atoms
Time spent = 0.000145912 secs
create_atoms 1 single 10 28 4
Created 1 atoms
Time spent = 0.000159979 secs
create_atoms 1 single 19 47 4
Created 1 atoms
Time spent = 0.000148058 secs
create_atoms 1 single 10.5 16.5 4
Created 1 atoms
Time spent = 0.000147104 secs
create_atoms 1 single 38 45 4
Created 1 atoms
Time spent = 0.000144958 secs
create_atoms 1 single 42.5 41.5 4
Created 1 atoms
Time spent = 0.000161886 secs
create_atoms 1 single 47.5 42.5 4
Created 1 atoms
Time spent = 0.000147104 secs
create_atoms 1 single 38 7 4
Created 1 atoms
Time spent = 0.000146151 secs
create_atoms 1 single 10 44 4
Created 1 atoms
Time spent = 0.000145912 secs
create_atoms 1 single 29.5 27.5 4
Created 1 atoms
Time spent = 0.000146151 secs
create_atoms 1 single 45 30 4
Created 1 atoms
Time spent = 0.000145912 secs
create_atoms 1 single 3 9 4
Created 1 atoms
Time spent = 0.000154018 secs
create_atoms 1 single 8.5 35.5 4
Created 1 atoms
Time spent = 0.000146151 secs
create_atoms 1 single 24 44 4
Created 1 atoms
Time spent = 0.000145912 secs
create_atoms 1 single 47 4 4
Created 1 atoms
Time spent = 0.000144958 secs
create_atoms 1 single 7.5 8.5 4
Created 1 atoms
Time spent = 0.000144958 secs
create_atoms 1 single 32.5 41.5 4
Created 1 atoms
Time spent = 0.000157833 secs
create_atoms 1 single 0.5 34.5 4
Created 1 atoms
Time spent = 0.000145912 secs
create_atoms 1 single 11 8 4
Created 1 atoms
Time spent = 0.000147104 secs
create_atoms 1 single 2 40 4
Created 1 atoms
Time spent = 0.000144958 secs
create_atoms 1 single 25 24 4
Created 1 atoms
Time spent = 0.000146151 secs
create_atoms 1 single 47.5 6.5 4
Created 1 atoms
Time spent = 0.000147104 secs
create_atoms 1 single 39.5 28.5 4
Created 1 atoms
Time spent = 0.000154972 secs
create_atoms 1 single 17 21 4
Created 1 atoms
Time spent = 0.000164032 secs
create_atoms 1 single 32 43 4
Created 1 atoms
Time spent = 0.000146151 secs
create_atoms 1 single 16.5 29.5 4
Created 1 atoms
Time spent = 0.000146151 secs
create_atoms 1 single 34 34 4
Created 1 atoms
Time spent = 0.000145912 secs
create_atoms 1 single 11.5 3.5 4
Created 1 atoms
Time spent = 0.000154018 secs
create_atoms 1 single 39 22 4
Created 1 atoms
Time spent = 0.000144958 secs
create_atoms 1 single 24.5 36.5 4
Created 1 atoms
Time spent = 0.000144958 secs
create_atoms 1 single 33 31 4
Created 1 atoms
Time spent = 0.000144958 secs
create_atoms 1 single 35.5 35.5 4
Created 1 atoms
Time spent = 0.000146151 secs
create_atoms 1 single 14.5 34.5 4
Created 1 atoms
Time spent = 0.000146866 secs
create_atoms 1 single 34 28 4
Created 1 atoms
Time spent = 0.000153065 secs
create_atoms 1 single 37 41 4
Created 1 atoms
Time spent = 0.000145912 secs
create_atoms 1 single 33 46 4
Created 1 atoms
Time spent = 0.000144958 secs
create_atoms 1 single 27.5 28.5 4
Created 1 atoms
Time spent = 0.000145197 secs
create_atoms 1 single 40.5 22.5 4
Created 1 atoms
Time spent = 0.000150919 secs
create_atoms 1 single 27.5 1.5 4
Created 1 atoms
Time spent = 0.000146151 secs
create_atoms 1 single 12 2 4
Created 1 atoms
Time spent = 0.000151873 secs
create_atoms 1 single 36 43 4
Created 1 atoms
Time spent = 0.000144005 secs
create_atoms 1 single 28.5 9.5 4
Created 1 atoms
Time spent = 0.000146151 secs
create_atoms 1 single 20.5 25.5 4
Created 1 atoms
Time spent = 0.000144958 secs
create_atoms 1 single 3 3 4
Created 1 atoms
Time spent = 0.000144005 secs
create_atoms 1 single 38 33 4
Created 1 atoms
Time spent = 0.000146151 secs
create_atoms 1 single 3 20 4
Created 1 atoms
Time spent = 0.000154972 secs
create_atoms 1 single 35 11 4
Created 1 atoms
Time spent = 0.000145912 secs
create_atoms 1 single 5 25 4
Created 1 atoms
Time spent = 0.000144005 secs
create_atoms 1 single 36.5 6.5 4
Created 1 atoms
Time spent = 0.000144005 secs
create_atoms 1 single 19.5 24.5 4
Created 1 atoms
Time spent = 0.000236988 secs
create_atoms 1 single 27 41 4
Created 1 atoms
Time spent = 0.000169992 secs
create_atoms 1 single 39.5 11.5 4
Created 1 atoms
Time spent = 0.000138998 secs
create_atoms 1 single 21.5 2.5 4
Created 1 atoms
Time spent = 0.000136852 secs
create_atoms 1 single 46.5 15.5 4
Created 1 atoms
Time spent = 0.000138998 secs
create_atoms 1 single 13 24 4
Created 1 atoms
Time spent = 0.000137091 secs
create_atoms 1 single 11 37 4
Created 1 atoms
Time spent = 0.000144005 secs
create_atoms 1 single 11.5 31.5 4
Created 1 atoms
Time spent = 0.000144958 secs
create_atoms 1 single 47 0 4
Created 1 atoms
Time spent = 0.000138044 secs
create_atoms 1 single 25.5 17.5 4
Created 1 atoms
Time spent = 0.00014019 secs
create_atoms 1 single 32 11 4
Created 1 atoms
Time spent = 0.000138044 secs
create_atoms 1 single 8 17 4
Created 1 atoms
Time spent = 0.000138044 secs
create_atoms 1 single 27.5 12.5 4
Created 1 atoms
Time spent = 0.000137806 secs
create_atoms 1 single 25 7 4
Created 1 atoms
Time spent = 0.000146151 secs
create_atoms 1 single 25.5 37.5 4
Created 1 atoms
Time spent = 0.000139952 secs
create_atoms 1 single 12 15 4
Created 1 atoms
Time spent = 0.000138044 secs
create_atoms 1 single 1 7 4
Created 1 atoms
Time spent = 0.000138998 secs
create_atoms 1 single 18.5 47.5 4
Created 1 atoms
Time spent = 0.000138044 secs
create_atoms 1 single 5 38 4
Created 1 atoms
Time spent = 0.000136852 secs
create_atoms 1 single 42 19 4
Created 1 atoms
Time spent = 0.000149012 secs
create_atoms 1 single 30.5 7.5 4
Created 1 atoms
Time spent = 0.000138044 secs
create_atoms 1 single 42.5 7.5 4
Created 1 atoms
Time spent = 0.000138998 secs
create_atoms 1 single 26.5 18.5 4
Created 1 atoms
Time spent = 0.000153065 secs
create_atoms 1 single 18.5 1.5 4
Created 1 atoms
Time spent = 0.000137091 secs
create_atoms 1 single 41.5 10.5 4
Created 1 atoms
Time spent = 0.000140905 secs
# define frozen substrate and mobile atoms
region base block INF INF INF INF 0 1.8
set region base type 2
18432 settings made for type
group base type 2
18432 atoms in group base
group mobile type 1
18616 atoms in group mobile
# pair style
pair_style eam/alloy
pair_coeff * * ptvoterlammps.eam Pt Pt
neighbor 0.5 bin
neigh_modify every 1 delay 5 check yes
fix 1 mobile nve
fix 2 mobile langevin ${Tequil} ${Tequil} 1.0 ${seed} zero yes
fix 2 mobile langevin 400.0 ${Tequil} 1.0 ${seed} zero yes
fix 2 mobile langevin 400.0 400.0 1.0 ${seed} zero yes
fix 2 mobile langevin 400.0 400.0 1.0 826626413 zero yes
timestep 0.005
compute tmobile mobile temp
thermo 100
thermo_modify temp tmobile
WARNING: Temperature for thermo pressure is not for group all (../thermo.cpp:488)
# thermal equilibration
run 1000
Neighbor list info ...
update every 1 steps, delay 5 steps, check yes
max neighbors/atom: 2000, page size: 100000
master list distance cutoff = 6.07583
ghost atom cutoff = 12
binsize = 3.03792, bins = 62 62 7
1 neighbor lists, perpetual/occasional/extra = 1 0 0
(1) pair eam/alloy, perpetual
attributes: half, newton on
pair build: half/bin/atomonly/newton
stencil: half/bin/3d/newton
bin: standard
Per MPI rank memory allocation (min/avg/max) = 3.359 | 3.359 | 3.36 Mbytes
Step Temp E_pair E_mol TotEng Press
0 0 -206220.22 0 -206220.22 -52155.664
100 188.18127 -206044.43 0 -205591.63 -25068.83
200 274.34464 -205860.78 0 -205200.66 -40191.797
300 325.66286 -205750.01 0 -204966.4 -31510.222
400 352.48242 -205675.42 0 -204827.28 -35058.064
500 370.88571 -205619.66 0 -204727.25 -32735.022
600 388.62129 -205592.87 0 -204657.78 -33904.556
700 389.54874 -205579.73 0 -204642.4 -32769.852
800 395.56074 -205576.82 0 -204625.03 -33755.948
900 398.03458 -205564.48 0 -204606.74 -32777.103
1000 401.24089 -205562.85 0 -204597.4 -33785.341
Loop time of 4.3687 on 16 procs for 1000 steps with 37048 atoms
Performance: 98.885 ns/day, 0.243 hours/ns, 228.901 timesteps/s
98.4% CPU use with 16 MPI tasks x no OpenMP threads
MPI task timing breakdown:
Section | min time | avg time | max time |%varavg| %total
---------------------------------------------------------------
Pair | 3.2988 | 3.3828 | 3.4667 | 2.3 | 77.43
Neigh | 0.20856 | 0.23127 | 0.24382 | 1.9 | 5.29
Comm | 0.33313 | 0.45075 | 0.55485 | 9.2 | 10.32
Output | 0.00042987 | 0.00044042 | 0.00049591 | 0.0 | 0.01
Modify | 0.18811 | 0.28363 | 0.36798 | 9.7 | 6.49
Other | | 0.01983 | | | 0.45
Nlocal: 2315.5 ave 2332 max 2297 min
Histogram: 2 0 0 3 4 0 2 1 2 2
Nghost: 3186.31 ave 3205 max 3170 min
Histogram: 2 1 3 0 2 3 2 1 0 2
Neighs: 55590.9 ave 56174 max 55103 min
Histogram: 2 2 1 1 4 1 3 0 0 2
Total # of neighbors = 889454
Ave neighs/atom = 24.0082
Neighbor list builds = 105
Dangerous builds = 0
reset_timestep 0
# pin base so will not move during quenches
fix freeze base setforce 0.0 0.0 0.0
# event detection
compute event all event/displace ${cutevent}
compute event all event/displace 1.1
# hyper/local
fix HL mobile hyper/local ${cutbond} ${qfactor} ${Vmax} ${Tequil} ${Dcut} ${alpha} ${boost}
fix HL mobile hyper/local 3.2 ${qfactor} ${Vmax} ${Tequil} ${Dcut} ${alpha} ${boost}
fix HL mobile hyper/local 3.2 0.3 ${Vmax} ${Tequil} ${Dcut} ${alpha} ${boost}
fix HL mobile hyper/local 3.2 0.3 0.4 ${Tequil} ${Dcut} ${alpha} ${boost}
fix HL mobile hyper/local 3.2 0.3 0.4 400.0 ${Dcut} ${alpha} ${boost}
fix HL mobile hyper/local 3.2 0.3 0.4 400.0 10.0 ${alpha} ${boost}
fix HL mobile hyper/local 3.2 0.3 0.4 400.0 10.0 200.0 ${boost}
fix HL mobile hyper/local 3.2 0.3 0.4 400.0 10.0 200.0 4000.0
# thermo output
thermo_style custom step temp pe f_HL f_HL[*]
WARNING: New thermo_style command, previous thermo_modify settings will be lost (../output.cpp:705)
thermo_modify lost ignore
thermo_modify temp tmobile
WARNING: Temperature for thermo pressure is not for group all (../thermo.cpp:488)
thermo ${nevent}
thermo 100
# dump
region substrate block INF INF INF INF 1.8 3.8
region adatoms block INF INF INF INF 3.8 INF
variable acolor atom rmask(base)+2*rmask(substrate)+3*rmask(adatoms)
dump 1 all image 10000000 local.*.jpg v_acolor type size 1024 1024 zoom ${zoom} adiam 2.5 view 0.0 0.0 up 0 1 0 axes yes 0.9 0.01
dump 1 all image 10000000 local.*.jpg v_acolor type size 1024 1024 zoom 1.8 adiam 2.5 view 0.0 0.0 up 0 1 0 axes yes 0.9 0.01
dump_modify 1 pad 6 amap 1 3 sa 1 3 blue red green
# run
hyper ${steps} ${nevent} HL event min ${tol} ${tol} 1000 1000 dump 1
hyper 1500 ${nevent} HL event min ${tol} ${tol} 1000 1000 dump 1
hyper 1500 100 HL event min ${tol} ${tol} 1000 1000 dump 1
hyper 1500 100 HL event min 1.0e-15 ${tol} 1000 1000 dump 1
hyper 1500 100 HL event min 1.0e-15 1.0e-15 1000 1000 dump 1
WARNING: Resetting reneighboring criteria during hyper (../hyper.cpp:133)
Neighbor list info ...
update every 1 steps, delay 0 steps, check yes
max neighbors/atom: 2000, page size: 100000
master list distance cutoff = 6.07583
ghost atom cutoff = 12
binsize = 3.03792, bins = 62 62 7
2 neighbor lists, perpetual/occasional/extra = 1 1 0
(1) pair eam/alloy, perpetual
attributes: half, newton on
pair build: half/bin/atomonly/newton
stencil: half/bin/3d/newton
bin: standard
(2) fix hyper/local, occasional
attributes: full, newton on, cut 10
pair build: full/bin/atomonly
stencil: full/bin/3d
bin: standard
Per MPI rank memory allocation (min/avg/max) = 7.566 | 7.567 | 7.567 Mbytes
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
0 401.24089 -205562.85 0 0 0 1 0 0 0 0 0 0 0 0 4e+19 0 0 0 0 0 0 0 0 0 0 0
77 401.24089 -206534.96 0 0 0 1 0 0 0 0 0 0 0 0 4e+19 0 0 0 0 0 0 0 1540 0 0 0
Loop time of 0.540347 on 16 procs for 77 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
0 401.24089 -205562.85 23.271302 74 0.18753621 1 6.0138739 703.62325 0 0.55802338 3.5350432 0 0 0 4e+19 10.115141 10.115141 0 0 0 0 0 0 0 0 0
100 399.15639 -205546.21 22.904368 90 0.32935524 0.39929142 6.0138739 703.62325 0.026229865 0.91517139 3.9968927 91.88 0.3995539 0.4009724 0.39695676 10.262823 10.262823 0 0 0 0 0 2000 0 0 0
Loop time of 0.579085 on 16 procs for 100 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
100 399.15639 -205546.21 22.904368 90 0.32935524 0.39929142 6.0138739 703.62325 0.026229865 0.91517139 3.9968927 91.88 0.3995539 0.4009724 0.39695676 10.262823 10.262823 0 0 0 0 0 2000 0 0 0
184 399.15639 -206534.96 22.904368 90 0.32935524 0.39929142 6.0138739 703.62325 0.026229865 0.91517139 3.9968927 49.934783 0.21714886 0.4009724 0.39695676 10.262823 10.262823 0 0 0 0 0 3680 0 0 0
Loop time of 0.556056 on 16 procs for 84 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
100 399.15639 -205546.21 22.903938 90 0.32935524 0.39929142 6.0138739 703.62325 0.026229865 0.91517139 3.9968927 91.88 0.3995539 0.4009724 0.39695676 10.262823 10.262823 0 0 0 0 0 2000 0 0 0
200 403.01717 -205543.17 20.844359 90 0.3291605 0.39888693 6.0138739 703.62325 0.039527213 0.94418421 4.0368484 90.95 0.39930574 0.4019706 0.39554353 10.262823 10.262823 0 0 0 0 0 4000 0 0 0
Loop time of 0.581214 on 16 procs for 100 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
200 403.01717 -205543.17 20.844359 90 0.3291605 0.39888693 6.0138739 703.62325 0.039527213 0.94418421 4.0368484 90.95 0.39930574 0.4019706 0.39554353 10.262823 10.262823 0 0 0 0 0 4000 0 0 0
275 403.01717 -206534.96 20.844359 90 0.3291605 0.39888693 6.0138739 703.62325 0.039527213 0.94418421 4.0368484 66.145455 0.29040418 0.4019706 0.39554353 10.262823 10.262823 0 0 0 0 0 5500 0 0 0
Loop time of 0.481812 on 16 procs for 75 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
200 403.01717 -205543.17 21.115577 91 0.3291605 0.39888693 6.0138739 703.62325 0.039527213 0.94418421 4.0368484 90.95 0.39930574 0.4019706 0.39554353 10.262823 10.262823 0 0 0 0 0 4000 0 0 0
300 399.01963 -205541.46 19.137336 85 0.32442182 0.39862755 6.0138739 703.62325 0.046873868 0.94776891 4.0368484 92.02 0.39912484 0.40296919 0.39497622 10.288936 10.288936 0 0 0 0 0 6000 0 0 0
Loop time of 0.5757 on 16 procs for 100 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
300 399.01963 -205541.46 19.137336 85 0.32442182 0.39862755 6.0138739 703.62325 0.046873868 0.94776891 4.0368484 92.02 0.39912484 0.40296919 0.39497622 10.288936 10.288936 0 0 0 0 0 6000 0 0 0
377 399.01963 -206534.96 19.137336 85 0.32442182 0.39862755 6.0138739 703.62325 0.046873868 0.94776891 4.0368484 73.225464 0.31760598 0.40296919 0.39497622 10.288936 10.288936 0 0 0 0 0 7540 0 0 0
Loop time of 0.514907 on 16 procs for 77 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
300 399.01963 -205541.46 19.137003 85 0.32442182 0.39862755 6.0138739 703.62325 0.046873868 0.94776891 4.0368484 92.02 0.39912484 0.40296919 0.39497622 10.288936 10.288936 0 0 0 0 0 6000 0 0 0
400 398.15351 -205544.87 20.470844 93 0.34589451 0.39828754 6.0138739 703.62325 0.049952465 0.94776891 4.0779385 92.0375 0.39894967 0.40395328 0.3932824 10.307052 10.307052 0 0 0 0 0 8000 0 0 0
Loop time of 0.577371 on 16 procs for 100 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
400 398.15351 -205544.87 20.470844 93 0.34589451 0.39828754 6.0138739 703.62325 0.049952465 0.94776891 4.0779385 92.0375 0.39894967 0.40395328 0.3932824 10.307052 10.307052 0 0 0 0 0 8000 0 0 0
471 398.15351 -206534.96 20.470844 93 0.34589451 0.39828754 6.0138739 703.62325 0.049952465 0.94776891 4.0779385 78.163482 0.33881076 0.40395328 0.3932824 10.307052 10.307052 0 0 0 0 0 9420 0 0 0
Loop time of 0.465473 on 16 procs for 71 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
400 398.15351 -205544.87 20.470689 93 0.34589451 0.39828754 6.0138739 703.62325 0.049952465 0.94776891 4.0779385 92.0375 0.39894967 0.40395328 0.3932824 10.307052 10.307052 0 0 0 0 0 8000 0 0 0
500 400.29399 -205544.98 17.051242 83 0.42140172 0.39805251 6.0138739 703.62325 0.056986933 1.0907861 4.0779385 91.986 0.39879563 0.40493836 0.39165573 10.307052 10.307052 0 0 0 0 0 10000 0 0 0
Loop time of 0.579188 on 16 procs for 100 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
500 400.29399 -205544.98 17.051242 83 0.42140172 0.39805251 6.0138739 703.62325 0.056986933 1.0907861 4.0779385 91.986 0.39879563 0.40493836 0.39165573 10.307052 10.307052 0 0 0 0 0 10000 0 0 0
577 400.29399 -206534.96 17.051242 83 0.42140172 0.39805251 6.0138739 703.62325 0.056986933 1.0910651 4.0779385 79.710572 0.3455768 0.40493836 0.39165573 10.307052 10.307052 0 0 0 0 0 11540 0 0 0
Loop time of 0.502193 on 16 procs for 77 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
500 400.29399 -205544.98 17.051107 83 0.42140172 0.39805251 6.0138739 703.62325 0.056986933 1.0910651 4.0779385 91.986 0.39879563 0.40493836 0.39165573 10.307052 10.307052 0 0 0 0 0 10000 0 0 0
600 400.96099 -205544.56 20.904479 91 0.41219484 0.39780769 6.0138739 703.62325 0.061331691 1.1358732 4.0779385 92.013333 0.39864794 0.40593806 0.39067432 10.307052 10.307052 0 0 0 0 0 12000 0 0 0
Loop time of 0.694955 on 16 procs for 100 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
600 400.96099 -205544.56 20.904479 91 0.41219484 0.39780769 6.0138739 703.62325 0.061331691 1.1358732 4.0779385 92.013333 0.39864794 0.40593806 0.39067432 10.307052 10.307052 0 0 0 0 0 12000 0 0 0
680 400.96099 -206534.96 20.904479 91 0.41219484 0.39780769 6.0138739 703.62325 0.061331691 1.1358732 4.0779385 81.188235 0.35174818 0.40593806 0.39067432 10.307052 10.307052 0 0 0 0 0 13600 0 0 0
Loop time of 0.529041 on 16 procs for 80 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
600 400.96099 -205544.56 20.904088 91 0.41219484 0.39780769 6.0138739 703.62325 0.061331691 1.1358732 4.0779385 92.013333 0.39864794 0.40593806 0.39067432 10.307052 10.307052 0 0 0 0 0 12000 0 0 0
700 397.78618 -205534.96 20.361513 95 0.54466603 0.39757442 6.0138739 703.62325 0.061146951 1.1853748 4.1995704 92.12 0.39850836 0.40693553 0.38981834 10.307052 10.307052 0 0 0 0 0 14000 0 0 0
Loop time of 0.590093 on 16 procs for 100 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
700 397.78618 -205534.96 20.361513 95 0.54466603 0.39757442 6.0138739 703.62325 0.061146951 1.2139704 4.1995704 92.12 0.39850836 0.40693553 0.38981834 10.307052 10.307052 0 0 0 0 0 14000 0 0 0
790 397.78618 -206534.96 20.361513 95 0.54466603 0.39757442 6.0138739 703.62325 0.061146951 2.2107138 4.1995704 81.625316 0.35310868 0.40693553 0.38981834 10.307052 10.307052 0 0 0 0 0 15800 0 0 0
Loop time of 0.594281 on 16 procs for 90 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
700 397.78618 -205534.96 20.236364 94 0.51088027 0.39757442 6.0138739 703.62325 0.061146951 2.2107138 4.205089 92.12 0.39850836 0.40693553 0.38981834 10.307052 10.307052 0 0 0 0 0 14000 1 2 6
800 399.66919 -205547.44 21.285461 94 0.56079766 0.39739855 6.0138739 703.62325 0.06556778 2.2107138 4.3041291 92.36625 0.3983806 0.40793368 0.38875 10.307052 10.385797 0 0 0 0 0 16000 1 2 6
Loop time of 0.583824 on 16 procs for 100 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
800 399.66919 -205547.44 21.285461 94 0.56079766 0.39739855 6.0138739 703.62325 0.06556778 2.2107138 4.3041291 92.36625 0.3983806 0.40793368 0.38875 10.307052 10.385797 0 0 0 0 0 16000 1 2 6
872 399.66919 -206535.54 21.285461 94 0.56079766 0.39739855 6.0138739 703.62325 0.06556778 2.3177682 4.3041291 84.739679 0.36548679 0.40793368 0.38875 10.307052 10.385797 0 0 0 0 0 17440 1 2 6
Loop time of 0.46886 on 16 procs for 72 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
800 399.66919 -205547.44 21.2852 94 0.44964213 0.39739855 6.0138739 703.62325 0.06556778 2.3177682 4.3041291 92.36625 0.3983806 0.40793368 0.38875 10.307052 10.385797 0 0 0 0 0 16000 2 4 13
900 401.5853 -205544.22 19.308189 94 0.47610389 0.39719191 6.0138739 703.62325 0.066991886 2.3177682 4.3041291 92.017778 0.39825974 0.40893337 0.3878576 10.307052 10.385797 0 0 0 0 0 18000 2 4 13
Loop time of 0.585137 on 16 procs for 100 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
900 401.5853 -205544.22 19.308189 94 0.47610389 0.39719191 6.0138739 703.62325 0.066991886 2.3177682 4.3041291 92.017778 0.39825974 0.40893337 0.3878576 10.307052 10.385797 0 0 0 0 0 18000 2 4 13
975 401.5853 -206535.54 19.308189 94 0.47610389 0.39719191 6.0138739 703.62325 0.066991886 2.3177682 4.3041291 84.939487 0.36762438 0.40893337 0.3878576 10.307052 10.385797 0 0 0 0 0 19500 2 4 13
Loop time of 0.502012 on 16 procs for 75 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
900 401.5853 -205544.22 19.307938 94 0.47610389 0.39719191 6.0138739 703.62325 0.066991886 2.3177682 4.3041291 92.017778 0.39825974 0.40893337 0.3878576 10.307052 10.385797 0 0 0 0 0 18000 2 4 13
1000 395.06218 -205526.35 17.514295 91 0.42044925 0.39716259 6.0138739 703.62325 0.067937867 2.3177682 4.3041291 92.511 0.39814962 0.40993184 0.3867545 10.307052 10.385797 0 0 0 0 0 20000 2 4 13
Loop time of 0.588597 on 16 procs for 100 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
1000 395.06218 -205526.35 17.514295 91 0.42044925 0.39716259 6.0138739 703.62325 0.067937867 2.3177682 4.3041291 92.511 0.39814962 0.40993184 0.3867545 10.307052 10.385797 0 0 0 0 0 20000 2 4 13
1083 395.06218 -206535.54 17.514295 91 0.42044925 0.39716259 6.0138739 703.62325 0.067937867 2.3177682 4.3041291 85.421053 0.36763584 0.40993184 0.3867545 10.307052 10.385797 0 0 0 0 0 21660 2 4 13
Loop time of 0.543222 on 16 procs for 83 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
1000 395.06218 -205526.35 17.514191 91 0.42044925 0.39716259 6.0138739 703.62325 0.067937867 2.3177682 4.3041291 92.511 0.39814962 0.40993184 0.3867545 10.307052 10.385797 0 0 0 0 0 20000 2 4 13
1100 400.04484 -205545.92 19.52012 89 0.58919981 0.39704631 6.0138739 703.62325 0.069136967 2.3177682 4.4265979 92.517273 0.39805636 0.41093134 0.38574293 10.307052 10.385797 0 0 0 0 0 22000 2 4 13
Loop time of 0.590075 on 16 procs for 100 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
1100 400.04484 -205545.92 19.52012 89 0.58919981 0.39704631 6.0138739 703.62325 0.069136967 2.3177682 4.4265979 92.517273 0.39805636 0.41093134 0.38574293 10.307052 10.385797 0 0 0 0 0 22000 2 4 13
1177 400.04484 -206535.53 19.52012 89 0.58919981 0.39704631 6.0138739 703.62325 0.069136967 2.3177682 4.4265979 86.464741 0.37201529 0.41093134 0.38574293 10.307052 10.385797 0 0 0 0 0 23540 2 4 13
Loop time of 0.500839 on 16 procs for 77 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
1100 400.04484 -205545.92 19.518413 89 0.429675 0.39705701 6.0137119 703.6043 0.069136967 2.3177682 4.4265979 92.517273 0.39805636 0.41093134 0.38574293 10.307052 10.385797 0 0 0 0 0 22000 3 6 19
1200 400.7462 -205543.2 21.169548 91 0.32511134 0.39679665 6.0137119 703.6043 0.06750442 2.3177682 4.4265979 92.376667 0.39796198 0.41191655 0.3846039 10.307052 10.385797 0 0 0 0 0 24000 3 6 19
Loop time of 0.583971 on 16 procs for 100 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
1200 400.7462 -205543.2 21.169548 91 0.32511134 0.39679665 6.0137119 703.6043 0.06750442 2.3177682 4.4265979 92.376667 0.39796198 0.41191655 0.3846039 10.307052 10.385797 0 0 0 0 0 24000 3 6 19
1277 400.7462 -206535.53 21.169548 91 0.32511134 0.39679665 6.0137119 703.6043 0.06750442 2.3177682 4.4265979 86.806578 0.37396584 0.41191655 0.3846039 10.307052 10.385797 0 0 0 0 0 25540 3 6 19
Loop time of 0.509118 on 16 procs for 77 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
1200 400.7462 -205543.2 21.169281 91 0.32511134 0.39679665 6.0137119 703.6043 0.06750442 2.3177682 4.4265979 92.376667 0.39796198 0.41191655 0.3846039 10.307052 10.385797 0 0 0 0 0 24000 3 6 19
1300 398.53702 -205539.33 21.35815 94 0.38773898 0.39659935 6.0137119 703.6043 0.067808168 2.3177682 4.4265979 92.500769 0.39786514 0.41289519 0.3846039 10.307052 10.385797 0 0 0 0 0 26000 3 6 19
Loop time of 0.587306 on 16 procs for 100 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
1300 398.53702 -205539.33 21.35815 94 0.38773898 0.39659935 6.0137119 703.6043 0.067808168 2.3177682 4.4265979 92.500769 0.39786514 0.41289519 0.3846039 10.307052 10.385797 0 0 0 0 0 26000 3 6 19
1375 398.53702 -206535.53 21.35815 94 0.38773898 0.39659935 6.0137119 703.6043 0.067808168 2.3177682 4.4265979 87.455273 0.37616341 0.41289519 0.3846039 10.307052 10.385797 0 0 0 0 0 27500 3 6 19
Loop time of 0.483781 on 16 procs for 75 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
1300 398.53702 -205539.33 21.35787 94 0.38773898 0.39659935 6.0137119 703.6043 0.067808168 2.3177682 4.4265979 92.500769 0.39786514 0.41289519 0.3846039 10.307052 10.385797 0 0 0 0 0 26000 3 6 19
1400 402.80537 -205549.3 19.481887 95 0.32554201 0.39648737 6.0137119 703.6043 0.069550538 2.3177682 4.4265979 92.666429 0.39776836 0.41389491 0.38420043 10.307052 10.385797 0 0 0 0 0 28000 3 6 19
Loop time of 0.586411 on 16 procs for 100 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
1400 402.80537 -205549.3 19.481887 95 0.32554201 0.39648737 6.0137119 703.6043 0.069550538 2.3177682 4.4265979 92.666429 0.39776836 0.41389491 0.38420043 10.307052 10.385797 0 0 0 0 0 28000 3 6 19
1471 402.80537 -206535.53 19.481887 95 0.32554201 0.39648737 6.0137119 703.6043 0.069550538 2.3177682 4.4265979 88.193746 0.37856948 0.41389491 0.38420043 10.307052 10.385797 0 0 0 0 0 29420 3 6 19
Loop time of 0.473799 on 16 procs for 71 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
1400 402.80537 -205549.3 19.481632 95 0.32554201 0.39648737 6.0137119 703.6043 0.069550538 2.3177682 4.4265979 92.666429 0.39776836 0.41389491 0.38420043 10.307052 10.385797 0 0 0 0 0 28000 3 6 19
1500 402.0803 -205537.7 20.903964 99 0.3340498 0.39635609 6.0137119 703.6043 0.070409086 2.3177682 4.4265979 92.857333 0.39767858 0.41489448 0.38372784 10.333041 10.385797 0 0 0 0 0 30000 3 6 19
Loop time of 0.587342 on 16 procs for 100 steps with 37048 atoms
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
1500 402.0803 -205537.7 20.903964 99 0.3340498 0.39635609 6.0137119 703.6043 0.070409086 2.3177682 4.4265979 92.857333 0.39767858 0.41489448 0.38372784 10.333041 10.385797 0 0 0 0 0 30000 3 6 19
1574 402.0803 -206535.53 20.903964 99 0.3340498 0.39635609 6.0137119 703.6043 0.070409086 2.3177682 4.4265979 88.491741 0.37898213 0.41489448 0.38372784 10.333041 10.385797 0 0 0 0 0 31480 3 6 19
Loop time of 0.493982 on 16 procs for 74 steps with 37048 atoms
Final hyper stats ...
Cummulative quantities for fix hyper:
hyper time = 30000
event timesteps = 3
# of atoms in events = 6
Quantities for this hyper run:
event timesteps = 3
# of atoms in events = 6
max length of any bond = 4.4266
max drift distance of any atom = 2.31777
fraction of steps & bonds with zero bias = 0.0704091
Current quantities:
ave bonds/atom = 6.01371
Cummulative quantities specific tofix hyper/local:
# of new bonds formed = 19
max bonds/atom = 13
Quantities for this hyper run specific to fix hyper/local:
ave boosted bonds/step = 92.8573
ave boost coeff of all bonds = 0.397679
max boost coeff of any bond = 0.414894
min boost coeff of any bond = 0.383728
max dist from my box of any non-maxstrain bond ghost atom = 10.333
max dist from my box of any bond ghost atom = 10.3858
count of ghost bond neighbors not found on reneighbor steps = 0
lost bond partners = 0
ave bias coeff for lost bond partners = 0
bias overlaps = 0
non-matching bias coeffs = 0
CPU time for bond builds = 0.044807
Current quantities specific to fix hyper/local:
neighbor bonds/bond = 703.604
ave boost coeff for all bonds = 0.396356
Loop time of 17.9972 on 16 procs for 1500 steps with 37048 atoms
Performance: 36.006 ns/day, 0.667 hours/ns, 83.346 timesteps/s
120.7% CPU use with 16 MPI tasks x no OpenMP threads
Hyper stats:
Dynamics time (%) = 8.87027 (49.2869)
Quench time (%) = 8.15972 (45.3388)
Other time (%) = 1.2212 (6.78552)
MPI task timing breakdown:
Section | min time | avg time | max time |%varavg| %total
---------------------------------------------------------------
Pair | 11.6 | 11.848 | 12.043 | 3.9 | 65.83
Neigh | 0.50025 | 0.52638 | 0.55163 | 2.1 | 2.92
Comm | 0.34528 | 0.49905 | 0.66742 | 13.3 | 2.77
Output | 0.0021305 | 0.0021461 | 0.0022686 | 0.1 | 0.01
Modify | 3.7498 | 3.9009 | 3.9786 | 2.8 | 21.67
Other | | 1.221 | | | 6.79
Nlocal: 2315.5 ave 2361 max 2267 min
Histogram: 1 1 0 4 2 1 3 3 0 1
Nghost: 3187.88 ave 3236 max 3141 min
Histogram: 1 0 3 2 2 1 4 1 1 1
Neighs: 53950.6 ave 54989 max 53049 min
Histogram: 2 0 3 2 1 2 4 1 0 1
FullNghs: 542951 ave 554654 max 533224 min
Histogram: 1 2 3 1 2 2 2 2 0 1
Total # of neighbors = 8687214
Ave neighs/atom = 234.485
Neighbor list builds = 165
Dangerous builds = 0
Total wall time: 0:00:22

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,68 @@
# Change Log
## [2.7.24](https://github.com/kokkos/kokkos/tree/2.7.24) (2018-11-04)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.7.00...2.7.24)
**Implemented enhancements:**
- DualView: Add non-templated functions for sync, need\_sync, view, modify [\#1858](https://github.com/kokkos/kokkos/issues/1858)
- DualView: Avoid needlessly allocates and initializes modify\_host and modify\_device flag views [\#1831](https://github.com/kokkos/kokkos/issues/1831)
- DualView: Incorrect deduction of "not device type" [\#1659](https://github.com/kokkos/kokkos/issues/1659)
- BuildSystem: Add KOKKOS\_ENABLE\_CXX14 and KOKKOS\_ENABLE\_CXX17 [\#1602](https://github.com/kokkos/kokkos/issues/1602)
- BuildSystem: Installed kokkos\_generated\_settings.cmake contains build directories instead of install directories [\#1838](https://github.com/kokkos/kokkos/issues/1838)
- BuildSystem: KOKKOS\_ARCH: add ticks to printout of improper arch setting [\#1649](https://github.com/kokkos/kokkos/issues/1649)
- BuildSystem: Make core/src/Makefile for Cuda use needed nvcc\_wrapper [\#1296](https://github.com/kokkos/kokkos/issues/1296)
- Build: Support PGI as host compiler for NVCC [\#1828](https://github.com/kokkos/kokkos/issues/1828)
- Build: Many Warnings Fixed e.g.[\#1786](https://github.com/kokkos/kokkos/issues/1786)
- Capability: OffsetView with non-zero begin index [\#567](https://github.com/kokkos/kokkos/issues/567)
- Capability: Reductions into device side view [\#1788](https://github.com/kokkos/kokkos/issues/1788)
- Capability: Add max\_size to Kokkos::Array [\#1760](https://github.com/kokkos/kokkos/issues/1760)
- Capability: View Assignment: LayoutStride -\> LayoutLeft and LayoutStride -\> LayoutRight [\#1594](https://github.com/kokkos/kokkos/issues/1594)
- Capability: Atomic function allow implicit conversion of update argument [\#1571](https://github.com/kokkos/kokkos/issues/1571)
- Capability: Add team\_size\_max with tagged functors [\#663](https://github.com/kokkos/kokkos/issues/663)
- Capability: Fix allignment of views from Kokkos\_ScratchSpace should use different alignment [\#1700](https://github.com/kokkos/kokkos/issues/1700)
- Capabilitiy: create\_mirror\_view\_and\_copy for DynRankView [\#1651](https://github.com/kokkos/kokkos/issues/1651)
- Capability: DeepCopy HBWSpace / HostSpace [\#548](https://github.com/kokkos/kokkos/issues/548)
- ROCm: support team vector scan [\#1645](https://github.com/kokkos/kokkos/issues/1645)
- ROCm: Merge from rocm-hackathon2 [\#1636](https://github.com/kokkos/kokkos/issues/1636)
- ROCm: Add ParallelScanWithTotal [\#1611](https://github.com/kokkos/kokkos/issues/1611)
- ROCm: Implement MDRange in ROCm [\#1314](https://github.com/kokkos/kokkos/issues/1314)
- ROCm: Implement Reducers for Nested Parallelism Levels [\#963](https://github.com/kokkos/kokkos/issues/963)
- ROCm: Add asynchronous deep copy [\#959](https://github.com/kokkos/kokkos/issues/959)
- Tests: Memory pool test seems to allocate 8GB [\#1830](https://github.com/kokkos/kokkos/issues/1830)
- Tests: Add unit\_test for team\_broadcast [\#734](https://github.com/kokkos/kokkos/issues/734)
**Fixed bugs:**
- BuildSystem: Makefile.kokkos gets gcc-toolchain wrong if gcc is cached [\#1841](https://github.com/kokkos/kokkos/issues/1841)
- BuildSystem: kokkos\_generated\_settings.cmake placement is inconsistent [\#1771](https://github.com/kokkos/kokkos/issues/1771)
- BuildSystem: Invalid escape sequence \. in kokkos\_functions.cmake [\#1661](https://github.com/kokkos/kokkos/issues/1661)
- BuildSystem: Problem in Kokkos generated cmake file [\#1770](https://github.com/kokkos/kokkos/issues/1770)
- BuildSystem: invalid file names on windows [\#1671](https://github.com/kokkos/kokkos/issues/1671)
- Tests: reducers min/max\_loc test fails randomly due to multiple min values and thus multiple valid locations [\#1681](https://github.com/kokkos/kokkos/issues/1681)
- Tests: cuda.scatterview unit test causes "Bus error" when force\_uvm and enable\_lambda are enabled [\#1852](https://github.com/kokkos/kokkos/issues/1852)
- Tests: cuda.cxx11 unit test fails when force\_uvm and enable\_lambda are enabled [\#1850](https://github.com/kokkos/kokkos/issues/1850)
- Tests: threads.reduce\_device\_view\_range\_policy failing with Cuda/8.0.44 and RDC [\#1836](https://github.com/kokkos/kokkos/issues/1836)
- Build: compile error when compiling Kokkos with hwloc 2.0.1 \(on OSX 10.12.6, with g++ 7.2.0\) [\#1506](https://github.com/kokkos/kokkos/issues/1506)
- Build: dual\_view.view broken with UVM [\#1834](https://github.com/kokkos/kokkos/issues/1834)
- Build: White cuda/9.2 + gcc/7.2 warnings triggering errors [\#1833](https://github.com/kokkos/kokkos/issues/1833)
- Build: warning: enum constant in boolean context [\#1813](https://github.com/kokkos/kokkos/issues/1813)
- Capability: Fix overly conservative max\_team\_size thingy [\#1808](https://github.com/kokkos/kokkos/issues/1808)
- DynRankView: Ctors taking ViewAllocateWithoutInitializing broken [\#1783](https://github.com/kokkos/kokkos/issues/1783)
- Cuda: Apollo cuda.team\_broadcast test fail with clang-6.0 [\#1762](https://github.com/kokkos/kokkos/issues/1762)
- Cuda: Clang spurious test failure in impl\_view\_accessible [\#1753](https://github.com/kokkos/kokkos/issues/1753)
- Cuda: Kokkos::complex\<double\> atomic deadlocks with Clang 6 Cuda build with -O0 [\#1752](https://github.com/kokkos/kokkos/issues/1752)
- Cuda: LayoutStride Test fails for UVM as default memory space [\#1688](https://github.com/kokkos/kokkos/issues/1688)
- Cuda: Scan wrong values on Volta [\#1676](https://github.com/kokkos/kokkos/issues/1676)
- Cuda: Kokkos::deep\_copy error with CudaUVM and Kokkos::Serial spaces [\#1652](https://github.com/kokkos/kokkos/issues/1652)
- Cuda: cudaErrorInvalidConfiguration with debug build [\#1647](https://github.com/kokkos/kokkos/issues/1647)
- Cuda: parallel\_for with TeamPolicy::team\_size\_recommended with launch bounds not working -- reported by Daniel Holladay [\#1283](https://github.com/kokkos/kokkos/issues/1283)
- Cuda: Using KOKKOS\_CLASS\_LAMBDA in a class with Kokkos::Random\_XorShift64\_Pool member data [\#1696](https://github.com/kokkos/kokkos/issues/1696)
- Long Build Times on Darwin [\#1721](https://github.com/kokkos/kokkos/issues/1721)
- Capability: Typo in Kokkos\_Sort.hpp - BinOp3D - wrong comparison [\#1720](https://github.com/kokkos/kokkos/issues/1720)
- Buffer overflow in SharedAllocationRecord in Kokkos\_HostSpace.cpp [\#1673](https://github.com/kokkos/kokkos/issues/1673)
- Serial unit test failure [\#1632](https://github.com/kokkos/kokkos/issues/1632)
## [2.7.00](https://github.com/kokkos/kokkos/tree/2.7.00) (2018-05-24)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.6.00...2.7.00)

View File

@ -11,7 +11,7 @@ IF(NOT KOKKOS_HAS_TRILINOS)
# Define Project Name if this is a standalone build
IF(NOT DEFINED ${PROJECT_NAME})
project(Kokkos CXX)
project(Kokkos CXX)
ENDIF()
# Basic initialization (Used in KOKKOS_SETTINGS)
@ -22,7 +22,7 @@ IF(NOT KOKKOS_HAS_TRILINOS)
include(${KOKKOS_SRC_PATH}/cmake/kokkos_functions.cmake)
set_kokkos_cxx_compiler()
set_kokkos_cxx_standard()
#------------ GET OPTIONS AND KOKKOS_SETTINGS --------------------------------
# Add Kokkos' modules to CMake's module path.
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/")
@ -34,7 +34,7 @@ IF(NOT KOKKOS_HAS_TRILINOS)
#------------ GENERATE HEADER AND SOURCE FILES -------------------------------
execute_process(
COMMAND ${KOKKOS_SETTINGS} make -f ${KOKKOS_SRC_PATH}/cmake/Makefile.generate_cmake_settings CXX=${CMAKE_CXX_COMPILER} generate_build_settings
COMMAND ${KOKKOS_SETTINGS} make -f ${KOKKOS_SRC_PATH}/cmake/Makefile.generate_cmake_settings CXX=${CMAKE_CXX_COMPILER} PREFIX=${CMAKE_INSTALL_PREFIX} generate_build_settings
WORKING_DIRECTORY "${Kokkos_BINARY_DIR}"
OUTPUT_FILE ${Kokkos_BINARY_DIR}/core_src_make.out
RESULT_VARIABLE GEN_SETTINGS_RESULT
@ -45,6 +45,7 @@ IF(NOT KOKKOS_HAS_TRILINOS)
endif()
include(${Kokkos_BINARY_DIR}/kokkos_generated_settings.cmake)
install(FILES ${Kokkos_BINARY_DIR}/kokkos_generated_settings.cmake DESTINATION lib/cmake/Kokkos)
install(FILES ${Kokkos_BINARY_DIR}/kokkos_generated_settings.cmake DESTINATION ${CMAKE_INSTALL_PREFIX})
string(REPLACE " " ";" KOKKOS_TPL_INCLUDE_DIRS "${KOKKOS_GMAKE_TPL_INCLUDE_DIRS}")
string(REPLACE " " ";" KOKKOS_TPL_LIBRARY_DIRS "${KOKKOS_GMAKE_TPL_LIBRARY_DIRS}")
string(REPLACE " " ";" KOKKOS_TPL_LIBRARY_NAMES "${KOKKOS_GMAKE_TPL_LIBRARY_NAMES}")

View File

@ -6,9 +6,9 @@ ifndef KOKKOS_PATH
endif
CXXFLAGS=$(CCFLAGS)
# Options: Cuda,ROCm,OpenMP,Pthreads,Qthreads,Serial
# Options: Cuda,ROCm,OpenMP,Pthread,Qthreads,Serial
KOKKOS_DEVICES ?= "OpenMP"
#KOKKOS_DEVICES ?= "Pthreads"
#KOKKOS_DEVICES ?= "Pthread"
# Options:
# Intel: KNC,KNL,SNB,HSW,BDW,SKX
# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72
@ -21,12 +21,13 @@ KOKKOS_ARCH ?= ""
KOKKOS_DEBUG ?= "no"
# Options: hwloc,librt,experimental_memkind
KOKKOS_USE_TPLS ?= ""
# Options: c++11,c++1z
# Options: c++11,c++14,c++1y,c++17,c++1z,c++2a
KOKKOS_CXX_STANDARD ?= "c++11"
# Options: aggressive_vectorization,disable_profiling,disable_deprecated_code,enable_large_mem_tests
KOKKOS_OPTIONS ?= ""
# Option for setting ETI path
KOKKOS_ETI_PATH ?= ${KOKKOS_PATH}/core/src/eti
KOKKOS_CMAKE ?= "no"
# Default settings specific options.
# Options: force_uvm,use_ldg,rdc,enable_lambda
@ -41,7 +42,11 @@ kokkos_has_string=$(if $(findstring $2,$1),1,0)
# Check for general settings.
KOKKOS_INTERNAL_ENABLE_DEBUG := $(call kokkos_has_string,$(KOKKOS_DEBUG),yes)
KOKKOS_INTERNAL_ENABLE_CXX11 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++11)
KOKKOS_INTERNAL_ENABLE_CXX14 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++14)
KOKKOS_INTERNAL_ENABLE_CXX1Y := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++1y)
KOKKOS_INTERNAL_ENABLE_CXX17 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++17)
KOKKOS_INTERNAL_ENABLE_CXX1Z := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++1z)
KOKKOS_INTERNAL_ENABLE_CXX2A := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2a)
# Check for external libraries.
KOKKOS_INTERNAL_USE_HWLOC := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),hwloc)
@ -110,6 +115,18 @@ KOKKOS_INTERNAL_COMPILER_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VE
KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),apple-darwin)
KOKKOS_INTERNAL_COMPILER_HCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC)
# Check Host Compiler if using NVCC through nvcc_wrapper
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER := $(strip $(shell echo $(CXX) | grep nvcc_wrapper | wc -l))
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER), 1)
KOKKOS_CXX_HOST_VERSION := $(strip $(shell $(CXX) $(CXXFLAGS) --host-version 2>&1))
KOKKOS_INTERNAL_COMPILER_PGI := $(call kokkos_has_string,$(KOKKOS_CXX_HOST_VERSION),PGI)
KOKKOS_INTERNAL_COMPILER_INTEL := $(call kokkos_has_string,$(KOKKOS_CXX_HOST_VERSION),Intel Corporation)
KOKKOS_INTERNAL_COMPILER_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_HOST_VERSION),clang)
endif
endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2)
KOKKOS_INTERNAL_COMPILER_CLANG = 1
endif
@ -202,18 +219,34 @@ endif
# Set C++11 flags.
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_INTERNAL_CXX11_FLAG := --c++11
KOKKOS_INTERNAL_CXX14_FLAG := --c++14
#KOKKOS_INTERNAL_CXX17_FLAG := --c++17
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
KOKKOS_INTERNAL_CXX11_FLAG := -std=c++11
#KOKKOS_INTERNAL_CXX14_FLAG := -std=c++14
KOKKOS_INTERNAL_CXX1Y_FLAG := -std=c++1y
#KOKKOS_INTERNAL_CXX17_FLAG := -std=c++17
#KOKKOS_INTERNAL_CXX1Z_FLAG := -std=c++1Z
#KOKKOS_INTERNAL_CXX2A_FLAG := -std=c++2a
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
KOKKOS_INTERNAL_CXX11_FLAG := -hstd=c++11
KOKKOS_INTERNAL_CXX14_FLAG := -hstd=c++14
#KOKKOS_INTERNAL_CXX1Y_FLAG := -hstd=c++1y
#KOKKOS_INTERNAL_CXX17_FLAG := -hstd=c++17
#KOKKOS_INTERNAL_CXX1Z_FLAG := -hstd=c++1z
#KOKKOS_INTERNAL_CXX2A_FLAG := -hstd=c++2a
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_HCC), 1)
KOKKOS_INTERNAL_CXX11_FLAG :=
else
KOKKOS_INTERNAL_CXX11_FLAG := --std=c++11
KOKKOS_INTERNAL_CXX14_FLAG := --std=c++14
KOKKOS_INTERNAL_CXX1Y_FLAG := --std=c++1y
KOKKOS_INTERNAL_CXX17_FLAG := --std=c++17
KOKKOS_INTERNAL_CXX1Z_FLAG := --std=c++1z
KOKKOS_INTERNAL_CXX2A_FLAG := --std=c++2a
endif
endif
endif
@ -336,7 +369,9 @@ endif
#CPPFLAGS is now unused
KOKKOS_CPPFLAGS =
KOKKOS_CXXFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src -I$(KOKKOS_ETI_PATH)
ifneq ($(KOKKOS_CMAKE), yes)
KOKKOS_CXXFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src -I$(KOKKOS_ETI_PATH)
endif
KOKKOS_TPL_INCLUDE_DIRS =
KOKKOS_TPL_LIBRARY_DIRS =
KOKKOS_TPL_LIBRARY_NAMES =
@ -347,9 +382,11 @@ endif
KOKKOS_LIBS = -ldl
KOKKOS_TPL_LIBRARY_NAMES += dl
KOKKOS_LDFLAGS = -L$(shell pwd)
# CXXLDFLAGS is used together with CXXFLAGS in a combined compile/link command
KOKKOS_CXXLDFLAGS = -L$(shell pwd)
ifneq ($(KOKKOS_CMAKE), yes)
KOKKOS_LDFLAGS = -L$(shell pwd)
# CXXLDFLAGS is used together with CXXFLAGS in a combined compile/link command
KOKKOS_CXXLDFLAGS = -L$(shell pwd)
endif
KOKKOS_LINK_FLAGS =
KOKKOS_SRC =
KOKKOS_HEADERS =
@ -377,10 +414,12 @@ tmp := $(call kokkos_append_header,"/* Execution Spaces */")
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA")
tmp := $(call kokkos_append_header,"\#define KOKKOS_COMPILER_CUDA_VERSION $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION)")
endif
ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
tmp := $(call kokkos_append_header,'\#define KOKKOS_ENABLE_ROCM')
tmp := $(call kokkos_append_header,'\#define KOKKOS_IMPL_ROCM_CLANG_WORKAROUND 1')
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
@ -438,11 +477,25 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX11")
endif
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX14), 1)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX14_FLAG)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX14")
endif
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Y), 1)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Y_FLAG)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX14")
endif
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX17), 1)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX17_FLAG)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX17")
endif
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Z), 1)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Z_FLAG)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX11")
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX1Z")
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX17")
endif
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2A), 1)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2A_FLAG)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX20")
endif
ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
@ -465,7 +518,9 @@ endif
ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1)
ifneq ($(HWLOC_PATH),)
KOKKOS_CXXFLAGS += -I$(HWLOC_PATH)/include
ifneq ($(KOKKOS_CMAKE), yes)
KOKKOS_CXXFLAGS += -I$(HWLOC_PATH)/include
endif
KOKKOS_LDFLAGS += -L$(HWLOC_PATH)/lib
KOKKOS_CXXLDFLAGS += -L$(HWLOC_PATH)/lib
KOKKOS_TPL_INCLUDE_DIRS += $(HWLOC_PATH)/include
@ -484,7 +539,9 @@ endif
ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
ifneq ($(MEMKIND_PATH),)
KOKKOS_CXXFLAGS += -I$(MEMKIND_PATH)/include
ifneq ($(KOKKOS_CMAKE), yes)
KOKKOS_CXXFLAGS += -I$(MEMKIND_PATH)/include
endif
KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib
KOKKOS_CXXLDFLAGS += -L$(MEMKIND_PATH)/lib
KOKKOS_TPL_INCLUDE_DIRS += $(MEMKIND_PATH)/include
@ -977,7 +1034,9 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
endif
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
ifneq ($(CUDA_PATH),)
KOKKOS_CXXFLAGS += -I$(CUDA_PATH)/include
ifneq ($(KOKKOS_CMAKE), yes)
KOKKOS_CXXFLAGS += -I$(CUDA_PATH)/include
endif
KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64
KOKKOS_CXXLDFLAGS += -L$(CUDA_PATH)/lib64
KOKKOS_TPL_INCLUDE_DIRS += $(CUDA_PATH)/include
@ -1032,7 +1091,9 @@ ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.cpp)
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.hpp)
ifneq ($(QTHREADS_PATH),)
KOKKOS_CXXFLAGS += -I$(QTHREADS_PATH)/include
ifneq ($(KOKKOS_CMAKE), yes)
KOKKOS_CXXFLAGS += -I$(QTHREADS_PATH)/include
endif
KOKKOS_LDFLAGS += -L$(QTHREADS_PATH)/lib
KOKKOS_CXXLDFLAGS += -L$(QTHREADS_PATH)/lib
KOKKOS_TPL_INCLUDE_DIRS += $(QTHREADS_PATH)/include

View File

@ -52,44 +52,47 @@ For specifics see the LICENSE file contained in the repository or distribution.
* GCC 4.8.4
* GCC 4.9.3
* GCC 5.1.0
* GCC 5.3.0
* GCC 5.5.0
* GCC 6.1.0
* GCC 7.2.0
* GCC 7.3.0
* GCC 8.1.0
* Intel 15.0.2
* Intel 16.0.1
* Intel 17.1.043
* Intel 17.0.1
* Intel 17.4.196
* Intel 18.0.128
* Intel 18.2.128
* Clang 3.6.1
* Clang 3.7.1
* Clang 3.8.1
* Clang 3.9.0
* Clang 4.0.0
* Clang 4.0.0 for CUDA (CUDA Toolkit 8.0.44)
* Clang 6.0.0 for CUDA (CUDA Toolkit 9.1)
* PGI 17.10
* NVCC 7.0 for CUDA (with gcc 4.8.4)
* Clang 6.0.0 for CUDA (CUDA Toolkit 9.0)
* Clang 7.0.0 for CUDA (CUDA Toolkit 9.1)
* PGI 18.7
* NVCC 7.5 for CUDA (with gcc 4.8.4)
* NVCC 8.0.44 for CUDA (with gcc 5.3.0)
* NVCC 9.1 for CUDA (with gcc 6.1.0)
### Primary tested compilers on Power 8 are:
* GCC 5.4.0 (OpenMP,Serial)
* IBM XL 13.1.6 (OpenMP, Serial)
* NVCC 8.0.44 for CUDA (with gcc 5.4.0)
* NVCC 9.0.103 for CUDA (with gcc 6.3.0 and XL 13.1.6)
* GCC 6.4.0 (OpenMP,Serial)
* GCC 7.2.0 (OpenMP,Serial)
* IBM XL 16.1.0 (OpenMP, Serial)
* NVCC 9.2.88 for CUDA (with gcc 7.2.0 and XL 16.1.0)
### Primary tested compilers on Intel KNL are:
* GCC 6.2.0
* Intel 16.4.258 (with gcc 4.7.2)
* Intel 17.2.174 (with gcc 4.9.3)
* Intel 18.0.128 (with gcc 4.9.3)
* Intel 18.2.199 (with gcc 4.9.3)
### Primary tested compilers on ARM
* GCC 6.1.0
### Primary tested compilers on ARM (Cavium ThunderX2)
* GCC 7.2.0
* ARM/Clang 18.4.0
### Other compilers working:
* X86:
- Cygwin 2.1.0 64bit with gcc 4.9.3
- GCC 8.1.0 (not warning free)
### Known non-working combinations:
* Power8:

View File

@ -697,6 +697,7 @@ namespace Kokkos {
typedef Random_XorShift64<DeviceType> generator_type;
typedef DeviceType device_type;
KOKKOS_INLINE_FUNCTION
Random_XorShift64_Pool() {
num_states_ = 0;
}
@ -709,12 +710,14 @@ namespace Kokkos {
#endif
}
KOKKOS_INLINE_FUNCTION
Random_XorShift64_Pool(const Random_XorShift64_Pool& src):
locks_(src.locks_),
state_(src.state_),
num_states_(src.num_states_)
{}
KOKKOS_INLINE_FUNCTION
Random_XorShift64_Pool operator = (const Random_XorShift64_Pool& src) {
locks_ = src.locks_;
state_ = src.state_;
@ -958,6 +961,7 @@ namespace Kokkos {
typedef DeviceType device_type;
KOKKOS_INLINE_FUNCTION
Random_XorShift1024_Pool() {
num_states_ = 0;
}
@ -972,6 +976,7 @@ namespace Kokkos {
#endif
}
KOKKOS_INLINE_FUNCTION
Random_XorShift1024_Pool(const Random_XorShift1024_Pool& src):
locks_(src.locks_),
state_(src.state_),
@ -979,6 +984,7 @@ namespace Kokkos {
num_states_(src.num_states_)
{}
KOKKOS_INLINE_FUNCTION
Random_XorShift1024_Pool operator = (const Random_XorShift1024_Pool& src) {
locks_ = src.locks_;
state_ = src.state_;

View File

@ -246,8 +246,8 @@ public:
{
bin_count_atomic = Kokkos::View<int*, Space >("Kokkos::SortImpl::BinSortFunctor::bin_count",bin_op.max_bins());
bin_count_const = bin_count_atomic;
bin_offsets = offset_type("Kokkos::SortImpl::BinSortFunctor::bin_offsets",bin_op.max_bins());
sort_order = offset_type("PermutationVector",range_end-range_begin);
bin_offsets = offset_type(ViewAllocateWithoutInitializing("Kokkos::SortImpl::BinSortFunctor::bin_offsets"),bin_op.max_bins());
sort_order = offset_type(ViewAllocateWithoutInitializing("Kokkos::SortImpl::BinSortFunctor::sort_order"),range_end-range_begin);
}
BinSort( const_key_view_type keys_
@ -290,7 +290,7 @@ public:
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
scratch_view_type
sorted_values("Scratch",
sorted_values(ViewAllocateWithoutInitializing("Kokkos::SortImpl::BinSortFunctor::sorted_values"),
len,
values.extent(1),
values.extent(2),
@ -301,7 +301,7 @@ public:
values.extent(7));
#else
scratch_view_type
sorted_values("Scratch",
sorted_values(ViewAllocateWithoutInitializing("Kokkos::SortImpl::BinSortFunctor::sorted_values"),
values.rank_dynamic > 0 ? len : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
values.rank_dynamic > 1 ? values.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG ,
values.rank_dynamic > 2 ? values.extent(2) : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
@ -483,7 +483,7 @@ struct BinOp3D {
if (keys(i1,0)>keys(i2,0)) return true;
else if (keys(i1,0)==keys(i2,0)) {
if (keys(i1,1)>keys(i2,1)) return true;
else if (keys(i1,1)==keys(i2,2)) {
else if (keys(i1,1)==keys(i2,1)) {
if (keys(i1,2)>keys(i2,2)) return true;
}
}

View File

@ -0,0 +1,41 @@
#Set your Kokkos path to something appropriate
KOKKOS_PATH = ${HOME}/git/kokkos-github-repo
KOKKOS_DEVICES = "Cuda"
KOKKOS_ARCH = "Pascal60"
KOKKOS_CUDA_OPTIONS = enable_lambda
#KOKKOS_DEVICES = "OpenMP"
#KOKKOS_ARCH = "Power8"
SRC = gups-kokkos.cc
default: build
echo "Start Build"
CXXFLAGS = -O3
CXX = ${HOME}/git/kokkos-github-repo/bin/nvcc_wrapper
#CXX = g++
LINK = ${CXX}
LINKFLAGS =
EXE = gups-kokkos
DEPFLAGS = -M
OBJ = $(SRC:.cc=.o)
LIB =
include $(KOKKOS_PATH)/Makefile.kokkos
build: $(EXE)
$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
clean: kokkos-clean
rm -f *.o $(EXE)
# Compilation rules
%.o:%.cc $(KOKKOS_CPP_DEPENDS)
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<

View File

@ -0,0 +1,199 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// ************************************************************************
//@HEADER
*/
#include "Kokkos_Core.hpp"
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <sys/time.h>
#define HLINE "-------------------------------------------------------------\n"
#if defined(KOKKOS_ENABLE_CUDA)
typedef Kokkos::View<int64_t*, Kokkos::CudaSpace>::HostMirror GUPSHostArray;
typedef Kokkos::View<int64_t*, Kokkos::CudaSpace> GUPSDeviceArray;
#else
typedef Kokkos::View<int64_t*, Kokkos::HostSpace>::HostMirror GUPSHostArray;
typedef Kokkos::View<int64_t*, Kokkos::HostSpace> GUPSDeviceArray;
#endif
typedef int GUPSIndex;
double now() {
struct timeval now;
gettimeofday(&now, NULL);
return (double) now.tv_sec + ((double) now.tv_usec * 1.0e-6);
}
void randomize_indices(GUPSHostArray& indices, GUPSDeviceArray& dev_indices, const int64_t dataCount) {
for( GUPSIndex i = 0; i < indices.extent(0); ++i ) {
indices[i] = lrand48() % dataCount;
}
Kokkos::deep_copy(dev_indices, indices);
}
void run_gups(GUPSDeviceArray& indices, GUPSDeviceArray& data, const int64_t datum,
const bool performAtomics) {
if( performAtomics ) {
Kokkos::parallel_for("bench-gups-atomic", indices.extent(0), KOKKOS_LAMBDA(const GUPSIndex i) {
Kokkos::atomic_fetch_xor( &data[indices[i]], datum );
});
} else {
Kokkos::parallel_for("bench-gups-non-atomic", indices.extent(0), KOKKOS_LAMBDA(const GUPSIndex i) {
data[indices[i]] ^= datum;
});
}
Kokkos::fence();
}
int run_benchmark(const GUPSIndex indicesCount, const GUPSIndex dataCount, const int repeats,
const bool useAtomics) {
printf("Reports fastest timing per kernel\n");
printf("Creating Views...\n");
printf("Memory Sizes:\n");
printf("- Elements: %15" PRIu64 " (%12.4f MB)\n", static_cast<uint64_t>(dataCount),
1.0e-6 * ((double) dataCount * (double) sizeof(int64_t)));
printf("- Indices: %15" PRIu64 " (%12.4f MB)\n", static_cast<uint64_t>(indicesCount),
1.0e-6 * ((double) indicesCount * (double) sizeof(int64_t)));
printf(" - Atomics: %15s\n", (useAtomics ? "Yes" : "No") );
printf("Benchmark kernels will be performed for %d iterations.\n", repeats);
printf(HLINE);
GUPSDeviceArray dev_indices("indices", indicesCount);
GUPSDeviceArray dev_data("data", dataCount);
int64_t datum = -1;
GUPSHostArray indices = Kokkos::create_mirror_view(dev_indices);
GUPSHostArray data = Kokkos::create_mirror_view(dev_data);
double gupsTime = 0.0;
printf("Initializing Views...\n");
#if defined(KOKKOS_HAVE_OPENMP)
Kokkos::parallel_for("init-data", Kokkos::RangePolicy<Kokkos::OpenMP>(0, dataCount),
#else
Kokkos::parallel_for("init-data", Kokkos::RangePolicy<Kokkos::Serial>(0, dataCount),
#endif
KOKKOS_LAMBDA(const int i) {
data[i] = 10101010101;
});
#if defined(KOKKOS_HAVE_OPENMP)
Kokkos::parallel_for("init-indices", Kokkos::RangePolicy<Kokkos::OpenMP>(0, indicesCount),
#else
Kokkos::parallel_for("init-indices", Kokkos::RangePolicy<Kokkos::Serial>(0, indicesCount),
#endif
KOKKOS_LAMBDA(const int i) {
indices[i] = 0;
});
Kokkos::deep_copy(dev_data, data);
Kokkos::deep_copy(dev_indices, indices);
double start;
printf("Starting benchmarking...\n");
for( GUPSIndex k = 0; k < repeats; ++k ) {
randomize_indices(indices, dev_indices, data.extent(0));
start = now();
run_gups(dev_indices, dev_data, datum, useAtomics);
gupsTime += now() - start;
}
Kokkos::deep_copy(indices, dev_indices);
Kokkos::deep_copy(data, dev_data);
printf(HLINE);
printf("GUP/s Random: %18.6f\n",
(1.0e-9 * ((double) repeats) * (double) dev_indices.extent(0)) / gupsTime);
printf(HLINE);
return 0;
}
int main(int argc, char* argv[]) {
printf(HLINE);
printf("Kokkos GUPS Benchmark\n");
printf(HLINE);
srand48(1010101);
Kokkos::initialize(argc, argv);
int64_t indices = 8192;
int64_t data = 33554432;
int64_t repeats = 10;
bool useAtomics = false;
for( int i = 1; i < argc; ++i ) {
if( strcmp( argv[i], "--indices" ) == 0 ) {
indices = std::atoll(argv[i+1]);
++i;
} else if( strcmp( argv[i], "--data" ) == 0 ) {
data = std::atoll(argv[i+1]);
++i;
} else if( strcmp( argv[i], "--repeats" ) == 0 ) {
repeats = std::atoll(argv[i+1]);
++i;
} else if( strcmp( argv[i], "--atomics" ) == 0 ) {
useAtomics = true;
}
}
const int rc = run_benchmark(indices, data, repeats, useAtomics);
Kokkos::finalize();
return rc;
}

View File

@ -0,0 +1,41 @@
#Set your Kokkos path to something appropriate
KOKKOS_PATH = ${HOME}/git/kokkos-github-repo
#KOKKOS_DEVICES = "Cuda"
#KOKKOS_ARCH = "Pascal60"
#KOKKOS_CUDA_OPTIONS = enable_lambda
KOKKOS_DEVICES = "OpenMP"
KOKKOS_ARCH = "Power8"
SRC = stream-kokkos.cc
default: build
echo "Start Build"
CXXFLAGS = -O3
#CXX = ${HOME}/git/kokkos-github-repo/bin/nvcc_wrapper
CXX = g++
LINK = ${CXX}
LINKFLAGS =
EXE = stream-kokkos
DEPFLAGS = -M
OBJ = $(SRC:.cc=.o)
LIB =
include $(KOKKOS_PATH)/Makefile.kokkos
build: $(EXE)
$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
clean: kokkos-clean
rm -f *.o $(EXE)
# Compilation rules
%.o:%.cc $(KOKKOS_CPP_DEPENDS)
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<

View File

@ -0,0 +1,265 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// ************************************************************************
//@HEADER
*/
#include "Kokkos_Core.hpp"
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <sys/time.h>
#define STREAM_ARRAY_SIZE 100000000
#define STREAM_NTIMES 20
#define HLINE "-------------------------------------------------------------\n"
#if defined(KOKKOS_ENABLE_CUDA)
typedef Kokkos::View<double*, Kokkos::CudaSpace>::HostMirror StreamHostArray;
typedef Kokkos::View<double*, Kokkos::CudaSpace> StreamDeviceArray;
#else
typedef Kokkos::View<double*, Kokkos::HostSpace>::HostMirror StreamHostArray;
typedef Kokkos::View<double*, Kokkos::HostSpace> StreamDeviceArray;
#endif
typedef int StreamIndex;
double now() {
struct timeval now;
gettimeofday(&now, NULL);
return (double) now.tv_sec + ((double) now.tv_usec * 1.0e-6);
}
void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c) {
Kokkos::parallel_for("copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
c[i] = a[i];
});
Kokkos::fence();
}
void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c,
const double scalar) {
Kokkos::parallel_for("copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
b[i] = scalar * c[i];
});
Kokkos::fence();
}
void perform_add(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c) {
Kokkos::parallel_for("add", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
c[i] = a[i] + b[i];
});
Kokkos::fence();
}
void perform_triad(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c,
const double scalar) {
Kokkos::parallel_for("triad", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
a[i] = b[i] + scalar * c[i];
});
Kokkos::fence();
}
int perform_validation(StreamHostArray& a, StreamHostArray& b, StreamHostArray& c,
const StreamIndex arraySize, const double scalar) {
double ai = 1.0;
double bi = 2.0;
double ci = 0.0;
for( StreamIndex i = 0; i < arraySize; ++i ) {
ci = ai;
bi = scalar * ci;
ci = ai + bi;
ai = bi + scalar * ci;
};
double aError = 0.0;
double bError = 0.0;
double cError = 0.0;
for( StreamIndex i = 0; i < arraySize; ++i ) {
aError = std::abs( a[i] - ai );
bError = std::abs( b[i] - bi );
cError = std::abs( c[i] - ci );
}
double aAvgError = aError / (double) arraySize;
double bAvgError = bError / (double) arraySize;
double cAvgError = cError / (double) arraySize;
const double epsilon = 1.0e-13;
int errorCount = 0;
if( std::abs( aAvgError / ai ) > epsilon ) {
fprintf(stderr, "Error: validation check on View a failed.\n");
errorCount++;
}
if( std::abs( bAvgError / bi ) > epsilon ) {
fprintf(stderr, "Error: validation check on View b failed.\n");
errorCount++;
}
if( std::abs( cAvgError / ci ) > epsilon ) {
fprintf(stderr, "Error: validation check on View c failed.\n");
errorCount++;
}
if( errorCount == 0 ) {
printf("All solutions checked and verified.\n");
}
return errorCount;
}
int run_benchmark() {
printf("Reports fastest timing per kernel\n");
printf("Creating Views...\n");
printf("Memory Sizes:\n");
printf("- Array Size: %" PRIu64 "\n", static_cast<uint64_t>(STREAM_ARRAY_SIZE));
printf("- Per Array: %12.2f MB\n", 1.0e-6 * (double) STREAM_ARRAY_SIZE * (double) sizeof(double));
printf("- Total: %12.2f MB\n", 3.0e-6 * (double) STREAM_ARRAY_SIZE * (double) sizeof(double));
printf("Benchmark kernels will be performed for %d iterations.\n", STREAM_NTIMES);
printf(HLINE);
StreamDeviceArray dev_a("a", STREAM_ARRAY_SIZE);
StreamDeviceArray dev_b("b", STREAM_ARRAY_SIZE);
StreamDeviceArray dev_c("c", STREAM_ARRAY_SIZE);
StreamHostArray a = Kokkos::create_mirror_view(dev_a);
StreamHostArray b = Kokkos::create_mirror_view(dev_b);
StreamHostArray c = Kokkos::create_mirror_view(dev_c);
const double scalar = 3.0;
double copyTime = std::numeric_limits<double>::max();
double scaleTime = std::numeric_limits<double>::max();
double addTime = std::numeric_limits<double>::max();
double triadTime = std::numeric_limits<double>::max();
printf("Initializing Views...\n");
#if defined(KOKKOS_HAVE_OPENMP)
Kokkos::parallel_for("init", Kokkos::RangePolicy<Kokkos::OpenMP>(0, STREAM_ARRAY_SIZE),
#else
Kokkos::parallel_for("init", Kokkos::RangePolicy<Kokkos::Serial>(0, STREAM_ARRAY_SIZE),
#endif
KOKKOS_LAMBDA(const int i) {
a[i] = 1.0;
b[i] = 2.0;
c[i] = 0.0;
});
// Copy contents of a (from the host) to the dev_a (device)
Kokkos::deep_copy(dev_a, a);
Kokkos::deep_copy(dev_b, b);
Kokkos::deep_copy(dev_c, c);
double start;
printf("Starting benchmarking...\n");
for( StreamIndex k = 0; k < STREAM_NTIMES; ++k ) {
start = now();
perform_copy(dev_a, dev_b, dev_c);
copyTime = std::min( copyTime, (now() - start) );
start = now();
perform_scale(dev_a, dev_b, dev_c, scalar);
scaleTime = std::min( scaleTime, (now() - start) );
start = now();
perform_add(dev_a, dev_b, dev_c);
addTime = std::min( addTime, (now() - start) );
start = now();
perform_triad(dev_a, dev_b, dev_c, scalar);
triadTime = std::min( triadTime, (now() - start) );
}
Kokkos::deep_copy(a, dev_a);
Kokkos::deep_copy(b, dev_b);
Kokkos::deep_copy(c, dev_c);
printf("Performing validation...\n");
int rc = perform_validation(a, b, c, STREAM_ARRAY_SIZE, scalar);
printf(HLINE);
printf("Copy %11.2f MB/s\n",
( 1.0e-06 * 2.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / copyTime );
printf("Scale %11.2f MB/s\n",
( 1.0e-06 * 2.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / scaleTime );
printf("Add %11.2f MB/s\n",
( 1.0e-06 * 3.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / addTime );
printf("Triad %11.2f MB/s\n",
( 1.0e-06 * 3.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / triadTime );
printf(HLINE);
return rc;
}
int main(int argc, char* argv[]) {
printf(HLINE);
printf("Kokkos STREAM Benchmark\n");
printf(HLINE);
Kokkos::initialize(argc, argv);
const int rc = run_benchmark();
Kokkos::finalize();
return rc;
}

View File

@ -125,18 +125,20 @@ function show_help {
echo " --openmp-ratio=N/D Ratio of the cpuset to use for OpenMP"
echo " Default: 1"
echo " --openmp-places=<Op> Op=threads|cores|sockets. Default: threads"
echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
echo " --force-openmp-num-threads=N"
echo " --openmp-num-threads=N"
echo " Override logic for selecting OMP_NUM_THREADS"
echo " --force-openmp-proc-bind=<OP>"
echo " --openmp-proc-bind=<OP>"
echo " Override logic for selecting OMP_PROC_BIND"
echo " --no-openmp-nested Set OMP_NESTED to false"
echo " --openmp-nested Set OMP_NESTED to true"
echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
echo " --output-prefix=<P> Save the output to files of the form"
echo " P.hpcbind.N, P.stdout.N and P.stderr.N where P is "
echo " the prefix and N is the rank (no spaces)"
echo " --output-mode=<Op> How console output should be handled."
echo " Options are all, rank0, and none. Default: rank0"
echo " --lstopo Show bindings in lstopo"
echo " --save-topology=<Xml> Save the topology to the given xml file"
echo " --load-topology=<Xml> Load a previously saved topology from an xml file"
echo " -v|--verbose Print bindings and relevant environment variables"
echo " -h|--help Show this message"
echo ""
@ -189,7 +191,7 @@ HPCBIND_OPENMP_PLACES=${OMP_PLACES:-threads}
declare -i HPCBIND_OPENMP_PROC_BIND=1
HPCBIND_OPENMP_FORCE_NUM_THREADS=""
HPCBIND_OPENMP_FORCE_PROC_BIND=""
declare -i HPCBIND_OPENMP_NESTED=1
declare -i HPCBIND_OPENMP_NESTED=0
declare -i HPCBIND_VERBOSE=0
declare -i HPCBIND_LSTOPO=0
@ -197,6 +199,9 @@ declare -i HPCBIND_LSTOPO=0
HPCBIND_OUTPUT_PREFIX=""
HPCBIND_OUTPUT_MODE="rank0"
HPCBIND_OUTPUT_TOPOLOGY=""
HPCBIND_INPUT_TOPOLOGY=""
declare -i HPCBIND_HAS_COMMAND=0
for i in "$@"; do
@ -276,10 +281,22 @@ for i in "$@"; do
HPCBIND_OPENMP_NESTED=0
shift
;;
--openmp-nested)
HPCBIND_OPENMP_NESTED=1
shift
;;
--output-prefix=*)
HPCBIND_OUTPUT_PREFIX="${i#*=}"
shift
;;
--save-topology=*)
HPCBIND_OUTPUT_TOPOLOGY="${i#*=}"
shift
;;
--load-topology=*)
HPCBIND_INPUT_TOPOLOGY="${i#*=}"
shift
;;
--output-mode=*)
HPCBIND_OUTPUT_MODE="${i#*=}"
#convert to lower case
@ -327,24 +344,37 @@ elif [[ ${HPCBIND_QUEUE_RANK} -eq 0 ]]; then
HPCBIND_TEE=1
fi
# Save the topology to the given xml file
if [[ "${HPCBIND_OUTPUT_TOPOLOGY}" != "" ]]; then
if [[ ${HPCBIND_QUEUE_RANK} -eq 0 ]]; then
lstopo-no-graphics "${HPCBIND_OUTPUT_TOPOLOGY}"
else
lstopo-no-graphics >/dev/null 2>&1
fi
fi
# Load the topology to the given xml file
if [[ "${HPCBIND_INPUT_TOPOLOGY}" != "" ]]; then
if [ -f ${HPCBIND_INPUT_TOPOLOGY} ]; then
export HWLOC_XMLFILE="${HPCBIND_INPUT_TOPOLOGY}"
export HWLOC_THISSYSTEM=1
fi
fi
if [[ "${HPCBIND_OUTPUT_PREFIX}" == "" ]]; then
HPCBIND_LOG=/dev/null
HPCBIND_ERR=/dev/null
HPCBIND_OUT=/dev/null
else
if [[ ${HPCBIND_QUEUE_SIZE} -gt 0 ]]; then
HPCBIND_STR_QUEUE_SIZE="${HPCBIND_QUEUE_SIZE}"
HPCBIND_STR_QUEUE_RANK=$(printf %0*d ${#HPCBIND_STR_QUEUE_SIZE} ${HPCBIND_QUEUE_RANK})
HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}.hpcbind.${HPCBIND_STR_QUEUE_RANK}"
HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}.stderr.${HPCBIND_STR_QUEUE_RANK}"
HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}.stdout.${HPCBIND_STR_QUEUE_RANK}"
else
HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}.hpcbind.${HPCBIND_QUEUE_RANK}"
HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}.stderr.${HPCBIND_QUEUE_RANK}"
HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}.stdout.${HPCBIND_QUEUE_RANK}"
if [[ ${HPCBIND_QUEUE_SIZE} -le 0 ]]; then
HPCBIND_QUEUE_SIZE=1
fi
HPCBIND_STR_QUEUE_SIZE="${HPCBIND_QUEUE_SIZE}"
HPCBIND_STR_QUEUE_RANK=$(printf %0*d ${#HPCBIND_STR_QUEUE_SIZE} ${HPCBIND_QUEUE_RANK})
HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}.hpcbind.${HPCBIND_STR_QUEUE_RANK}"
HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}.stderr.${HPCBIND_STR_QUEUE_RANK}"
HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}.stdout.${HPCBIND_STR_QUEUE_RANK}"
> ${HPCBIND_LOG}
fi
@ -546,6 +576,8 @@ if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then
hostname -s >> ${HPCBIND_LOG}
echo "[HPCBIND]" >> ${HPCBIND_LOG}
echo "${TMP_ENV}" | grep -E "^HPCBIND_" >> ${HPCBIND_LOG}
echo "[HWLOC]" >> ${HPCBIND_LOG}
echo "${TMP_ENV}" | grep -E "^HWLOC_" >> ${HPCBIND_LOG}
echo "[CUDA]" >> ${HPCBIND_LOG}
echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG}
echo "[OPENMP]" >> ${HPCBIND_LOG}
@ -568,6 +600,8 @@ else
hostname -s > >(tee -a ${HPCBIND_LOG})
echo "[HPCBIND]" > >(tee -a ${HPCBIND_LOG})
echo "${TMP_ENV}" | grep -E "^HPCBIND_" > >(tee -a ${HPCBIND_LOG})
echo "[HWLOC]" > >(tee -a ${HPCBIND_LOG})
echo "${TMP_ENV}" | grep -E "^HWLOC_" > >(tee -a ${HPCBIND_LOG})
echo "[CUDA]" > >(tee -a ${HPCBIND_LOG})
echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG})
echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG})

View File

@ -74,6 +74,9 @@ dry_run=0
host_only=0
host_only_args=""
# Just run version on host compiler
get_host_version=0
# Enable workaround for CUDA 6.5 for pragma ident
replace_pragma_ident=0
@ -93,6 +96,9 @@ depfile_separate=0
depfile_output_arg=""
depfile_target_arg=""
# Option to remove duplicate libraries and object files
remove_duplicate_link_files=0
#echo "Arguments: $# $@"
while [ $# -gt 0 ]
@ -106,10 +112,18 @@ do
--host-only)
host_only=1
;;
#get the host version only
--host-version)
get_host_version=1
;;
#replace '#pragma ident' with '#ident' this is needed to compile OpenMPI due to a configure script bug and a non standardized behaviour of pragma with macros
--replace-pragma-ident)
replace_pragma_ident=1
;;
#remove duplicate link files
--remove-duplicate-link-files)
remove_duplicate_link_files=1
;;
#handle source files to be compiled as cuda files
*.cpp|*.cxx|*.cc|*.C|*.c++|*.cu)
cpp_files="$cpp_files $1"
@ -124,7 +138,12 @@ do
fi
;;
#Handle shared args (valid for both nvcc and the host compiler)
-D*|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared)
-D*)
unescape_commas=`echo "$1" | sed -e 's/\\\,/,/g'`
arg=`printf "%q" $unescape_commas`
shared_args="$shared_args $arg"
;;
-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared|-w)
shared_args="$shared_args $1"
;;
#Handle compilation argument
@ -152,7 +171,7 @@ do
shift
;;
#Handle known nvcc args
-gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*)
--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*)
cuda_args="$cuda_args $1"
;;
#Handle more known nvcc args
@ -164,8 +183,11 @@ do
cuda_args="$cuda_args $1 $2"
shift
;;
-rdc=*|-maxrregcount*|--maxrregcount*)
cuda_args="$cuda_args $1"
;;
#Handle c++11
--std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++1z|-std=c++1z)
--std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++1y|-std=c++1y|--std=c++17|-std=c++17|--std=c++1z|-std=c++1z)
if [ $stdcxx_applied -eq 1 ]; then
echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-std=c++1* or --std=c++1*), only the first is used because nvcc can only accept a single std setting"
else
@ -205,6 +227,15 @@ do
fi
shift
;;
#Handle -+ (same as -x c++, specifically used for xl compilers, but mutually exclusive with -x. So replace it with -x c++)
-+)
if [ $first_xcompiler_arg -eq 1 ]; then
xcompiler_args="-x,c++"
first_xcompiler_arg=0
else
xcompiler_args="$xcompiler_args,-x,c++"
fi
;;
#Handle -ccbin (if its not set we can set it to a default value)
-ccbin)
cuda_args="$cuda_args $1 $2"
@ -212,18 +243,39 @@ do
host_compiler=$2
shift
;;
#Handle -arch argument (if its not set use a default
-arch*)
#Handle -arch argument (if its not set use a default) this is the version with = sign
-arch*|-gencode*)
cuda_args="$cuda_args $1"
arch_set=1
;;
#Handle -code argument (if its not set use a default) this is the version with = sign
-code*)
cuda_args="$cuda_args $1"
;;
#Handle -arch argument (if its not set use a default) this is the version without = sign
-arch|-gencode)
cuda_args="$cuda_args $1 $2"
arch_set=1
shift
;;
#Handle -code argument (if its not set use a default) this is the version without = sign
-code)
cuda_args="$cuda_args $1 $2"
shift
;;
#Handle -Xcudafe argument
-Xcudafe)
cuda_args="$cuda_args -Xcudafe $2"
shift
;;
#Handle -Xlinker argument
-Xlinker)
xlinker_args="$xlinker_args -Xlinker $2"
shift
;;
#Handle args that should be sent to the linker
-Wl*)
-Wl,*)
xlinker_args="$xlinker_args -Xlinker ${1:4:${#1}}"
host_linker_args="$host_linker_args ${1:4:${#1}}"
;;
@ -256,6 +308,44 @@ do
shift
done
# Only print host compiler version
if [ $get_host_version -eq 1 ]; then
$host_compiler --version
exit
fi
#Remove duplicate object files
if [ $remove_duplicate_link_files -eq 1 ]; then
for obj in $object_files
do
object_files_reverse="$obj $object_files_reverse"
done
object_files_reverse_clean=""
for obj in $object_files_reverse
do
exists=false
for obj2 in $object_files_reverse_clean
do
if [ "$obj" == "$obj2" ]
then
exists=true
echo "Exists: $obj"
fi
done
if [ "$exists" == "false" ]
then
object_files_reverse_clean="$object_files_reverse_clean $obj"
fi
done
object_files=""
for obj in $object_files_reverse_clean
do
object_files="$obj $object_files"
done
fi
#Add default host compiler if necessary
if [ $ccbin_set -ne 1 ]; then
cuda_args="$cuda_args -ccbin $host_compiler"
@ -328,10 +418,19 @@ fi
#Run compilation command
if [ $host_only -eq 1 ]; then
if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then
echo "$host_command"
fi
$host_command
elif [ -n "$nvcc_depfile_command" ]; then
if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then
echo "$nvcc_command && $nvcc_depfile_command"
fi
$nvcc_command && $nvcc_depfile_command
else
if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then
echo "$nvcc_command"
fi
$nvcc_command
fi
error_code=$?

View File

@ -235,3 +235,7 @@ install(FILES
# Install the export set for use with the install-tree
INSTALL(EXPORT KokkosTargets DESTINATION
"${INSTALL_CMAKE_DIR}")
# build and install pkgconfig file
CONFIGURE_FILE(core/src/kokkos.pc.in kokkos.pc @ONLY)
INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/kokkos.pc DESTINATION lib/pkgconfig)

View File

@ -47,7 +47,7 @@ function(set_kokkos_cxx_compiler)
OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION
OUTPUT_STRIP_TRAILING_WHITESPACE)
string(REGEX MATCH "[0-9]+\.[0-9]+\.[0-9]+$"
string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+$"
INTERNAL_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION})
endif()

View File

@ -41,7 +41,6 @@ list(APPEND KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST
foreach(opt ${KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST})
string(TOUPPER ${opt} OPT )
IF(DEFINED Kokkos_ENABLE_${opt})
MESSAGE("Kokkos_ENABLE_${opt} is defined!")
IF(DEFINED KOKKOS_ENABLE_${OPT})
IF(NOT ("${KOKKOS_ENABLE_${OPT}}" STREQUAL "${Kokkos_ENABLE_${opt}}"))
IF(DEFINED KOKKOS_ENABLE_${OPT}_INTERNAL)
@ -59,7 +58,6 @@ foreach(opt ${KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST})
ENDIF()
ELSE()
SET(KOKKOS_INTERNAL_ENABLE_${OPT}_DEFAULT ${Kokkos_ENABLE_${opt}})
MESSAGE("set KOKKOS_INTERNAL_ENABLE_${OPT}_DEFAULT!")
ENDIF()
ENDIF()
endforeach()
@ -81,6 +79,7 @@ list(APPEND KOKKOS_ARCH_LIST
ARMv80 # (HOST) ARMv8.0 Compatible CPU
ARMv81 # (HOST) ARMv8.1 Compatible CPU
ARMv8-ThunderX # (HOST) ARMv8 Cavium ThunderX CPU
ARMv8-TX2 # (HOST) ARMv8 Cavium ThunderX2 CPU
WSM # (HOST) Intel Westmere CPU
SNB # (HOST) Intel Sandy/Ivy Bridge CPUs
HSW # (HOST) Intel Haswell CPUs
@ -123,11 +122,18 @@ list(APPEND KOKKOS_DEVICES_LIST
# List of possible TPLs for Kokkos
# From Makefile.kokkos: Options: hwloc,librt,experimental_memkind
set(KOKKOS_USE_TPLS_LIST)
if(APPLE)
list(APPEND KOKKOS_USE_TPLS_LIST
HWLOC # hwloc
MEMKIND # experimental_memkind
)
else()
list(APPEND KOKKOS_USE_TPLS_LIST
HWLOC # hwloc
LIBRT # librt
MEMKIND # experimental_memkind
)
endif()
# Map of cmake variables to Makefile variables
set(KOKKOS_INTERNAL_HWLOC hwloc)
set(KOKKOS_INTERNAL_LIBRT librt)
@ -172,6 +178,7 @@ set(KOKKOS_INTERNAL_LAMBDA enable_lambda)
set(tmpr "\n ")
string(REPLACE ";" ${tmpr} KOKKOS_INTERNAL_ARCH_DOCSTR "${KOKKOS_ARCH_LIST}")
set(KOKKOS_INTERNAL_ARCH_DOCSTR "${tmpr}${KOKKOS_INTERNAL_ARCH_DOCSTR}")
# This would be useful, but we use Foo_ENABLE mechanisms
#string(REPLACE ";" ${tmpr} KOKKOS_INTERNAL_DEVICES_DOCSTR "${KOKKOS_DEVICES_LIST}")
#string(REPLACE ";" ${tmpr} KOKKOS_INTERNAL_USE_TPLS_DOCSTR "${KOKKOS_USE_TPLS_LIST}")
@ -269,7 +276,7 @@ set(KOKKOS_ENABLE_PROFILING_LOAD_PRINT ${KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_P
set_kokkos_default_default(DEPRECATED_CODE ON)
set(KOKKOS_ENABLE_DEPRECATED_CODE ${KOKKOS_INTERNAL_ENABLE_DEPRECATED_CODE_DEFAULT} CACHE BOOL "Enable deprecated code.")
set_kokkos_default_default(EXPLICIT_INSTANTIATION ON)
set_kokkos_default_default(EXPLICIT_INSTANTIATION OFF)
set(KOKKOS_ENABLE_EXPLICIT_INSTANTIATION ${KOKKOS_INTERNAL_ENABLE_EXPLICIT_INSTANTIATION_DEFAULT} CACHE BOOL "Enable explicit template instantiation.")
#-------------------------------------------------------------------------------

View File

@ -15,16 +15,16 @@
# Ensure that KOKKOS_ARCH is in the ARCH_LIST
if (KOKKOS_ARCH MATCHES ",")
message("-- Detected a comma in: KOKKOS_ARCH=${KOKKOS_ARCH}")
message("-- Detected a comma in: KOKKOS_ARCH=`${KOKKOS_ARCH}`")
message("-- Although we prefer KOKKOS_ARCH to be semicolon-delimited, we do allow")
message("-- comma-delimited values for compatibility with scripts (see github.com/trilinos/Trilinos/issues/2330)")
string(REPLACE "," ";" KOKKOS_ARCH "${KOKKOS_ARCH}")
message("-- Commas were changed to semicolons, now KOKKOS_ARCH=${KOKKOS_ARCH}")
message("-- Commas were changed to semicolons, now KOKKOS_ARCH=`${KOKKOS_ARCH}`")
endif()
foreach(arch ${KOKKOS_ARCH})
list(FIND KOKKOS_ARCH_LIST ${arch} indx)
if (indx EQUAL -1)
message(FATAL_ERROR "${arch} is not an accepted value for KOKKOS_ARCH."
message(FATAL_ERROR "`${arch}` is not an accepted value in KOKKOS_ARCH=`${KOKKOS_ARCH}`."
" Please pick from these choices: ${KOKKOS_INTERNAL_ARCH_DOCSTR}")
endif ()
endforeach()
@ -130,7 +130,8 @@ string(REPLACE ";" ":" KOKKOS_INTERNAL_ADDTOPATH "${addpathl}")
# Set the KOKKOS_SETTINGS String -- this is the primary communication with the
# makefile configuration. See Makefile.kokkos
set(KOKKOS_SETTINGS KOKKOS_SRC_PATH=${KOKKOS_SRC_PATH})
set(KOKKOS_SETTINGS KOKKOS_CMAKE=yes)
set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} KOKKOS_SRC_PATH=${KOKKOS_SRC_PATH})
set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} KOKKOS_PATH=${KOKKOS_PATH})
set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} KOKKOS_INSTALL_PATH=${CMAKE_INSTALL_PREFIX})

View File

@ -241,17 +241,16 @@ elif [ "$MACHINE" = "white" ]; then
BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/5.4.0"
CUDA_MODULE_LIST2="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/6.3.0,ibm/xl/13.1.6"
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/6.4.0,ibm/xl/16.1.0"
# Don't do pthread on white.
GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"ibm/13.1.6 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
"cuda/8.0.44 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
"cuda/9.0.103 $CUDA_MODULE_LIST2 $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
"gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
"cuda/9.0.103 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
)
if [ -z "$ARCH_FLAG" ]; then
@ -362,7 +361,7 @@ elif [ "$MACHINE" = "apollo" ]; then
"gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
"intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
"clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
"clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
"clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread,OpenMP" clang++ $CUDA_WARNING_FLAGS"
"cuda/9.1 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
)
else

View File

@ -96,6 +96,7 @@ template< class DataType ,
class Arg3Type = void>
class DualView : public ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type >
{
template< class , class , class , class > friend class DualView ;
public:
//! \name Typedefs for device types and various Kokkos::View specializations.
//@{
@ -182,8 +183,20 @@ public:
//! \name Counters to keep track of changes ("modified" flags)
//@{
View<unsigned int,LayoutLeft,typename t_host::execution_space> modified_device;
View<unsigned int,LayoutLeft,typename t_host::execution_space> modified_host;
#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
protected:
// modified_flags[0] -> host
// modified_flags[1] -> device
typedef View<unsigned int[2],LayoutLeft,Kokkos::HostSpace> t_modified_flags;
t_modified_flags modified_flags;
public:
#else
typedef View<unsigned int[2],LayoutLeft,typename t_host::execution_space> t_modified_flags;
typedef View<unsigned int,LayoutLeft,typename t_host::execution_space> t_modified_flag;
t_modified_flags modified_flags;
t_modified_flag modified_host,modified_device;
#endif
//@}
//! \name Constructors
@ -194,10 +207,14 @@ public:
/// Both device and host View objects are constructed using their
/// default constructors. The "modified" flags are both initialized
/// to "unmodified."
DualView () :
modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device")),
modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
{}
#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
DualView () = default;
#else
DualView ():modified_flags (t_modified_flags("DualView::modified_flags")) {
modified_host = t_modified_flag(modified_flags,0);
modified_device = t_modified_flag(modified_flags,1);
}
#endif
/// \brief Constructor that allocates View objects on both host and device.
///
@ -219,17 +236,24 @@ public:
const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG)
: d_view (label, n0, n1, n2, n3, n4, n5, n6, n7)
, h_view (create_mirror_view (d_view)) // without UVM, host View mirrors
, modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device"))
, modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
{}
, modified_flags (t_modified_flags("DualView::modified_flags"))
{
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
modified_host = t_modified_flag(modified_flags,0);
modified_device = t_modified_flag(modified_flags,1);
#endif
}
//! Copy constructor (shallow copy)
template<class SS, class LS, class DS, class MS>
DualView (const DualView<SS,LS,DS,MS>& src) :
d_view (src.d_view),
h_view (src.h_view),
modified_device (src.modified_device),
modified_host (src.modified_host)
modified_flags (src.modified_flags)
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
, modified_host(src.modified_host)
, modified_device(src.modified_device)
#endif
{}
//! Subview constructor
@ -241,8 +265,11 @@ public:
)
: d_view( Kokkos::subview( src.d_view , arg0 , args ... ) )
, h_view( Kokkos::subview( src.h_view , arg0 , args ... ) )
, modified_device (src.modified_device)
, modified_host (src.modified_host)
, modified_flags (src.modified_flags)
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
, modified_host(src.modified_host)
, modified_device(src.modified_device)
#endif
{}
/// \brief Create DualView from existing device and host View objects.
@ -258,8 +285,7 @@ public:
DualView (const t_dev& d_view_, const t_host& h_view_) :
d_view (d_view_),
h_view (h_view_),
modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device")),
modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
modified_flags (t_modified_flags("DualView::modified_flags"))
{
if ( int(d_view.rank) != int(h_view.rank) ||
d_view.extent(0) != h_view.extent(0) ||
@ -281,6 +307,10 @@ public:
d_view.span() != h_view.span() ) {
Kokkos::Impl::throw_runtime_exception("DualView constructed with incompatible views");
}
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
modified_host = t_modified_flag(modified_flags,0);
modified_device = t_modified_flag(modified_flags,1);
#endif
}
//@}
@ -316,6 +346,30 @@ public:
t_dev,
t_host>::type& view () const
{
#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
constexpr bool device_is_memspace = std::is_same<Device,typename Device::memory_space>::value;
constexpr bool device_is_execspace = std::is_same<Device,typename Device::execution_space>::value;
constexpr bool device_exec_is_t_dev_exec = std::is_same<typename Device::execution_space,typename t_dev::execution_space>::value;
constexpr bool device_mem_is_t_dev_mem = std::is_same<typename Device::memory_space,typename t_dev::memory_space>::value;
constexpr bool device_exec_is_t_host_exec = std::is_same<typename Device::execution_space,typename t_host::execution_space>::value;
constexpr bool device_mem_is_t_host_mem = std::is_same<typename Device::memory_space,typename t_host::memory_space>::value;
constexpr bool device_is_t_host_device = std::is_same<typename Device::execution_space,typename t_host::device_type>::value;
constexpr bool device_is_t_dev_device = std::is_same<typename Device::memory_space,typename t_host::device_type>::value;
static_assert(
device_is_t_dev_device || device_is_t_host_device ||
(device_is_memspace && (device_mem_is_t_dev_mem || device_mem_is_t_host_mem) ) ||
(device_is_execspace && (device_exec_is_t_dev_exec || device_exec_is_t_host_exec) ) ||
(
(!device_is_execspace && !device_is_memspace) && (
(device_mem_is_t_dev_mem || device_mem_is_t_host_mem) ||
(device_exec_is_t_dev_exec || device_exec_is_t_host_exec)
)
)
,
"Template parameter to .view() must exactly match one of the DualView's device types or one of the execution or memory spaces");
#endif
return Impl::if_c<
std::is_same<
typename t_dev::memory_space,
@ -324,6 +378,72 @@ public:
t_host >::select (d_view , h_view);
}
KOKKOS_INLINE_FUNCTION
t_host view_host() const {
return h_view;
}
KOKKOS_INLINE_FUNCTION
t_dev view_device() const {
return d_view;
}
template<class Device>
static int get_device_side() {
constexpr bool device_is_memspace = std::is_same<Device,typename Device::memory_space>::value;
constexpr bool device_is_execspace = std::is_same<Device,typename Device::execution_space>::value;
constexpr bool device_exec_is_t_dev_exec = std::is_same<typename Device::execution_space,typename t_dev::execution_space>::value;
constexpr bool device_mem_is_t_dev_mem = std::is_same<typename Device::memory_space,typename t_dev::memory_space>::value;
constexpr bool device_exec_is_t_host_exec = std::is_same<typename Device::execution_space,typename t_host::execution_space>::value;
constexpr bool device_mem_is_t_host_mem = std::is_same<typename Device::memory_space,typename t_host::memory_space>::value;
constexpr bool device_is_t_host_device = std::is_same<typename Device::execution_space,typename t_host::device_type>::value;
constexpr bool device_is_t_dev_device = std::is_same<typename Device::memory_space,typename t_host::device_type>::value;
#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
static_assert(
device_is_t_dev_device || device_is_t_host_device ||
(device_is_memspace && (device_mem_is_t_dev_mem || device_mem_is_t_host_mem) ) ||
(device_is_execspace && (device_exec_is_t_dev_exec || device_exec_is_t_host_exec) ) ||
(
(!device_is_execspace && !device_is_memspace) && (
(device_mem_is_t_dev_mem || device_mem_is_t_host_mem) ||
(device_exec_is_t_dev_exec || device_exec_is_t_host_exec)
)
)
,
"Template parameter to .sync() must exactly match one of the DualView's device types or one of the execution or memory spaces");
#endif
#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
int dev = -1;
#else
int dev = 0;
#endif
if(device_is_t_dev_device) dev = 1;
else if(device_is_t_host_device) dev = 0;
else {
if(device_is_memspace) {
if(device_mem_is_t_dev_mem) dev = 1;
if(device_mem_is_t_host_mem) dev = 0;
if(device_mem_is_t_host_mem && device_mem_is_t_dev_mem) dev = -1;
}
if(device_is_execspace) {
if(device_exec_is_t_dev_exec) dev = 1;
if(device_exec_is_t_host_exec) dev = 0;
if(device_exec_is_t_host_exec && device_exec_is_t_dev_exec) dev = -1;
}
if(!device_is_execspace && !device_is_memspace) {
if(device_mem_is_t_dev_mem) dev = 1;
if(device_mem_is_t_host_mem) dev = 0;
if(device_mem_is_t_host_mem && device_mem_is_t_dev_mem) dev = -1;
if(device_exec_is_t_dev_exec) dev = 1;
if(device_exec_is_t_host_exec) dev = 0;
if(device_exec_is_t_host_exec && device_exec_is_t_dev_exec) dev = -1;
}
}
return dev;
}
/// \brief Update data on device or host only if data in the other
/// space has been marked as modified.
///
@ -347,23 +467,20 @@ public:
( std::is_same< Device , int>::value)
, int >::type& = 0)
{
const unsigned int dev =
Impl::if_c<
std::is_same<
typename t_dev::memory_space,
typename Device::memory_space>::value ,
unsigned int,
unsigned int>::select (1, 0);
if(modified_flags.data()==NULL) return;
if (dev) { // if Device is the same as DualView's device type
if ((modified_host () > 0) && (modified_host () >= modified_device ())) {
int dev = get_device_side<Device>();
if (dev == 1) { // if Device is the same as DualView's device type
if ((modified_flags(0) > 0) && (modified_flags(0) >= modified_flags(1))) {
deep_copy (d_view, h_view);
modified_host() = modified_device() = 0;
modified_flags(0) = modified_flags(1) = 0;
}
} else { // hopefully Device is the same as DualView's host type
if ((modified_device () > 0) && (modified_device () >= modified_host ())) {
}
if (dev == 0) { // hopefully Device is the same as DualView's host type
if ((modified_flags(1) > 0) && (modified_flags(1) >= modified_flags(0))) {
deep_copy (h_view, d_view);
modified_host() = modified_device() = 0;
modified_flags(0) = modified_flags(1) = 0;
}
}
if(std::is_same<typename t_host::memory_space,typename t_dev::memory_space>::value) {
@ -378,46 +495,71 @@ public:
( std::is_same< Device , int>::value)
, int >::type& = 0 )
{
const unsigned int dev =
Impl::if_c<
std::is_same<
typename t_dev::memory_space,
typename Device::memory_space>::value,
unsigned int,
unsigned int>::select (1, 0);
if (dev) { // if Device is the same as DualView's device type
if ((modified_host () > 0) && (modified_host () >= modified_device ())) {
if(modified_flags.data()==NULL) return;
int dev = get_device_side<Device>();
if (dev == 1) { // if Device is the same as DualView's device type
if ((modified_flags(0) > 0) && (modified_flags(0) >= modified_flags(1))) {
Impl::throw_runtime_exception("Calling sync on a DualView with a const datatype.");
}
} else { // hopefully Device is the same as DualView's host type
if ((modified_device () > 0) && (modified_device () >= modified_host ())) {
}
if (dev == 0){ // hopefully Device is the same as DualView's host type
if ((modified_flags(1) > 0) && (modified_flags(1) >= modified_flags(0))) {
Impl::throw_runtime_exception("Calling sync on a DualView with a const datatype.");
}
}
}
void sync_host() {
if( ! std::is_same< typename traits::data_type , typename traits::non_const_data_type>::value )
Impl::throw_runtime_exception("Calling sync_host on a DualView with a const datatype.");
if(modified_flags.data()==NULL) return;
if(modified_flags(1) > modified_flags(0)) {
deep_copy (h_view, d_view);
modified_flags(1) = modified_flags(0) = 0;
}
}
void sync_device() {
if( ! std::is_same< typename traits::data_type , typename traits::non_const_data_type>::value )
Impl::throw_runtime_exception("Calling sync_device on a DualView with a const datatype.");
if(modified_flags.data()==NULL) return;
if(modified_flags(0) > modified_flags(1)) {
deep_copy (d_view, h_view);
modified_flags(1) = modified_flags(0) = 0;
}
}
template<class Device>
bool need_sync() const
{
const unsigned int dev =
Impl::if_c<
std::is_same<
typename t_dev::memory_space,
typename Device::memory_space>::value ,
unsigned int,
unsigned int>::select (1, 0);
if(modified_flags.data()==NULL) return false;
int dev = get_device_side<Device>();
if (dev) { // if Device is the same as DualView's device type
if ((modified_host () > 0) && (modified_host () >= modified_device ())) {
if (dev == 1) { // if Device is the same as DualView's device type
if ((modified_flags(0) > 0) && (modified_flags(0) >= modified_flags(1))) {
return true;
}
} else { // hopefully Device is the same as DualView's host type
if ((modified_device () > 0) && (modified_device () >= modified_host ())) {
}
if (dev == 0){ // hopefully Device is the same as DualView's host type
if ((modified_flags(1) > 0) && (modified_flags(1) >= modified_flags(0))) {
return true;
}
}
return false;
}
inline bool need_sync_host() const {
if(modified_flags.data()==NULL) return false;
return modified_flags(0)<modified_flags(1);
}
inline bool need_sync_device() const {
if(modified_flags.data()==NULL) return false;
return modified_flags(1)<modified_flags(0);
}
/// \brief Mark data as modified on the given device \c Device.
///
/// If \c Device is the same as this DualView's device type, then
@ -425,26 +567,22 @@ public:
/// data as modified.
template<class Device>
void modify () {
const unsigned int dev =
Impl::if_c<
std::is_same<
typename t_dev::memory_space,
typename Device::memory_space>::value,
unsigned int,
unsigned int>::select (1, 0);
if(modified_flags.data()==NULL) return;
int dev = get_device_side<Device>();
if (dev) { // if Device is the same as DualView's device type
if (dev == 1) { // if Device is the same as DualView's device type
// Increment the device's modified count.
modified_device () = (modified_device () > modified_host () ?
modified_device () : modified_host ()) + 1;
} else { // hopefully Device is the same as DualView's host type
modified_flags(1) = (modified_flags(1) > modified_flags(0) ?
modified_flags(1) : modified_flags(0)) + 1;
}
if (dev == 0) { // hopefully Device is the same as DualView's host type
// Increment the host's modified count.
modified_host () = (modified_device () > modified_host () ?
modified_device () : modified_host ()) + 1;
modified_flags(0) = (modified_flags(1) > modified_flags(0) ?
modified_flags(1) : modified_flags(0)) + 1;
}
#ifdef KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
if (modified_host() && modified_device()) {
if (modified_flags(0) && modified_flags(1)) {
std::string msg = "Kokkos::DualView::modify ERROR: ";
msg += "Concurrent modification of host and device views ";
msg += "in DualView \"";
@ -455,6 +593,45 @@ public:
#endif
}
inline void modify_host() {
if(modified_flags.data()!=NULL) {
modified_flags(0) = (modified_flags(1) > modified_flags(0) ?
modified_flags(1) : modified_flags(0)) + 1;
#ifdef KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
if (modified_flags(0) && modified_flags(1)) {
std::string msg = "Kokkos::DualView::modify_host ERROR: ";
msg += "Concurrent modification of host and device views ";
msg += "in DualView \"";
msg += d_view.label();
msg += "\"\n";
Kokkos::abort(msg.c_str());
}
#endif
}
}
inline void modify_device() {
if(modified_flags.data()!=NULL) {
modified_flags(1) = (modified_flags(1) > modified_flags(0) ?
modified_flags(1) : modified_flags(0)) + 1;
#ifdef KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
if (modified_flags(0) && modified_flags(1)) {
std::string msg = "Kokkos::DualView::modify_device ERROR: ";
msg += "Concurrent modification of host and device views ";
msg += "in DualView \"";
msg += d_view.label();
msg += "\"\n";
Kokkos::abort(msg.c_str());
}
#endif
}
}
inline void clear_sync_state() {
if(modified_flags.data()!=NULL)
modified_flags(1) = modified_flags(0) = 0;
}
//@}
//! \name Methods for reallocating or resizing the View objects.
//@{
@ -476,7 +653,10 @@ public:
h_view = create_mirror_view( d_view );
/* Reset dirty flags */
modified_device() = modified_host() = 0;
if(modified_flags.data()==NULL) {
modified_flags = t_modified_flags("DualView::modified_flags");
} else
modified_flags(1) = modified_flags(0) = 0;
}
/// \brief Resize both views, copying old contents into new if necessary.
@ -491,13 +671,16 @@ public:
const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG ,
const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG ,
const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG ) {
if(modified_device() >= modified_host()) {
if(modified_flags.data()==NULL) {
modified_flags = t_modified_flags("DualView::modified_flags");
}
if(modified_flags(1) >= modified_flags(0)) {
/* Resize on Device */
::Kokkos::resize(d_view,n0,n1,n2,n3,n4,n5,n6,n7);
h_view = create_mirror_view( d_view );
/* Mark Device copy as modified */
modified_device() = modified_device()+1;
modified_flags(1) = modified_flags(1)+1;
} else {
/* Realloc on Device */
@ -525,7 +708,7 @@ public:
d_view = create_mirror_view( typename t_dev::execution_space(), h_view );
/* Mark Host copy as modified */
modified_host() = modified_host()+1;
modified_flags(0) = modified_flags(0)+1;
}
}
@ -649,7 +832,10 @@ void
deep_copy (DualView<DT,DL,DD,DM> dst, // trust me, this must not be a reference
const DualView<ST,SL,SD,SM>& src )
{
if (src.modified_device () >= src.modified_host ()) {
if(src.modified_flags.data()==NULL || dst.modified_flags.data()==NULL) {
return deep_copy(dst.d_view, src.d_view);
}
if (src.modified_flags(1) >= src.modified_flags(0)) {
deep_copy (dst.d_view, src.d_view);
dst.template modify<typename DualView<DT,DL,DD,DM>::device_type> ();
} else {
@ -666,7 +852,10 @@ deep_copy (const ExecutionSpace& exec ,
DualView<DT,DL,DD,DM> dst, // trust me, this must not be a reference
const DualView<ST,SL,SD,SM>& src )
{
if (src.modified_device () >= src.modified_host ()) {
if(src.modified_flags.data()==NULL || dst.modified_flags.data()==NULL) {
return deep_copy(exec, dst.d_view, src.d_view);
}
if (src.modified_flags(1) >= src.modified_flags(0)) {
deep_copy (exec, dst.d_view, src.d_view);
dst.template modify<typename DualView<DT,DL,DD,DM>::device_type> ();
} else {

View File

@ -64,7 +64,7 @@ namespace Impl {
template <typename Specialize>
struct DynRankDimTraits {
enum : size_t{unspecified =KOKKOS_INVALID_INDEX};
enum : size_t{unspecified = KOKKOS_INVALID_INDEX};
// Compute the rank of the view from the nonzero dimension arguments.
KOKKOS_INLINE_FUNCTION
@ -384,8 +384,8 @@ public:
// Removed dimension checks...
typedef typename DstType::offset_type dst_offset_type ;
dst.m_map.m_offset = dst_offset_type(std::integral_constant<unsigned,0>() , src.layout() ); //Check this for integer input1 for padding, etc
dst.m_map.m_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
dst.m_map.m_impl_offset = dst_offset_type(std::integral_constant<unsigned,0>() , src.layout() ); //Check this for integer input1 for padding, etc
dst.m_map.m_impl_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_impl_handle , src.m_track );
dst.m_track.assign( src.m_track , DstTraits::is_managed );
dst.m_rank = src.Rank ;
}
@ -565,10 +565,14 @@ public:
//----------------------------------------
// Allow specializations to query their specialized map
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
KOKKOS_INLINE_FUNCTION
const Kokkos::Impl::ViewMapping< traits , void > &
implementation_map() const { return m_map ; }
#endif
KOKKOS_INLINE_FUNCTION
const Kokkos::Impl::ViewMapping< traits , void > &
impl_map() const { return m_map ; }
//----------------------------------------
@ -624,7 +628,7 @@ public:
reference_type operator()() const
{
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (0 , this->rank(), m_track, m_map) )
return implementation_map().reference();
return impl_map().reference();
//return m_map.reference(0,0,0,0,0,0,0);
}
@ -647,7 +651,7 @@ public:
typename std::enable_if< !std::is_same<typename drvtraits::value_type, typename drvtraits::scalar_array_type>::value && std::is_integral<iType>::value, reference_type>::type
operator[](const iType & i0) const
{
// auto map = implementation_map();
// auto map = impl_map();
const size_t dim_scalar = m_map.dimension_scalar();
const size_t bytes = this->span() / dim_scalar;
@ -785,7 +789,7 @@ public:
reference_type access() const
{
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (0 , this->rank(), m_track, m_map) )
return implementation_map().reference();
return impl_map().reference();
//return m_map.reference(0,0,0,0,0,0,0);
}
@ -1004,7 +1008,7 @@ public:
//----------------------------------------
// Allocation according to allocation properties and array layout
// unused arg_layout dimensions must be set toKOKKOS_INVALID_INDEX so that rank deduction can properly take place
// unused arg_layout dimensions must be set to KOKKOS_INVALID_INDEX so that rank deduction can properly take place
template< class ... P >
explicit inline
DynRankView( const Kokkos::Impl::ViewCtorProp< P ... > & arg_prop
@ -1179,7 +1183,7 @@ public:
: DynRankView( Kokkos::Impl::ViewCtorProp< std::string >( arg_label )
, typename traits::array_layout
( arg_N0 , arg_N1 , arg_N2 , arg_N3 , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
)
)
{}
// For backward compatibility
@ -1189,8 +1193,7 @@ public:
, const typename traits::array_layout & arg_layout
)
: DynRankView( Kokkos::Impl::ViewCtorProp< std::string , Kokkos::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::WithoutInitializing )
, Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout)
, arg_layout
)
{}
@ -1205,7 +1208,9 @@ public:
, const size_t arg_N6 =KOKKOS_INVALID_INDEX
, const size_t arg_N7 =KOKKOS_INVALID_INDEX
)
: DynRankView(Kokkos::Impl::ViewCtorProp< std::string , Kokkos::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::WithoutInitializing ), arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7 )
: DynRankView(Kokkos::Impl::ViewCtorProp< std::string , Kokkos::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::WithoutInitializing )
, typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)
)
{}
//----------------------------------------
@ -1445,30 +1450,30 @@ public:
ret_type dst ;
const SubviewExtents< 7 , rank > extents =
ExtentGenerator< Args ... >::generator( src.m_map.m_offset.m_dim , args... ) ;
ExtentGenerator< Args ... >::generator( src.m_map.m_impl_offset.m_dim , args... ) ;
dst_offset_type tempdst( src.m_map.m_offset , extents ) ;
dst_offset_type tempdst( src.m_map.m_impl_offset , extents ) ;
dst.m_track = src.m_track ;
dst.m_map.m_offset.m_dim.N0 = tempdst.m_dim.N0 ;
dst.m_map.m_offset.m_dim.N1 = tempdst.m_dim.N1 ;
dst.m_map.m_offset.m_dim.N2 = tempdst.m_dim.N2 ;
dst.m_map.m_offset.m_dim.N3 = tempdst.m_dim.N3 ;
dst.m_map.m_offset.m_dim.N4 = tempdst.m_dim.N4 ;
dst.m_map.m_offset.m_dim.N5 = tempdst.m_dim.N5 ;
dst.m_map.m_offset.m_dim.N6 = tempdst.m_dim.N6 ;
dst.m_map.m_impl_offset.m_dim.N0 = tempdst.m_dim.N0 ;
dst.m_map.m_impl_offset.m_dim.N1 = tempdst.m_dim.N1 ;
dst.m_map.m_impl_offset.m_dim.N2 = tempdst.m_dim.N2 ;
dst.m_map.m_impl_offset.m_dim.N3 = tempdst.m_dim.N3 ;
dst.m_map.m_impl_offset.m_dim.N4 = tempdst.m_dim.N4 ;
dst.m_map.m_impl_offset.m_dim.N5 = tempdst.m_dim.N5 ;
dst.m_map.m_impl_offset.m_dim.N6 = tempdst.m_dim.N6 ;
dst.m_map.m_offset.m_stride.S0 = tempdst.m_stride.S0 ;
dst.m_map.m_offset.m_stride.S1 = tempdst.m_stride.S1 ;
dst.m_map.m_offset.m_stride.S2 = tempdst.m_stride.S2 ;
dst.m_map.m_offset.m_stride.S3 = tempdst.m_stride.S3 ;
dst.m_map.m_offset.m_stride.S4 = tempdst.m_stride.S4 ;
dst.m_map.m_offset.m_stride.S5 = tempdst.m_stride.S5 ;
dst.m_map.m_offset.m_stride.S6 = tempdst.m_stride.S6 ;
dst.m_map.m_impl_offset.m_stride.S0 = tempdst.m_stride.S0 ;
dst.m_map.m_impl_offset.m_stride.S1 = tempdst.m_stride.S1 ;
dst.m_map.m_impl_offset.m_stride.S2 = tempdst.m_stride.S2 ;
dst.m_map.m_impl_offset.m_stride.S3 = tempdst.m_stride.S3 ;
dst.m_map.m_impl_offset.m_stride.S4 = tempdst.m_stride.S4 ;
dst.m_map.m_impl_offset.m_stride.S5 = tempdst.m_stride.S5 ;
dst.m_map.m_impl_offset.m_stride.S6 = tempdst.m_stride.S6 ;
dst.m_map.m_handle = dst_handle_type( src.m_map.m_handle +
src.m_map.m_offset( extents.domain_offset(0)
dst.m_map.m_impl_handle = dst_handle_type( src.m_map.m_impl_handle +
src.m_map.m_impl_offset( extents.domain_offset(0)
, extents.domain_offset(1)
, extents.domain_offset(2)
, extents.domain_offset(3)
@ -1896,6 +1901,7 @@ inline
typename DynRankView<T,P...>::HostMirror
create_mirror( const DynRankView<T,P...> & src
, typename std::enable_if<
std::is_same< typename ViewTraits<T,P...>::specialize , void >::value &&
! std::is_same< typename Kokkos::ViewTraits<T,P...>::array_layout
, Kokkos::LayoutStride >::value
>::type * = 0
@ -1914,6 +1920,7 @@ inline
typename DynRankView<T,P...>::HostMirror
create_mirror( const DynRankView<T,P...> & src
, typename std::enable_if<
std::is_same< typename ViewTraits<T,P...>::specialize , void >::value &&
std::is_same< typename Kokkos::ViewTraits<T,P...>::array_layout
, Kokkos::LayoutStride >::value
>::type * = 0
@ -1929,7 +1936,11 @@ create_mirror( const DynRankView<T,P...> & src
// Create a mirror in a new space (specialization for different space)
template<class Space, class T, class ... P>
typename Impl::MirrorDRVType<Space,T,P ...>::view_type create_mirror(const Space& , const Kokkos::DynRankView<T,P...> & src) {
typename Impl::MirrorDRVType<Space,T,P ...>::view_type
create_mirror(const Space& , const Kokkos::DynRankView<T,P...> & src
, typename std::enable_if<
std::is_same< typename ViewTraits<T,P...>::specialize , void >::value
>::type * = 0) {
return typename Impl::MirrorDRVType<Space,T,P ...>::view_type(src.label(), Impl::reconstructLayout(src.layout(), src.rank()) );
}
@ -1985,6 +1996,29 @@ create_mirror_view(const Space& , const Kokkos::DynRankView<T,P...> & src
return typename Impl::MirrorDRViewType<Space,T,P ...>::view_type(src.label(), Impl::reconstructLayout(src.layout(), src.rank()) );
}
// Create a mirror view and deep_copy in a new space (specialization for same space)
template<class Space, class T, class ... P>
typename Impl::MirrorDRViewType<Space,T,P ...>::view_type
create_mirror_view_and_copy(const Space& , const Kokkos::DynRankView<T,P...> & src
, std::string const& name = ""
, typename std::enable_if<Impl::MirrorDRViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
(void)name;
return src;
}
// Create a mirror view and deep_copy in a new space (specialization for different space)
template<class Space, class T, class ... P>
typename Impl::MirrorDRViewType<Space,T,P ...>::view_type
create_mirror_view_and_copy(const Space& , const Kokkos::DynRankView<T,P...> & src
, std::string const& name = ""
, typename std::enable_if<!Impl::MirrorDRViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
using Mirror = typename Impl::MirrorDRViewType<Space,T,P ...>::view_type;
std::string label = name.empty() ? src.label() : name;
auto mirror = Mirror( Kokkos::ViewAllocateWithoutInitializing(label), Impl::reconstructLayout(src.layout(), src.rank()) );
deep_copy(mirror, src);
return mirror;
}
} //end Kokkos

File diff suppressed because it is too large Load Diff

View File

@ -47,7 +47,9 @@
#include <string>
#include <vector>
#include <Kokkos_Core.hpp>
#include <Kokkos_View.hpp>
#include <Kokkos_Parallel.hpp>
#include <Kokkos_Parallel_Reduce.hpp>
namespace Kokkos {

View File

@ -86,14 +86,13 @@ public:
vector():DV() {
_size = 0;
_extra_storage = 1.1;
DV::modified_host() = 1;
}
vector(int n, Scalar val=Scalar()):DualView<Scalar*,LayoutLeft,Arg1Type>("Vector",size_t(n*(1.1))) {
_size = n;
_extra_storage = 1.1;
DV::modified_host() = 1;
DV::modified_flags(0) = 1;
assign(n,val);
}
@ -119,16 +118,16 @@ public:
/* Assign value either on host or on device */
if( DV::modified_host() >= DV::modified_device() ) {
if( DV::template need_sync<typename DV::t_dev::device_type>() ) {
set_functor_host f(DV::h_view,val);
parallel_for(n,f);
DV::t_host::execution_space::fence();
DV::modified_host()++;
DV::template modify<typename DV::t_host::device_type>();
} else {
set_functor f(DV::d_view,val);
parallel_for(n,f);
DV::t_dev::execution_space::fence();
DV::modified_device()++;
DV::template modify<typename DV::t_dev::device_type>();
}
}
@ -137,7 +136,8 @@ public:
}
void push_back(Scalar val) {
DV::modified_host()++;
DV::template sync<typename DV::t_host::device_type>();
DV::template modify<typename DV::t_host::device_type>();
if(_size == span()) {
size_t new_size = _size*_extra_storage;
if(new_size == _size) new_size++;
@ -247,10 +247,10 @@ public:
}
void on_host() {
DV::modified_host() = DV::modified_device() + 1;
DV::template modify<typename DV::t_host::device_type>();
}
void on_device() {
DV::modified_device() = DV::modified_host() + 1;
DV::template modify<typename DV::t_dev::device_type>();
}
void set_overallocation(float extra) {

View File

@ -23,6 +23,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
threads/TestThreads_DynRankViewAPI_rank12345.cpp
threads/TestThreads_DynRankViewAPI_rank67.cpp
threads/TestThreads_ErrorReporter.cpp
threads/TestThreads_OffsetView.cpp
threads/TestThreads_ScatterView.cpp
threads/TestThreads_StaticCrsGraph.cpp
threads/TestThreads_UnorderedMap.cpp
@ -47,6 +48,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
serial/TestSerial_DynRankViewAPI_rank12345.cpp
serial/TestSerial_DynRankViewAPI_rank67.cpp
serial/TestSerial_ErrorReporter.cpp
serial/TestSerial_OffsetView.cpp
serial/TestSerial_ScatterView.cpp
serial/TestSerial_StaticCrsGraph.cpp
serial/TestSerial_UnorderedMap.cpp
@ -71,6 +73,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
openmp/TestOpenMP_DynRankViewAPI_rank12345.cpp
openmp/TestOpenMP_DynRankViewAPI_rank67.cpp
openmp/TestOpenMP_ErrorReporter.cpp
openmp/TestOpenMP_OffsetView.cpp
openmp/TestOpenMP_ScatterView.cpp
openmp/TestOpenMP_StaticCrsGraph.cpp
openmp/TestOpenMP_UnorderedMap.cpp
@ -95,6 +98,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
cuda/TestCuda_DynRankViewAPI_rank12345.cpp
cuda/TestCuda_DynRankViewAPI_rank67.cpp
cuda/TestCuda_ErrorReporter.cpp
cuda/TestCuda_OffsetView.cpp
cuda/TestCuda_ScatterView.cpp
cuda/TestCuda_StaticCrsGraph.cpp
cuda/TestCuda_UnorderedMap.cpp

View File

@ -39,6 +39,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
OBJ_CUDA += TestCuda_DynRankViewAPI_rank12345.o
OBJ_CUDA += TestCuda_DynRankViewAPI_rank67.o
OBJ_CUDA += TestCuda_ErrorReporter.o
OBJ_CUDA += TestCuda_OffsetView.o
OBJ_CUDA += TestCuda_ScatterView.o
OBJ_CUDA += TestCuda_StaticCrsGraph.o
OBJ_CUDA += TestCuda_UnorderedMap.o
@ -57,6 +58,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
OBJ_ROCM += TestROCm_DynRankViewAPI_rank12345.o
OBJ_ROCM += TestROCm_DynRankViewAPI_rank67.o
OBJ_ROCM += TestROCm_ErrorReporter.o
OBJ_ROCM += TestROCm_OffsetView.o
OBJ_ROCM += TestROCm_ScatterView.o
OBJ_ROCM += TestROCm_StaticCrsGraph.o
OBJ_ROCM += TestROCm_UnorderedMap.o
@ -75,6 +77,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
OBJ_THREADS += TestThreads_DynRankViewAPI_rank12345.o
OBJ_THREADS += TestThreads_DynRankViewAPI_rank67.o
OBJ_THREADS += TestThreads_ErrorReporter.o
OBJ_THREADS += TestThreads_OffsetView.o
OBJ_THREADS += TestThreads_ScatterView.o
OBJ_THREADS += TestThreads_StaticCrsGraph.o
OBJ_THREADS += TestThreads_UnorderedMap.o
@ -93,6 +96,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
OBJ_OPENMP += TestOpenMP_DynRankViewAPI_rank12345.o
OBJ_OPENMP += TestOpenMP_DynRankViewAPI_rank67.o
OBJ_OPENMP += TestOpenMP_ErrorReporter.o
OBJ_OPENMP += TestOpenMP_OffsetView.o
OBJ_OPENMP += TestOpenMP_ScatterView.o
OBJ_OPENMP += TestOpenMP_StaticCrsGraph.o
OBJ_OPENMP += TestOpenMP_UnorderedMap.o
@ -111,6 +115,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
OBJ_SERIAL += TestSerial_DynRankViewAPI_rank12345.o
OBJ_SERIAL += TestSerial_DynRankViewAPI_rank67.o
OBJ_SERIAL += TestSerial_ErrorReporter.o
OBJ_SERIAL += TestSerial_OffsetView.o
OBJ_SERIAL += TestSerial_ScatterView.o
OBJ_SERIAL += TestSerial_StaticCrsGraph.o
OBJ_SERIAL += TestSerial_UnorderedMap.o

View File

@ -729,6 +729,7 @@ public:
static void run_tests() {
run_test_resize_realloc();
run_test_mirror();
run_test_mirror_and_copy();
run_test_scalar();
run_test();
run_test_const();
@ -885,6 +886,69 @@ public:
}
}
static void run_test_mirror_and_copy()
{
// LayoutLeft
{
Kokkos::DynRankView< double, Kokkos::LayoutLeft, Kokkos::HostSpace > a_org( "A", 10 );
a_org(5) = 42.0;
Kokkos::DynRankView< double, Kokkos::LayoutLeft, Kokkos::HostSpace > a_h = a_org;
auto a_h2 = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), a_h );
auto a_d = Kokkos::create_mirror_view_and_copy( DeviceType(), a_h );
auto a_h3 = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), a_d );
int equal_ptr_h_h2 = a_h.data() == a_h2.data() ? 1 : 0;
int equal_ptr_h_d = a_h.data() == a_d.data() ? 1 : 0;
int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0;
int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0;
int is_same_memspace = std::is_same< Kokkos::HostSpace, typename DeviceType::memory_space >::value ? 1 : 0;
ASSERT_EQ( equal_ptr_h_h2, 1 );
ASSERT_EQ( equal_ptr_h_d, is_same_memspace );
ASSERT_EQ( equal_ptr_h2_d, is_same_memspace );
ASSERT_EQ( equal_ptr_h3_d, is_same_memspace );
ASSERT_EQ( a_h.extent(0), a_h3.extent(0) );
ASSERT_EQ( a_h.extent(0), a_h2.extent(0) );
ASSERT_EQ( a_h.extent(0), a_d .extent(0) );
ASSERT_EQ( a_h.extent(0), a_h3.extent(0) );
ASSERT_EQ( a_h.rank(), a_org.rank() );
ASSERT_EQ( a_h.rank(), a_h2.rank() );
ASSERT_EQ( a_h.rank(), a_h3.rank() );
ASSERT_EQ( a_h.rank(), a_d.rank() );
ASSERT_EQ( a_org(5), a_h3(5) );
}
// LayoutRight
{
Kokkos::DynRankView< double, Kokkos::LayoutRight, Kokkos::HostSpace > a_org( "A", 10 );
a_org(5) = 42.0;
Kokkos::DynRankView< double, Kokkos::LayoutRight, Kokkos::HostSpace > a_h = a_org;
auto a_h2 = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), a_h );
auto a_d = Kokkos::create_mirror_view_and_copy( DeviceType(), a_h );
auto a_h3 = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), a_d );
int equal_ptr_h_h2 = a_h.data() == a_h2.data() ? 1 : 0;
int equal_ptr_h_d = a_h.data() == a_d.data() ? 1 : 0;
int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0;
int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0;
int is_same_memspace = std::is_same< Kokkos::HostSpace, typename DeviceType::memory_space >::value ? 1 : 0;
ASSERT_EQ( equal_ptr_h_h2, 1 );
ASSERT_EQ( equal_ptr_h_d, is_same_memspace );
ASSERT_EQ( equal_ptr_h2_d, is_same_memspace );
ASSERT_EQ( equal_ptr_h3_d, is_same_memspace );
ASSERT_EQ( a_h.extent(0), a_h3.extent(0) );
ASSERT_EQ( a_h.extent(0), a_h2.extent(0) );
ASSERT_EQ( a_h.extent(0), a_d .extent(0) );
ASSERT_EQ( a_h.rank(), a_org.rank() );
ASSERT_EQ( a_h.rank(), a_h2.rank() );
ASSERT_EQ( a_h.rank(), a_h3.rank() );
ASSERT_EQ( a_h.rank(), a_d.rank() );
ASSERT_EQ( a_org(5), a_h3(5) );
}
}
static void run_test_scalar()
{
typedef typename dView0::HostMirror hView0 ; //HostMirror of DynRankView is a DynRankView

View File

@ -0,0 +1,426 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
/*
* FIXME the OffsetView class is really not very well tested.
*/
#ifndef CONTAINERS_UNIT_TESTS_TESTOFFSETVIEW_HPP_
#define CONTAINERS_UNIT_TESTS_TESTOFFSETVIEW_HPP_
#include <gtest/gtest.h>
#include <iostream>
#include <cstdlib>
#include <cstdio>
#include <impl/Kokkos_Timer.hpp>
#include <Kokkos_OffsetView.hpp>
#include <KokkosExp_MDRangePolicy.hpp>
using std::endl;
using std::cout;
namespace Test{
template <typename Scalar, typename Device>
void test_offsetview_construction(unsigned int size)
{
typedef Kokkos::Experimental::OffsetView<Scalar**, Device> offset_view_type;
typedef Kokkos::View<Scalar**, Device> view_type;
Kokkos::Experimental::index_list_type range0 = {-1, 3};
Kokkos::Experimental::index_list_type range1 = {-2, 2};
offset_view_type ov("firstOV", range0, range1);
ASSERT_EQ("firstOV", ov.label());
ASSERT_EQ(2, ov.Rank);
ASSERT_EQ(ov.begin(0), -1);
ASSERT_EQ(ov.end(0), 4);
ASSERT_EQ(ov.begin(1), -2);
ASSERT_EQ(ov.end(1), 3);
ASSERT_EQ(ov.extent(0), 5);
ASSERT_EQ(ov.extent(1), 5);
const int ovmin0 = ov.begin(0);
const int ovend0 = ov.end(0);
const int ovmin1 = ov.begin(1);
const int ovend1 = ov.end(1);
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
{
Kokkos::Experimental::OffsetView<Scalar*, Device> offsetV1("OneDOffsetView", range0);
Kokkos::RangePolicy<Device, int> rangePolicy1(offsetV1.begin(0), offsetV1.end(0));
Kokkos::parallel_for(rangePolicy1, KOKKOS_LAMBDA (const int i){
offsetV1(i) = 1;
}
);
Kokkos::fence();
int OVResult = 0;
Kokkos::parallel_reduce(rangePolicy1, KOKKOS_LAMBDA(const int i, int & updateMe){
updateMe += offsetV1(i);
}, OVResult);
Kokkos::fence();
ASSERT_EQ(OVResult, offsetV1.end(0) - offsetV1.begin(0)) << "found wrong number of elements in OffsetView that was summed.";
}
{ //test deep copy of scalar const value into mirro
const int constVal = 6;
typename offset_view_type::HostMirror hostOffsetView =
Kokkos::Experimental::create_mirror_view(ov);
Kokkos::Experimental::deep_copy(hostOffsetView, constVal);
for(int i = hostOffsetView.begin(0); i < hostOffsetView.end(0); ++i) {
for(int j = hostOffsetView.begin(1); j < hostOffsetView.end(1); ++j) {
ASSERT_EQ(hostOffsetView(i,j), constVal) << "Bad data found in OffsetView";
}
}
}
typedef Kokkos::MDRangePolicy<Device, Kokkos::Rank<2>, Kokkos::IndexType<int> > range_type;
typedef typename range_type::point_type point_type;
range_type rangePolicy2D(point_type{ {ovmin0, ovmin1 } },
point_type{ { ovend0, ovend1 } });
const int constValue = 9;
Kokkos::parallel_for(rangePolicy2D, KOKKOS_LAMBDA (const int i, const int j) {
ov(i,j) = constValue;
}
);
//test offsetview to offsetviewmirror deep copy
typename offset_view_type::HostMirror hostOffsetView =
Kokkos::Experimental::create_mirror_view(ov);
Kokkos::Experimental::deep_copy(hostOffsetView, ov);
for(int i = hostOffsetView.begin(0); i < hostOffsetView.end(0); ++i) {
for(int j = hostOffsetView.begin(1); j < hostOffsetView.end(1); ++j) {
ASSERT_EQ(hostOffsetView(i,j), constValue) << "Bad data found in OffsetView";
}
}
int OVResult = 0;
Kokkos::parallel_reduce(rangePolicy2D, KOKKOS_LAMBDA(const int i, const int j, int & updateMe){
updateMe += ov(i, j);
}, OVResult);
int answer = 0;
for(int i = ov.begin(0); i < ov.end(0); ++i) {
for(int j = ov.begin(1); j < ov.end(1); ++j) {
answer += constValue;
}
}
ASSERT_EQ(OVResult, answer) << "Bad data found in OffsetView";
#endif
{
offset_view_type ovCopy(ov);
ASSERT_EQ(ovCopy==ov, true) <<
"Copy constructor or equivalence operator broken";
}
{
offset_view_type ovAssigned = ov;
ASSERT_EQ(ovAssigned==ov, true) <<
"Assignment operator or equivalence operator broken";
}
{ //construct OffsetView from a View plus begins array
const int extent0 = 100;
const int extent1 = 200;
const int extent2 = 300;
Kokkos::View<Scalar***, Device> view3D("view3D", extent0, extent1, extent2);
Kokkos::deep_copy(view3D, 1);
Kokkos::Array<int64_t,3> begins = {{-10, -20, -30}};
Kokkos::Experimental::OffsetView<Scalar***, Device> offsetView3D(view3D, begins);
typedef Kokkos::MDRangePolicy<Device, Kokkos::Rank<3>, Kokkos::IndexType<int64_t> > range3_type;
typedef typename range3_type::point_type point3_type;
range3_type rangePolicy3DZero(point3_type{ {0, 0, 0 } },
point3_type{ { extent0, extent1, extent2 } });
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
int view3DSum = 0;
Kokkos::parallel_reduce(rangePolicy3DZero, KOKKOS_LAMBDA(const int i, const int j, int k, int & updateMe){
updateMe += view3D(i, j, k);
}, view3DSum);
range3_type rangePolicy3D(point3_type{ {begins[0], begins[1], begins[2] } },
point3_type{ { begins[0] + extent0, begins[1] + extent1, begins[2] + extent2 } });
int offsetView3DSum = 0;
Kokkos::parallel_reduce(rangePolicy3D, KOKKOS_LAMBDA(const int i, const int j, int k, int & updateMe){
updateMe += offsetView3D(i, j, k);
}, offsetView3DSum);
ASSERT_EQ(view3DSum, offsetView3DSum) << "construction of OffsetView from View and begins array broken.";
#endif
}
view_type viewFromOV = ov.view();
ASSERT_EQ(viewFromOV == ov, true) <<
"OffsetView::view() or equivalence operator View == OffsetView broken";
{
offset_view_type ovFromV(viewFromOV, {-1, -2});
ASSERT_EQ(ovFromV == viewFromOV , true) <<
"Construction of OffsetView from View or equivalence operator OffsetView == View broken";
}
{
offset_view_type ovFromV = viewFromOV;
ASSERT_EQ(ovFromV == viewFromOV , true) <<
"Construction of OffsetView from View by assignment (implicit conversion) or equivalence operator OffsetView == View broken";
}
{// test offsetview to view deep copy
view_type aView("aView", ov.extent(0), ov.extent(1));
Kokkos::Experimental::deep_copy(aView, ov);
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
int sum = 0;
Kokkos::parallel_reduce(rangePolicy2D, KOKKOS_LAMBDA(const int i, const int j, int & updateMe){
updateMe += ov(i, j) - aView(i- ov.begin(0), j-ov.begin(1));
}, sum);
ASSERT_EQ(sum, 0) << "deep_copy(view, offsetView) broken.";
#endif
}
{// test view to offsetview deep copy
view_type aView("aView", ov.extent(0), ov.extent(1));
Kokkos::deep_copy(aView, 99);
Kokkos::Experimental::deep_copy(ov, aView);
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
int sum = 0;
Kokkos::parallel_reduce(rangePolicy2D, KOKKOS_LAMBDA(const int i, const int j, int & updateMe){
updateMe += ov(i, j) - aView(i- ov.begin(0), j-ov.begin(1));
}, sum);
ASSERT_EQ(sum, 0) << "deep_copy(offsetView, view) broken.";
#endif
}
}
template <typename Scalar, typename Device>
void test_offsetview_subview(unsigned int size)
{
{//test subview 1
Kokkos::Experimental::OffsetView<Scalar*, Device> sliceMe("offsetToSlice", {-10, 20});
{
auto offsetSubviewa = Kokkos::Experimental::subview(sliceMe, 0);
ASSERT_EQ(offsetSubviewa.Rank, 0) << "subview of offset is broken.";
}
}
{//test subview 2
Kokkos::Experimental::OffsetView<Scalar**, Device> sliceMe("offsetToSlice", {-10,20}, {-20,30});
{
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(),-2);
ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
}
{
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL());
ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
}
}
{//test subview rank 3
Kokkos::Experimental::OffsetView<Scalar***, Device> sliceMe("offsetToSlice", {-10,20}, {-20,30}, {-30,40});
//slice 1
{
auto offsetSubview = Kokkos::Experimental::subview(sliceMe,Kokkos::ALL(),Kokkos::ALL(), 0);
ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken.";
}
{
auto offsetSubview = Kokkos::Experimental::subview(sliceMe,Kokkos::ALL(), 0,Kokkos::ALL());
ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken.";
}
{
auto offsetSubview = Kokkos::Experimental::subview(sliceMe,0, Kokkos::ALL(),Kokkos::ALL());
ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken.";
}
{
auto offsetSubview = Kokkos::Experimental::subview(sliceMe,0, Kokkos::ALL(), Kokkos::make_pair(-30, -21));
ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken.";
ASSERT_EQ(offsetSubview.begin(0) , -20);
ASSERT_EQ(offsetSubview.end(0) , 31);
ASSERT_EQ(offsetSubview.begin(1) , 0);
ASSERT_EQ(offsetSubview.end(1) , 9);
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
typedef Kokkos::MDRangePolicy<Device, Kokkos::Rank<2>, Kokkos::IndexType<int> > range_type;
typedef typename range_type::point_type point_type;
const int b0 = offsetSubview.begin(0);
const int b1 = offsetSubview.begin(1);
const int e0 = offsetSubview.end(0);
const int e1 = offsetSubview.end(1);
range_type rangeP2D(point_type{ {b0, b1 } }, point_type{ { e0, e1} });
Kokkos::parallel_for(rangeP2D, KOKKOS_LAMBDA(const int i, const int j) {
offsetSubview(i,j) = 6;
}
);
int sum = 0;
Kokkos::parallel_reduce(rangeP2D, KOKKOS_LAMBDA(const int i, const int j, int & updateMe){
updateMe += offsetSubview(i, j);
}, sum);
ASSERT_EQ(sum, 6*(e0-b0)*(e1-b1));
#endif
}
// slice 2
{
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0);
ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
}
{
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, Kokkos::ALL());
ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
}
{
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0);
ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
}
}
{//test subview rank 4
Kokkos::Experimental::OffsetView<Scalar****, Device> sliceMe("offsetToSlice", {-10,20}, {-20,30}, {-30,40}, {-40, 50});
//slice 1
{
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(),Kokkos::ALL(), Kokkos::ALL(), 0);
ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken.";
}
{
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0, Kokkos::ALL());
ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken.";
}
{
auto offsetSubview = Kokkos::Experimental::subview(sliceMe ,Kokkos::ALL(), 0, Kokkos::ALL(),Kokkos::ALL());
ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken.";
}
{
auto offsetSubview = Kokkos::Experimental::subview(sliceMe , 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL() );
ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken.";
}
// slice 2
auto offsetSubview2a = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0, 0);
ASSERT_EQ(offsetSubview2a.Rank, 2) << "subview of offset is broken.";
{
auto offsetSubview2b = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, Kokkos::ALL(), 0);
ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken.";
}
{
auto offsetSubview2b = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0, Kokkos::ALL());
ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken.";
}
{
auto offsetSubview2b = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0, Kokkos::ALL());
ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken.";
}
{
auto offsetSubview2b = Kokkos::Experimental::subview(sliceMe, 0, 0, Kokkos::ALL(), Kokkos::ALL());
ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken.";
}
// slice 3
{
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0, 0);
ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
}
{
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0, 0);
ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
}
{
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, Kokkos::ALL(), 0);
ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
}
{
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, 0, Kokkos::ALL());
ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
}
}
}
TEST_F( TEST_CATEGORY, offsetview_construction) {
test_offsetview_construction<int,TEST_EXECSPACE>(10);
}
TEST_F( TEST_CATEGORY, offsetview_subview) {
test_offsetview_subview<int,TEST_EXECSPACE>(10);
}
} // namespace Test
#endif /* CONTAINERS_UNIT_TESTS_TESTOFFSETVIEW_HPP_ */

View File

@ -80,7 +80,9 @@ void test_scatter_view_config(int n)
Kokkos::Experimental::contribute(original_view, scatter_view);
}
#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
Kokkos::fence();
auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), original_view);
Kokkos::fence();
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
auto val0 = host_view(i, 0);
auto val1 = host_view(i, 1);
@ -111,9 +113,6 @@ struct TestDuplicatedScatterView {
test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
Kokkos::Experimental::ScatterDuplicated,
Kokkos::Experimental::ScatterNonAtomic>(n);
test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
Kokkos::Experimental::ScatterDuplicated,
Kokkos::Experimental::ScatterAtomic>(n);
}
};
@ -127,6 +126,16 @@ struct TestDuplicatedScatterView<Kokkos::Cuda> {
};
#endif
#ifdef KOKKOS_ENABLE_ROCM
// disable duplicated instantiation with ROCm until
// UniqueToken can support it
template <>
struct TestDuplicatedScatterView<Kokkos::Experimental::ROCm> {
TestDuplicatedScatterView(int) {
}
};
#endif
template <typename ExecSpace>
void test_scatter_view(int n)
{
@ -142,16 +151,28 @@ void test_scatter_view(int n)
Kokkos::Experimental::ScatterNonDuplicated,
Kokkos::Experimental::ScatterNonAtomic>(n);
}
#ifdef KOKKOS_ENABLE_SERIAL
if (!std::is_same<ExecSpace, Kokkos::Serial>::value) {
#endif
test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
Kokkos::Experimental::ScatterNonDuplicated,
Kokkos::Experimental::ScatterAtomic>(n);
#ifdef KOKKOS_ENABLE_SERIAL
}
#endif
TestDuplicatedScatterView<ExecSpace> duptest(n);
}
TEST_F( TEST_CATEGORY, scatterview) {
#ifndef KOKKOS_ENABLE_ROCM
test_scatter_view<TEST_EXECSPACE>(10);
#ifdef KOKKOS_ENABLE_DEBUG
test_scatter_view<TEST_EXECSPACE>(100000);
#else
test_scatter_view<TEST_EXECSPACE>(10000000);
#endif
#endif
}
} // namespace Test

View File

@ -46,6 +46,7 @@
#include <vector>
#include <Kokkos_StaticCrsGraph.hpp>
#include <Kokkos_Core.hpp>
/*--------------------------------------------------------------------------*/
namespace Test {

View File

@ -0,0 +1,47 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include<cuda/TestCuda_Category.hpp>
#include<TestOffsetView.hpp>

View File

@ -0,0 +1,47 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include<openmp/TestOpenMP_Category.hpp>
#include<TestOffsetView.hpp>

View File

@ -60,6 +60,6 @@ protected:
} // namespace Test
#define TEST_CATEGORY rocm
#define TEST_EXECSPACE Kokkos::ROCm
#define TEST_EXECSPACE Kokkos::Experimental::ROCm
#endif

View File

@ -0,0 +1,46 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include<serial/TestSerial_Category.hpp>
#include<TestOffsetView.hpp>

View File

@ -0,0 +1,47 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include<threads/TestThreads_Category.hpp>
#include<TestOffsetView.hpp>

View File

@ -108,3 +108,7 @@ else()
endif()
#-----------------------------------------------------------------------------
# build and install pkgconfig file
CONFIGURE_FILE(kokkos.pc.in kokkos.pc @ONLY)
INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/kokkos.pc DESTINATION lib/pkgconfig)

View File

@ -208,7 +208,7 @@ struct CudaParallelLaunch< DriverType
, const int shmem
, const cudaStream_t stream = 0 )
{
if ( grid.x && ( block.x * block.y * block.z ) ) {
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
sizeof( DriverType ) ) {
@ -264,7 +264,7 @@ struct CudaParallelLaunch< DriverType
, const int shmem
, const cudaStream_t stream = 0 )
{
if ( grid.x && ( block.x * block.y * block.z ) ) {
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
sizeof( DriverType ) ) {
@ -321,7 +321,7 @@ struct CudaParallelLaunch< DriverType
, const int shmem
, const cudaStream_t stream = 0 )
{
if ( grid.x && ( block.x * block.y * block.z ) ) {
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
sizeof( DriverType ) ) {
@ -370,7 +370,7 @@ struct CudaParallelLaunch< DriverType
, const int shmem
, const cudaStream_t stream = 0 )
{
if ( grid.x && ( block.x * block.y * block.z ) ) {
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
sizeof( DriverType ) ) {

View File

@ -453,6 +453,8 @@ SharedAllocationRecord( const Kokkos::CudaSpace & arg_space
, arg_label.c_str()
, SharedAllocationHeader::maximum_label_length
);
// Set last element zero, in case c_str is too long
header.m_label[SharedAllocationHeader::maximum_label_length - 1] = (char) 0;
// Copy to device memory
Kokkos::Impl::DeepCopy<CudaSpace,HostSpace>( RecordBase::m_alloc_ptr , & header , sizeof(SharedAllocationHeader) );
@ -491,6 +493,9 @@ SharedAllocationRecord( const Kokkos::CudaUVMSpace & arg_space
, arg_label.c_str()
, SharedAllocationHeader::maximum_label_length
);
// Set last element zero, in case c_str is too long
RecordBase::m_alloc_ptr->m_label[SharedAllocationHeader::maximum_label_length - 1] = (char) 0;
}
SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
@ -525,6 +530,8 @@ SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace & arg_space
, arg_label.c_str()
, SharedAllocationHeader::maximum_label_length
);
// Set last element zero, in case c_str is too long
RecordBase::m_alloc_ptr->m_label[SharedAllocationHeader::maximum_label_length - 1] = (char) 0;
}
//----------------------------------------------------------------------------

View File

@ -689,9 +689,13 @@ Cuda::size_type cuda_internal_multiprocessor_count()
CudaSpace::size_type cuda_internal_maximum_concurrent_block_count()
{
#if defined(KOKKOS_ARCH_KEPLER)
// Compute capability 3.0 through 3.7
enum : int { max_resident_blocks_per_multiprocessor = 16 };
#else
// Compute capability 5.0 through 6.2
enum : int { max_resident_blocks_per_multiprocessor = 32 };
#endif
return CudaInternal::singleton().m_multiProcCount
* max_resident_blocks_per_multiprocessor ;
};

View File

@ -52,22 +52,22 @@
namespace Kokkos { namespace Impl {
template<class DriverType, bool Large>
template<class DriverType, class LaunchBounds, bool Large>
struct CudaGetMaxBlockSize;
template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
template<class DriverType, class LaunchBounds>
int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
return CudaGetMaxBlockSize<DriverType,Large>::get_block_size(f,vector_length, shmem_extra_block,shmem_extra_thread);
return CudaGetMaxBlockSize<DriverType,LaunchBounds,(CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>::get_block_size(f,vector_length, shmem_extra_block,shmem_extra_thread);
}
template<class DriverType>
struct CudaGetMaxBlockSize<DriverType,true> {
struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<>,true> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int numBlocks;
int blockSize=32;
int blockSize=1024;
int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
@ -76,8 +76,9 @@ struct CudaGetMaxBlockSize<DriverType,true> {
blockSize,
sharedmem);
while (blockSize<1024 && numBlocks>0) {
blockSize*=2;
if(numBlocks>0) return blockSize;
while (blockSize>32 && numBlocks==0) {
blockSize/=2;
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
@ -87,19 +88,30 @@ struct CudaGetMaxBlockSize<DriverType,true> {
blockSize,
sharedmem);
}
if(numBlocks>0) return blockSize;
else return blockSize/2;
int blockSizeUpperBound = blockSize*2;
while (blockSize<blockSizeUpperBound && numBlocks>0) {
blockSize+=32;
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_constant_memory<DriverType>,
blockSize,
sharedmem);
}
return blockSize - 32;
}
};
template<class DriverType>
struct CudaGetMaxBlockSize<DriverType,false> {
struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<>,false> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int numBlocks;
int blockSize=32;
int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
unsigned int blockSize=1024;
unsigned int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
@ -107,8 +119,9 @@ struct CudaGetMaxBlockSize<DriverType,false> {
blockSize,
sharedmem);
while (blockSize<1024 && numBlocks>0) {
blockSize*=2;
if(numBlocks>0) return blockSize;
while (blockSize>32 && numBlocks==0) {
blockSize/=2;
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
@ -118,24 +131,121 @@ struct CudaGetMaxBlockSize<DriverType,false> {
blockSize,
sharedmem);
}
if(numBlocks>0) return blockSize;
else return blockSize/2;
unsigned int blockSizeUpperBound = blockSize*2;
while (blockSize<blockSizeUpperBound && numBlocks>0) {
blockSize+=32;
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_local_memory<DriverType>,
blockSize,
sharedmem);
}
return blockSize - 32;
}
};
template<class DriverType, unsigned int MaxThreadsPerBlock, unsigned int MinBlocksPerSM>
struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<MaxThreadsPerBlock,MinBlocksPerSM>,true> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int numBlocks = 0, oldNumBlocks = 0;
unsigned int blockSize=MaxThreadsPerBlock;
unsigned int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_constant_memory<DriverType,MaxThreadsPerBlock,MinBlocksPerSM>,
blockSize,
sharedmem);
if(static_cast<unsigned int>(numBlocks)>=MinBlocksPerSM) return blockSize;
while (blockSize>32 && static_cast<unsigned int>(numBlocks)<MinBlocksPerSM) {
blockSize/=2;
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_constant_memory<DriverType>,
blockSize,
sharedmem);
}
unsigned int blockSizeUpperBound = (blockSize*2<MaxThreadsPerBlock?blockSize*2:MaxThreadsPerBlock);
while (blockSize<blockSizeUpperBound && static_cast<unsigned int>(numBlocks)>MinBlocksPerSM) {
blockSize+=32;
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
oldNumBlocks = numBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_constant_memory<DriverType>,
blockSize,
sharedmem);
}
if(static_cast<unsigned int>(oldNumBlocks)>=MinBlocksPerSM) return blockSize - 32;
return -1;
}
};
template<class DriverType, unsigned int MaxThreadsPerBlock, unsigned int MinBlocksPerSM>
struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<MaxThreadsPerBlock,MinBlocksPerSM>,false> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int numBlocks = 0, oldNumBlocks = 0;
unsigned int blockSize=MaxThreadsPerBlock;
int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_local_memory<DriverType,MaxThreadsPerBlock,MinBlocksPerSM>,
blockSize,
sharedmem);
if(static_cast<unsigned int>(numBlocks)>=MinBlocksPerSM) return blockSize;
while (blockSize>32 && static_cast<unsigned int>(numBlocks)<MinBlocksPerSM) {
blockSize/=2;
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_local_memory<DriverType>,
blockSize,
sharedmem);
}
unsigned int blockSizeUpperBound = (blockSize*2<MaxThreadsPerBlock?blockSize*2:MaxThreadsPerBlock);
while (blockSize<blockSizeUpperBound && static_cast<unsigned int>(numBlocks)>=MinBlocksPerSM) {
blockSize+=32;
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
oldNumBlocks = numBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_local_memory<DriverType>,
blockSize,
sharedmem);
}
if(static_cast<unsigned int>(oldNumBlocks)>=MinBlocksPerSM) return blockSize - 32;
return -1;
}
};
template<class DriverType, bool Large>
template<class DriverType, class LaunchBounds, bool Large>
struct CudaGetOptBlockSize;
template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
template<class DriverType, class LaunchBounds>
int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
return CudaGetOptBlockSize<DriverType,Large>::get_block_size(f,vector_length,shmem_extra_block,shmem_extra_thread);
return CudaGetOptBlockSize<DriverType,LaunchBounds,(CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>::get_block_size(f,vector_length,shmem_extra_block,shmem_extra_thread);
}
template<class DriverType>
struct CudaGetOptBlockSize<DriverType,true> {
struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<>,true> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int blockSize=16;
@ -165,7 +275,7 @@ struct CudaGetOptBlockSize<DriverType,true> {
};
template<class DriverType>
struct CudaGetOptBlockSize<DriverType,false> {
struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<>,false> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int blockSize=16;
@ -194,6 +304,75 @@ struct CudaGetOptBlockSize<DriverType,false> {
}
};
template<class DriverType, unsigned int MaxThreadsPerBlock, unsigned int MinBlocksPerSM>
struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds< MaxThreadsPerBlock, MinBlocksPerSM >,true> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int blockSize=16;
int numBlocks;
int sharedmem;
int maxOccupancy=0;
int bestBlockSize=0;
int max_threads_per_block = std::min(MaxThreadsPerBlock,cuda_internal_maximum_warp_count()*CudaTraits::WarpSize);
while(blockSize < max_threads_per_block ) {
blockSize*=2;
//calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_constant_memory<DriverType,MaxThreadsPerBlock,MinBlocksPerSM>,
blockSize,
sharedmem);
if(numBlocks >= int(MinBlocksPerSM) && blockSize<=int(MaxThreadsPerBlock)) {
if(maxOccupancy < numBlocks*blockSize) {
maxOccupancy = numBlocks*blockSize;
bestBlockSize = blockSize;
}
}
}
if(maxOccupancy > 0)
return bestBlockSize;
return -1;
}
};
template<class DriverType, unsigned int MaxThreadsPerBlock, unsigned int MinBlocksPerSM>
struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds< MaxThreadsPerBlock, MinBlocksPerSM >,false> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int blockSize=16;
int numBlocks;
int sharedmem;
int maxOccupancy=0;
int bestBlockSize=0;
int max_threads_per_block = std::min(MaxThreadsPerBlock,cuda_internal_maximum_warp_count()*CudaTraits::WarpSize);
while(blockSize < max_threads_per_block ) {
blockSize*=2;
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_local_memory<DriverType,MaxThreadsPerBlock,MinBlocksPerSM>,
blockSize,
sharedmem);
if(numBlocks >= int(MinBlocksPerSM) && blockSize<=int(MaxThreadsPerBlock)) {
if(maxOccupancy < numBlocks*blockSize) {
maxOccupancy = numBlocks*blockSize;
bestBlockSize = blockSize;
}
}
}
if(maxOccupancy > 0)
return bestBlockSize;
return -1;
}
};
}} // namespace Kokkos::Impl
#endif // KOKKOS_ENABLE_CUDA

View File

@ -148,6 +148,9 @@ namespace Kokkos {
namespace Impl {
namespace {
static int lock_array_copied = 0;
inline int eliminate_warning_for_lock_array() {
return lock_array_copied;
}
}
}
}

View File

@ -60,6 +60,7 @@
#include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <Cuda/Kokkos_Cuda_Locks.hpp>
#include <Kokkos_Vectorization.hpp>
#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
#if defined(KOKKOS_ENABLE_PROFILING)
#include <impl/Kokkos_Profiling_Interface.hpp>
@ -114,6 +115,7 @@ public:
//----------------------------------------
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
template< class FunctorType >
inline static
int team_size_max( const FunctorType & functor )
@ -131,7 +133,35 @@ public:
return n ;
}
#endif
template<class FunctorType>
int team_size_max( const FunctorType& f, const ParallelForTag& ) const {
typedef Impl::ParallelFor< FunctorType , TeamPolicy<Properties...> > closure_type;
int block_size = Kokkos::Impl::cuda_get_max_block_size< closure_type, typename traits::launch_bounds >( f ,(size_t) vector_length(),
(size_t) team_scratch_size(0) + 2*sizeof(double), (size_t) thread_scratch_size(0) + sizeof(double) );
return block_size/vector_length();
}
template<class FunctorType>
int team_size_max( const FunctorType& f, const ParallelReduceTag& ) const {
typedef Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,TeamPolicyInternal,FunctorType> functor_analysis_type;
typedef typename Impl::ParallelReduceReturnValue<void,typename functor_analysis_type::value_type,FunctorType>::reducer_type reducer_type;
typedef Impl::ParallelReduce< FunctorType , TeamPolicy<Properties...>, reducer_type > closure_type;
typedef Impl::FunctorValueTraits< FunctorType , typename traits::work_tag > functor_value_traits;
int block_size = Kokkos::Impl::cuda_get_max_block_size< closure_type, typename traits::launch_bounds >( f ,(size_t) vector_length(),
(size_t) team_scratch_size(0) + 2*sizeof(double), (size_t) thread_scratch_size(0) + sizeof(double) +
((functor_value_traits::StaticValueSize!=0)?0:functor_value_traits::value_size( f )));
// Currently we require Power-of-2 team size for reductions.
int p2 = 1;
while(p2<=block_size) p2*=2;
p2/=2;
return p2/vector_length();
}
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
template< class FunctorType >
static int team_size_recommended( const FunctorType & functor )
{ return team_size_max( functor ); }
@ -143,11 +173,41 @@ public:
if(max<1) max = 1;
return max;
}
#endif
template<class FunctorType>
int team_size_recommended( const FunctorType& f, const ParallelForTag& ) const {
typedef Impl::ParallelFor< FunctorType , TeamPolicy<Properties...> > closure_type;
int block_size = Kokkos::Impl::cuda_get_opt_block_size< closure_type, typename traits::launch_bounds >( f ,(size_t) vector_length(),
(size_t) team_scratch_size(0) + 2*sizeof(double), (size_t) thread_scratch_size(0) + sizeof(double));
return block_size/vector_length();
}
template<class FunctorType>
int team_size_recommended( const FunctorType& f, const ParallelReduceTag& ) const {
typedef Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,TeamPolicyInternal,FunctorType> functor_analysis_type;
typedef typename Impl::ParallelReduceReturnValue<void,typename functor_analysis_type::value_type,FunctorType>::reducer_type reducer_type;
typedef Impl::ParallelReduce< FunctorType , TeamPolicy<Properties...>, reducer_type > closure_type;
typedef Impl::FunctorValueTraits< FunctorType , typename traits::work_tag > functor_value_traits;
int block_size = Kokkos::Impl::cuda_get_opt_block_size< closure_type, typename traits::launch_bounds >( f ,(size_t) vector_length(),
(size_t) team_scratch_size(0) + 2*sizeof(double), (size_t) thread_scratch_size(0) + sizeof(double) +
((functor_value_traits::StaticValueSize!=0)?0:functor_value_traits::value_size( f )));
return block_size/vector_length();
}
inline static
int vector_length_max()
{ return Impl::CudaTraits::WarpSize; }
inline static
int scratch_size_max(int level)
{ return (level==0?
1024*40: // 48kB is the max for CUDA, but we need some for team_member.reduce etc.
20*1024*1024); // arbitrarily setting this to 20MB, for a Volta V100 that would give us about 3.2GB for 2 teams per SM
}
//----------------------------------------
inline int vector_length() const { return m_vector_length ; }
@ -419,7 +479,7 @@ public:
void execute() const
{
const typename Policy::index_type nwork = m_policy.end() - m_policy.begin();
const int block_size = Kokkos::Impl::cuda_get_opt_block_size< ParallelFor >( m_functor , 1, 0 , 0 );
const int block_size = Kokkos::Impl::cuda_get_opt_block_size< ParallelFor, LaunchBounds>( m_functor , 1, 0 , 0 );
const dim3 block( 1 , block_size , 1);
const dim3 grid( std::min( typename Policy::index_type(( nwork + block.y - 1 ) / block.y) , typename Policy::index_type(cuda_internal_maximum_grid_count()) ) , 1 , 1);
@ -654,7 +714,7 @@ public:
: m_functor( arg_functor )
, m_league_size( arg_policy.league_size() )
, m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
Kokkos::Impl::cuda_get_opt_block_size< ParallelFor >( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length() )
Kokkos::Impl::cuda_get_opt_block_size< ParallelFor, LaunchBounds >( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length() )
, m_vector_size( arg_policy.vector_length() )
, m_shmem_begin( sizeof(double) * ( m_team_size + 2 ) )
, m_shmem_size( arg_policy.scratch_size(0,m_team_size) + FunctorTeamShmemSize< FunctorType >::value( m_functor , m_team_size ) )
@ -670,7 +730,7 @@ public:
}
if ( int(m_team_size) >
int(Kokkos::Impl::cuda_get_max_block_size< ParallelFor >
int(Kokkos::Impl::cuda_get_max_block_size< ParallelFor, LaunchBounds >
( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelFor< Cuda > requested too large team size."));
}
@ -725,12 +785,13 @@ public:
const Policy m_policy ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
const bool m_result_ptr_device_accessible ;
size_type * m_scratch_space ;
size_type * m_scratch_flags ;
size_type * m_unified_space ;
// Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
// Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit)
enum { UseShflReduction = false };//((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
// Some crutch to do function overloading
private:
typedef double DummyShflReductionType;
@ -752,12 +813,12 @@ public:
__device__ inline
void operator() () const {
run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
/* run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
}
__device__ inline
void run(const DummySHMEMReductionType& ) const
{
{*/
const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
@ -786,7 +847,8 @@ public:
// This is the final block with the final result at the final threads' location
size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
size_type * const global = m_result_ptr_device_accessible? reinterpret_cast<size_type*>(m_result_ptr) :
( m_unified_space ? m_unified_space : m_scratch_space );
if ( threadIdx.y == 0 ) {
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
@ -798,10 +860,9 @@ public:
}
}
__device__ inline
/* __device__ inline
void run(const DummyShflReductionType&) const
{
value_type value;
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
// Number of blocks is bounded so that the reduction can be limited to two passes.
@ -832,7 +893,7 @@ public:
*result = value;
}
}
}
}*/
// Determine block size constrained by shared memory:
static inline
@ -863,16 +924,18 @@ public:
CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
Cuda::fence();
if(!m_result_ptr_device_accessible) {
Cuda::fence();
if ( m_result_ptr ) {
if ( m_unified_space ) {
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
}
else {
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
if ( m_result_ptr ) {
if ( m_unified_space ) {
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
}
else {
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
}
}
}
}
@ -883,17 +946,18 @@ public:
}
}
template< class HostViewType >
template< class ViewType >
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const HostViewType & arg_result
, const ViewType & arg_result
, typename std::enable_if<
Kokkos::is_view< HostViewType >::value
Kokkos::is_view< ViewType >::value
,void*>::type = NULL)
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result.data() )
, m_result_ptr_device_accessible(MemorySpaceAccess< Kokkos::CudaSpace , typename ViewType::memory_space>::accessible )
, m_scratch_space( 0 )
, m_scratch_flags( 0 )
, m_unified_space( 0 )
@ -906,6 +970,7 @@ public:
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.view().data() )
, m_result_ptr_device_accessible(MemorySpaceAccess< Kokkos::CudaSpace , typename ReducerType::result_view_type::memory_space>::accessible )
, m_scratch_space( 0 )
, m_scratch_flags( 0 )
, m_unified_space( 0 )
@ -953,6 +1018,7 @@ public:
const Policy m_policy ; // used for workrange and nwork
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
const bool m_result_ptr_device_accessible ;
size_type * m_scratch_space ;
size_type * m_scratch_flags ;
size_type * m_unified_space ;
@ -960,7 +1026,7 @@ public:
typedef typename Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern;
// Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && (ValueTraits::StaticValueSize!=0)) };
// Some crutch to do function overloading
private:
typedef double DummyShflReductionType;
@ -978,12 +1044,12 @@ public:
inline
__device__
void operator() (void) const {
run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
/* run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
}
__device__ inline
void run(const DummySHMEMReductionType& ) const
{
{*/
const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
@ -1007,7 +1073,8 @@ public:
// This is the final block with the final result at the final threads' location
size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
size_type * const global = m_result_ptr_device_accessible? reinterpret_cast<size_type*>(m_result_ptr) :
( m_unified_space ? m_unified_space : m_scratch_space );
if ( threadIdx.y == 0 ) {
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
@ -1019,7 +1086,7 @@ public:
}
}
__device__ inline
/* __device__ inline
void run(const DummyShflReductionType&) const
{
@ -1051,7 +1118,7 @@ public:
}
}
}
*/
// Determine block size constrained by shared memory:
static inline
unsigned local_block_size( const FunctorType & f )
@ -1089,16 +1156,18 @@ public:
CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
Cuda::fence();
if(!m_result_ptr_device_accessible) {
Cuda::fence();
if ( m_result_ptr ) {
if ( m_unified_space ) {
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
}
else {
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
if ( m_result_ptr ) {
if ( m_unified_space ) {
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
}
else {
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
}
}
}
}
@ -1109,17 +1178,18 @@ public:
}
}
template< class HostViewType >
template< class ViewType >
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const HostViewType & arg_result
, const ViewType & arg_result
, typename std::enable_if<
Kokkos::is_view< HostViewType >::value
Kokkos::is_view< ViewType >::value
,void*>::type = NULL)
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result.data() )
, m_result_ptr_device_accessible(MemorySpaceAccess< Kokkos::CudaSpace , typename ViewType::memory_space>::accessible )
, m_scratch_space( 0 )
, m_scratch_flags( 0 )
, m_unified_space( 0 )
@ -1132,6 +1202,7 @@ public:
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.view().data() )
, m_result_ptr_device_accessible(MemorySpaceAccess< Kokkos::CudaSpace , typename ReducerType::result_view_type::memory_space>::accessible )
, m_scratch_space( 0 )
, m_scratch_flags( 0 )
, m_unified_space( 0 )
@ -1174,7 +1245,7 @@ public:
typedef FunctorType functor_type ;
typedef Cuda::size_type size_type ;
enum { UseShflReduction = (true && ValueTraits::StaticValueSize) };
enum { UseShflReduction = (true && (ValueTraits::StaticValueSize!=0)) };
private:
typedef double DummyShflReductionType;
@ -1191,6 +1262,7 @@ private:
const FunctorType m_functor ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
const bool m_result_ptr_device_accessible ;
size_type * m_scratch_space ;
size_type * m_scratch_flags ;
size_type * m_unified_space ;
@ -1279,7 +1351,8 @@ public:
// This is the final block with the final result at the final threads' location
size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
size_type * const global = m_result_ptr_device_accessible? reinterpret_cast<size_type*>(m_result_ptr) :
( m_unified_space ? m_unified_space : m_scratch_space );
if ( threadIdx.y == 0 ) {
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
@ -1312,12 +1385,18 @@ public:
, value );
}
pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
pointer_type const result = m_result_ptr_device_accessible? m_result_ptr :
(pointer_type) ( m_unified_space ? m_unified_space : m_scratch_space );
value_type init;
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init);
if(Impl::cuda_inter_block_reduction<FunctorType,ValueJoin,WorkTag>
(value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,blockDim.y)) {
if(
Impl::cuda_inter_block_reduction<FunctorType,ValueJoin,WorkTag>
(value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,blockDim.y)
//This breaks a test
// Kokkos::Impl::CudaReductionsFunctor<FunctorType,WorkTag,false,true>::scalar_inter_block_reduction(ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x ,
// kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags)
) {
const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
if(id==0) {
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
@ -1331,7 +1410,7 @@ public:
{
const int nwork = m_league_size * m_team_size ;
if ( nwork ) {
const int block_count = UseShflReduction? std::min( m_league_size , size_type(1024) )
const int block_count = UseShflReduction? std::min( m_league_size , size_type(1024*32) )
:std::min( m_league_size , m_team_size );
m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_count );
@ -1344,16 +1423,18 @@ public:
CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute
Cuda::fence();
if(!m_result_ptr_device_accessible) {
Cuda::fence();
if ( m_result_ptr ) {
if ( m_unified_space ) {
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
}
else {
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
DeepCopy<HostSpace,CudaSpace>( m_result_ptr, m_scratch_space, size );
if ( m_result_ptr ) {
if ( m_unified_space ) {
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
}
else {
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
DeepCopy<HostSpace,CudaSpace>( m_result_ptr, m_scratch_space, size );
}
}
}
}
@ -1364,16 +1445,17 @@ public:
}
}
template< class HostViewType >
template< class ViewType >
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const HostViewType & arg_result
, const ViewType & arg_result
, typename std::enable_if<
Kokkos::is_view< HostViewType >::value
Kokkos::is_view< ViewType >::value
,void*>::type = NULL)
: m_functor( arg_functor )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result.data() )
, m_result_ptr_device_accessible(MemorySpaceAccess< Kokkos::CudaSpace , typename ViewType::memory_space>::accessible )
, m_scratch_space( 0 )
, m_scratch_flags( 0 )
, m_unified_space( 0 )
@ -1383,17 +1465,17 @@ public:
, m_scratch_ptr{NULL,NULL}
, m_scratch_size{
arg_policy.scratch_size(0,( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce, LaunchBounds >( arg_functor , arg_policy.vector_length(),
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
arg_policy.vector_length() )
), arg_policy.scratch_size(1,( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce, LaunchBounds >( arg_functor , arg_policy.vector_length(),
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
arg_policy.vector_length() )
)}
, m_league_size( arg_policy.league_size() )
, m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce, LaunchBounds >( arg_functor , arg_policy.vector_length(),
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
arg_policy.vector_length() )
, m_vector_size( arg_policy.vector_length() )
@ -1430,9 +1512,7 @@ public:
Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much L0 scratch memory"));
}
if ( unsigned(m_team_size) >
unsigned(Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
if ( int(m_team_size) > arg_policy.team_size_max(m_functor,ParallelReduceTag()) ) {
Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too large team size."));
}
@ -1444,6 +1524,7 @@ public:
: m_functor( arg_functor )
, m_reducer( reducer )
, m_result_ptr( reducer.view().data() )
, m_result_ptr_device_accessible(MemorySpaceAccess< Kokkos::CudaSpace , typename ReducerType::result_view_type::memory_space>::accessible )
, m_scratch_space( 0 )
, m_scratch_flags( 0 )
, m_unified_space( 0 )
@ -1453,7 +1534,7 @@ public:
, m_scratch_ptr{NULL,NULL}
, m_league_size( arg_policy.league_size() )
, m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce, LaunchBounds >( arg_functor , arg_policy.vector_length(),
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
arg_policy.vector_length() )
, m_vector_size( arg_policy.vector_length() )
@ -1486,10 +1567,7 @@ public:
CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
}
if ( int(m_team_size) >
int(Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
if ( int(m_team_size) > arg_policy.team_size_max(m_functor,ParallelReduceTag()) ) {
Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too large team size."));
}
@ -1753,7 +1831,7 @@ public:
// Occupancy calculator assumes whole block.
m_team_size =
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce, LaunchBounds >
( arg_functor
, arg_policy.vector_length()
, arg_policy.team_scratch_size(0)
@ -1970,7 +2048,9 @@ private:
const WorkRange range( m_policy , blockIdx.x , gridDim.x );
for ( typename Policy::member_type iwork_base = range.begin(); iwork_base < range.end() ; iwork_base += blockDim.y ) {
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
unsigned MASK=KOKKOS_IMPL_CUDA_ACTIVEMASK;
#endif
const typename Policy::member_type iwork = iwork_base + threadIdx.y ;
__syncthreads(); // Don't overwrite previous iteration values until they are used
@ -1981,7 +2061,11 @@ private:
for ( unsigned i = threadIdx.y ; i < word_count.value ; ++i ) {
shared_data[i + word_count.value] = shared_data[i] = shared_accum[i] ;
}
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
#else
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
#endif
if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values.
// Call functor to accumulate inclusive scan value for this work item
@ -2189,6 +2273,9 @@ private:
const WorkRange range( m_policy , blockIdx.x , gridDim.x );
for ( typename Policy::member_type iwork_base = range.begin(); iwork_base < range.end() ; iwork_base += blockDim.y ) {
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
unsigned MASK=KOKKOS_IMPL_CUDA_ACTIVEMASK;
#endif
const typename Policy::member_type iwork = iwork_base + threadIdx.y ;
@ -2201,6 +2288,11 @@ private:
shared_data[i + word_count.value] = shared_data[i] = shared_accum[i] ;
}
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
#else
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
#endif
if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values.
// Call functor to accumulate inclusive scan value for this work item

View File

@ -194,8 +194,9 @@ void cuda_shfl_up( T & out , T const & in , int delta ,
*/
template< class ValueType , class JoinOp>
__device__
inline void cuda_intra_warp_reduction( ValueType& result,
__device__ inline
typename std::enable_if< !Kokkos::is_reducer<ValueType>::value >::type
cuda_intra_warp_reduction( ValueType& result,
const JoinOp& join,
const uint32_t max_active_thread = blockDim.y) {
@ -214,8 +215,9 @@ inline void cuda_intra_warp_reduction( ValueType& result,
}
template< class ValueType , class JoinOp>
__device__
inline void cuda_inter_warp_reduction( ValueType& value,
__device__ inline
typename std::enable_if< !Kokkos::is_reducer<ValueType>::value >::type
cuda_inter_warp_reduction( ValueType& value,
const JoinOp& join,
const int max_active_thread = blockDim.y) {
@ -247,8 +249,9 @@ inline void cuda_inter_warp_reduction( ValueType& value,
}
template< class ValueType , class JoinOp>
__device__
inline void cuda_intra_block_reduction( ValueType& value,
__device__ inline
typename std::enable_if< !Kokkos::is_reducer<ValueType>::value >::type
cuda_intra_block_reduction( ValueType& value,
const JoinOp& join,
const int max_active_thread = blockDim.y) {
cuda_intra_warp_reduction(value,join,max_active_thread);
@ -314,31 +317,52 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
if( id + 1 < int(gridDim.x) )
join(value, tmp);
}
int active = KOKKOS_IMPL_CUDA_BALLOT(1);
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
#endif
if (int(blockDim.x*blockDim.y) > 2) {
value_type tmp = Kokkos::shfl_down(value, 2,32);
if( id + 2 < int(gridDim.x) )
join(value, tmp);
}
active += KOKKOS_IMPL_CUDA_BALLOT(1);
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
#endif
if (int(blockDim.x*blockDim.y) > 4) {
value_type tmp = Kokkos::shfl_down(value, 4,32);
if( id + 4 < int(gridDim.x) )
join(value, tmp);
}
active += KOKKOS_IMPL_CUDA_BALLOT(1);
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
#endif
if (int(blockDim.x*blockDim.y) > 8) {
value_type tmp = Kokkos::shfl_down(value, 8,32);
if( id + 8 < int(gridDim.x) )
join(value, tmp);
}
active += KOKKOS_IMPL_CUDA_BALLOT(1);
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
#endif
if (int(blockDim.x*blockDim.y) > 16) {
value_type tmp = Kokkos::shfl_down(value, 16,32);
if( id + 16 < int(gridDim.x) )
join(value, tmp);
}
active += KOKKOS_IMPL_CUDA_BALLOT(1);
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
#endif
}
}
//The last block has in its thread=0 the global reduction value through "value"
@ -478,31 +502,52 @@ cuda_inter_block_reduction( const ReducerType& reducer,
if( id + 1 < int(gridDim.x) )
reducer.join(value, tmp);
}
int active = KOKKOS_IMPL_CUDA_BALLOT(1);
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
#endif
if (int(blockDim.x*blockDim.y) > 2) {
value_type tmp = Kokkos::shfl_down(value, 2,32);
if( id + 2 < int(gridDim.x) )
reducer.join(value, tmp);
}
active += KOKKOS_IMPL_CUDA_BALLOT(1);
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
#endif
if (int(blockDim.x*blockDim.y) > 4) {
value_type tmp = Kokkos::shfl_down(value, 4,32);
if( id + 4 < int(gridDim.x) )
reducer.join(value, tmp);
}
active += KOKKOS_IMPL_CUDA_BALLOT(1);
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
#endif
if (int(blockDim.x*blockDim.y) > 8) {
value_type tmp = Kokkos::shfl_down(value, 8,32);
if( id + 8 < int(gridDim.x) )
reducer.join(value, tmp);
}
active += KOKKOS_IMPL_CUDA_BALLOT(1);
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
#endif
if (int(blockDim.x*blockDim.y) > 16) {
value_type tmp = Kokkos::shfl_down(value, 16,32);
if( id + 16 < int(gridDim.x) )
reducer.join(value, tmp);
}
active += KOKKOS_IMPL_CUDA_BALLOT(1);
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
#endif
}
}
@ -513,6 +558,213 @@ cuda_inter_block_reduction( const ReducerType& reducer,
#endif
}
template<class FunctorType, class ArgTag, bool DoScan, bool UseShfl>
struct CudaReductionsFunctor;
template<class FunctorType, class ArgTag>
struct CudaReductionsFunctor<FunctorType, ArgTag, false, true> {
typedef FunctorValueTraits< FunctorType , ArgTag > ValueTraits ;
typedef FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
typedef FunctorValueInit< FunctorType , ArgTag > ValueInit ;
typedef FunctorValueOps< FunctorType , ArgTag > ValueOps ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::value_type Scalar;
__device__
static inline void scalar_intra_warp_reduction(
const FunctorType& functor,
Scalar value, // Contribution
const bool skip_vector, // Skip threads if Kokkos vector lanes are not part of the reduction
const int width, // How much of the warp participates
Scalar& result)
{
unsigned mask = width==32?0xffffffff:((1<<width)-1)<<((threadIdx.y*blockDim.x+threadIdx.x)%(32/width))*width;
for(int delta=skip_vector?blockDim.x:1; delta<width; delta*=2) {
Scalar tmp;
cuda_shfl_down(tmp,value,delta,width,mask);
ValueJoin::join( functor , &value, &tmp);
}
cuda_shfl(result,value,0,width,mask);
}
__device__
static inline void scalar_intra_block_reduction(
const FunctorType& functor,
Scalar value,
const bool skip,
Scalar* my_global_team_buffer_element,
const int shared_elements,
Scalar* shared_team_buffer_element) {
const int warp_id = (threadIdx.y*blockDim.x)/32;
Scalar* const my_shared_team_buffer_element =
shared_team_buffer_element + warp_id%shared_elements;
// Warp Level Reduction, ignoring Kokkos vector entries
scalar_intra_warp_reduction(functor,value,skip,32,value);
if(warp_id<shared_elements) {
*my_shared_team_buffer_element=value;
}
// Wait for every warp to be done before using one warp to do final cross warp reduction
__syncthreads();
const int num_warps = blockDim.x*blockDim.y/32;
for(int w = shared_elements; w<num_warps; w+=shared_elements) {
if(warp_id>=w && warp_id<w+shared_elements) {
if((threadIdx.y*blockDim.x + threadIdx.x)%32==0)
ValueJoin::join( functor , my_shared_team_buffer_element, &value);
}
__syncthreads();
}
if( warp_id == 0) {
ValueInit::init( functor , &value );
for(unsigned int i=threadIdx.y*blockDim.x+threadIdx.x; i<blockDim.y*blockDim.x/32; i+=32)
ValueJoin::join( functor , &value,&shared_team_buffer_element[i]);
scalar_intra_warp_reduction(functor,value,false,32,*my_global_team_buffer_element);
}
}
__device__
static inline bool scalar_inter_block_reduction(
const FunctorType & functor ,
const Cuda::size_type block_id ,
const Cuda::size_type block_count ,
Cuda::size_type * const shared_data ,
Cuda::size_type * const global_data ,
Cuda::size_type * const global_flags ) {
Scalar* const global_team_buffer_element = ((Scalar*) global_data);
Scalar* const my_global_team_buffer_element = global_team_buffer_element + blockIdx.x;
Scalar* shared_team_buffer_elements = ((Scalar*) shared_data);
Scalar value = shared_team_buffer_elements[threadIdx.y];
int shared_elements=blockDim.x*blockDim.y/32;
int global_elements=block_count;
__syncthreads();
scalar_intra_block_reduction(functor,value,true,my_global_team_buffer_element,shared_elements,shared_team_buffer_elements);
__syncthreads();
unsigned int num_teams_done = 0;
if(threadIdx.x + threadIdx.y == 0) {
__threadfence();
num_teams_done = Kokkos::atomic_fetch_add(global_flags,1)+1;
}
bool is_last_block = false;
if(__syncthreads_or(num_teams_done == gridDim.x)) {
is_last_block=true;
*global_flags = 0;
ValueInit::init( functor, &value);
for(int i=threadIdx.y*blockDim.x+threadIdx.x; i<global_elements; i+=blockDim.x*blockDim.y) {
ValueJoin::join( functor , &value,&global_team_buffer_element[i]);
}
scalar_intra_block_reduction(functor,value,false,shared_team_buffer_elements+(blockDim.y-1),shared_elements,shared_team_buffer_elements);
}
return is_last_block;
}
};
template<class FunctorType, class ArgTag>
struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
typedef FunctorValueTraits< FunctorType , ArgTag > ValueTraits ;
typedef FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
typedef FunctorValueInit< FunctorType , ArgTag > ValueInit ;
typedef FunctorValueOps< FunctorType , ArgTag > ValueOps ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::value_type Scalar;
__device__
static inline void scalar_intra_warp_reduction(
const FunctorType& functor,
Scalar* value, // Contribution
const bool skip_vector, // Skip threads if Kokkos vector lanes are not part of the reduction
const int width) // How much of the warp participates
{
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
unsigned mask = width==32?0xffffffff:((1<<width)-1)<<((threadIdx.y*blockDim.x+threadIdx.x)%(32/width))*width;
#endif
const int lane_id = (threadIdx.y*blockDim.x+threadIdx.x)%32;
for(int delta=skip_vector?blockDim.x:1; delta<width; delta*=2) {
if(lane_id + delta<32) {
ValueJoin::join( functor , value, value+delta);
}
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(mask);
#else
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
#endif
}
*value=*(value-lane_id);
}
__device__
static inline void scalar_intra_block_reduction(
const FunctorType& functor,
Scalar value,
const bool skip,
Scalar* result,
const int shared_elements,
Scalar* shared_team_buffer_element) {
const int warp_id = (threadIdx.y*blockDim.x)/32;
Scalar* const my_shared_team_buffer_element =
shared_team_buffer_element + threadIdx.y*blockDim.x+threadIdx.x;
*my_shared_team_buffer_element = value;
// Warp Level Reduction, ignoring Kokkos vector entries
scalar_intra_warp_reduction(functor,my_shared_team_buffer_element,skip,32);
// Wait for every warp to be done before using one warp to do final cross warp reduction
__syncthreads();
if( warp_id == 0) {
const unsigned int delta = (threadIdx.y*blockDim.x+threadIdx.x)*32;
if(delta<blockDim.x*blockDim.y)
*my_shared_team_buffer_element = shared_team_buffer_element[delta];
KOKKOS_IMPL_CUDA_SYNCWARP;
scalar_intra_warp_reduction(functor,my_shared_team_buffer_element,false,blockDim.x*blockDim.y/32);
if(threadIdx.x + threadIdx.y == 0) *result = *shared_team_buffer_element;
}
}
__device__
static inline bool scalar_inter_block_reduction(
const FunctorType & functor ,
const Cuda::size_type block_id ,
const Cuda::size_type block_count ,
Cuda::size_type * const shared_data ,
Cuda::size_type * const global_data ,
Cuda::size_type * const global_flags ) {
Scalar* const global_team_buffer_element = ((Scalar*) global_data);
Scalar* const my_global_team_buffer_element = global_team_buffer_element + blockIdx.x;
Scalar* shared_team_buffer_elements = ((Scalar*) shared_data);
Scalar value = shared_team_buffer_elements[threadIdx.y];
int shared_elements=blockDim.x*blockDim.y/32;
int global_elements=block_count;
__syncthreads();
scalar_intra_block_reduction(functor,value,true,my_global_team_buffer_element,shared_elements,shared_team_buffer_elements);
__syncthreads();
unsigned int num_teams_done = 0;
if(threadIdx.x + threadIdx.y == 0) {
__threadfence();
num_teams_done = Kokkos::atomic_fetch_add(global_flags,1)+1;
}
bool is_last_block = false;
if(__syncthreads_or(num_teams_done == gridDim.x)) {
is_last_block=true;
*global_flags = 0;
ValueInit::init( functor, &value);
for(int i=threadIdx.y*blockDim.x+threadIdx.x; i<global_elements; i+=blockDim.x*blockDim.y) {
ValueJoin::join( functor , &value,&global_team_buffer_element[i]);
}
scalar_intra_block_reduction(functor,value,false,shared_team_buffer_elements+(blockDim.y-1),shared_elements,shared_team_buffer_elements);
}
return is_last_block;
}
};
//----------------------------------------------------------------------------
// See section B.17 of Cuda C Programming Guide Version 3.2
// for discussion of
@ -639,14 +891,15 @@ void cuda_intra_block_reduce_scan( const FunctorType & functor ,
*
* Global reduce result is in the last threads' 'shared_data' location.
*/
template< bool DoScan , class FunctorType , class ArgTag >
__device__
bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
const Cuda::size_type block_id ,
const Cuda::size_type block_count ,
Cuda::size_type * const shared_data ,
Cuda::size_type * const global_data ,
Cuda::size_type * const global_flags )
bool cuda_single_inter_block_reduce_scan2( const FunctorType & functor ,
const Cuda::size_type block_id ,
const Cuda::size_type block_count ,
Cuda::size_type * const shared_data ,
Cuda::size_type * const global_data ,
Cuda::size_type * const global_flags )
{
typedef Cuda::size_type size_type ;
typedef FunctorValueTraits< FunctorType , ArgTag > ValueTraits ;
@ -655,7 +908,6 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
typedef FunctorValueOps< FunctorType , ArgTag > ValueOps ;
typedef typename ValueTraits::pointer_type pointer_type ;
//typedef typename ValueTraits::reference_type reference_type ;
// '__ffs' = position of the least significant bit set to 1.
// 'blockDim.y' is guaranteed to be a power of two so this
@ -678,12 +930,7 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
size_type * const global = global_data + word_count.value * block_id ;
//#if (__CUDA_ARCH__ < 500)
for ( int i = int(threadIdx.y) ; i < int(word_count.value) ; i += int(blockDim.y) ) { global[i] = shared[i] ; }
//#else
// for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
//#endif
}
// Contributing blocks note that their contribution has been completed via an atomic-increment flag
@ -725,6 +972,22 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
return is_last_block ;
}
template< bool DoScan , class FunctorType , class ArgTag >
__device__
bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
const Cuda::size_type block_id ,
const Cuda::size_type block_count ,
Cuda::size_type * const shared_data ,
Cuda::size_type * const global_data ,
Cuda::size_type * const global_flags )
{
typedef FunctorValueTraits< FunctorType , ArgTag > ValueTraits ;
if(!DoScan && ValueTraits::StaticValueSize)
return Kokkos::Impl::CudaReductionsFunctor<FunctorType,ArgTag,false,(ValueTraits::StaticValueSize>16)>::scalar_inter_block_reduction(functor,block_id,block_count,shared_data,global_data,global_flags);
else
return cuda_single_inter_block_reduce_scan2<DoScan, FunctorType, ArgTag>(functor, block_id, block_count, shared_data, global_data, global_flags);
}
// Size in bytes required for inter block reduce or scan
template< bool DoScan , class FunctorType , class ArgTag >
inline

View File

@ -160,7 +160,7 @@ public:
template<class ValueType>
KOKKOS_INLINE_FUNCTION
void team_broadcast( ValueType & val, const int& thread_id) const
void team_broadcast( ValueType & val, const int& thread_id ) const
{
#ifdef __CUDA_ARCH__
if ( 1 == blockDim.z ) { // team == block
@ -178,6 +178,29 @@ public:
}
#endif
}
template<class Closure, class ValueType>
KOKKOS_INLINE_FUNCTION
void team_broadcast( Closure const & f, ValueType & val, const int& thread_id ) const
{
#ifdef __CUDA_ARCH__
f( val );
if ( 1 == blockDim.z ) { // team == block
__syncthreads();
// Wait for shared data write until all threads arrive here
if ( threadIdx.x == 0u && threadIdx.y == (uint32_t)thread_id ) {
*((ValueType*) m_team_reduce) = val ;
}
__syncthreads(); // Wait for shared data read until root thread writes
val = *((ValueType*) m_team_reduce);
}
else { // team <= warp
ValueType tmp( val ); // input might not be a register variable
cuda_shfl( val, tmp, blockDim.x * thread_id, blockDim.x * blockDim.y );
}
#endif
}
//--------------------------------------------------------------------------
/**\brief Reduction across a team
@ -200,92 +223,7 @@ public:
team_reduce( ReducerType const & reducer ) const noexcept
{
#ifdef __CUDA_ARCH__
typedef typename ReducerType::value_type value_type ;
value_type tmp( reducer.reference() );
// reduce within the warp using shuffle
const int wx =
( threadIdx.x + blockDim.x * threadIdx.y ) & CudaTraits::WarpIndexMask ;
for ( int i = CudaTraits::WarpSize ; (int)blockDim.x <= ( i >>= 1 ) ; ) {
cuda_shfl_down( reducer.reference() , tmp , i , CudaTraits::WarpSize );
// Root of each vector lane reduces:
if ( 0 == threadIdx.x && wx < i ) {
reducer.join( tmp , reducer.reference() );
}
}
if ( 1 < blockDim.z ) { // team <= warp
// broadcast result from root vector lange of root thread
cuda_shfl( reducer.reference() , tmp
, blockDim.x * threadIdx.y , CudaTraits::WarpSize );
}
else { // team == block
// Reduce across warps using shared memory
// Broadcast result within block
// Number of warps, blockDim.y may not be power of two:
const int nw = ( blockDim.x * blockDim.y + CudaTraits::WarpIndexMask ) >> CudaTraits::WarpIndexShift ;
// Warp index:
const int wy = ( blockDim.x * threadIdx.y ) >> CudaTraits::WarpIndexShift ;
// Number of shared memory entries for the reduction:
int nsh = m_team_reduce_size / sizeof(value_type);
// Using at most one entry per warp:
if ( nw < nsh ) nsh = nw ;
__syncthreads(); // Wait before shared data write
if ( 0 == wx && wy < nsh ) {
((value_type*) m_team_reduce)[wy] = tmp ;
}
// When more warps than shared entries:
for ( int i = nsh ; i < nw ; i += nsh ) {
__syncthreads();
if ( 0 == wx && i <= wy ) {
const int k = wy - i ;
if ( k < nsh ) {
reducer.join( *((value_type*) m_team_reduce + k) , tmp );
}
}
}
__syncthreads();
// One warp performs the inter-warp reduction:
if ( 0 == wy ) {
// Start at power of two covering nsh
for ( int i = 1 << ( 32 - __clz(nsh-1) ) ; ( i >>= 1 ) ; ) {
const int k = wx + i ;
if ( wx < i && k < nsh ) {
reducer.join( ((value_type*)m_team_reduce)[wx]
, ((value_type*)m_team_reduce)[k] );
__threadfence_block();
}
}
}
__syncthreads(); // Wait for reduction
// Broadcast result to all threads
reducer.reference() = *((value_type*)m_team_reduce);
}
cuda_intra_block_reduction(reducer,blockDim.y);
#endif /* #ifdef __CUDA_ARCH__ */
}
@ -801,7 +739,11 @@ void parallel_for
; i += blockDim.x ) {
closure(i);
}
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x);
#else
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
#endif
#endif
}
@ -970,7 +912,11 @@ KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
#ifdef __CUDA_ARCH__
if(threadIdx.x == 0) lambda();
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x);
#else
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
#endif
#endif
}
@ -979,7 +925,11 @@ KOKKOS_INLINE_FUNCTION
void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
#ifdef __CUDA_ARCH__
if(threadIdx.x == 0 && threadIdx.y == 0) lambda();
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x);
#else
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
#endif
#endif
}

View File

@ -2,9 +2,11 @@
#if defined( __CUDA_ARCH__ )
#if ( CUDA_VERSION < 9000 )
#define KOKKOS_IMPL_CUDA_ACTIVEMASK 0
#define KOKKOS_IMPL_CUDA_SYNCWARP __threadfence_block()
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(x) __threadfence_block()
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK __threadfence_block()
#define KOKKOS_IMPL_CUDA_BALLOT(x) __ballot(x)
#define KOKKOS_IMPL_CUDA_BALLOT_MASK(x) __ballot(x)
#define KOKKOS_IMPL_CUDA_SHFL(x,y,z) __shfl(x,y,z)
#define KOKKOS_IMPL_CUDA_SHFL_MASK(m,x,y,z) __shfl(x,y,z)
#define KOKKOS_IMPL_CUDA_SHFL_UP(x,y,z) __shfl_up(x,y,z)
@ -12,9 +14,11 @@
#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x,y,z) __shfl_down(x,y,z)
#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m,x,y,z) __shfl_down(x,y,z)
#else
#define KOKKOS_IMPL_CUDA_ACTIVEMASK __activemask()
#define KOKKOS_IMPL_CUDA_SYNCWARP __syncwarp(0xffffffff)
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) __syncwarp(m)
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) __syncwarp(m);
#define KOKKOS_IMPL_CUDA_BALLOT(x) __ballot_sync(__activemask(),x)
#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m,x) __ballot_sync(m,x)
#define KOKKOS_IMPL_CUDA_SHFL(x,y,z) __shfl_sync(0xffffffff,x,y,z)
#define KOKKOS_IMPL_CUDA_SHFL_MASK(m,x,y,z) __shfl_sync(m,x,y,z)
#define KOKKOS_IMPL_CUDA_SHFL_UP(x,y,z) __shfl_up_sync(0xffffffff,x,y,z)
@ -23,11 +27,16 @@
#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m,x,y,z) __shfl_down_sync(m,x,y,z)
#endif
#else
#define KOKKOS_IMPL_CUDA_ACTIVEMASK 0
#define KOKKOS_IMPL_CUDA_SYNCWARP
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK
#define KOKKOS_IMPL_CUDA_BALLOT(x) 0
#define KOKKOS_IMPL_CUDA_BALLOT_MASK(x) 0
#define KOKKOS_IMPL_CUDA_SHFL(x,y,z) 0
#define KOKKOS_IMPL_CUDA_SHFL_MASK(m,x,y,z) 0
#define KOKKOS_IMPL_CUDA_SHFL_UP(x,y,z) 0
#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x,y,z) 0
#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m,x,y,z) 0
#endif
#if ( CUDA_VERSION >= 9000 ) && (!defined(KOKKOS_COMPILER_CLANG))

View File

@ -279,6 +279,8 @@ public:
KOKKOS_INLINE_FUNCTION
static handle_type assign( value_type * arg_data_ptr, track_type const & arg_tracker )
{
if(arg_data_ptr == NULL) return handle_type();
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
// Assignment of texture = non-texture requires creation of a texture object
// which can only occur on the host. In addition, 'get_record' is only valid
@ -292,8 +294,7 @@ public:
#if ! defined( KOKKOS_ENABLE_CUDA_LDG_INTRINSIC )
if ( 0 == r ) {
//Kokkos::abort("Cuda const random access View using Cuda texture memory requires Kokkos to allocate the View's memory");
return handle_type();
Kokkos::abort("Cuda const random access View using Cuda texture memory requires Kokkos to allocate the View's memory");
}
#endif

View File

@ -46,6 +46,8 @@
#include <initializer_list>
#include <Kokkos_Layout.hpp>
#include<impl/KokkosExp_Host_IterateTile.hpp>
#include <Kokkos_ExecPolicy.hpp>
#include <Kokkos_Parallel.hpp>
@ -63,13 +65,15 @@
namespace Kokkos {
// ------------------------------------------------------------------ //
// Moved to Kokkos_Layout.hpp for more general accessibility
/*
enum class Iterate
{
Default, // Default for the device
Left, // Left indices stride fastest
Right, // Right indices stride fastest
};
*/
template <typename ExecSpace>
struct default_outer_direction

View File

@ -45,11 +45,13 @@
#define KOKKOS_ARRAY_HPP
#include <Kokkos_Macros.hpp>
#include <impl/Kokkos_Error.hpp>
#include <type_traits>
#include <algorithm>
#include <limits>
#include <cstddef>
#include <string>
namespace Kokkos {
@ -132,6 +134,7 @@ public:
KOKKOS_INLINE_FUNCTION static constexpr size_type size() { return N ; }
KOKKOS_INLINE_FUNCTION static constexpr bool empty(){ return false ; }
KOKKOS_INLINE_FUNCTION constexpr size_type max_size() const { return N ; }
template< typename iType >
KOKKOS_INLINE_FUNCTION
@ -160,7 +163,7 @@ public:
return & m_internal_implementation_private_member_data[0];
}
#ifdef KOKKOS_ROCM_CLANG_WORKAROUND
#ifdef KOKKOS_IMPL_ROCM_CLANG_WORKAROUND
// Do not default unless move and move-assignment are also defined
KOKKOS_INLINE_FUNCTION
~Array() = default ;
@ -197,6 +200,7 @@ public:
KOKKOS_INLINE_FUNCTION static constexpr size_type size() { return 0 ; }
KOKKOS_INLINE_FUNCTION static constexpr bool empty() { return true ; }
KOKKOS_INLINE_FUNCTION constexpr size_type max_size() const { return 0 ; }
template< typename iType >
KOKKOS_INLINE_FUNCTION
@ -261,6 +265,7 @@ public:
KOKKOS_INLINE_FUNCTION constexpr size_type size() const { return m_size ; }
KOKKOS_INLINE_FUNCTION constexpr bool empty() const { return 0 != m_size ; }
KOKKOS_INLINE_FUNCTION constexpr size_type max_size() const { return m_size ; }
template< typename iType >
KOKKOS_INLINE_FUNCTION
@ -336,6 +341,7 @@ public:
KOKKOS_INLINE_FUNCTION constexpr size_type size() const { return m_size ; }
KOKKOS_INLINE_FUNCTION constexpr bool empty() const { return 0 != m_size ; }
KOKKOS_INLINE_FUNCTION constexpr size_type max_size() const { return m_size ; }
template< typename iType >
KOKKOS_INLINE_FUNCTION

View File

@ -105,7 +105,10 @@ namespace Kokkos {
template< typename T > struct is_ ## CONCEPT { \
private: \
template< typename , typename = std::true_type > struct have : std::false_type {}; \
template< typename U > struct have<U,typename std::is_same<U,typename U:: CONCEPT >::type> : std::true_type {}; \
template< typename U > struct have<U,typename std::is_same< \
typename std::remove_cv<U>::type, \
typename std::remove_cv<typename U:: CONCEPT>::type \
>::type> : std::true_type {}; \
public: \
enum { value = is_ ## CONCEPT::template have<T>::value }; \
};

View File

@ -453,8 +453,9 @@ template<class ViewTypeA,class ViewTypeB, class Layout, class ExecSpace,typename
struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,2,iType,KOKKOS_IMPL_COMPILING_LIBRARY> {
ViewTypeA a;
ViewTypeB b;
typedef Kokkos::Rank<2,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern;
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern;
typedef Kokkos::Rank<2,outer_iteration_pattern,inner_iteration_pattern> iterate_type;
typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
@ -475,7 +476,9 @@ struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,3,iType,KOKKOS_IMPL_COMPILI
ViewTypeA a;
ViewTypeB b;
typedef Kokkos::Rank<3,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern;
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern;
typedef Kokkos::Rank<3,outer_iteration_pattern,inner_iteration_pattern> iterate_type;
typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
@ -496,7 +499,9 @@ struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,4,iType,KOKKOS_IMPL_COMPILI
ViewTypeA a;
ViewTypeB b;
typedef Kokkos::Rank<4,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern;
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern;
typedef Kokkos::Rank<4,outer_iteration_pattern,inner_iteration_pattern> iterate_type;
typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
@ -519,7 +524,9 @@ struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,5,iType,KOKKOS_IMPL_COMPILI
ViewTypeA a;
ViewTypeB b;
typedef Kokkos::Rank<5,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern;
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern;
typedef Kokkos::Rank<5,outer_iteration_pattern,inner_iteration_pattern> iterate_type;
typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
@ -542,7 +549,9 @@ struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,6,iType,KOKKOS_IMPL_COMPILI
ViewTypeA a;
ViewTypeB b;
typedef Kokkos::Rank<6,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern;
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern;
typedef Kokkos::Rank<6,outer_iteration_pattern,inner_iteration_pattern> iterate_type;
typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
@ -566,7 +575,9 @@ struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,7,iType,KOKKOS_IMPL_COMPILI
ViewTypeA a;
ViewTypeB b;
typedef Kokkos::Rank<6,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern;
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern;
typedef Kokkos::Rank<6,outer_iteration_pattern,inner_iteration_pattern> iterate_type;
typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
@ -590,7 +601,9 @@ struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,8,iType,KOKKOS_IMPL_COMPILI
ViewTypeA a;
ViewTypeB b;
typedef Kokkos::Rank<6,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern;
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern;
typedef Kokkos::Rank<6,outer_iteration_pattern,inner_iteration_pattern> iterate_type;
typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
@ -642,7 +655,9 @@ void view_copy(const DstType& dst, const SrcType& src) {
int64_t strides[DstType::Rank+1];
dst.stride(strides);
Kokkos::Iterate iterate;
if ( std::is_same<typename DstType::array_layout,Kokkos::LayoutRight>::value ) {
if ( Kokkos::is_layouttiled<typename DstType::array_layout>::value ) {
iterate = Kokkos::layout_iterate_type_selector<typename DstType::array_layout>::outer_iteration_pattern;
} else if ( std::is_same<typename DstType::array_layout,Kokkos::LayoutRight>::value ) {
iterate = Kokkos::Iterate::Right;
} else if ( std::is_same<typename DstType::array_layout,Kokkos::LayoutLeft>::value ) {
iterate = Kokkos::Iterate::Left;
@ -1243,9 +1258,9 @@ void deep_copy
ViewTypeFlat;
ViewTypeFlat dst_flat(dst.data(),dst.size());
if(dst.span() < std::numeric_limits<int>::max())
if(dst.span() < std::numeric_limits<int>::max()) {
Kokkos::Impl::ViewFill< ViewTypeFlat , Kokkos::LayoutRight, typename ViewType::execution_space, ViewTypeFlat::Rank, int >( dst_flat , value );
else
} else
Kokkos::Impl::ViewFill< ViewTypeFlat , Kokkos::LayoutRight, typename ViewType::execution_space, ViewTypeFlat::Rank, int64_t >( dst_flat , value );
Kokkos::fence();
return;
@ -1397,7 +1412,6 @@ void deep_copy
enum { SrcExecCanAccessDst =
Kokkos::Impl::SpaceAccessibility< src_execution_space , dst_memory_space >::accessible };
// Checking for Overlapping Views.
dst_value_type* dst_start = dst.data();
dst_value_type* dst_end = dst.data() + dst.span();
@ -1493,7 +1507,7 @@ void deep_copy
Kokkos::fence();
} else {
Kokkos::fence();
Impl::view_copy(typename dst_type::uniform_runtime_nomemspace_type(dst),typename src_type::uniform_runtime_const_nomemspace_type(src));
Impl::view_copy(dst, src);
Kokkos::fence();
}
}
@ -1739,8 +1753,7 @@ void deep_copy
exec_space.fence();
} else {
exec_space.fence();
Impl::view_copy(typename dst_type::uniform_runtime_nomemspace_type(dst),
typename src_type::uniform_runtime_const_nomemspace_type(src));
Impl::view_copy(dst, src);
exec_space.fence();
}
}
@ -1917,4 +1930,213 @@ void realloc( Kokkos::View<T,P...> & v ,
}
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
// Deduce Mirror Types
template<class Space, class T, class ... P>
struct MirrorViewType {
// The incoming view_type
typedef typename Kokkos::View<T,P...> src_view_type;
// The memory space for the mirror view
typedef typename Space::memory_space memory_space;
// Check whether it is the same memory space
enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value };
// The array_layout
typedef typename src_view_type::array_layout array_layout;
// The data type (we probably want it non-const since otherwise we can't even deep_copy to it.
typedef typename src_view_type::non_const_data_type data_type;
// The destination view type if it is not the same memory space
typedef Kokkos::View<data_type,array_layout,Space> dest_view_type;
// If it is the same memory_space return the existsing view_type
// This will also keep the unmanaged trait if necessary
typedef typename std::conditional<is_same_memspace,src_view_type,dest_view_type>::type view_type;
};
template<class Space, class T, class ... P>
struct MirrorType {
// The incoming view_type
typedef typename Kokkos::View<T,P...> src_view_type;
// The memory space for the mirror view
typedef typename Space::memory_space memory_space;
// Check whether it is the same memory space
enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value };
// The array_layout
typedef typename src_view_type::array_layout array_layout;
// The data type (we probably want it non-const since otherwise we can't even deep_copy to it.
typedef typename src_view_type::non_const_data_type data_type;
// The destination view type if it is not the same memory space
typedef Kokkos::View<data_type,array_layout,Space> view_type;
};
}
template< class T , class ... P >
inline
typename Kokkos::View<T,P...>::HostMirror
create_mirror( const Kokkos::View<T,P...> & src
, typename std::enable_if<
std::is_same< typename ViewTraits<T,P...>::specialize , void >::value &&
! std::is_same< typename Kokkos::ViewTraits<T,P...>::array_layout
, Kokkos::LayoutStride >::value
>::type * = 0
)
{
typedef View<T,P...> src_type ;
typedef typename src_type::HostMirror dst_type ;
return dst_type( std::string( src.label() ).append("_mirror")
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
, src.extent(0)
, src.extent(1)
, src.extent(2)
, src.extent(3)
, src.extent(4)
, src.extent(5)
, src.extent(6)
, src.extent(7) );
#else
, src.rank_dynamic > 0 ? src.extent(0): KOKKOS_IMPL_CTOR_DEFAULT_ARG
, src.rank_dynamic > 1 ? src.extent(1): KOKKOS_IMPL_CTOR_DEFAULT_ARG
, src.rank_dynamic > 2 ? src.extent(2): KOKKOS_IMPL_CTOR_DEFAULT_ARG
, src.rank_dynamic > 3 ? src.extent(3): KOKKOS_IMPL_CTOR_DEFAULT_ARG
, src.rank_dynamic > 4 ? src.extent(4): KOKKOS_IMPL_CTOR_DEFAULT_ARG
, src.rank_dynamic > 5 ? src.extent(5): KOKKOS_IMPL_CTOR_DEFAULT_ARG
, src.rank_dynamic > 6 ? src.extent(6): KOKKOS_IMPL_CTOR_DEFAULT_ARG
, src.rank_dynamic > 7 ? src.extent(7): KOKKOS_IMPL_CTOR_DEFAULT_ARG );
#endif
}
template< class T , class ... P >
inline
typename Kokkos::View<T,P...>::HostMirror
create_mirror( const Kokkos::View<T,P...> & src
, typename std::enable_if<
std::is_same< typename ViewTraits<T,P...>::specialize , void >::value &&
std::is_same< typename Kokkos::ViewTraits<T,P...>::array_layout
, Kokkos::LayoutStride >::value
>::type * = 0
)
{
typedef View<T,P...> src_type ;
typedef typename src_type::HostMirror dst_type ;
Kokkos::LayoutStride layout ;
layout.dimension[0] = src.extent(0);
layout.dimension[1] = src.extent(1);
layout.dimension[2] = src.extent(2);
layout.dimension[3] = src.extent(3);
layout.dimension[4] = src.extent(4);
layout.dimension[5] = src.extent(5);
layout.dimension[6] = src.extent(6);
layout.dimension[7] = src.extent(7);
layout.stride[0] = src.stride_0();
layout.stride[1] = src.stride_1();
layout.stride[2] = src.stride_2();
layout.stride[3] = src.stride_3();
layout.stride[4] = src.stride_4();
layout.stride[5] = src.stride_5();
layout.stride[6] = src.stride_6();
layout.stride[7] = src.stride_7();
return dst_type( std::string( src.label() ).append("_mirror") , layout );
}
// Create a mirror in a new space (specialization for different space)
template<class Space, class T, class ... P>
typename Impl::MirrorType<Space,T,P ...>::view_type
create_mirror(const Space& , const Kokkos::View<T,P...> & src
, typename std::enable_if<
std::is_same< typename ViewTraits<T,P...>::specialize , void >::value
>::type * = 0) {
return typename Impl::MirrorType<Space,T,P ...>::view_type(src.label(),src.layout());
}
template< class T , class ... P >
inline
typename Kokkos::View<T,P...>::HostMirror
create_mirror_view( const Kokkos::View<T,P...> & src
, typename std::enable_if<(
std::is_same< typename Kokkos::View<T,P...>::memory_space
, typename Kokkos::View<T,P...>::HostMirror::memory_space
>::value
&&
std::is_same< typename Kokkos::View<T,P...>::data_type
, typename Kokkos::View<T,P...>::HostMirror::data_type
>::value
)>::type * = 0
)
{
return src ;
}
template< class T , class ... P >
inline
typename Kokkos::View<T,P...>::HostMirror
create_mirror_view( const Kokkos::View<T,P...> & src
, typename std::enable_if< ! (
std::is_same< typename Kokkos::View<T,P...>::memory_space
, typename Kokkos::View<T,P...>::HostMirror::memory_space
>::value
&&
std::is_same< typename Kokkos::View<T,P...>::data_type
, typename Kokkos::View<T,P...>::HostMirror::data_type
>::value
)>::type * = 0
)
{
return Kokkos::create_mirror( src );
}
// Create a mirror view in a new space (specialization for same space)
template<class Space, class T, class ... P>
typename Impl::MirrorViewType<Space,T,P ...>::view_type
create_mirror_view(const Space& , const Kokkos::View<T,P...> & src
, typename std::enable_if<Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
return src;
}
// Create a mirror view in a new space (specialization for different space)
template<class Space, class T, class ... P>
typename Impl::MirrorViewType<Space,T,P ...>::view_type
create_mirror_view(const Space& , const Kokkos::View<T,P...> & src
, typename std::enable_if<!Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
return typename Impl::MirrorViewType<Space,T,P ...>::view_type(src.label(),src.layout());
}
// Create a mirror view and deep_copy in a new space (specialization for same space)
template<class Space, class T, class ... P>
typename Impl::MirrorViewType<Space,T,P ...>::view_type
create_mirror_view_and_copy(const Space& , const Kokkos::View<T,P...> & src
, std::string const& name = ""
, typename std::enable_if<Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
(void)name;
return src;
}
// Create a mirror view and deep_copy in a new space (specialization for different space)
template<class Space, class T, class ... P>
typename Impl::MirrorViewType<Space,T,P ...>::view_type
create_mirror_view_and_copy(const Space& , const Kokkos::View<T,P...> & src
, std::string const& name = ""
, typename std::enable_if<!Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
using Mirror = typename Impl::MirrorViewType<Space,T,P ...>::view_type;
std::string label = name.empty() ? src.label() : name;
auto mirror = Mirror(ViewAllocateWithoutInitializing(label), src.layout());
deep_copy(mirror, src);
return mirror;
}
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif

View File

@ -57,6 +57,10 @@
namespace Kokkos {
struct ParallelForTag {};
struct ParallelScanTag {};
struct ParallelReduceTag {};
struct ChunkSize {
int value;
ChunkSize(int value_):value(value_) {}
@ -320,6 +324,10 @@ public:
template< class FunctorType >
static int team_size_recommended( const FunctorType & , const int&);
template<class FunctorType>
int team_size_recommended( const FunctorType & functor , const int vector_length);
//----------------------------------------
/** \brief Construct policy with the given instance of the execution space */
TeamPolicyInternal( const typename traits::execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1 );

View File

@ -76,6 +76,8 @@ struct LayoutLeft {
size_t dimension[ ARRAY_LAYOUT_MAX_RANK ];
enum { is_extent_constructible = true };
LayoutLeft( LayoutLeft const & ) = default ;
LayoutLeft( LayoutLeft && ) = default ;
LayoutLeft & operator = ( LayoutLeft const & ) = default ;
@ -108,6 +110,8 @@ struct LayoutRight {
size_t dimension[ ARRAY_LAYOUT_MAX_RANK ];
enum { is_extent_constructible = true };
LayoutRight( LayoutRight const & ) = default ;
LayoutRight( LayoutRight && ) = default ;
LayoutRight & operator = ( LayoutRight const & ) = default ;
@ -132,6 +136,8 @@ struct LayoutStride {
size_t dimension[ ARRAY_LAYOUT_MAX_RANK ] ;
size_t stride[ ARRAY_LAYOUT_MAX_RANK ] ;
enum { is_extent_constructible = false };
LayoutStride( LayoutStride const & ) = default ;
LayoutStride( LayoutStride && ) = default ;
LayoutStride & operator = ( LayoutStride const & ) = default ;
@ -222,6 +228,8 @@ struct LayoutTileLeft {
size_t dimension[ ARRAY_LAYOUT_MAX_RANK ] ;
enum { is_extent_constructible = true };
LayoutTileLeft( LayoutTileLeft const & ) = default ;
LayoutTileLeft( LayoutTileLeft && ) = default ;
LayoutTileLeft & operator = ( LayoutTileLeft const & ) = default ;
@ -235,6 +243,144 @@ struct LayoutTileLeft {
: dimension { argN0 , argN1 , argN2 , argN3 , argN4 , argN5 , argN6 , argN7 } {}
};
//////////////////////////////////////////////////////////////////////////////////////
enum class Iterate
{
Default,
Left, // Left indices stride fastest
Right // Right indices stride fastest
};
// To check for LayoutTiled
// This is to hide extra compile-time 'identifier' info within the LayoutTiled class by not relying on template specialization to include the ArgN*'s
template < typename LayoutTiledCheck, class Enable = void >
struct is_layouttiled : std::false_type {};
#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
template < typename LayoutTiledCheck >
struct is_layouttiled< LayoutTiledCheck, typename std::enable_if<LayoutTiledCheck::is_array_layout_tiled>::type > : std::true_type {};
namespace Experimental {
/// LayoutTiled
// Must have Rank >= 2
template < Kokkos::Iterate OuterP, Kokkos::Iterate InnerP,
unsigned ArgN0 , unsigned ArgN1 , unsigned ArgN2 = 0, unsigned ArgN3 = 0, unsigned ArgN4 = 0, unsigned ArgN5 = 0, unsigned ArgN6 = 0, unsigned ArgN7 = 0,
bool IsPowerOfTwo =
( Impl::is_integral_power_of_two(ArgN0) &&
Impl::is_integral_power_of_two(ArgN1) &&
(Impl::is_integral_power_of_two(ArgN2) || (ArgN2 == 0) ) &&
(Impl::is_integral_power_of_two(ArgN3) || (ArgN3 == 0) ) &&
(Impl::is_integral_power_of_two(ArgN4) || (ArgN4 == 0) ) &&
(Impl::is_integral_power_of_two(ArgN5) || (ArgN5 == 0) ) &&
(Impl::is_integral_power_of_two(ArgN6) || (ArgN6 == 0) ) &&
(Impl::is_integral_power_of_two(ArgN7) || (ArgN7 == 0) )
)
>
struct LayoutTiled {
static_assert( IsPowerOfTwo
, "LayoutTiled must be given power-of-two tile dimensions" );
#if 0
static_assert( (Impl::is_integral_power_of_two(ArgN0) ) &&
(Impl::is_integral_power_of_two(ArgN1) ) &&
(Impl::is_integral_power_of_two(ArgN2) || (ArgN2 == 0) ) &&
(Impl::is_integral_power_of_two(ArgN3) || (ArgN3 == 0) ) &&
(Impl::is_integral_power_of_two(ArgN4) || (ArgN4 == 0) ) &&
(Impl::is_integral_power_of_two(ArgN5) || (ArgN5 == 0) ) &&
(Impl::is_integral_power_of_two(ArgN6) || (ArgN6 == 0) ) &&
(Impl::is_integral_power_of_two(ArgN7) || (ArgN7 == 0) )
, "LayoutTiled must be given power-of-two tile dimensions" );
#endif
typedef LayoutTiled<OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, ArgN7, IsPowerOfTwo> array_layout ;
static constexpr Iterate outer_pattern = OuterP;
static constexpr Iterate inner_pattern = InnerP;
enum { N0 = ArgN0 };
enum { N1 = ArgN1 };
enum { N2 = ArgN2 };
enum { N3 = ArgN3 };
enum { N4 = ArgN4 };
enum { N5 = ArgN5 };
enum { N6 = ArgN6 };
enum { N7 = ArgN7 };
size_t dimension[ ARRAY_LAYOUT_MAX_RANK ] ;
enum { is_extent_constructible = true };
LayoutTiled( LayoutTiled const & ) = default ;
LayoutTiled( LayoutTiled && ) = default ;
LayoutTiled & operator = ( LayoutTiled const & ) = default ;
LayoutTiled & operator = ( LayoutTiled && ) = default ;
KOKKOS_INLINE_FUNCTION
explicit constexpr
LayoutTiled( size_t argN0 = 0 , size_t argN1 = 0 , size_t argN2 = 0 , size_t argN3 = 0
, size_t argN4 = 0 , size_t argN5 = 0 , size_t argN6 = 0 , size_t argN7 = 0
)
: dimension { argN0 , argN1 , argN2 , argN3 , argN4 , argN5 , argN6 , argN7 } {}
};
} // namespace Experimental
#endif
// For use with view_copy
template < typename ... Layout >
struct layout_iterate_type_selector {
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Default ;
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Default ;
};
template <>
struct layout_iterate_type_selector< Kokkos::LayoutRight > {
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right ;
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right ;
};
template <>
struct layout_iterate_type_selector< Kokkos::LayoutLeft > {
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left ;
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left ;
};
template <>
struct layout_iterate_type_selector< Kokkos::LayoutStride > {
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Default ;
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Default ;
};
#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
template < unsigned ArgN0 , unsigned ArgN1 , unsigned ArgN2 , unsigned ArgN3 , unsigned ArgN4 , unsigned ArgN5 , unsigned ArgN6 , unsigned ArgN7 >
struct layout_iterate_type_selector< Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Left, Kokkos::Iterate::Left, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, ArgN7, true> > {
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left ;
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left ;
};
template < unsigned ArgN0 , unsigned ArgN1 , unsigned ArgN2 , unsigned ArgN3 , unsigned ArgN4 , unsigned ArgN5 , unsigned ArgN6 , unsigned ArgN7 >
struct layout_iterate_type_selector< Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Right, Kokkos::Iterate::Left, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, ArgN7, true> > {
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right ;
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left ;
};
template < unsigned ArgN0 , unsigned ArgN1 , unsigned ArgN2 , unsigned ArgN3 , unsigned ArgN4 , unsigned ArgN5 , unsigned ArgN6 , unsigned ArgN7 >
struct layout_iterate_type_selector< Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Left, Kokkos::Iterate::Right, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, ArgN7, true> > {
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left ;
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right ;
};
template < unsigned ArgN0 , unsigned ArgN1 , unsigned ArgN2 , unsigned ArgN3 , unsigned ArgN4 , unsigned ArgN5 , unsigned ArgN6 , unsigned ArgN7 >
struct layout_iterate_type_selector< Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Right, Kokkos::Iterate::Right, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, ArgN7, true> > {
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right ;
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right ;
};
#endif
} // namespace Kokkos
#endif // #ifndef KOKKOS_LAYOUT_HPP

View File

@ -153,7 +153,7 @@
#else
#define KOKKOS_LAMBDA [=]__host__ __device__
#if defined( KOKKOS_ENABLE_CXX1Z )
#if defined( KOKKOS_ENABLE_CXX17 ) || defined( KOKKOS_ENABLE_CXX20 )
#define KOKKOS_CLASS_LAMBDA [=,*this] __host__ __device__
#endif
#endif
@ -213,7 +213,7 @@
#define KOKKOS_LAMBDA [=]
#endif
#if defined( KOKKOS_ENABLE_CXX1Z ) && !defined( KOKKOS_CLASS_LAMBDA )
#if (defined( KOKKOS_ENABLE_CXX17 ) || defined( KOKKOS_ENABLE_CXX20) )&& !defined( KOKKOS_CLASS_LAMBDA )
#define KOKKOS_CLASS_LAMBDA [=,*this]
#endif
@ -521,6 +521,9 @@
#if defined ( KOKKOS_ENABLE_CUDA )
#if ( 9000 <= CUDA_VERSION )
#define KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND
#if ( __CUDA_ARCH__ )
#define KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
#endif
#endif
#endif

View File

@ -793,7 +793,7 @@ struct ParallelReduceReturnValue<typename std::enable_if<
static return_type return_value(ReturnType& return_val,
const FunctorType& functor) {
#ifdef KOKOOS_ENABLE_DEPRECATED_CODE
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
return return_type(return_val,functor.value_count);
#else
if ( is_array<ReturnType>::value )
@ -1002,7 +1002,8 @@ void parallel_reduce(const std::string& label,
typename Impl::enable_if<
Kokkos::Impl::is_execution_policy<PolicyType>::value
>::type * = 0) {
Impl::ParallelReduceAdaptor<PolicyType,FunctorType,const ReturnType>::execute(label,policy,functor,return_value);
ReturnType return_value_impl = return_value;
Impl::ParallelReduceAdaptor<PolicyType,FunctorType,ReturnType>::execute(label,policy,functor,return_value_impl);
}
template< class PolicyType, class FunctorType, class ReturnType >
@ -1054,6 +1055,9 @@ void parallel_reduce(const std::string& label,
, typename ValueTraits::pointer_type
>::type value_type ;
static_assert(Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,PolicyType,FunctorType>::
has_final_member_function,"Calling parallel_reduce without either return value or final function.");
typedef Kokkos::View< value_type
, Kokkos::HostSpace
, Kokkos::MemoryUnmanaged
@ -1076,6 +1080,9 @@ void parallel_reduce(const PolicyType& policy,
, typename ValueTraits::pointer_type
>::type value_type ;
static_assert(Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,PolicyType,FunctorType>::
has_final_member_function,"Calling parallel_reduce without either return value or final function.");
typedef Kokkos::View< value_type
, Kokkos::HostSpace
, Kokkos::MemoryUnmanaged
@ -1096,6 +1103,9 @@ void parallel_reduce(const size_t& policy,
, typename ValueTraits::pointer_type
>::type value_type ;
static_assert(Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,RangePolicy<>,FunctorType>::
has_final_member_function,"Calling parallel_reduce without either return value or final function.");
typedef Kokkos::View< value_type
, Kokkos::HostSpace
, Kokkos::MemoryUnmanaged
@ -1117,6 +1127,9 @@ void parallel_reduce(const std::string& label,
, typename ValueTraits::pointer_type
>::type value_type ;
static_assert(Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,RangePolicy<>,FunctorType>::
has_final_member_function,"Calling parallel_reduce without either return value or final function.");
typedef Kokkos::View< value_type
, Kokkos::HostSpace
, Kokkos::MemoryUnmanaged

View File

@ -136,6 +136,55 @@ public:
}
}
KOKKOS_INLINE_FUNCTION
void* get_shmem_aligned (const ptrdiff_t size, const ptrdiff_t alignment, int level = -1) const {
if(level == -1)
level = m_default_level;
if(level == 0) {
char* previous = m_iter_L0;
const ptrdiff_t missalign = size_t(m_iter_L0)%alignment;
if(missalign) m_iter_L0 += alignment-missalign;
void* tmp = m_iter_L0 + m_offset * size;
if (m_end_L0 < (m_iter_L0 += size * m_multiplier)) {
m_iter_L0 = previous; // put it back like it was
#ifdef KOKKOS_DEBUG
// mfh 23 Jun 2015: printf call consumes 25 registers
// in a CUDA build, so only print in debug mode. The
// function still returns NULL if not enough memory.
printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
"%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
long(m_end_L0-m_iter_L0));
#endif // KOKKOS_DEBUG
tmp = 0;
}
return tmp;
} else {
char* previous = m_iter_L1;
const ptrdiff_t missalign = size_t(m_iter_L1)%alignment;
if(missalign) m_iter_L1 += alignment-missalign;
void* tmp = m_iter_L1 + m_offset * size;
if (m_end_L1 < (m_iter_L1 += size * m_multiplier)) {
m_iter_L1 = previous; // put it back like it was
#ifdef KOKKOS_DEBUG
// mfh 23 Jun 2015: printf call consumes 25 registers
// in a CUDA build, so only print in debug mode. The
// function still returns NULL if not enough memory.
printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
"%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
long(m_end_L1-m_iter_L1));
#endif // KOKKOS_DEBUG
tmp = 0;
}
return tmp;
}
}
template< typename IntType >
KOKKOS_INLINE_FUNCTION
ScratchMemorySpace( void * ptr_L0 , const IntType & size_L0 , void * ptr_L1 = NULL , const IntType & size_L1 = 0)

View File

@ -262,7 +262,7 @@ public:
}
//----------------------------------------
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
template< class FunctorType >
static
int team_size_max( const FunctorType & ) { return 1 ; }
@ -274,6 +274,16 @@ public:
template< class FunctorType >
static
int team_size_recommended( const FunctorType & , const int& ) { return 1 ; }
#endif
template<class FunctorType>
int team_size_max( const FunctorType&, const ParallelForTag& ) const { return 1 ; }
template<class FunctorType>
int team_size_max( const FunctorType&, const ParallelReduceTag& ) const { return 1 ; }
template<class FunctorType>
int team_size_recommended( const FunctorType&, const ParallelForTag& ) const { return 1 ; }
template<class FunctorType>
int team_size_recommended( const FunctorType&, const ParallelReduceTag& ) const { return 1 ; }
//----------------------------------------
@ -281,6 +291,16 @@ public:
inline int league_size() const { return m_league_size ; }
inline size_t scratch_size(const int& level, int = 0) const { return m_team_scratch_size[level] + m_thread_scratch_size[level]; }
inline static
int vector_length_max()
{ return 1024; } // Use arbitrary large number, is meant as a vectorizable length
inline static
int scratch_size_max(int level)
{ return (level==0?
1024*32:
20*1024*1024);
}
/** \brief Specify league size, request team size */
TeamPolicyInternal( execution_space &
, int league_size_request

Some files were not shown because too many files have changed in this diff Show More