Compare commits
59 Commits
patch_9Nov
...
patch_15No
| Author | SHA1 | Date | |
|---|---|---|---|
| cf79751f4f | |||
| e4dee3de17 | |||
| 6e225d90fc | |||
| 1fc3b4618c | |||
| eae9d27f6d | |||
| db29ec7eee | |||
| 090778c42b | |||
| db935dba5e | |||
| e160376365 | |||
| d5f222464b | |||
| 4d9e2a014b | |||
| 8a4983e4bc | |||
| 82d6aa9add | |||
| 4231ab3d57 | |||
| 25914ea3f3 | |||
| 003bb28471 | |||
| a557644939 | |||
| 04520e627d | |||
| 952e52982e | |||
| a942d8b3ba | |||
| 7a22b8aa62 | |||
| 4c1fbc359a | |||
| 2c644c5f2e | |||
| b1186a971e | |||
| 2dbd575a4b | |||
| 4805e1df22 | |||
| 380f0e4971 | |||
| a026ce9669 | |||
| 7e779d16de | |||
| b776f0f29f | |||
| 443644025f | |||
| c4c90a96ec | |||
| 5cb2463204 | |||
| 5a4e44b75a | |||
| 0ca02b6f41 | |||
| 2b96dfd6cc | |||
| c22c6e4d34 | |||
| b2d67bcbb5 | |||
| b3f08b38a2 | |||
| 8e9d4f5bce | |||
| fe07ad279d | |||
| 5062c43aea | |||
| 90caf0019c | |||
| 3b7ebbb8df | |||
| d7a479d2f6 | |||
| 0c8ce199af | |||
| 4a6f088c0b | |||
| 56598fcd0b | |||
| 265c11dca9 | |||
| d6631266ce | |||
| fbd610b8a9 | |||
| 86d1304176 | |||
| f68d77c7af | |||
| 7a4f534676 | |||
| 729201ab93 | |||
| ab8215a669 | |||
| fe04147ee0 | |||
| 62b1159673 | |||
| adeb0c2b54 |
1
.github/CODEOWNERS
vendored
@ -29,6 +29,7 @@ src/USER-MEAMC/* @martok
|
||||
src/USER-MOFFF/* @hheenen
|
||||
src/USER-MOLFILE/* @akohlmey
|
||||
src/USER-NETCDF/* @pastewka
|
||||
src/USER-PLUMED/* @gtribello
|
||||
src/USER-PHONON/* @lingtikong
|
||||
src/USER-PTM/* @pmla
|
||||
src/USER-OMP/* @akohlmey
|
||||
|
||||
@ -304,7 +304,7 @@ pkg_depends(USER-SCAFACOS MPI)
|
||||
|
||||
find_package(OpenMP QUIET)
|
||||
option(BUILD_OMP "Build with OpenMP support" ${OpenMP_FOUND})
|
||||
if(BUILD_OMP OR PKG_USER-OMP OR PKG_KOKKOS OR PKG_USER-INTEL)
|
||||
if(BUILD_OMP OR PKG_KOKKOS OR PKG_USER-INTEL)
|
||||
find_package(OpenMP REQUIRED)
|
||||
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
|
||||
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
|
||||
@ -349,7 +349,7 @@ if(PKG_KSPACE)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(PKG_MSCG OR PKG_USER-ATC OR PKG_USER-AWPMD OR PKG_USER-QUIP OR PKG_LATTE)
|
||||
if(PKG_MSCG OR PKG_USER-ATC OR PKG_USER-AWPMD OR PKG_USER-QUIP OR PKG_LATTE OR PKG_USER-PLUMED)
|
||||
find_package(LAPACK)
|
||||
find_package(BLAS)
|
||||
if(NOT LAPACK_FOUND OR NOT BLAS_FOUND)
|
||||
@ -531,6 +531,12 @@ endif()
|
||||
|
||||
if(PKG_USER-PLUMED)
|
||||
find_package(GSL REQUIRED)
|
||||
set(PLUMED_MODE "static" CACHE STRING "Linkage mode for Plumed2 library")
|
||||
set(PLUMED_MODE_VALUES static shared runtime)
|
||||
set_property(CACHE PLUMED_MODE PROPERTY STRINGS ${PLUMED_MODE_VALUES})
|
||||
validate_option(PLUMED_MODE PLUMED_MODE_VALUES)
|
||||
string(TOUPPER ${PLUMED_MODE} PLUMED_MODE)
|
||||
|
||||
option(DOWNLOAD_PLUMED "Download Plumed (instead of using the system's one)" OFF)
|
||||
if(DOWNLOAD_PLUMED)
|
||||
include(ExternalProject)
|
||||
@ -543,13 +549,29 @@ if(PKG_USER-PLUMED)
|
||||
ExternalProject_get_property(plumed_build INSTALL_DIR)
|
||||
set(PLUMED_INSTALL_DIR ${INSTALL_DIR})
|
||||
list(APPEND LAMMPS_DEPS plumed_build)
|
||||
list(APPEND LAMMPS_LINK_LIBS ${PLUMED_INSTALL_DIR}/lib/plumed/obj/kernel.o
|
||||
${PLUMED_INSTALL_DIR}/lib/plumed/obj/PlumedStatic.o ${GSL_LIBRARIES} ${CMAKE_DL_LIBS})
|
||||
if(PLUMED_MODE STREQUAL "STATIC")
|
||||
add_definitions(-D__PLUMED_WRAPPER_CXX=1)
|
||||
list(APPEND LAMMPS_LINK_LIBS ${PLUMED_INSTALL_DIR}/lib/plumed/obj/kernel.o
|
||||
"${PLUMED_INSTALL_DIR}/lib/plumed/obj/PlumedStatic.o" ${GSL_LIBRARIES} ${CMAKE_DL_LIBS} ${LAPACK_LIBRARIES})
|
||||
elseif(PLUMED_MODE STREQUAL "SHARED")
|
||||
list(APPEND LAMMPS_LINK_LIBS ${PLUMED_INSTALL_DIR}/lib/libplumed.so ${CMAKE_DL_LIBS})
|
||||
elseif(PLUMED_MODE STREQUAL "RUNTIME")
|
||||
add_definitions(-D__PLUMED_HAS_DLOPEN=1 -D__PLUMED_DEFAULT_KERNEL=${PLUMED_INSTALL_DIR}/lib/libplumedKernel.so)
|
||||
list(APPEND LAMMPS_LINK_LIBS ${PLUMED_INSTALL_DIR}/lib/libplumedWrapper.a -rdynamic ${CMAKE_DL_LIBS})
|
||||
endif()
|
||||
set(PLUMED_INCLUDE_DIRS "${PLUMED_INSTALL_DIR}/include")
|
||||
else()
|
||||
find_package(PkgConfig REQUIRED)
|
||||
pkg_check_modules(PLUMED plumed REQUIRED)
|
||||
include(${PLUMED_LIBDIR}/plumed/src/lib/Plumed.cmake.static)
|
||||
if(PLUMED_MODE STREQUAL "STATIC")
|
||||
add_definitions(-D__PLUMED_WRAPPER_CXX=1)
|
||||
include(${PLUMED_LIBDIR}/plumed/src/lib/Plumed.cmake.static)
|
||||
elseif(PLUMED_MODE STREQUAL "SHARED")
|
||||
include(${PLUMED_LIBDIR}/plumed/src/lib/Plumed.cmake.shared)
|
||||
elseif(PLUMED_MODE STREQUAL "RUNTIME")
|
||||
add_definitions(-D__PLUMED_HAS_DLOPEN=1 -D__PLUMED_DEFAULT_KERNEL=${PLUMED_LIBDIR}/libplumedKernel.so)
|
||||
include(${PLUMED_LIBDIR}/plumed/src/lib/Plumed.cmake.runtime)
|
||||
endif()
|
||||
list(APPEND LAMMPS_LINK_LIBS ${PLUMED_LOAD})
|
||||
endif()
|
||||
include_directories(${PLUMED_INCLUDE_DIRS})
|
||||
|
||||
@ -1492,6 +1492,11 @@ target API.
|
||||
</dl>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><code>BIN2C</code> (CUDA only)</td>
|
||||
<td>Path to bin2c executable, will automatically pick up the first one in your $PATH.</td>
|
||||
<td>(automatic)</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
@ -1647,9 +1652,8 @@ requires `gzip` to be in your `PATH`
|
||||
</tr>
|
||||
<tr>
|
||||
<td><code>GZIP_EXECUTABLE</code></td>
|
||||
<td></td>
|
||||
<td>
|
||||
</td>
|
||||
<td>Path to gzip executable, will automatically pick up the first one in your $PATH.</td>
|
||||
<td>(automatic)</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
@ -1679,9 +1683,8 @@ requires `ffmpeg` to be in your `PATH`
|
||||
</tr>
|
||||
<tr>
|
||||
<td><code>FFMPEG_EXECUTABLE</code></td>
|
||||
<td></td>
|
||||
<td>
|
||||
</td>
|
||||
<td>Path to ffmpeg executable, will automatically pick up the first one in your $PATH.</td>
|
||||
<td>(automatic)</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
@ -13,7 +13,7 @@ set(USER_PACKAGES USER-ATC USER-AWPMD USER-BOCS USER-CGDNA USER-CGSDK USER-COLVA
|
||||
|
||||
set(PACKAGES_WITH_LIB COMPRESS GPU KIM KOKKOS LATTE MEAM MPIIO MSCG POEMS PYTHON REAX VORONOI
|
||||
USER-ATC USER-AWPMD USER-COLVARS USER-H5MD USER-LB USER-MOLFILE
|
||||
USER-NETCDF USER-QMMM USER-QUIP USER-SMD USER-VTK)
|
||||
USER-NETCDF USER-PLUMED USER-QMMM USER-QUIP USER-SMD USER-VTK)
|
||||
|
||||
set(ALL_PACKAGES ${STANDARD_PACKAGES} ${USER_PACKAGES})
|
||||
|
||||
|
||||
@ -13,7 +13,7 @@ set(USER_PACKAGES USER-ATC USER-AWPMD USER-BOCS USER-CGDNA USER-CGSDK USER-COLVA
|
||||
|
||||
set(PACKAGES_WITH_LIB COMPRESS GPU KIM KOKKOS LATTE MEAM MPIIO MSCG POEMS PYTHON REAX VORONOI
|
||||
USER-ATC USER-AWPMD USER-COLVARS USER-H5MD USER-LB USER-MOLFILE
|
||||
USER-NETCDF USER-QMMM USER-QUIP USER-SMD USER-VTK)
|
||||
USER-NETCDF USER-PLUMED USER-QMMM USER-QUIP USER-SMD USER-VTK)
|
||||
|
||||
set(ALL_PACKAGES ${STANDARD_PACKAGES} ${USER_PACKAGES})
|
||||
|
||||
|
||||
@ -56,7 +56,8 @@ set(PKG_USER-MOFFF OFF CACHE BOOL "" FORCE)
|
||||
set(PKG_USER-MOLFILE OFF CACHE BOOL "" FORCE)
|
||||
set(PKG_USER-NETCDF OFF CACHE BOOL "" FORCE)
|
||||
set(PKG_USER-OMP OFF CACHE BOOL "" FORCE)
|
||||
set(PKG_USER-PHOFFOFF OFF CACHE BOOL "" FORCE)
|
||||
set(PKG_USER-PHONON OFF CACHE BOOL "" FORCE)
|
||||
set(PKG_USER-PLUMED OFF CACHE BOOL "" FORCE)
|
||||
set(PKG_USER-QMMM OFF CACHE BOOL "" FORCE)
|
||||
set(PKG_USER-QTB OFF CACHE BOOL "" FORCE)
|
||||
set(PKG_USER-QUIP OFF CACHE BOOL "" FORCE)
|
||||
|
||||
@ -13,7 +13,7 @@ set(USER_PACKAGES USER-ATC USER-AWPMD USER-BOCS USER-CGDNA USER-CGSDK USER-COLVA
|
||||
|
||||
set(PACKAGES_WITH_LIB COMPRESS GPU KIM KOKKOS LATTE MEAM MPIIO MSCG POEMS PYTHON REAX VORONOI
|
||||
USER-ATC USER-AWPMD USER-COLVARS USER-H5MD USER-LB USER-MOLFILE
|
||||
USER-NETCDF USER-QMMM USER-QUIP USER-SMD USER-VTK)
|
||||
USER-NETCDF USER-PLUMED USER-QMMM USER-QUIP USER-SMD USER-VTK)
|
||||
|
||||
set(ALL_PACKAGES ${STANDARD_PACKAGES} ${USER_PACKAGES})
|
||||
|
||||
|
||||
@ -13,7 +13,7 @@ set(USER_PACKAGES USER-ATC USER-AWPMD USER-BOCS USER-CGDNA USER-CGSDK USER-COLVA
|
||||
|
||||
set(PACKAGES_WITH_LIB COMPRESS GPU KIM KOKKOS LATTE MEAM MPIIO MSCG POEMS PYTHON REAX VORONOI
|
||||
USER-ATC USER-AWPMD USER-COLVARS USER-H5MD USER-LB USER-MOLFILE
|
||||
USER-NETCDF USER-QMMM USER-QUIP USER-SMD USER-VTK)
|
||||
USER-NETCDF USER-PLUMED USER-QMMM USER-QUIP USER-SMD USER-VTK)
|
||||
|
||||
set(ALL_PACKAGES ${STANDARD_PACKAGES} ${USER_PACKAGES})
|
||||
|
||||
|
||||
184
doc/github-development-workflow.md
Normal file
@ -0,0 +1,184 @@
|
||||
# Outline of the GitHub Development Workflow
|
||||
|
||||
This purpose of this document is to provide a point of reference for the
|
||||
core LAMMPS developers and other LAMMPS contibutors to understand the
|
||||
choices the LAMMPS developers have agreed on. Git and GitHub provide the
|
||||
tools, but do not set policies, so it is up to the developers to come to
|
||||
an agreement as to how to define and interpret policies. This document
|
||||
is likely to change as our experiences and needs change and we try to
|
||||
adapt accordingly. Last change 2018-11-15.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
* [GitHub Merge Management](#github-merge-management)
|
||||
* [Pull Requests](#pull-requests)
|
||||
* [Pull Request Assignments](#pull-request-assignments)
|
||||
* [Pull Request Reviews](#pull-request-reviews)
|
||||
* [Pull Request Discussions](#pull-request-discussions)
|
||||
* [Checklist for Pull Requests](#checklist-for-pull-requests)
|
||||
* [GitHub Issues](#github-issues)
|
||||
* [Milestones and Release Planning](#milestones-and-release-planning)
|
||||
|
||||
## GitHub Merge Management
|
||||
|
||||
In the interest of consistency, ONLY ONE of the core LAMMPS developers
|
||||
should doing the merging itself. This is currently
|
||||
[@akohlmey](https://github.com/akohlmey) (Axel Kohlmeyer).
|
||||
If this assignment needs to be changed, it shall be done right after a
|
||||
stable release.
|
||||
|
||||
## Pull Requests
|
||||
|
||||
ALL changes to the LAMMPS code and documentation, however trivial, MUST
|
||||
be submitted as a pull request to GitHub. All changes to the "master"
|
||||
branch must be made exclusively through merging pull requests. The
|
||||
"unstable" and "stable" branches, respectively are only to be updated
|
||||
upon patch or stable releases with fast-forward merges based on the
|
||||
associated tags. Pull requests may also be submitted to (long-running)
|
||||
feature branches created by LAMMPS developers inside the LAMMPS project,
|
||||
if needed. Those are not subject to the merge and review restrictions
|
||||
discussed in this document, though, but get manages as needed on a
|
||||
case-by-case basis.
|
||||
|
||||
### Pull Request Assignments
|
||||
|
||||
Pull requests can be "chaperoned" by one of the LAMMPS core developers.
|
||||
This is indicated by who the pull request is assigned to. LAMMPS core
|
||||
developers can self-assign or they can decide to assign a pull request
|
||||
to a different LAMMPS developer. Being assigned to a pull request means,
|
||||
that this pull request may need some work and the assignee is tasked to
|
||||
determine what this might be needed or not, and may either implement the
|
||||
required changes or ask the submitter of the pull request to implement
|
||||
them. Even though, all LAMMPS developers may have write access to pull
|
||||
requests (if enabled by the submitter, which is the default), only the
|
||||
submitter or the assignee of a pull request may do so. During this
|
||||
period the "work_in_progress" label shall be applied to the pull
|
||||
request. The assignee gets to decide what happens to the pull request
|
||||
next, e.g. whether it should be assigned to a different developer for
|
||||
additional checks and changes, or is recommended to be merged. Removing
|
||||
the "work_in_progress" label and assigning the pull request to the
|
||||
developer tasked with merging signals that a pull request is ready to be
|
||||
merged.
|
||||
|
||||
### Pull Request Reviews
|
||||
|
||||
People can be assigned to review a pull request in two ways:
|
||||
|
||||
* They can be assigned manually to review a pull request
|
||||
by the submitter or a LAMMPS developer
|
||||
* They can be automatically assigned, because a developers matches
|
||||
a file pattern in the `.github/CODEOWNERS` file, which associates
|
||||
developers with the code they contributed and maintain.
|
||||
|
||||
Reviewers are requested to state their appraisal of the proposed changes
|
||||
and either approve or request changes. People may unassign themselves
|
||||
from review, if they feel not competent about the changes proposed. At
|
||||
least one review from a LAMMPS developer with write access is required
|
||||
before merging in addition to the automated compilation tests. The
|
||||
feature, that reviews from code owners are "hard" reviews (i.e. they
|
||||
must all be approved before merging is allowed), is currently disabled
|
||||
and it is in the discretion of the merge maintainer to assess when
|
||||
a sufficient degree of approval has been reached. Reviews may be
|
||||
(automatically) dismissed, when the reviewed code has been changed,
|
||||
and then approval is required a second time.
|
||||
|
||||
### Pull Request Discussions
|
||||
|
||||
All discussions about a pull request should be kept as much as possible
|
||||
on the pull request discussion page on GitHub, so that other developers
|
||||
can later review the entire discussion after the fact and understand the
|
||||
rationale behind choices made. Exceptions to this policy are technical
|
||||
discussions, that are centered on tools or policies themselves
|
||||
(git, github, c++) rather than on the content of the pull request.
|
||||
|
||||
### Checklist for Pull Requests
|
||||
|
||||
Here are some items to check:
|
||||
* source and text files should not have CR/LF line endings (use dos2unix to remove)
|
||||
* every new command or style should have documentation. The names of
|
||||
source files (c++ and manual) should follow the name of the style.
|
||||
(example: `src/fix_nve.cpp`, `src/fix_nve.h` for `fix nve` command,
|
||||
implementing the class `FixNVE`, documented in `doc/src/fix_nve.txt`)
|
||||
* all new style names should be lower case, the must be no dashes,
|
||||
blanks, or underscores separating words, only forward slashes.
|
||||
* new style docs should be added to the "overview" files in
|
||||
`doc/src/Commands_*.txt`, `doc/src/{fixes,computes,pairs,bonds,...}.txt`
|
||||
and `doc/src/lammps.book`
|
||||
* new files in packages should be added to `src/.gitignore`
|
||||
* removed or renamed files in packages should be added to `src/Purge.list`
|
||||
* C++ source files should use C++ style include files for accessing
|
||||
C-library APIs, e.g. `#include <cstdlib>` instead of `#include <stdlib.h>`.
|
||||
And they should use angular brackets instead of double quotes. Full list:
|
||||
* assert.h -> cassert
|
||||
* ctype.h -> cctype
|
||||
* errno.h -> cerrno
|
||||
* float.h -> cfloat
|
||||
* limits.h -> climits
|
||||
* math.h -> cmath
|
||||
* omplex.h -> complex
|
||||
* setjmp.h -> csetjmp
|
||||
* signal.h -> csignal
|
||||
* stddef.h -> cstddef
|
||||
* stdint.h -> cstdint
|
||||
* stdio.h -> cstdio
|
||||
* stdlib.h -> cstdlib
|
||||
* string.h -> cstring
|
||||
* time.h -> ctime
|
||||
Do not replace (as they are C++-11): `inttypes.h` and `stdint.h`.
|
||||
* Code should follow the C++-98 standard. C++-11 is only accepted
|
||||
in individual special purpose packages
|
||||
* indentation is two spaces per level
|
||||
* there should be no tabs and no trailing whitespace
|
||||
* header files, especially of new styles, should not include any
|
||||
other headers, except the header with the base class or cstdio.
|
||||
Forward declarations should be used instead when possible.
|
||||
* iostreams should be avoided. LAMMPS uses stdio from the C-library.
|
||||
* use of STL in headers and class definitions should be avoided.
|
||||
* static class members should be avoided at all cost.
|
||||
* anything storing atom IDs should be using `tagint` and not `int`.
|
||||
This can be flagged by the compiler only for pointers and only when
|
||||
compiling LAMMPS with `-DLAMMPS_BIGBIG`.
|
||||
* when including both `lmptype.h` (and using defines or macros from it)
|
||||
and `mpi.h`, `lmptype.h` must be included first.
|
||||
|
||||
## GitHub Issues
|
||||
|
||||
The GitHub issue tracker is the location where the LAMMPS developers
|
||||
and other contributors or LAMMPS users can report issues or bugs with
|
||||
the LAMMPS code or request new features to be added. Feature requests
|
||||
are usually indicated by a `[Feature Request]` marker in the subject.
|
||||
Issues are assigned to a person, if this person is working on this
|
||||
feature or working to resolve an issue. Issues that have nobody working
|
||||
on them at the moment, have the label `volunteer needed` attached.
|
||||
|
||||
When an issue, say `#125` is resolved by a specific pull request,
|
||||
the comment for the pull request shall contain the text `closes #125`
|
||||
or `fixes #125`, so that the issue is automatically deleted when
|
||||
the pull request is merged.
|
||||
|
||||
## Milestones and Release Planning
|
||||
|
||||
LAMMPS uses a continuous release development model with incremental
|
||||
changes, i.e. significant effort is made - including automated pre-merge
|
||||
testing - that the code in the branch "master" does not get broken.
|
||||
More extensive testing (including regression testing) is performed after
|
||||
code is merged to the "master" branch. There are patch releases of
|
||||
LAMMPS every 1-3 weeks at a point, when the LAMMPS developers feel, that
|
||||
a sufficient amount of changes have happened, and the post-merge testing
|
||||
has been successful. These patch releases are marked with a
|
||||
`patch_<version date>` tag and the "unstable" branch follows only these
|
||||
versions (and thus is always supposed to be of production quality,
|
||||
unlike "master", which may be temporary broken, in the case of larger
|
||||
change sets or unexpected incompatibilities or side effects.
|
||||
|
||||
About 3-4 times each year, there are going to be "stable" releases
|
||||
of LAMMPS. These have seen additional, manual testing and review of
|
||||
results from testing with instrumented code and static code analysis.
|
||||
Also, in the last 2-3 patch releases before a stable release are
|
||||
"release candidate" versions which only contain bugfixes and
|
||||
documentation updates. For release planning and the information of
|
||||
code contributors, issues and pull requests being actively worked on
|
||||
are assigned a "milestone", which corresponds to the next stable
|
||||
release or the stable release after that, with a tentative release
|
||||
date.
|
||||
|
||||
@ -137,9 +137,9 @@ simply loading the appropriate module before building LAMMPS.
|
||||
-D CMAKE_C_COMPILER=name # name of C compiler
|
||||
-D CMAKE_Fortran_COMPILER=name # name of Fortran compiler :pre
|
||||
|
||||
-D CMAKE_CXX_FlAGS=string # flags to use with C++ compiler
|
||||
-D CMAKE_C_FlAGS=string # flags to use with C compiler
|
||||
-D CMAKE_Fortran_FlAGS=string # flags to use with Fortran compiler :pre
|
||||
-D CMAKE_CXX_FLAGS=string # flags to use with C++ compiler
|
||||
-D CMAKE_C_FLAGS=string # flags to use with C compiler
|
||||
-D CMAKE_Fortran_FLAGS=string # flags to use with Fortran compiler :pre
|
||||
|
||||
By default CMake will use a compiler it finds and it will add
|
||||
optimization flags appropriate to that compiler and any "accelerator
|
||||
|
||||
@ -41,11 +41,11 @@ This is the list of packages that may require additional steps.
|
||||
"USER-ATC"_#user-atc,
|
||||
"USER-AWPMD"_#user-awpmd,
|
||||
"USER-COLVARS"_#user-colvars,
|
||||
"USER-PLUMED" _#user-plumed,
|
||||
"USER-H5MD"_#user-h5md,
|
||||
"USER-INTEL"_#user-intel,
|
||||
"USER-MOLFILE"_#user-molfile,
|
||||
"USER-NETCDF"_#user-netcdf,
|
||||
"USER-PLUMED"_#user-plumed,
|
||||
"USER-OMP"_#user-omp,
|
||||
"USER-QMMM"_#user-qmmm,
|
||||
"USER-QUIP"_#user-quip,
|
||||
@ -715,57 +715,98 @@ a corresponding Makefile.lammps.machine file.
|
||||
|
||||
USER-PLUMED package :h4,link(user-plumed)
|
||||
|
||||
Before building LAMMPS with this package, you must first build PLUMED.
|
||||
PLUMED can be built as part of the LAMMPS build or installed separately
|
||||
from LAMMPS using the generic "plumed installation instructions"_plumedinstall.
|
||||
:link(plumedinstall,http://plumed.github.io/doc-master/user-doc/html/_installation.html)
|
||||
|
||||
PLUMED can be linked into MD codes in three different modes: static,
|
||||
shared, and runtime. With the "static" mode, all required PLUMED code
|
||||
is linked statically into the MD code. The MD code is then fully
|
||||
independent from the PLUMED installation, but also you have to
|
||||
rebuild/relink the MD code to update the PLUMED code inside it. With
|
||||
"shared" linkage mode, the MD code is linked to a shared library
|
||||
containing the PLUMED code, preferably after it was installed in a
|
||||
globally accessible location. This way the same installed PLUMED code
|
||||
can be shared across multiple MD packages and can be updated, for as
|
||||
long as the shared PLUMED library is ABI-compatible. The third linkage
|
||||
mode is "runtime" which allows to switch the PLUMED kernel at runtime
|
||||
between different variants through setting the PLUMED_KERNEL environment
|
||||
varible, which has to point to the location of the libplumedKernel.so
|
||||
dynamical shared object, which is then loaded at runtime. This is
|
||||
particularly convenient for doing PLUMED development and comparing
|
||||
multiple PLUMED versions without having to recompile the hosting MD
|
||||
code. All three linkage modes are supported by LAMMPS on selected
|
||||
operating systems (e.g. Linux) and using either CMake or traditional
|
||||
make build. The "static" mode should be most portable, the "runtime"
|
||||
mode support in LAMMPS makes the most assumptions about operating
|
||||
system and compiler environment. If one mode does not work, try a
|
||||
different one, or switch to a different build system, or consider
|
||||
a global PLUMED installation or downloading it during building LAMMPS.
|
||||
|
||||
[CMake build]:
|
||||
|
||||
-D DOWNLOAD_PLUMED=value # download PLUMED for build, value = no (default) or yes
|
||||
-D PLUMED_MODE=value # Linkage mode for PLUMED, value = static (default), shared, or runtime :pre
|
||||
|
||||
If DOWNLOAD_PLUMED is set to "yes", the PLUMED library will be
|
||||
downloaded (the version of that is hardcoded to a vetted version of
|
||||
PLUMED, usually a recent stable release version) and built inside the
|
||||
CMake build directory. If DOWNLOAD_PLUMED is set to "no" (the default),
|
||||
CMake will try to detect an installed version of PLUMED and link to
|
||||
that. For this to work, the PLUMED library has to be installed into a
|
||||
location where the pkg-config tool can find it or the PKG_CONFIG_PATH
|
||||
environment variable has to be set up accordingly.
|
||||
|
||||
The PLUMED_MODE setting determines the linkage mode of the PLUMED
|
||||
library. Allowed values are "static" (default), "shared", or "runtime".
|
||||
For a discussion of PLUMED linkage modes, please see above. When
|
||||
enabling DOWNLOAD_PLUMED, the static linkage mode is recommended.
|
||||
|
||||
[Traditional make]:
|
||||
|
||||
Before building LAMMPS with this package, you must first build
|
||||
PLUMED. We recommending building PLUMED separately to LAMMPS using
|
||||
the instructions that can be found at http://plumed.github.io/doc-master/user-doc/html/_installation.html.
|
||||
Before compiling LAMMPS you can then install the fix plumed command
|
||||
and compile LAMMPS in the usual manner:
|
||||
Before installing the USER-PLUMED package, first the PLUMED library
|
||||
needs to be configured so that LAMMPS can find the right settings when
|
||||
compiling and linking the LAMMPS executable itself. You can either
|
||||
download and build PLUMED inside the LAMMPS plumed library folder or use
|
||||
a previously installed PLUMED library and point LAMMPS to its
|
||||
location. You also have to choose the linkage mode: "static" (default),
|
||||
"shared" or "runtime". For a discussion of PLUMED linkage modes, please
|
||||
see above.
|
||||
|
||||
Download/compilation/configuration of the plumed library can be done
|
||||
from the src folder through the following make args:
|
||||
|
||||
make lib-plumed # print help message
|
||||
make lib-plumed args="-b" # download and build PLUMED in lib/plumed/plumed2
|
||||
make lib-plumed args="-p $HOME/.local" # use existing PLUMED installation in $HOME/.local
|
||||
make lib-plumed args="-p /usr/local -m shared" # use existing PLUMED installation in
|
||||
# /usr/local and use shared linkage mode
|
||||
:pre
|
||||
|
||||
Note that 2 symbolic (soft) links, "includelink" and "liblink" are
|
||||
created in lib/plumed to point into the location of the PLUMED build to
|
||||
use and also a new file lib/plumed/Makefile.lammps is created with
|
||||
settings suitable for LAMMPS to compile and link PLUMED in the desired
|
||||
linkage mode. After this step is compleded, you can install the
|
||||
USER-PLUMED package and compile LAMMPS in the usual manner:
|
||||
|
||||
make yes-user-plumed
|
||||
make machine :pre
|
||||
|
||||
Once this compilation completes you should be able to run LAMMPS in the usual
|
||||
way. When running LAMMPS with an input script that contains a fix
|
||||
plumed command LAMMPS will try to call the PLUMED runtime library. PLUMED
|
||||
must therefore be available in your path if LAMMPS is compiled in this way.
|
||||
Once this compilation completes you should be able to run LAMMPS in the
|
||||
usual way. For shared linkage mode, libplumed.so must be found by the
|
||||
LAMMPS executable, which on many operating systems means, you have to
|
||||
set the LD_LIBRARY_PATH environment variable accordingly.
|
||||
|
||||
On some machines it is not possible to call runtime libraries in the way described
|
||||
above. When compiling on these machines it is thus better to statically link
|
||||
PLUMED when compiling LAMMPS. To do this you must either download a PLUMED
|
||||
tarball from http://www.plumed.org/get-it or clone it using
|
||||
git clone https://github.com/plumed/plumed2.git. If you download the tarball
|
||||
unpack it in the /lib/plumed directory. Similarly if you clone
|
||||
it clone it to the /lib/plumed directory as if there is a version of PLUMED within
|
||||
this directory LAMMPS will always try to statically link the version of PLUMED
|
||||
that this directory contains instead of dynamically linking the library.
|
||||
Support for the different linkage modes in LAMMPS varies for different
|
||||
operating systems, using the static linkage is expected to be the most
|
||||
portable, and thus set to be the default.
|
||||
|
||||
Once you have downloaded PLUMED into /lib/plumed you must again build the code
|
||||
here by following the instructions that can be found at
|
||||
http://plumed.github.io/doc-master/user-doc/html/_installation.html.
|
||||
|
||||
You can statically link PLUMED manually and if you want to access the full
|
||||
range of PLUMED functionalities this is what you should do. If you only want the
|
||||
basic range of functionalities, however, (i.e. no user contributed modules) then
|
||||
you can download and compile PLUMED in one step from the lammps/src dir, using a
|
||||
command like like those below:
|
||||
|
||||
make lib-plumed # print help message
|
||||
make lib-plumed args="-b" # download and build the latest stable version of PLUMED
|
||||
|
||||
These commands will simply invoke the lib/plumed/Install.py script with
|
||||
args specified. Furthermore, once the script has completed you should
|
||||
have a compiled version of PLUMED. With this built you can install/un-install
|
||||
PLUMED and build LAMMPS in the usual manner:
|
||||
|
||||
make yes-user-plumed
|
||||
make machine :pre
|
||||
|
||||
make no-user-plumed
|
||||
make machine :pre
|
||||
If you want to change the linkage mode, you have to re-run "make
|
||||
lib-plumed" with the desired settings [and] do a reinstall if the
|
||||
USER-PLUMED package with "make yes-user-plumed" to update the required
|
||||
makefile settings with the changes in the lib/plumed folder.
|
||||
|
||||
:line
|
||||
|
||||
|
||||
@ -56,6 +56,7 @@ packages:
|
||||
"USER-INTEL"_Build_extras.html#user-intel,
|
||||
"USER-MOLFILE"_Build_extras.html#user-molfile,
|
||||
"USER-NETCDF"_Build_extras.html#user-netcdf,
|
||||
"USER-PLUMED"_Build_extras.html#user-plumed,
|
||||
"USER-OMP"_Build_extras.html#user-omp,
|
||||
"USER-QMMM"_Build_extras.html#user-qmmm,
|
||||
"USER-QUIP"_Build_extras.html#user-quip,
|
||||
|
||||
@ -59,6 +59,7 @@ An alphabetic list of all LAMMPS commands.
|
||||
"fix_modify"_fix_modify.html,
|
||||
"group"_group.html,
|
||||
"group2ndx"_group2ndx.html,
|
||||
"hyper"_hyper.html,
|
||||
"if"_if.html,
|
||||
"info"_info.html,
|
||||
"improper_coeff"_improper_coeff.html,
|
||||
|
||||
@ -78,6 +78,8 @@ OPT.
|
||||
"grem"_fix_grem.html,
|
||||
"halt"_fix_halt.html,
|
||||
"heat"_fix_heat.html,
|
||||
"hyper/global"_fix_hyper_global.html,
|
||||
"hyper/local"_fix_hyper_local.html,
|
||||
"imd"_fix_imd.html,
|
||||
"indent"_fix_indent.html,
|
||||
"ipi"_fix_ipi.html,
|
||||
@ -108,7 +110,7 @@ OPT.
|
||||
"nph/asphere (o)"_fix_nph_asphere.html,
|
||||
"nph/body"_fix_nph_body.html,
|
||||
"nph/eff"_fix_nh_eff.html,
|
||||
"nph/sphere (ko)"_fix_nph_sphere.html,
|
||||
"nph/sphere (o)"_fix_nph_sphere.html,
|
||||
"nphug (o)"_fix_nphug.html,
|
||||
"npt (iko)"_fix_nh.html,
|
||||
"npt/asphere (o)"_fix_npt_asphere.html,
|
||||
@ -128,7 +130,7 @@ OPT.
|
||||
"nve/line"_fix_nve_line.html,
|
||||
"nve/manifold/rattle"_fix_nve_manifold_rattle.html,
|
||||
"nve/noforce"_fix_nve_noforce.html,
|
||||
"nve/sphere (o)"_fix_nve_sphere.html,
|
||||
"nve/sphere (ko)"_fix_nve_sphere.html,
|
||||
"nve/spin"_fix_nve_spin.html,
|
||||
"nve/tri"_fix_nve_tri.html,
|
||||
"nvk"_fix_nvk.html,
|
||||
@ -147,6 +149,7 @@ OPT.
|
||||
"phonon"_fix_phonon.html,
|
||||
"pimd"_fix_pimd.html,
|
||||
"planeforce"_fix_planeforce.html,
|
||||
"plumed"_fix_plumed.html,
|
||||
"poems"_fix_poems.html,
|
||||
"pour"_fix_pour.html,
|
||||
"precession/spin"_fix_precession_spin.html,
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
<!-- HTML_ONLY -->
|
||||
<HEAD>
|
||||
<TITLE>LAMMPS Users Manual</TITLE>
|
||||
<META NAME="docnumber" CONTENT="9 Nov 2018 version">
|
||||
<META NAME="docnumber" CONTENT="15 Nov 2018 version">
|
||||
<META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
|
||||
<META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation. This software and manual is distributed under the GNU General Public License.">
|
||||
</HEAD>
|
||||
@ -21,7 +21,7 @@
|
||||
:line
|
||||
|
||||
LAMMPS Documentation :c,h1
|
||||
9 Nov 2018 version :c,h2
|
||||
15 Nov 2018 version :c,h2
|
||||
|
||||
"What is a LAMMPS version?"_Manual_version.html
|
||||
|
||||
|
||||
@ -89,6 +89,7 @@ as contained in the file name.
|
||||
"USER-NETCDF"_#PKG-USER-NETCDF,
|
||||
"USER-OMP"_#PKG-USER-OMP,
|
||||
"USER-PHONON"_#PKG-USER-PHONON,
|
||||
"USER-PLUMED"_#PKG-USER-PLUMED,
|
||||
"USER-PTM"_#PKG-USER-PTM,
|
||||
"USER-QMMM"_#PKG-USER-QMMM,
|
||||
"USER-QTB"_#PKG-USER-QTB,
|
||||
@ -1187,7 +1188,7 @@ the NAMD MD code, but with portability in mind. Axel Kohlmeyer
|
||||
[Install:]
|
||||
|
||||
This package has "specific installation
|
||||
instructions"_Build_extras.html#gpu on the "Build
|
||||
instructions"_Build_extras.html#user-colvars on the "Build
|
||||
extras"_Build_extras.html doc page.
|
||||
|
||||
[Supporting info:]
|
||||
@ -1201,18 +1202,20 @@ examples/USER/colvars :ul
|
||||
|
||||
:line
|
||||
|
||||
USER-PLUMED package :link(USER-PLUMED),h4
|
||||
USER-PLUMED package :link(PKG-USER-PLUMED),h4
|
||||
|
||||
[Contents:]
|
||||
|
||||
The fix plumed command allows you to use the plugin for molecular
|
||||
dynamics PLUMED to analyse and bias your LAMMPS trajectory on the fly.
|
||||
In practise PLUMED is called from within the lammps input script by using
|
||||
the "fix plumed _fix_plumed.html command.
|
||||
The fix plumed command allows you to use the PLUMED free energy plugin
|
||||
for molecular dynamics to analyse and bias your LAMMPS trajectory on
|
||||
the fly. The PLUMED library is called from within the LAMMPS input
|
||||
script by using the "fix plumed _fix_plumed.html command.
|
||||
|
||||
[Authors:] The PLUMED library is written and maintained by
|
||||
Massimilliano Bonomi, Giovanni Bussi, Carlo Camiloni and
|
||||
Gareth Tribello.
|
||||
[Authors:] The "PLUMED library"_#PLUMED is written and maintained by
|
||||
Massimilliano Bonomi, Giovanni Bussi, Carlo Camiloni and Gareth
|
||||
Tribello.
|
||||
|
||||
:link(PLUMED,http://www.plumed.org)
|
||||
|
||||
[Install:]
|
||||
|
||||
@ -1224,7 +1227,7 @@ extras"_Build_extras.html doc page.
|
||||
|
||||
src/USER-PLUMED/README
|
||||
lib/plumed/README
|
||||
"fix plumed "_fix_plumed.html
|
||||
"fix plumed"_fix_plumed.html
|
||||
examples/USER/plumed :ul
|
||||
|
||||
:line
|
||||
|
||||
@ -62,17 +62,20 @@ Package, Description, Doc page, Example, Library
|
||||
"USER-NETCDF"_Packages_details.html#PKG-USER-NETCDF, dump output via NetCDF,"dump netcdf"_dump_netcdf.html, n/a, ext
|
||||
"USER-OMP"_Packages_details.html#PKG-USER-OMP, OpenMP-enabled styles,"Speed omp"_Speed_omp.html, "Benchmarks"_http://lammps.sandia.gov/bench.html, no
|
||||
"USER-PHONON"_Packages_details.html#PKG-USER-PHONON, phonon dynamical matrix,"fix phonon"_fix_phonon.html, USER/phonon, no
|
||||
"USER-PLUMED"_Packages_details.html#PKG-USER-PLUMED, "PLUMED"_#PLUMED free energy library,"fix plumed"_fix_plumed.html, USER/plumed, ext
|
||||
"USER-PTM"_Packages_details.html#PKG-USER-PTM, Polyhedral Template Matching,"compute ptm/atom"_compute_ptm_atom.html, n/a, no
|
||||
"USER-QMMM"_Packages_details.html#PKG-USER-QMMM, QM/MM coupling,"fix qmmm"_fix_qmmm.html, USER/qmmm, ext
|
||||
"USER-QTB"_Packages_details.html#PKG-USER-QTB, quantum nuclear effects,"fix qtb"_fix_qtb.html "fix qbmsst"_fix_qbmsst.html, qtb, no
|
||||
"USER-QUIP"_Packages_details.html#PKG-USER-QUIP, QUIP/libatoms interface,"pair_style quip"_pair_quip.html, USER/quip, ext
|
||||
"USER-REAXC"_Packages_details.html#PKG-USER-REAXC, ReaxFF potential (C/C++) ,"pair_style reaxc"_pair_reaxc.html, reax, no
|
||||
"USER-SCAFACOS"_Packages_details.html#PKG-USER-SCAFACOS, wrapper on ScaFaCoS solver,"kspace_style scafacos"_kspace_style.html, USER/scafacos, ext
|
||||
"USER-SDPD"_Packages_details.html#PKG-USER-SDPD, smoothed dissipative particle dynamics,"pair_style sdpd/taitwater/isothermal"_pair_sdpd_taitwater_isothermal, USER/sdpd, no
|
||||
"USER-SDPD"_Packages_details.html#PKG-USER-SDPD, smoothed dissipative particle dynamics,"pair_style sdpd/taitwater/isothermal"_pair_sdpd_taitwater_isothermal.html, USER/sdpd, no
|
||||
"USER-SMD"_Packages_details.html#PKG-USER-SMD, smoothed Mach dynamics,"SMD User Guide"_PDF/SMD_LAMMPS_userguide.pdf, USER/smd, ext
|
||||
"USER-SMTBQ"_Packages_details.html#PKG-USER-SMTBQ, second moment tight binding QEq potential,"pair_style smtbq"_pair_smtbq.html, USER/smtbq, no
|
||||
"USER-SPH"_Packages_details.html#PKG-USER-SPH, smoothed particle hydrodynamics,"SPH User Guide"_PDF/SPH_LAMMPS_userguide.pdf, USER/sph, no
|
||||
"USER-TALLY"_Packages_details.html#PKG-USER-TALLY, pairwise tally computes,"compute XXX/tally"_compute_tally.html, USER/tally, no
|
||||
"USER-UEF"_Packages_details.html#PKG-USER-UEF, extensional flow,"fix nvt/uef"_fix_nh_uef.html, USER/uef, no
|
||||
"USER-VTK"_Packages_details.html#PKG-USER-VTK, dump output via VTK, "compute vtk"_dump_vtk.html, n/a, ext :tb(ea=c,ca1=l)
|
||||
|
||||
:link(MOFplus,https://www.mofplus.org/content/show/MOF-FF)
|
||||
:link(PLUMED,http://www.plumed.org)
|
||||
|
||||
@ -44,6 +44,7 @@ Commands :h1
|
||||
fix_modify
|
||||
group
|
||||
group2ndx
|
||||
hyper
|
||||
if
|
||||
improper_coeff
|
||||
improper_style
|
||||
|
||||
@ -176,6 +176,7 @@ compute"_Commands_compute.html doc page are followed by one or more of
|
||||
(g,i,k,o,t) to indicate which accelerated styles exist.
|
||||
|
||||
"ackland/atom"_compute_ackland_atom.html -
|
||||
"adf"_compute_adf.html - angular distribution function
|
||||
"aggregate/atom"_compute_cluster_atom.html - aggregate ID for each atom
|
||||
"angle"_compute_angle.html -
|
||||
"angle/local"_compute_angle_local.html -
|
||||
|
||||
@ -117,5 +117,5 @@ package"_Build_package.html doc page for more info.
|
||||
:line
|
||||
|
||||
:link(Larsen)
|
||||
[(Larsen)] Larsen, Schmidt, Schiøtz, Modelling Simul Mater Sci Eng, 24, 055007 (2016).
|
||||
[(Larsen)] Larsen, Schmidt, Schiotz, Modelling Simul Mater Sci Eng, 24, 055007 (2016).
|
||||
|
||||
|
||||
@ -221,6 +221,8 @@ accelerated styles exist.
|
||||
"grem"_fix_grem.html -
|
||||
"halt"_fix_halt.html - terminate a dynamics run or minimization
|
||||
"heat"_fix_heat.html - add/subtract momentum-conserving heat
|
||||
"hyper/global"_fix_hyper_global.html - global hyperdynamics
|
||||
"hyper/local"_fix_hyper_local.html - local hyperdynamics
|
||||
"imd"_fix_imd.html -
|
||||
"indent"_fix_indent.html - impose force due to an indenter
|
||||
"ipi"_fix_ipi.html -
|
||||
@ -238,6 +240,7 @@ accelerated styles exist.
|
||||
"manifoldforce"_fix_manifoldforce.html -
|
||||
"meso"_fix_meso.html -
|
||||
"meso"_fix_meso_move.html - move mesoscopic SPH/SDPD particles in a prescribed fashion
|
||||
"meso/move"_fix_meso_move.html -
|
||||
"meso/stationary"_fix_meso_stationary.html -
|
||||
"momentum"_fix_momentum.html - zero the linear and/or angular momentum of a group of atoms
|
||||
"move"_fix_move.html - move atoms in a prescribed fashion
|
||||
@ -293,6 +296,7 @@ accelerated styles exist.
|
||||
"phonon"_fix_phonon.html -
|
||||
"pimd"_fix_pimd.html -
|
||||
"planeforce"_fix_planeforce.html - constrain atoms to move in a plane
|
||||
"plumed"_fix_plumed.html - wrapper on PLUMED free energy library
|
||||
"poems"_fix_poems.html - constrain clusters of atoms to move as coupled rigid bodies
|
||||
"pour"_fix_pour.html - pour new atoms/molecules into a granular simulation domain
|
||||
"precession/spin"_fix_precession_spin.html -
|
||||
|
||||
@ -41,7 +41,7 @@ react = mandatory argument indicating new reaction specification :l
|
||||
fraction = initiate reaction with this probability if otherwise eligible
|
||||
seed = random number seed (positive integer)
|
||||
{stabilize_steps} value = timesteps
|
||||
timesteps = number of timesteps to apply internally created nve/limit.html
|
||||
timesteps = number of timesteps to apply internally created nve/limit fix :pre
|
||||
{update_edges} value = {none} or {charges} :l
|
||||
none = do not update topology near the edges of reaction templates
|
||||
charges = update atomic charges of all atoms in reaction templates
|
||||
|
||||
@ -116,7 +116,8 @@ not a limitation of functionality.
|
||||
|
||||
[Related commands:]
|
||||
|
||||
"fix smd"_fix_smd.html
|
||||
"fix smd"_fix_smd.html, "fix spring"_fix_spring.html,
|
||||
"fix plumed"_fix_plumed.html
|
||||
|
||||
[Default:]
|
||||
|
||||
@ -126,4 +127,4 @@ and tstat = NULL.
|
||||
:line
|
||||
|
||||
:link(Fiorin)
|
||||
[(Fiorin)] Fiorin , Klein, Henin, Mol. Phys., DOI:10.1080/00268976.2013.813594
|
||||
[(Fiorin)] Fiorin, Klein, Henin, Mol. Phys., DOI:10.1080/00268976.2013.813594
|
||||
|
||||
260
doc/src/fix_hyper_global.txt
Normal file
@ -0,0 +1,260 @@
|
||||
"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
|
||||
|
||||
:link(lws,http://lammps.sandia.gov)
|
||||
:link(ld,Manual.html)
|
||||
:link(lc,Section_commands.html#comm)
|
||||
|
||||
:line
|
||||
|
||||
fix hyper/global command :h3
|
||||
|
||||
[Syntax:]
|
||||
|
||||
fix ID group-ID hyper/global cutbond qfactor Vmax Tequil :pre
|
||||
|
||||
ID, group-ID are documented in "fix"_fix.html command
|
||||
hyper/global = style name of this fix command
|
||||
cutbond = max distance at which a pair of atoms is considered bonded (distance units)
|
||||
qfactor = max strain at which bias potential goes to 0.0 (unitless)
|
||||
Vmax = height of bias potential (energy units)
|
||||
Tequil = equilibration temperature (temperature units) :ul
|
||||
|
||||
[Examples:]
|
||||
|
||||
fix 1 all hyper/global 1.0 0.3 0.8 300.0 :pre
|
||||
|
||||
[Description:]
|
||||
|
||||
This fix is meant to be used with the "hyper"_hyper.html command to
|
||||
perform a bond-boost global hyperdynamics (GHD) simulation. The role
|
||||
of this fix is to a select a single pair of atoms in the system at
|
||||
each timestep to add a global bias potential to, which will alter the
|
||||
dynamics of the system in a manner that effectively accelerates time.
|
||||
This is in contrast to the "fix hyper/local"_fix_hyper_local.html
|
||||
command, which can be user to perform a local hyperdynamics (LHD)
|
||||
simulation, by adding a local bias potential to multiple pairs of
|
||||
atoms at each timestep. GHD can time accelerate a small simulation
|
||||
with up to a few 100 atoms. For larger systems, LHD is needed to
|
||||
achieve good time acceleration.
|
||||
|
||||
For a system that undergoes rare transition events, where one or more
|
||||
atoms move over an energy barrier to a new potential energy basin, the
|
||||
effect of the bias potential is to induce more rapid transitions.
|
||||
This can lead to a dramatic speed-up in the rate at which events
|
||||
occurs, without altering their relative frequencies, thus leading to
|
||||
an overall increase in the elapsed real time of the simulation as
|
||||
compared to running for the same number of timesteps with normal MD.
|
||||
See the "hyper"_hyper.html doc page for a more general discussion of
|
||||
hyperdynamics and citations that explain both GHD and LHD.
|
||||
|
||||
The equations and logic used by this fix and described here to perform
|
||||
GHD follow the description given in "(Voter2013)"_#Voter2013ghd. The
|
||||
bond-boost form of a bias potential for HD is due to Miron and
|
||||
Fichthorn as described in "(Miron)"_#Mironghd. In LAMMPS we use a
|
||||
simplified version of bond-boost GHD where a single bond in the system
|
||||
is biased at any one timestep.
|
||||
|
||||
Bonds are defined between each pair of I,J atoms whose R0ij distance
|
||||
is less than {cutbond}, when the system is in a quenched state
|
||||
(minimum) energy. Note that these are not "bonds" in a covalent
|
||||
sense. A bond is simply any pair of atoms that meet the distance
|
||||
criterion. {Cutbond} is an argument to this fix; it is discussed
|
||||
below. A bond is only formed if one or both of the I.J atoms are in
|
||||
the specified group.
|
||||
|
||||
The current strain of bond IJ (when running dynamics) is defined as
|
||||
|
||||
Eij = (Rij - R0ij) / R0ij :pre
|
||||
|
||||
where Rij is the current distance between atoms I,J, and R0ij is the
|
||||
equilibrium distance in the quenched state.
|
||||
|
||||
The bias energy Vij of any bond IJ is defined as
|
||||
|
||||
Vij = Vmax * (1 - (Eij/q)^2) for abs(Eij) < qfactor
|
||||
= 0 otherwise :pre
|
||||
|
||||
where the prefactor {Vmax} and the cutoff {qfactor} are arguments to
|
||||
this fix; they are discussed below. This functional form is an
|
||||
inverse parabola centered at 0.0 with height Vmax and which goes to
|
||||
0.0 at +/- qfactor.
|
||||
|
||||
Let Emax = the maximum of abs(Eij) for all IJ bonds in the system on a
|
||||
given timestep. On that step, Vij is added as a bias potential to
|
||||
only the single bond with strain Emax, call it Vij(max). Note that
|
||||
Vij(max) will be 0.0 if Emax >= qfactor on that timestep. Also note
|
||||
that Vij(max) is added to the normal interatomic potential that is
|
||||
computed between all atoms in the system at every step.
|
||||
|
||||
The derivative of Vij(max) with respect to the position of each atom
|
||||
in the Emax bond gives a bias force Fij(max) acting on the bond as
|
||||
|
||||
Fij(max) = - dVij(max)/dEij = 2 Vmax Eij / qfactor^2 for abs(Eij) < qfactor
|
||||
= 0 otherwise :pre
|
||||
|
||||
which can be decomposed into an equal and opposite force acting on
|
||||
only the two I,J atoms in the Emax bond.
|
||||
|
||||
The time boost factor for the system is given each timestep I by
|
||||
|
||||
Bi = exp(beta * Vij(max)) :pre
|
||||
|
||||
where beta = 1/kTequil, and {Tequil} is the temperature of the system
|
||||
and an argument to this fix. Note that Bi >= 1 at every step.
|
||||
|
||||
NOTE: To run GHD, the input script must also use the "fix
|
||||
langevin"_fix_langevin.html command to thermostat the atoms at the
|
||||
same {Tequil} as specified by this fix, so that the system is running
|
||||
constant-temperature (NVT) dynamics. LAMMPS does not check that this
|
||||
is done.
|
||||
|
||||
The elapsed time t_hyper for a GHD simulation running for {N}
|
||||
timesteps is simply
|
||||
|
||||
t_hyper = Sum (i = 1 to N) Bi * dt :pre
|
||||
|
||||
where dt is the timestep size defined by the "timestep"_timestep.html
|
||||
command. The effective time acceleration due to GHD is thus t_hyper /
|
||||
N*dt, where N*dt is elapsed time for a normal MD run of N timesteps.
|
||||
|
||||
Note that in GHD, the boost factor varies from timestep to timestep.
|
||||
Likewise, which bond has Emax strain and thus which pair of atoms the
|
||||
bias potential is added to, will also vary from timestep to timestep.
|
||||
This is in contrast to local hyperdynamics (LHD) where the boost
|
||||
factor is an input parameter; see the "fix
|
||||
hyper/local"_fix_hyper_local.html doc page for details.
|
||||
|
||||
:line
|
||||
|
||||
Here is additional information on the input parameters for GHD.
|
||||
|
||||
The {cutbond} argument is the cutoff distance for defining bonds
|
||||
between pairs of nearby atoms. A pair of I,J atoms in their
|
||||
equilibrium, minimum-energy configuration, which are separated by a
|
||||
distance Rij < {cutbond}, are flagged as a bonded pair. Setting
|
||||
{cubond} to be ~25% larger than the nearest-neighbor distance in a
|
||||
crystalline lattice is a typical choice for solids, so that bonds
|
||||
exist only between nearest neighbor pairs.
|
||||
|
||||
The {qfactor} argument is the limiting strain at which the bias
|
||||
potential goes to 0.0. It is dimensionless, so a value of 0.3 means a
|
||||
bond distance can be up to 30% larger or 30% smaller than the
|
||||
equilibrium (quenched) R0ij distance and the two atoms in the bond
|
||||
could still experience a non-zero bias force.
|
||||
|
||||
If {qfactor} is set too large, then transitions from one energy basin
|
||||
to another are affected because the bias potential is non-zero at the
|
||||
transition state (e.g. saddle point). If {qfactor} is set too small
|
||||
than little boost is achieved because the Eij strain of some bond in
|
||||
the system will (nearly) always exceed {qfactor}. A value of 0.3 for
|
||||
{qfactor} is typically reasonable.
|
||||
|
||||
The {Vmax} argument is the prefactor on the bias potential. Ideally,
|
||||
tt should be set to a value slightly less than the smallest barrier
|
||||
height for an event to occur. Otherwise the applied bias potential
|
||||
may be large enough (when added to the interatomic potential) to
|
||||
produce a local energy basin with a maxima in the center. This can
|
||||
produce artificial energy minima in the same basin that trap an atom.
|
||||
Or if {Vmax} is even larger, it may induce an atom(s) to rapidly
|
||||
transition to another energy basin. Both cases are "bad dynamics"
|
||||
which violate the assumptions of GHD that guarantee an accelerated
|
||||
time-accurate trajectory of the system.
|
||||
|
||||
Note that if {Vmax} is set too small, the GHD simulation will run
|
||||
correctly. There will just be fewer events because the hyper time
|
||||
(t_hyper equation above) will be shorter.
|
||||
|
||||
NOTE: If you have no physical intuition as to the smallest barrier
|
||||
height in your system, a reasonable strategy to determine the largest
|
||||
{Vmax} you can use for an LHD model, is to run a sequence of
|
||||
simulations with smaller and smaller {Vmax} values, until the event
|
||||
rate does not change.
|
||||
|
||||
The {Tequil} argument is the temperature at which the system is
|
||||
simulated; see the comment above about the "fix
|
||||
langevin"_fix_langevin.html thermostatting. It is also part of the
|
||||
beta term in the exponential factor that determines how much boost is
|
||||
achieved as a function of the bias potential.
|
||||
|
||||
In general, the lower the value of {Tequil} and the higher the value
|
||||
of {Vmax}, the more boost will be achievable by the GHD algorithm.
|
||||
|
||||
:line
|
||||
|
||||
[Restart, fix_modify, output, run start/stop, minimize info:]
|
||||
|
||||
No information about this fix is written to "binary restart
|
||||
files"_restart.html.
|
||||
|
||||
The "fix_modify"_fix_modify.html {energy} option is supported by this
|
||||
fix to add the energy of the bias potential to the the system's
|
||||
potential energy as part of "thermodynamic output"_thermo_style.html.
|
||||
|
||||
This fix computes a global scalar and global vector of length 11, which
|
||||
can be accessed by various "output commands"_Howto_output.html. The
|
||||
scalar is the magnitude of the bias potential (energy units) applied on
|
||||
the current timestep. The vector stores the following quantities:
|
||||
|
||||
1 = boost factor on this step (unitless)
|
||||
2 = max strain Eij of any bond on this step (unitless)
|
||||
3 = ID of first atom in the max-strain bond
|
||||
4 = ID of second atom in the max-strain bond
|
||||
5 = average # of bonds/atom on this step :ul
|
||||
|
||||
6 = fraction of timesteps with bias = 0.0 during this run
|
||||
7 = max drift distance of any atom during this run (distance units)
|
||||
8 = max bond length during this run (distance units) :ul
|
||||
|
||||
9 = cummulative hyper time since fix was defined (time units)
|
||||
10 = cummulative count of event timesteps since fix was defined
|
||||
11 = cummulative count of atoms in events since fix was defined :ul
|
||||
|
||||
The first 5 quantities are for the current timestep. Quantities 6-8
|
||||
are for the current hyper run. Quantities 9-11 are cummulative across
|
||||
multiple runs (since the fix was defined in the input script).
|
||||
|
||||
For value 7, drift is the distance an atom moves between timesteps
|
||||
when the bond list is reset, i.e. between events. Atoms involved in
|
||||
an event will typically move the greatest distance since others are
|
||||
typically oscillating around their lattice site.
|
||||
|
||||
For value 10, events are checked for by the "hyper"_hyper.html command
|
||||
once every {Nevent} timesteps. This value is the count of those
|
||||
timesteps on which one (or more) events was detected. It is NOT the
|
||||
number of distinct events, since more than one event may occur in the
|
||||
same {Nevent} time window.
|
||||
|
||||
For value 11, each time the "hyper"_hyper.html command checks for an
|
||||
event, it invokes a compute to flag zero or more atoms as
|
||||
participating in one or more events. E.g. atoms that have displaced
|
||||
more than some distance from the previous quench state. Value 11 is
|
||||
the cummulative count of the number of atoms participating in any of
|
||||
the events that were found.
|
||||
|
||||
The scalar and vector values calculated by this fix are all
|
||||
"intensive".
|
||||
|
||||
No parameter of this fix can be used with the {start/stop} keywords of
|
||||
the "run"_run.html command. This fix is not invoked during "energy
|
||||
minimization"_minimize.html.
|
||||
|
||||
[Restrictions:]
|
||||
|
||||
This command can only be used if LAMMPS was built with the REPLICA
|
||||
package. See the "Build package"_Build_package.html doc page for more
|
||||
info.
|
||||
|
||||
[Related commands:]
|
||||
|
||||
"hyper"_hyper.html, "fix hyper/local"_fix_hyper_local.html
|
||||
|
||||
[Default:] None
|
||||
|
||||
:line
|
||||
|
||||
:link(Voter2013ghd)
|
||||
[(Voter2013)] S. Y. Kim, D. Perez, A. F. Voter, J Chem Phys, 139,
|
||||
144110 (2013).
|
||||
|
||||
:link(Mironghd)
|
||||
[(Miron)] R. A. Miron and K. A. Fichthorn, J Chem Phys, 119, 6210 (2003).
|
||||
404
doc/src/fix_hyper_local.txt
Normal file
@ -0,0 +1,404 @@
|
||||
"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
|
||||
|
||||
:link(lws,http://lammps.sandia.gov)
|
||||
:link(ld,Manual.html)
|
||||
:link(lc,Section_commands.html#comm)
|
||||
|
||||
:line
|
||||
|
||||
fix hyper/local command :h3
|
||||
|
||||
[Syntax:]
|
||||
|
||||
fix ID group-ID hyper/local cutbond qfactor Vmax Tequil Dcut alpha Btarget :pre
|
||||
|
||||
ID, group-ID are documented in "fix"_fix.html command :ulb,l
|
||||
hyper/local = style name of this fix command :l
|
||||
cutbond = max distance at which a pair of atoms is considered bonded (distance units) :l
|
||||
qfactor = max strain at which bias potential goes to 0.0 (unitless) :l
|
||||
Vmax = estimated height of bias potential (energy units) :l
|
||||
Tequil = equilibration temperature (temperature units) :l
|
||||
Dcut = minimum distance between boosted bonds (distance units) :l
|
||||
alpha = boostostat relaxation time (time units) :l
|
||||
Btarget = desired time boost factor (unitless) :l
|
||||
zero or more keyword/value pairs may be appended :l
|
||||
keyword = {lost} or {check/bias} or {check/coeff}
|
||||
{lostbond} value = error/warn/ignore
|
||||
{check/bias} values = Nevery error/warn/ignore
|
||||
{check/coeff} values = Nevery error/warn/ignore :pre
|
||||
:ule
|
||||
|
||||
[Examples:]
|
||||
|
||||
fix 1 all hyper/local 1.0 0.3 0.8 300.0 :pre
|
||||
|
||||
[Description:]
|
||||
|
||||
This fix is meant to be used with the "hyper"_hyper.html command to
|
||||
perform a bond-boost local hyperdynamics (LHD) simulation. The role
|
||||
of this fix is to a select multiple pairs of atoms in the system at
|
||||
each timestep to add a local bias potential to, which will alter the
|
||||
dynamics of the system in a manner that effectively accelerates time.
|
||||
This is in contrast to the "fix hyper/global"_fix_hyper_global.html
|
||||
command, which can be user to perform a global hyperdynamics (GHD)
|
||||
simulation, by adding a global bias potential to a single pair of
|
||||
atoms at each timestep. GHD can time accelerate a small simulation
|
||||
with up to a few 100 atoms. For larger systems, LHD is needed to
|
||||
achieve good time acceleration.
|
||||
|
||||
For a system that undergoes rare transition events, where one or more
|
||||
atoms move over an energy barrier to a new potential energy basin, the
|
||||
effect of the bias potential is to induce more rapid transitions.
|
||||
This can lead to a dramatic speed-up in the rate at which events
|
||||
occurs, without altering their relative frequencies, thus leading to
|
||||
an overall increase in the elapsed real time of the simulation as
|
||||
compared to running for the same number of timesteps with normal MD.
|
||||
See the "hyper"_hyper.html doc page for a more general discussion of
|
||||
hyperdynamics and citations that explain both GHD and LHD.
|
||||
|
||||
The equations and logic used by this fix and described here to perform
|
||||
LHD follow the description given in "(Voter2013)"_#Voter2013lhd. The
|
||||
bond-boost form of a bias potential for HD is due to Miron and
|
||||
Fichthorn as described in "(Miron)"_#Mironlhd.
|
||||
|
||||
To understand this description, you should first read the description
|
||||
of the GHD algorithm on the "fix hyper/global"_fix_hyper_global.html
|
||||
doc page. This description of LHD builds on the GHD description.
|
||||
|
||||
The definition of bonds, Eij, and Emax are the same for GHD and LHD.
|
||||
The formulas for Vij(max) and Fij(max) are also the same except for a
|
||||
pre-factor Cij, explained below.
|
||||
|
||||
The bias energy Vij applied to a bond IJ with maximum strain is
|
||||
|
||||
Vij(max) = Cij * Vmax * (1 - (Eij/q)^2) for abs(Eij) < qfactor
|
||||
= 0 otherwise :pre
|
||||
|
||||
The derivative of Vij(max) with respect to the position of each atom
|
||||
in the IJ bond gives a bias force Fij(max) acting on the bond as
|
||||
|
||||
Fij(max) = - dVij(max)/dEij = 2 Cij Vmax Eij / qfactor^2 for abs(Eij) < qfactor
|
||||
= 0 otherwise :pre
|
||||
|
||||
which can be decomposed into an equal and opposite force acting on
|
||||
only the two I,J atoms in the IJ bond.
|
||||
|
||||
The key difference is that in GHD a bias energy and force is added (on
|
||||
a particular timestep) to only one bond (pair of atoms) in the system,
|
||||
which is the bond with maximum strain Emax.
|
||||
|
||||
In LHD, a bias energy and force can be added to multiple bonds
|
||||
separated by the specified {Dcut} distance or more. A bond IJ is
|
||||
biased if it is the maximum strain bond within its local
|
||||
"neighborhood", which is defined as the bond IJ plus any neighbor
|
||||
bonds within a distance {Dcut} from IJ. The "distance" between bond
|
||||
IJ and bond KL is the minimum distance between any of the IK, IL, JK,
|
||||
JL pairs of atoms.
|
||||
|
||||
For a large system, multiple bonds will typically meet this
|
||||
requirement, and thus a bias potential Vij(max) will be applied to
|
||||
many bonds on the same timestep.
|
||||
|
||||
In LHD, all bonds store a Cij prefactor which appears in the Vij(max)
|
||||
and Fij(max) equations above. Note that the Cij factor scales the
|
||||
strength of the bias energy and forces whenever bond IJ is the maximum
|
||||
strain bond in its neighborhood.
|
||||
|
||||
Cij is initialized to 1.0 when a bond between the I,J atoms is first
|
||||
defined. The specified {Btarget} factor is then used to adjust the
|
||||
Cij prefactors for each bond every timestep in the following manner.
|
||||
|
||||
An instantaneous boost factor Bij is computed each timestep
|
||||
for each bond, as
|
||||
|
||||
Bij = exp(beta * Vkl(max)) :pre
|
||||
|
||||
where Vkl(max) is the bias energy of the maxstrain bond KL within bond
|
||||
IJ's neighborhood, beta = 1/kTequil, and {Tequil} is the temperature
|
||||
of the system and an argument to this fix.
|
||||
|
||||
NOTE: To run LHD, the input script must also use the "fix
|
||||
langevin"_fix_langevin.html command to thermostat the atoms at the
|
||||
same {Tequil} as specified by this fix, so that the system is running
|
||||
constant-temperature (NVT) dynamics. LAMMPS does not check that this
|
||||
is done.
|
||||
|
||||
Note that if IJ = KL, then bond IJ is a biased bond on that timestep,
|
||||
otherwise it is not. But regardless, the boost factor Bij can be
|
||||
thought of an estimate of time boost currently being applied within a
|
||||
local region centered on bond IJ. For LHD, we want this to be the
|
||||
specified {Btarget} value everywhere in the simulation domain.
|
||||
|
||||
To accomplish this, if Bij < Btarget, the Cij prefactor for bond IJ is
|
||||
incremented on the current timestep by an amount proportional to the
|
||||
inverse of the specified {alpha} and the difference (Bij - Btarget).
|
||||
Conversely if Bij > Btarget, Cij is decremented by the same amount.
|
||||
This procedure is termed "boostostatting" in
|
||||
"(Voter2013)"_#Voter2013lhd. It drives all of the individual Cij to
|
||||
values such that when Vij{max} is applied as a bias to bond IJ, the
|
||||
resulting boost factor Bij will be close to {Btarget} on average.
|
||||
Thus the LHD time acceleration factor for the overall system is
|
||||
effectively {Btarget}.
|
||||
|
||||
Note that in LHD, the boost factor {Btarget} is specified by the user.
|
||||
This is in contrast to global hyperdynamics (GHD) where the boost
|
||||
factor varies each timestep and is computed as a function of {Vmax},
|
||||
Emax, and {Tequil}; see the "fix hyper/global"_fix_hyper_global.html
|
||||
doc page for details.
|
||||
|
||||
:line
|
||||
|
||||
Here is additional information on the input parameters for LHD.
|
||||
|
||||
Note that the {cutbond}, {qfactor}, and {Tequil} arguments have the
|
||||
same meaning as for GHD. The {Vmax} argument is slightly different.
|
||||
The {Dcut}, {alpha}, and {Btarget} parameters are unique to LHD.
|
||||
|
||||
The {cutbond} argument is the cutoff distance for defining bonds
|
||||
between pairs of nearby atoms. A pair of I,J atoms in their
|
||||
equilibrium, minimum-energy configuration, which are separated by a
|
||||
distance Rij < {cutbond}, are flagged as a bonded pair. Setting
|
||||
{cubond} to be ~25% larger than the nearest-neighbor distance in a
|
||||
crystalline lattice is a typical choice for solids, so that bonds
|
||||
exist only between nearest neighbor pairs.
|
||||
|
||||
The {qfactor} argument is the limiting strain at which the bias
|
||||
potential goes to 0.0. It is dimensionless, so a value of 0.3 means a
|
||||
bond distance can be up to 30% larger or 30% smaller than the
|
||||
equilibrium (quenched) R0ij distance and the two atoms in the bond
|
||||
could still experience a non-zero bias force.
|
||||
|
||||
If {qfactor} is set too large, then transitions from one energy basin
|
||||
to another are affected because the bias potential is non-zero at the
|
||||
transition state (e.g. saddle point). If {qfactor} is set too small
|
||||
than little boost can be achieved because the Eij strain of some bond in
|
||||
the system will (nearly) always exceed {qfactor}. A value of 0.3 for
|
||||
{qfactor} is typically a reasonable value.
|
||||
|
||||
The {Vmax} argument is a fixed prefactor on the bias potential. There
|
||||
is a also a dynamic prefactor Cij, driven by the choice of {Btarget}
|
||||
as discussed above. The product of these should be a value less than
|
||||
the smallest barrier height for an event to occur. Otherwise the
|
||||
applied bias potential may be large enough (when added to the
|
||||
interatomic potential) to produce a local energy basin with a maxima
|
||||
in the center. This can produce artificial energy minima in the same
|
||||
basin that trap an atom. Or if Cij*{Vmax} is even larger, it may
|
||||
induce an atom(s) to rapidly transition to another energy basin. Both
|
||||
cases are "bad dynamics" which violate the assumptions of LHD that
|
||||
guarantee an accelerated time-accurate trajectory of the system.
|
||||
|
||||
NOTE: It may seem that {Vmax} can be set to any value, and Cij will
|
||||
compensate to reduce the overall prefactor if necessary. However the
|
||||
Cij are initialized to 1.0 and the boostostatting procedure typically
|
||||
operates slowly enough that there can be a time period of bad dynamics
|
||||
if {Vmax} is set too large. A better strategy is to set {Vmax} to the
|
||||
smallest barrier height for an event (the same as for GHD), so that
|
||||
the Cij remain near unity.
|
||||
|
||||
The {Tequil} argument is the temperature at which the system is
|
||||
simulated; see the comment above about the "fix
|
||||
langevin"_fix_langevin.html thermostatting. It is also part of the
|
||||
beta term in the exponential factor that determines how much boost is
|
||||
achieved as a function of the bias potential. See the discussion of
|
||||
the {Btarget} argument below.
|
||||
|
||||
As discussed above, the {Dcut} argument is the distance required
|
||||
between two locally maxstrain bonds for them to both be selected as
|
||||
biased bonds on the same timestep. Computationally, the larger {Dcut}
|
||||
is, the more work (computation and communication) must be done each
|
||||
timestep within the LHD algorithm. And the fewer bonds can be
|
||||
simultaneously biased, which may mean the specified {Btarget} time
|
||||
acceleration cannot be achieved.
|
||||
|
||||
Physically {Dcut} should be a long enough distance that biasing two
|
||||
pairs of atoms that close together will not influence the dynamics of
|
||||
each pair. E.g. something like 2x the cutoff of the interatomic
|
||||
potential. In practice a {Dcut} value of ~10 Angstroms seems to work
|
||||
well for many solid-state systems.
|
||||
|
||||
NOTE: You must also insure that ghost atom communication is performed
|
||||
for a distance of at least {Dcut} + {cutevent} where {cutevent} = the
|
||||
distance one or more atoms move (between quenched states) to be
|
||||
considered an "event". It is an argument to the "compute
|
||||
event/displace" command used to detect events. By default the ghost
|
||||
communication distance is set by the pair_style cutoff, which will
|
||||
typically be < {Dcut}. The "comm_modify cutoff"_comm_modify.html
|
||||
command can be used to set the ghost cutoff explicitly, e.g.
|
||||
|
||||
comm_modify cutoff 12.0 :pre
|
||||
|
||||
This fix does not know the {cutevent} parameter, but uses half the
|
||||
bond length as an estimate to warn if the ghost cutoff is not long
|
||||
enough.
|
||||
|
||||
As described above the {alpha} argument is a pre-factor in the
|
||||
boostostat update equation for each bond's Cij prefactor. {Alpha} is
|
||||
specified in time units, similar to other thermostat or barostat
|
||||
damping parameters. It is roughly the physical time it will take the
|
||||
boostostat to adjust a Cij value from a too high (or too low) value to
|
||||
a correct one. An {alpha} setting of a few ps is typically good for
|
||||
solid-state systems. Note that the {alpha} argument here is the
|
||||
inverse of the alpha parameter discussed in
|
||||
"(Voter2013)"_#Voter2013lhd.
|
||||
|
||||
The {Btarget} argument is the desired time boost factor (a value > 1)
|
||||
that all the atoms in the system will experience. The elapsed time
|
||||
t_hyper for an LHD simulation running for {N} timesteps is simply
|
||||
|
||||
t_hyper = Btarget * N*dt :pre
|
||||
|
||||
where dt is the timestep size defined by the "timestep"_timestep.html
|
||||
command. The effective time acceleration due to LHD is thus t_hyper /
|
||||
N*dt = Btarget, where N*dt is elapsed time for a normal MD run
|
||||
of N timesteps.
|
||||
|
||||
You cannot choose an arbitrarily large setting for {Btarget}. The
|
||||
maximum value you should choose is
|
||||
|
||||
Btarget = exp(beta * Vsmall) :pre
|
||||
|
||||
where Vsmall is the smallest event barrier height in your system, beta
|
||||
= 1/kTequil, and {Tequil} is the specified temperature of the system
|
||||
(both by this fix and the Langevin thermostat).
|
||||
|
||||
Note that if {Btarget} is set smaller than this, the LHD simulation
|
||||
will run correctly. There will just be fewer events because the hyper
|
||||
time (t_hyper equation above) will be shorter.
|
||||
|
||||
NOTE: If you have no physical intuition as to the smallest barrier
|
||||
height in your system, a reasonable strategy to determine the largest
|
||||
{Btarget} you can use for an LHD model, is to run a sequence of
|
||||
simulations with smaller and smaller {Btarget} values, until the event
|
||||
rate does not change.
|
||||
|
||||
:line
|
||||
|
||||
[Restart, fix_modify, output, run start/stop, minimize info:]
|
||||
|
||||
No information about this fix is written to "binary restart
|
||||
files"_restart.html.
|
||||
|
||||
The "fix_modify"_fix_modify.html {energy} option is supported by this
|
||||
fix to add the energy of the bias potential to the the system's
|
||||
potential energy as part of "thermodynamic output"_thermo_style.html.
|
||||
|
||||
This fix computes a global scalar and global vector of length 23,
|
||||
which can be accessed by various "output
|
||||
commands"_Howto_output.html. The scalar is the magnitude of
|
||||
the bias potential (energy units) applied on the current timestep,
|
||||
summed over all biased bonds. The vector stores the following
|
||||
quantities:
|
||||
|
||||
1 = # of biased bonds on this step
|
||||
2 = max strain Eij of any bond on this step (unitless)
|
||||
3 = average bias potential for all biased bonds on this step (energy units)
|
||||
4 = average # of bonds/atom on this step
|
||||
5 = average neighbor bonds/bond on this step within {Dcut} :ul
|
||||
|
||||
6 = fraction of steps and bonds with no bias during this run
|
||||
7 = max drift distance of any atom during this run (distance units)
|
||||
8 = max bond length during this run (distance units)
|
||||
9 = average # of biased bonds/step during this run
|
||||
10 = average bias potential for all biased bonds during this run (energy units)
|
||||
11 = max bias potential for any biased bond during this run (energy units)
|
||||
12 = min bias potential for any biased bond during this run (energy units)
|
||||
13 = max distance from my sub-box of any ghost atom with maxstrain < qfactor during this run (distance units)
|
||||
14 = max distance outside my box of any ghost atom with any maxstrain during this run (distance units)
|
||||
15 = count of ghost neighbor atoms not found on reneighbor steps during this run
|
||||
16 = count of lost bond partners during this run
|
||||
17 = average bias coeff for lost bond partners during this run
|
||||
18 = count of bias overlaps found during this run
|
||||
19 = count of non-matching bias coefficients found during this run :ul
|
||||
|
||||
20 = cummulative hyper time since fix created (time units)
|
||||
21 = cummulative count of event timesteps since fix created
|
||||
22 = cummulative count of atoms in events since fix created
|
||||
23 = cummulative # of new bonds since fix created :ul
|
||||
|
||||
The first quantities (1-5) are for the current timestep. Quantities
|
||||
6-19 are for the current hyper run. They are reset each time a new
|
||||
hyper run is performed. Quantities 20-23 are cummulative across
|
||||
multiple runs (since the fix was defined in the input script).
|
||||
|
||||
For value 6, the numerator is a count of all biased bonds on every
|
||||
timestep whose bias energy = 0.0 due to Eij >= {qfactor}. The
|
||||
denominator is the count of all biased bonds on all timesteps.
|
||||
|
||||
For value 7, drift is the distance an atom moves between timesteps
|
||||
when the bond list is reset, i.e. between events. Atoms involved in
|
||||
an event will typically move the greatest distance since others are
|
||||
typically oscillating around their lattice site.
|
||||
|
||||
For values 13 and 14, the maxstrain of a ghost atom is the maxstrain
|
||||
of any bond it is part of, and it is checked for ghost atoms within
|
||||
the bond neighbor cutoff.
|
||||
|
||||
Values 15-19 are mostly useful for debugging and diagnositc purposes.
|
||||
|
||||
For values 15-17, it is possible that a ghost atom owned by another
|
||||
processor will move far enough (e.g. as part of an event-in-progress)
|
||||
that it will no longer be within the communication cutoff distance for
|
||||
acquiring ghost atoms. Likewise it may be a ghost atom bond partner
|
||||
that cannot be found because it has moved too far. These values count
|
||||
those occurrences. Because they typically involve atoms that are part
|
||||
of events, they do not usually indicate bad dynamics. Value 16 is the
|
||||
average bias coefficient for bonds where a partner atom was lost.
|
||||
|
||||
For value 18, no two bonds should be biased if they are within a
|
||||
{Dcut} distance of each other. This value should be zero, indicating
|
||||
that no pair of bonds "overlap", meaning they are closer than {Dcut}
|
||||
from each other.
|
||||
|
||||
For value 19, the same bias coefficient is stored by both atoms in an
|
||||
IJ bond. This value should be zero, indicating that for all bonds,
|
||||
each atom in the bond stores the a bias coefficient with the same
|
||||
value.
|
||||
|
||||
Value 20 is simply the specified {boost} factor times the number of
|
||||
timestep times the timestep size.
|
||||
|
||||
For value 21, events are checked for by the "hyper"_hyper.html command
|
||||
once every {Nevent} timesteps. This value is the count of those
|
||||
timesteps on which one (or more) events was detected. It is NOT the
|
||||
number of distinct events, since more than one event may occur in the
|
||||
same {Nevent} time window.
|
||||
|
||||
For value 22, each time the "hyper"_hyper.html command checks for an
|
||||
event, it invokes a compute to flag zero or more atoms as
|
||||
participating in one or more events. E.g. atoms that have displaced
|
||||
more than some distance from the previous quench state. Value 22 is
|
||||
the cummulative count of the number of atoms participating in any of
|
||||
the events that were found.
|
||||
|
||||
Value 23 tallies the number of new bonds created by the bond reset
|
||||
operation. Bonds between a specific I,J pair of atoms may persist for
|
||||
the entire hyperdynamics simulation if neither I or J are involved in
|
||||
an event.
|
||||
|
||||
The scalar and vector values calculated by this fix are all
|
||||
"intensive".
|
||||
|
||||
No parameter of this fix can be used with the {start/stop} keywords of
|
||||
the "run"_run.html command. This fix is not invoked during "energy
|
||||
minimization"_minimize.html.
|
||||
|
||||
[Restrictions:]
|
||||
|
||||
This fix is part of the REPLICA package. It is only enabled if LAMMPS
|
||||
was built with that package. See the "Build package"_Build_package.html
|
||||
doc page for more info.
|
||||
|
||||
[Related commands:]
|
||||
|
||||
"hyper"_hyper.html, "fix hyper/global"_fix_hyper_global.html
|
||||
|
||||
[Default:] None
|
||||
|
||||
:line
|
||||
|
||||
:link(Voter2013lhd)
|
||||
[(Voter2013)] S. Y. Kim, D. Perez, A. F. Voter, J Chem Phys, 139,
|
||||
144110 (2013).
|
||||
|
||||
:link(Mironlhd)
|
||||
[(Miron)] R. A. Miron and K. A. Fichthorn, J Chem Phys, 119, 6210 (2003).
|
||||
@ -25,33 +25,32 @@ fix pl all plumed all plumed plumedfile plumed.dat outfile p.log
|
||||
|
||||
[Description:]
|
||||
|
||||
This fix instructs LAMMPS to call the PLUMED library, which allows one
|
||||
to perform various forms of trajectory analysis on the fly and to also
|
||||
use methods such as umbrella sampling and metadynamics to enhance the
|
||||
sampling of phase space.
|
||||
This fix instructs LAMMPS to call the "PLUMED"_plumedhome library, which
|
||||
allows one to perform various forms of trajectory analysis on the fly
|
||||
and to also use methods such as umbrella sampling and metadynamics to
|
||||
enhance the sampling of phase space.
|
||||
|
||||
The documentation included here only describes the fix plumed command.
|
||||
This command is LAMMPS specific whereas most of the functionality
|
||||
implemented in PLUMED will work with a range of MD codes and also when
|
||||
PLUMED is used as a stand alone code. The full documentation for PLUMED
|
||||
is available at "this website"_http://www.plumed.org/documentation
|
||||
The documentation included here only describes the fix plumed command
|
||||
itself. This command is LAMMPS specific, whereas most of the
|
||||
functionality implemented in PLUMED, however, will work with a range of
|
||||
MD codes, and when PLUMED is used as a stand alone code for analysis.
|
||||
The full "documentation for PLUMED"_plumeddocs is available online and
|
||||
included in the PLUMED source code. The PLUMED library development is
|
||||
hosted at
|
||||
"https://github.com/plumed/plumed2"_https://github.com/plumed/plumed2
|
||||
A detailed discussion of the code can be found in "(PLUMED)"_#PLUMED.
|
||||
|
||||
The PLUMED library is developed at
|
||||
"https://github.com/plumed/plumed2"_https://github.com/plumed/plumed2 A
|
||||
detailed discussion of the code can be found in "(PLUMED)"_#PLUMED.
|
||||
|
||||
There are some example scripts for using this package with LAMMPS in the
|
||||
There is an example input for using this package with LAMMPS in the
|
||||
examples/USER/plumed directory.
|
||||
|
||||
:line
|
||||
|
||||
The command to call PLUMED above is reasonably self explanatory. Within
|
||||
the input file for lammps the user is required to specify the input file
|
||||
for PLUMED and a file on which to output the PLUMED log. The user must
|
||||
specify both of these arguments every time PLUMED is to be used.
|
||||
Furthermore, the fix plumed command should appear in the LAMMPS input
|
||||
file after the relevant input paramters (e.g. the timestep) have been
|
||||
set.
|
||||
The command to make LAMMPS call PLUMED during a run requires two keyword
|
||||
value pairs pointing to the PLUMED input file and an output file for the
|
||||
PLUMED log. The user must specify these arguments every time PLUMED is
|
||||
to be used. Furthermore, the fix plumed command should appear in the
|
||||
LAMMPS input file [after] relevant input paramters (e.g. the timestep)
|
||||
have been set.
|
||||
|
||||
The {group-ID} entry is ignored. LAMMPS will always pass all the atoms
|
||||
to PLUMED and there can only be one instance of the plumed fix at a
|
||||
@ -64,10 +63,10 @@ functionality by only allowing only one plumed fix in the LAMMPS input.
|
||||
The {plumedfile} keyword allows the user to specify the name of the
|
||||
PLUMED input file. Instructions as to what should be included in a
|
||||
plumed input file can be found in the "documentation for
|
||||
PLUMED"_http://www.plumed.org/documentation.
|
||||
PLUMED"_plumeddocs
|
||||
|
||||
The {outfile} keyword allows the user to specify the name of a file on
|
||||
which to output the PLUMED log. This log file normally just parots the
|
||||
which to output the PLUMED log. This log file normally just parrots the
|
||||
information that is contained in the input file. The names of the files
|
||||
on which the results from the various analyses that have been performed
|
||||
using PLUMED will be specified by the user in the PLUMED input file.
|
||||
@ -76,12 +75,13 @@ using PLUMED will be specified by the user in the PLUMED input file.
|
||||
|
||||
When performing a restart of a calculation that involves PLUMED you must
|
||||
include a RESTART command in the PLUMED input file as detailed in the
|
||||
"PLUMED documentation"_http://www.plumed.org/documentation. When the
|
||||
restart command is found in the PLUMED input PLUMED will append to the
|
||||
files that were generated in the run that was performed previously.
|
||||
Furthermore, any history dependent bias potentials that were accumulated
|
||||
in previous calculations will be read in when the restart command is
|
||||
included in the PLUMED input.
|
||||
"PLUMED documentation"_plumeddocs. When the restart command is found in
|
||||
the PLUMED input PLUMED will append to the files that were generated in
|
||||
the run that was performed previously. No part of the PLUMED restart
|
||||
data is included in the LAMMPS restart files. Furthermore, any history
|
||||
dependent bias potentials that were accumulated in previous calculations
|
||||
will be read in when the RESTART command is included in the PLUMED
|
||||
input.
|
||||
|
||||
The "fix_modify"_fix_modify.html {energy} option is not supported by
|
||||
this fix.
|
||||
@ -97,10 +97,7 @@ This fix is part of the USER-PLUMED package. It is only enabled if
|
||||
LAMMPS was built with that package. See the "Build
|
||||
package"_Build_package.html doc page for more info.
|
||||
|
||||
There can only be one plumed fix active at a time. Since the interface
|
||||
communicates only the minimum amount of information and since the PLUMED
|
||||
module itself can handle an arbitrary number of analysis and biasing
|
||||
methods, this is not a limitation of functionality.
|
||||
There can only be one plumed fix active at a time.
|
||||
|
||||
[Related commands:]
|
||||
|
||||
@ -115,3 +112,6 @@ The default options are plumedfile = NULL and outfile = NULL
|
||||
|
||||
:link(PLUMED)
|
||||
[(PLUMED)] G.A. Tribello, M. Bonomi, D. Branduardi, C. Camilloni and G. Bussi, Comp. Phys. Comm 185, 604 (2014)
|
||||
|
||||
:link(plumeddocs,http://www.plumed.org/documentation)
|
||||
:link(plumedhome,http://www.plumed.org/)
|
||||
|
||||
@ -137,7 +137,8 @@ package"_Build_package.html doc page for more info.
|
||||
|
||||
"fix drag"_fix_drag.html, "fix spring"_fix_spring.html,
|
||||
"fix spring/self"_fix_spring_self.html,
|
||||
"fix spring/rg"_fix_spring_rg.html
|
||||
"fix spring/rg"_fix_spring_rg.html,
|
||||
"fix colvars"_fix_colvars.html, "fix plumed"_fix_plumed.html
|
||||
|
||||
[Default:] none
|
||||
|
||||
|
||||
@ -57,6 +57,8 @@ Fixes :h1
|
||||
fix_grem
|
||||
fix_halt
|
||||
fix_heat
|
||||
fix_hyper_global
|
||||
fix_hyper_local
|
||||
fix_imd
|
||||
fix_indent
|
||||
fix_ipi
|
||||
|
||||
192
doc/src/hyper.txt
Normal file
@ -0,0 +1,192 @@
|
||||
"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
|
||||
|
||||
:link(lws,http://lammps.sandia.gov)
|
||||
:link(ld,Manual.html)
|
||||
:link(lc,Section_commands.html#comm)
|
||||
|
||||
:line
|
||||
|
||||
hyper command :h3
|
||||
|
||||
[Syntax:]
|
||||
|
||||
hyper N Nevent fix-ID compute-ID keyword values ... :pre
|
||||
|
||||
N = # of timesteps to run :ulb,l
|
||||
Nevent = check for events every this many steps :l
|
||||
fix-ID = ID of a fix that applies a global or local bias potential, can be NULL :l
|
||||
compute-ID = ID of a compute that identifies when an event has occurred :l
|
||||
zero or more keyword/value pairs may be appended :l
|
||||
keyword = {min} or {dump} or {rebond} :l
|
||||
{min} values = etol ftol maxiter maxeval
|
||||
etol = stopping tolerance for energy, used in quenching
|
||||
ftol = stopping tolerance for force, used in quenching
|
||||
maxiter = max iterations of minimize, used in quenching
|
||||
maxeval = max number of force/energy evaluations, used in quenching
|
||||
{dump} value = dump-ID
|
||||
dump-ID = ID of dump to trigger whenever an event takes place
|
||||
{rebond} value = Nrebond
|
||||
Nrebond = frequency at which to reset bonds, even if no event has occurred
|
||||
:pre
|
||||
:ule
|
||||
|
||||
[Examples:]
|
||||
|
||||
compute event all event/displace 1.0
|
||||
fix HG mobile hyper/global 3.0 0.3 0.4 800.0
|
||||
hyper 5000 100 HG event min 1.0e-6 1.0e-6 100 100 dump 1 dump 5 :pre
|
||||
|
||||
[Description:]
|
||||
|
||||
Run a bond-boost hyperdynamics (HD) simulation where time is
|
||||
accelerated by application of a bias potential to one or more pairs of
|
||||
nearby atoms in the system. This command can be used to run both
|
||||
global and local hyperdyamics. In global HD a single bond within the
|
||||
system is biased on each timestep. In local HD multiple bonds
|
||||
(separated by a sufficient distance) can be biased simultaneously at
|
||||
each timestep. In the bond-boost hyperdynamics context, a "bond" is
|
||||
not a covalent bond between a pair of atoms in a molecule. Rather it
|
||||
is simply a pair of nearby atoms as discussed below.
|
||||
|
||||
Both global and local HD are described in "(Voter2013)"_#Voter2013 by
|
||||
Art Voter and collaborators. Similar to parallel replica dynamics
|
||||
(PRD), global and local HD are methods for performing accelerated
|
||||
dynamics that are suitable for infrequent-event systems that obey
|
||||
first-order kinetics. A good overview of accelerated dynamics methods
|
||||
for such systems in given in "(Voter2002)"_#Voter2002hd from the same
|
||||
group. To quote from the review paper: "The dynamical evolution is
|
||||
characterized by vibrational excursions within a potential basin,
|
||||
punctuated by occasional transitions between basins." The transition
|
||||
probability is characterized by p(t) = k*exp(-kt) where k is the rate
|
||||
constant. Running multiple replicas gives an effective enhancement in
|
||||
the timescale spanned by the multiple simulations, while waiting for
|
||||
an event to occur.
|
||||
|
||||
Both HD and PRD produce a time-accurate trajectory that effectively
|
||||
extends the timescale over which a system can be simulated, but they
|
||||
do it differently. HD uses a single replica of the system and
|
||||
accelerates time by biasing the interaction potential in a manner such
|
||||
that each timestep is effectively longer. PRD creates Nr replicas of
|
||||
the system and runs dynamics on each independently with a normal
|
||||
unbiased potential until an event occurs in one of the replicas. The
|
||||
time between events is reduced by a factor of Nr replicas. For both
|
||||
methods, per CPU second, more physical time elapses and more events
|
||||
occur. See the "prd"_prd.html doc page for more info about PRD.
|
||||
|
||||
An HD run has several stages, which are repeated each time an event
|
||||
occurs, as explained below. The logic for an HD run is as follows:
|
||||
|
||||
quench
|
||||
create initial list of bonds :pre
|
||||
|
||||
while (time remains):
|
||||
run dynamics for Nevent steps
|
||||
quench
|
||||
check for an event
|
||||
if event occurred: reset list of bonds
|
||||
restore pre-quench state :pre
|
||||
|
||||
The list of bonds is the list of atom pairs of atoms that are within a
|
||||
short cutoff distance of each other after the system energy is
|
||||
minimized (quenched). This list is created and reset by a "fix
|
||||
hyper/global"_fix_hyper_global.html or "fix
|
||||
hyper/local"_fix_hyper_local.html command specified as {fix-ID}. At
|
||||
every dynamics timestep, the same fix selects one of more bonds to
|
||||
apply a bias potential to.
|
||||
|
||||
IMPORTANT NOTE: The style of fix associated with the specified
|
||||
{fix-ID} determines whether you are running the global versus local
|
||||
hyperdynamics algorithm.
|
||||
|
||||
Dynamics (with the bias potential) is run continuously, stopping every
|
||||
{Nevent} steps to check if a transition event has occurred. The
|
||||
specified {N} for total steps must be a multiple of {Nevent}. check
|
||||
is performed by quenching the system and comparing the resulting atom
|
||||
coordinates to the coordinates from the previous basin.
|
||||
|
||||
A quench is an energy minimization and is performed by whichever
|
||||
algorithm has been defined by the "min_style"_min_style.html command.
|
||||
Minimization parameters may be set via the
|
||||
"min_modify"_min_modify.html command and by the {min} keyword of the
|
||||
hyper command. The latter are the settings that would be used with
|
||||
the "minimize"_minimize.html command. Note that typically, you do not
|
||||
need to perform a highly-converged minimization to detect a transition
|
||||
event, though you may need to in order to prevent a set of atoms in
|
||||
the system from relaxing to a saddle point.
|
||||
|
||||
The event check is performed by a compute with the specified
|
||||
{compute-ID}. Currently there is only one compute that works with the
|
||||
hyper command, which is the "compute
|
||||
event/displace"_compute_event_displace.html command. Other
|
||||
event-checking computes may be added. "Compute
|
||||
event/displace"_compute_event_displace.html checks whether any atom in
|
||||
the compute group has moved further than a specified threshold
|
||||
distance. If so, an event has occurred.
|
||||
|
||||
If this happens, the list of bonds is reset, since some bond pairs
|
||||
are likely now too far apart, and new pairs are likely close enough
|
||||
to be considered a bond. The pre-quenched state of the
|
||||
system (coordinates and velocities) is restored, and dynamics continue.
|
||||
|
||||
At the end of the hyper run, a variety of statistics are output to the
|
||||
screen and logfile. These include info relevant to both global and
|
||||
local hyperdynamics, such as the number of events and the elapsed
|
||||
hyper time (acclerated time), And it includes info specific to one or
|
||||
the other, depending on which style of fix was specified by {fix-ID}.
|
||||
|
||||
:line
|
||||
|
||||
The optional keywords operate as follows.
|
||||
|
||||
As explained above, the {min} keyword can be used to specify
|
||||
parameters for the quench. Their meaning is the same
|
||||
as for the "minimize"_minimize.html command
|
||||
|
||||
The {dump} keyword can be used to trigger a specific dump command with
|
||||
the specified {dump-ID} to output a snapshot each time an event is
|
||||
detected. It can be specified multiple times with different {dump-ID}
|
||||
values, as in the example above. These snapshots will be for the
|
||||
quenched state of the system on a timestep that is a multiple of
|
||||
{Nevent}, i.e. a timestep after the event has occurred. Note that any
|
||||
dump command in the input script will also output snapshots at
|
||||
whatever timestep interval it defines via its {N} argument; see the
|
||||
"dump"_dump.html command for details. This means if you only want a
|
||||
particular dump to output snapshots when events are detected, you
|
||||
should specify its {N} as a value larger than the length of the
|
||||
hyperdynamics run.
|
||||
|
||||
As in the code logic above, the bond list is normally only reset when
|
||||
an event occurs. The {rebond} keyword will force a reset of the bond
|
||||
list every {Nrebond} steps, even if an event has not occurred.
|
||||
{Nrebond} must be a multiple of {Nevent}. This can be useful to check
|
||||
if more frequent resets alter event statistics, perhaps because the
|
||||
parameters chosen for defining what is a bond and what is an event are
|
||||
producing bad dynamics in the presence of the bias potential.
|
||||
|
||||
:line
|
||||
|
||||
[Restrictions:]
|
||||
|
||||
This command can only be used if LAMMPS was built with the REPLICA
|
||||
package. See the "Build package"_Build_package.html doc
|
||||
page for more info.
|
||||
|
||||
[Related commands:]
|
||||
|
||||
"fix hyper/global"_fix_hyper_global.html, "fix
|
||||
hyper/local"_fix_hyper_local.html, "compute
|
||||
event/displace"_compute_event_displace.html, "prd"_prd.html
|
||||
|
||||
[Default:]
|
||||
|
||||
The option defaults are min = 0.1 0.1 40 50 and time = steps.
|
||||
|
||||
:line
|
||||
|
||||
:link(Voter2013)
|
||||
[(Voter2013)] S. Y. Kim, D. Perez, A. F. Voter, J Chem Phys, 139,
|
||||
144110 (2013).
|
||||
|
||||
:link(Voter2002hd)
|
||||
[(Voter2002)] Voter, Montalenti, Germann, Annual Review of Materials
|
||||
Research 32, 321 (2002).
|
||||
@ -160,6 +160,7 @@ dump_cfg_uef.html
|
||||
echo.html
|
||||
group.html
|
||||
group2ndx.html
|
||||
hyper.html
|
||||
if.html
|
||||
include.html
|
||||
info.html
|
||||
@ -277,6 +278,8 @@ fix_gravity.html
|
||||
fix_grem.html
|
||||
fix_halt.html
|
||||
fix_heat.html
|
||||
fix_hyper_global.html
|
||||
fix_hyper_local.html
|
||||
fix_imd.html
|
||||
fix_indent.html
|
||||
fix_ipi.html
|
||||
|
||||
@ -48,11 +48,12 @@ replicas of a system. One or more replicas can be used. The total
|
||||
number of steps {N} to run can be interpreted in one of two ways; see
|
||||
discussion of the {time} keyword below.
|
||||
|
||||
PRD is described in "this paper"_#Voter1998 by Art Voter. It is a method
|
||||
for performing accelerated dynamics that is suitable for
|
||||
infrequent-event systems that obey first-order kinetics. A good
|
||||
overview of accelerated dynamics methods for such systems in given in
|
||||
"this review paper"_#Voter2002prd from the same group. To quote from the
|
||||
PRD is described in "(Voter1998)"_#Voter1998 by Art Voter. Similar to
|
||||
global or local hyperdynamics (HD), PRD is a method for performing
|
||||
accelerated dynamics that is suitable for infrequent-event systems
|
||||
that obey first-order kinetics. A good overview of accelerated
|
||||
dynamics methods for such systems in given in this review paper
|
||||
"(Voter2002)"_#Voter2002prd from Art's group. To quote from the
|
||||
paper: "The dynamical evolution is characterized by vibrational
|
||||
excursions within a potential basin, punctuated by occasional
|
||||
transitions between basins." The transition probability is
|
||||
@ -61,15 +62,26 @@ Running multiple replicas gives an effective enhancement in the
|
||||
timescale spanned by the multiple simulations, while waiting for an
|
||||
event to occur.
|
||||
|
||||
Each replica runs on a partition of one or more processors. Processor
|
||||
partitions are defined at run-time using the "-partition command-line
|
||||
switch"_Run_options.html. Note that if you have MPI installed, you
|
||||
can run a multi-replica simulation with more replicas (partitions)
|
||||
than you have physical processors, e.g you can run a 10-replica
|
||||
simulation on one or two processors. However for PRD, this makes
|
||||
little sense, since running a replica on virtual instead of physical
|
||||
processors,offers no effective parallel speed-up in searching for
|
||||
infrequent events. See the "Howto replica"_Howto_replica.html doc
|
||||
Both PRD and HD produce a time-accurate trajectory that effectively
|
||||
extends the timescale over which a system can be simulated, but they
|
||||
do it differently. PRD creates Nr replicas of the system and runs
|
||||
dynamics on each independently with a normal unbiased potential until
|
||||
an event occurs in one of the replicas. The time between events is
|
||||
reduced by a factor of Nr replicas. HD uses a single replica of the
|
||||
system and accelerates time by biasing the interaction potential in a
|
||||
manner such that each timestep is effectively longer. For both
|
||||
methods, per CPU second, more physical time elapses and more events
|
||||
occur. See the "hyper"_hyper.html doc page for more info about HD.
|
||||
|
||||
In PRD, each replica runs on a partition of one or more processors.
|
||||
Processor partitions are defined at run-time using the "-partition
|
||||
command-line switch"_Run_options.html. Note that if you have MPI
|
||||
installed, you can run a multi-replica simulation with more replicas
|
||||
(partitions) than you have physical processors, e.g you can run a
|
||||
10-replica simulation on one or two processors. However for PRD, this
|
||||
makes little sense, since running a replica on virtual instead of
|
||||
physical processors,offers no effective parallel speed-up in searching
|
||||
for infrequent events. See the "Howto replica"_Howto_replica.html doc
|
||||
page for further discussion.
|
||||
|
||||
When a PRD simulation is performed, it is assumed that each replica is
|
||||
@ -78,8 +90,8 @@ I.e. the simulation domain, the number of atoms, the interaction
|
||||
potentials, etc should be the same for every replica.
|
||||
|
||||
A PRD run has several stages, which are repeated each time an "event"
|
||||
occurs in one of the replicas, as defined below. The logic for a PRD
|
||||
run is as follows:
|
||||
occurs in one of the replicas, as explained below. The logic for a
|
||||
PRD run is as follows:
|
||||
|
||||
while (time remains):
|
||||
dephase for n_dephase*t_dephase steps
|
||||
@ -129,7 +141,8 @@ Minimization parameters may be set via the
|
||||
PRD command. The latter are the settings that would be used with the
|
||||
"minimize"_minimize.html command. Note that typically, you do not
|
||||
need to perform a highly-converged minimization to detect a transition
|
||||
event.
|
||||
event, though you may need to in order to prevent a set of atoms in
|
||||
the system from relaxing to a saddle point.
|
||||
|
||||
The event check is performed by a compute with the specified
|
||||
{compute-ID}. Currently there is only one compute that works with the
|
||||
@ -307,7 +320,7 @@ deposit"_fix_deposit.html.
|
||||
"min_modify"_min_modify.html, "min_style"_min_style.html,
|
||||
"run_style"_run_style.html, "minimize"_minimize.html,
|
||||
"velocity"_velocity.html, "temper"_temper.html, "neb"_neb.html,
|
||||
"tad"_tad.html
|
||||
"tad"_tad.html, "hyper"_hyper.html
|
||||
|
||||
[Default:]
|
||||
|
||||
|
||||
@ -78,6 +78,7 @@ friction: frictional contact of spherical asperities between 2d surfaces
|
||||
gcmc: Grand Canonical Monte Carlo (GCMC) via the fix gcmc command
|
||||
granregion: use of fix wall/region/gran as boundary on granular particles
|
||||
hugoniostat: Hugoniostat shock dynamics
|
||||
hyper: global and local hyperdynamics of diffusion on Pt surface
|
||||
indent: spherical indenter into a 2d solid
|
||||
kim: use of potentials in Knowledge Base for Interatomic Models (KIM)
|
||||
latte: use of LATTE density-functional tight-binding quantum code
|
||||
|
||||
184
examples/hyper/adatoms.list.37K
Normal file
@ -0,0 +1,184 @@
|
||||
create_atoms 1 single 27.5 9.5 4
|
||||
create_atoms 1 single 16 9 4
|
||||
create_atoms 1 single 10 12 4
|
||||
create_atoms 1 single 31 44 4
|
||||
create_atoms 1 single 13 17 4
|
||||
create_atoms 1 single 8.5 28.5 4
|
||||
create_atoms 1 single 23 26 4
|
||||
create_atoms 1 single 38 27 4
|
||||
create_atoms 1 single 37.5 4.5 4
|
||||
create_atoms 1 single 41.5 47.5 4
|
||||
create_atoms 1 single 20.5 37.5 4
|
||||
create_atoms 1 single 5 8 4
|
||||
create_atoms 1 single 2.5 16.5 4
|
||||
create_atoms 1 single 38.5 45.5 4
|
||||
create_atoms 1 single 9 0 4
|
||||
create_atoms 1 single 39 32 4
|
||||
create_atoms 1 single 45.5 11.5 4
|
||||
create_atoms 1 single 40 0 4
|
||||
create_atoms 1 single 44.5 2.5 4
|
||||
create_atoms 1 single 4.5 44.5 4
|
||||
create_atoms 1 single 24.5 13.5 4
|
||||
create_atoms 1 single 47.5 23.5 4
|
||||
create_atoms 1 single 1 20 4
|
||||
create_atoms 1 single 38.5 31.5 4
|
||||
create_atoms 1 single 12.5 12.5 4
|
||||
create_atoms 1 single 2 27 4
|
||||
create_atoms 1 single 21 5 4
|
||||
create_atoms 1 single 47 12 4
|
||||
create_atoms 1 single 32.5 46.5 4
|
||||
create_atoms 1 single 9.5 40.5 4
|
||||
create_atoms 1 single 8.5 2.5 4
|
||||
create_atoms 1 single 41.5 22.5 4
|
||||
create_atoms 1 single 29 11 4
|
||||
create_atoms 1 single 3.5 3.5 4
|
||||
create_atoms 1 single 5 21 4
|
||||
create_atoms 1 single 46.5 31.5 4
|
||||
create_atoms 1 single 35 46 4
|
||||
create_atoms 1 single 40.5 41.5 4
|
||||
create_atoms 1 single 10 22 4
|
||||
create_atoms 1 single 43.5 14.5 4
|
||||
create_atoms 1 single 42 42 4
|
||||
create_atoms 1 single 4 26 4
|
||||
create_atoms 1 single 19 34 4
|
||||
create_atoms 1 single 33 9 4
|
||||
create_atoms 1 single 0.5 45.5 4
|
||||
create_atoms 1 single 30.5 32.5 4
|
||||
create_atoms 1 single 25.5 5.5 4
|
||||
create_atoms 1 single 47.5 39.5 4
|
||||
create_atoms 1 single 15 13 4
|
||||
create_atoms 1 single 21 21 4
|
||||
create_atoms 1 single 14 28 4
|
||||
create_atoms 1 single 9 34 4
|
||||
create_atoms 1 single 7 38 4
|
||||
create_atoms 1 single 11 35 4
|
||||
create_atoms 1 single 20.5 45.5 4
|
||||
create_atoms 1 single 30.5 31.5 4
|
||||
create_atoms 1 single 32.5 2.5 4
|
||||
create_atoms 1 single 21.5 3.5 4
|
||||
create_atoms 1 single 23 12 4
|
||||
create_atoms 1 single 4.5 33.5 4
|
||||
create_atoms 1 single 46 43 4
|
||||
create_atoms 1 single 42.5 45.5 4
|
||||
create_atoms 1 single 4.5 10.5 4
|
||||
create_atoms 1 single 33.5 15.5 4
|
||||
create_atoms 1 single 24 5 4
|
||||
create_atoms 1 single 13 16 4
|
||||
create_atoms 1 single 16.5 23.5 4
|
||||
create_atoms 1 single 45.5 28.5 4
|
||||
create_atoms 1 single 44.5 5.5 4
|
||||
create_atoms 1 single 27.5 46.5 4
|
||||
create_atoms 1 single 44.5 12.5 4
|
||||
create_atoms 1 single 12 41 4
|
||||
create_atoms 1 single 6 4 4
|
||||
create_atoms 1 single 31.5 10.5 4
|
||||
create_atoms 1 single 1 44 4
|
||||
create_atoms 1 single 31 4 4
|
||||
create_atoms 1 single 21 33 4
|
||||
create_atoms 1 single 3 33 4
|
||||
create_atoms 1 single 15 10 4
|
||||
create_atoms 1 single 28.5 22.5 4
|
||||
create_atoms 1 single 43 1 4
|
||||
create_atoms 1 single 3.5 0.5 4
|
||||
create_atoms 1 single 41 37 4
|
||||
create_atoms 1 single 18.5 43.5 4
|
||||
create_atoms 1 single 17 27 4
|
||||
create_atoms 1 single 3 5 4
|
||||
create_atoms 1 single 18.5 23.5 4
|
||||
create_atoms 1 single 31.5 14.5 4
|
||||
create_atoms 1 single 41 31 4
|
||||
create_atoms 1 single 22 3 4
|
||||
create_atoms 1 single 14.5 40.5 4
|
||||
create_atoms 1 single 9 38 4
|
||||
create_atoms 1 single 36 42 4
|
||||
create_atoms 1 single 33 22 4
|
||||
create_atoms 1 single 15.5 47.5 4
|
||||
create_atoms 1 single 3 0 4
|
||||
create_atoms 1 single 25.5 27.5 4
|
||||
create_atoms 1 single 2.5 28.5 4
|
||||
create_atoms 1 single 29.5 28.5 4
|
||||
create_atoms 1 single 44.5 18.5 4
|
||||
create_atoms 1 single 26 40 4
|
||||
create_atoms 1 single 41 27 4
|
||||
create_atoms 1 single 39.5 5.5 4
|
||||
create_atoms 1 single 3 38 4
|
||||
create_atoms 1 single 35 29 4
|
||||
create_atoms 1 single 11 19 4
|
||||
create_atoms 1 single 18 1 4
|
||||
create_atoms 1 single 39.5 40.5 4
|
||||
create_atoms 1 single 46 17 4
|
||||
create_atoms 1 single 1.5 23.5 4
|
||||
create_atoms 1 single 28.5 23.5 4
|
||||
create_atoms 1 single 10 28 4
|
||||
create_atoms 1 single 19 47 4
|
||||
create_atoms 1 single 10.5 16.5 4
|
||||
create_atoms 1 single 38 45 4
|
||||
create_atoms 1 single 42.5 41.5 4
|
||||
create_atoms 1 single 47.5 42.5 4
|
||||
create_atoms 1 single 38 7 4
|
||||
create_atoms 1 single 10 44 4
|
||||
create_atoms 1 single 29.5 27.5 4
|
||||
create_atoms 1 single 45 30 4
|
||||
create_atoms 1 single 3 9 4
|
||||
create_atoms 1 single 8.5 35.5 4
|
||||
create_atoms 1 single 24 44 4
|
||||
create_atoms 1 single 47 4 4
|
||||
create_atoms 1 single 7.5 8.5 4
|
||||
create_atoms 1 single 32.5 41.5 4
|
||||
create_atoms 1 single 0.5 34.5 4
|
||||
create_atoms 1 single 11 8 4
|
||||
create_atoms 1 single 2 40 4
|
||||
create_atoms 1 single 25 24 4
|
||||
create_atoms 1 single 47.5 6.5 4
|
||||
create_atoms 1 single 39.5 28.5 4
|
||||
create_atoms 1 single 17 21 4
|
||||
create_atoms 1 single 32 43 4
|
||||
create_atoms 1 single 16.5 29.5 4
|
||||
create_atoms 1 single 34 34 4
|
||||
create_atoms 1 single 11.5 3.5 4
|
||||
create_atoms 1 single 39 22 4
|
||||
create_atoms 1 single 24.5 36.5 4
|
||||
create_atoms 1 single 33 31 4
|
||||
create_atoms 1 single 35.5 35.5 4
|
||||
create_atoms 1 single 14.5 34.5 4
|
||||
create_atoms 1 single 34 28 4
|
||||
create_atoms 1 single 37 41 4
|
||||
create_atoms 1 single 33 46 4
|
||||
create_atoms 1 single 27.5 28.5 4
|
||||
create_atoms 1 single 40.5 22.5 4
|
||||
create_atoms 1 single 27.5 1.5 4
|
||||
create_atoms 1 single 12 2 4
|
||||
create_atoms 1 single 36 43 4
|
||||
create_atoms 1 single 28.5 9.5 4
|
||||
create_atoms 1 single 20.5 25.5 4
|
||||
create_atoms 1 single 3 3 4
|
||||
create_atoms 1 single 38 33 4
|
||||
create_atoms 1 single 3 20 4
|
||||
create_atoms 1 single 35 11 4
|
||||
create_atoms 1 single 5 25 4
|
||||
create_atoms 1 single 36.5 6.5 4
|
||||
create_atoms 1 single 19.5 24.5 4
|
||||
create_atoms 1 single 27 41 4
|
||||
create_atoms 1 single 39.5 11.5 4
|
||||
create_atoms 1 single 21.5 2.5 4
|
||||
create_atoms 1 single 46.5 15.5 4
|
||||
create_atoms 1 single 13 24 4
|
||||
create_atoms 1 single 11 37 4
|
||||
create_atoms 1 single 11.5 31.5 4
|
||||
create_atoms 1 single 47 0 4
|
||||
create_atoms 1 single 25.5 17.5 4
|
||||
create_atoms 1 single 32 11 4
|
||||
create_atoms 1 single 8 17 4
|
||||
create_atoms 1 single 27.5 12.5 4
|
||||
create_atoms 1 single 25 7 4
|
||||
create_atoms 1 single 25.5 37.5 4
|
||||
create_atoms 1 single 12 15 4
|
||||
create_atoms 1 single 1 7 4
|
||||
create_atoms 1 single 18.5 47.5 4
|
||||
create_atoms 1 single 5 38 4
|
||||
create_atoms 1 single 42 19 4
|
||||
create_atoms 1 single 30.5 7.5 4
|
||||
create_atoms 1 single 42.5 7.5 4
|
||||
create_atoms 1 single 26.5 18.5 4
|
||||
create_atoms 1 single 18.5 1.5 4
|
||||
create_atoms 1 single 41.5 10.5 4
|
||||
BIN
examples/hyper/global.10Oct18.000000.jpg
Normal file
|
After Width: | Height: | Size: 71 KiB |
BIN
examples/hyper/global.10Oct18.003000.jpg
Normal file
|
After Width: | Height: | Size: 70 KiB |
BIN
examples/hyper/global.10Oct18.038000.jpg
Normal file
|
After Width: | Height: | Size: 69 KiB |
BIN
examples/hyper/global.10Oct18.059000.jpg
Normal file
|
After Width: | Height: | Size: 70 KiB |
95
examples/hyper/in.hyper.global
Normal file
@ -0,0 +1,95 @@
|
||||
# 3d EAM surface for global HD
|
||||
|
||||
# nearest neighbor distance = a * sqrt(2)/2 = 2.77 Angs for Pt with a = 3.92
|
||||
# hop event on (100) surface is same distance
|
||||
# exchange event is 2 atoms moving same distance
|
||||
|
||||
variable Tequil index 500.0
|
||||
variable Vmax index 0.5
|
||||
variable qfactor index 0.3
|
||||
variable cutbond index 3.2
|
||||
variable cutevent index 1.1
|
||||
variable steps index 100000
|
||||
variable nevent index 1000
|
||||
variable zoom index 1.8
|
||||
|
||||
units metal
|
||||
atom_style atomic
|
||||
atom_modify map array
|
||||
boundary p p p
|
||||
|
||||
lattice fcc 3.92
|
||||
region box block 0 6 0 6 0 4
|
||||
create_box 3 box
|
||||
create_atoms 1 box
|
||||
|
||||
mass * 1.0
|
||||
|
||||
change_box all z final -0.1 5.0 boundary p p f
|
||||
create_atoms 2 single 3.5 3.5 4
|
||||
|
||||
# define frozen substrate and mobile atoms
|
||||
|
||||
group adatom type 2
|
||||
region base block INF INF INF INF 0 1.8
|
||||
set region base type 3
|
||||
group base type 3
|
||||
group mobile type 1 2
|
||||
|
||||
# pair style
|
||||
|
||||
pair_style eam/alloy
|
||||
pair_coeff * * ptvoterlammps.eam Pt Pt Pt
|
||||
|
||||
neighbor 0.5 bin
|
||||
neigh_modify every 1 delay 5 check yes
|
||||
|
||||
fix 1 mobile nve
|
||||
fix 2 mobile langevin ${Tequil} ${Tequil} 1.0 858872873 zero yes
|
||||
|
||||
timestep 0.005
|
||||
|
||||
compute tmobile mobile temp
|
||||
|
||||
thermo 100
|
||||
thermo_modify temp tmobile
|
||||
|
||||
# thermal equilibration
|
||||
|
||||
run 1000
|
||||
reset_timestep 0
|
||||
|
||||
# pin base so will not move during quenches
|
||||
|
||||
fix freeze base setforce 0.0 0.0 0.0
|
||||
|
||||
# event detection
|
||||
|
||||
compute event all event/displace ${cutevent}
|
||||
|
||||
# hyper/global
|
||||
|
||||
fix HG mobile hyper/global ${cutbond} ${qfactor} ${Vmax} ${Tequil}
|
||||
|
||||
# thermo output
|
||||
|
||||
thermo_style custom step temp pe f_HG f_HG[*]
|
||||
|
||||
thermo_modify lost ignore
|
||||
thermo_modify temp tmobile
|
||||
|
||||
thermo ${nevent}
|
||||
|
||||
# dump output options
|
||||
|
||||
region substrate block INF INF INF INF 1.8 3.8
|
||||
region adatoms block INF INF INF INF 3.8 INF
|
||||
variable acolor atom rmask(base)+2*rmask(substrate)+3*rmask(adatoms)
|
||||
|
||||
dump 1 all image 1000000 global.*.jpg v_acolor type &
|
||||
zoom ${zoom} adiam 2.5 view 0.0 0.0 up 0 1 0 axes yes 0.9 0.01
|
||||
dump_modify 1 pad 6 amap 1 3 sa 1 3 blue red green
|
||||
|
||||
# run
|
||||
|
||||
hyper ${steps} ${nevent} HG event min 1.0e-6 1.0e-6 100 100 dump 1
|
||||
112
examples/hyper/in.hyper.local
Normal file
@ -0,0 +1,112 @@
|
||||
# 3d EAM surface for local HD
|
||||
|
||||
# nearest neighbor distance = a * sqrt(2)/2 = 2.77 Angs for Pt with a = 3.92
|
||||
# hop event on (100) surface is same distance
|
||||
# exchange event is 2 atoms moving same distance
|
||||
|
||||
variable Tequil index 400.0
|
||||
variable Vmax index 0.4
|
||||
variable qfactor index 0.3
|
||||
variable cutbond index 3.2
|
||||
variable Dcut index 10.0
|
||||
variable cutevent index 1.1
|
||||
variable alpha index 200.0
|
||||
variable boost index 4000.0
|
||||
variable ghostcut index 12.0
|
||||
variable steps index 1500
|
||||
variable nevent index 100
|
||||
variable nx index 8
|
||||
variable ny index 8
|
||||
variable zoom index 1.8
|
||||
variable seed index 826626413
|
||||
variable tol index 1.0e-15
|
||||
variable add index 37K
|
||||
|
||||
units metal
|
||||
atom_style atomic
|
||||
atom_modify map array
|
||||
boundary p p p
|
||||
comm_modify cutoff ${ghostcut}
|
||||
|
||||
lattice fcc 3.92
|
||||
region box block 0 6 0 6 0 4
|
||||
create_box 2 box
|
||||
create_atoms 1 box
|
||||
|
||||
mass * 1.0
|
||||
|
||||
change_box all z final -0.1 5.0 boundary p p f
|
||||
|
||||
# replicate in xy
|
||||
|
||||
replicate ${nx} ${ny} 1
|
||||
|
||||
# add adatoms
|
||||
|
||||
include adatoms.list.${add}
|
||||
|
||||
# define frozen substrate and mobile atoms
|
||||
|
||||
region base block INF INF INF INF 0 1.8
|
||||
set region base type 2
|
||||
group base type 2
|
||||
group mobile type 1
|
||||
|
||||
# pair style
|
||||
|
||||
pair_style eam/alloy
|
||||
pair_coeff * * ptvoterlammps.eam Pt Pt
|
||||
|
||||
neighbor 0.5 bin
|
||||
neigh_modify every 1 delay 5 check yes
|
||||
|
||||
fix 1 mobile nve
|
||||
fix 2 mobile langevin ${Tequil} ${Tequil} 1.0 ${seed} zero yes
|
||||
|
||||
timestep 0.005
|
||||
|
||||
compute tmobile mobile temp
|
||||
|
||||
thermo 100
|
||||
thermo_modify temp tmobile
|
||||
|
||||
# thermal equilibration
|
||||
|
||||
run 1000
|
||||
reset_timestep 0
|
||||
|
||||
# pin base so will not move during quenches
|
||||
|
||||
fix freeze base setforce 0.0 0.0 0.0
|
||||
|
||||
# event detection
|
||||
|
||||
compute event all event/displace ${cutevent}
|
||||
|
||||
# hyper/local
|
||||
|
||||
fix HL mobile hyper/local ${cutbond} ${qfactor} ${Vmax} ${Tequil} &
|
||||
${Dcut} ${alpha} ${boost}
|
||||
|
||||
# thermo output
|
||||
|
||||
thermo_style custom step temp pe f_HL f_HL[*]
|
||||
|
||||
thermo_modify lost ignore
|
||||
thermo_modify temp tmobile
|
||||
|
||||
thermo ${nevent}
|
||||
|
||||
# dump
|
||||
|
||||
region substrate block INF INF INF INF 1.8 3.8
|
||||
region adatoms block INF INF INF INF 3.8 INF
|
||||
variable acolor atom rmask(base)+2*rmask(substrate)+3*rmask(adatoms)
|
||||
|
||||
dump 1 all image 10000000 local.*.jpg v_acolor type size 1024 1024 &
|
||||
zoom ${zoom} adiam 2.5 view 0.0 0.0 up 0 1 0 axes yes 0.9 0.01
|
||||
dump_modify 1 pad 6 amap 1 3 sa 1 3 blue red green
|
||||
|
||||
# run
|
||||
|
||||
hyper ${steps} ${nevent} HL event min ${tol} ${tol} 1000 1000 dump 1
|
||||
BIN
examples/hyper/local.10Oct18.000000.jpg
Normal file
|
After Width: | Height: | Size: 482 KiB |
BIN
examples/hyper/local.10Oct18.000700.jpg
Normal file
|
After Width: | Height: | Size: 479 KiB |
BIN
examples/hyper/local.10Oct18.000800.jpg
Normal file
|
After Width: | Height: | Size: 477 KiB |
BIN
examples/hyper/local.10Oct18.001100.jpg
Normal file
|
After Width: | Height: | Size: 479 KiB |
1243
examples/hyper/log.10Oct18.hyper.global.g++.4
Normal file
993
examples/hyper/log.10Oct18.hyper.local.g++.16
Normal file
@ -0,0 +1,993 @@
|
||||
LAMMPS (10 Oct 2018)
|
||||
# 3d EAM surface for local HD
|
||||
|
||||
# nearest neighbor distance = a * sqrt(2)/2 = 2.77 Angs for Pt with a = 3.92
|
||||
# hop event on (100) surface is same distance
|
||||
# exchange event is 2 atoms moving same distance
|
||||
|
||||
variable Tequil index 400.0
|
||||
variable Vmax index 0.4
|
||||
variable qfactor index 0.3
|
||||
variable cutbond index 3.2
|
||||
variable Dcut index 10.0
|
||||
variable cutevent index 1.1
|
||||
variable alpha index 200.0
|
||||
variable boost index 4000.0
|
||||
variable ghostcut index 12.0
|
||||
variable steps index 1500
|
||||
variable nevent index 100
|
||||
variable nx index 8
|
||||
variable ny index 8
|
||||
variable zoom index 1.8
|
||||
variable seed index 826626413
|
||||
variable tol index 1.0e-15
|
||||
variable add index 37K
|
||||
|
||||
units metal
|
||||
atom_style atomic
|
||||
atom_modify map array
|
||||
boundary p p p
|
||||
comm_modify cutoff ${ghostcut}
|
||||
comm_modify cutoff 12.0
|
||||
|
||||
lattice fcc 3.92
|
||||
Lattice spacing in x,y,z = 3.92 3.92 3.92
|
||||
region box block 0 6 0 6 0 4
|
||||
create_box 2 box
|
||||
Created orthogonal box = (0 0 0) to (23.52 23.52 15.68)
|
||||
2 by 4 by 2 MPI processor grid
|
||||
create_atoms 1 box
|
||||
Created 576 atoms
|
||||
Time spent = 0.00108504 secs
|
||||
|
||||
mass * 1.0
|
||||
|
||||
change_box all z final -0.1 5.0 boundary p p f
|
||||
orthogonal box = (0 0 -0.392) to (23.52 23.52 19.6)
|
||||
|
||||
# replicate in xy
|
||||
|
||||
replicate ${nx} ${ny} 1
|
||||
replicate 8 ${ny} 1
|
||||
replicate 8 8 1
|
||||
orthogonal box = (0 0 -0.392) to (188.16 188.16 19.6)
|
||||
4 by 4 by 1 MPI processor grid
|
||||
36864 atoms
|
||||
Time spent = 0.0028758 secs
|
||||
|
||||
# add adatoms
|
||||
|
||||
include adatoms.list.${add}
|
||||
include adatoms.list.37K
|
||||
create_atoms 1 single 27.5 9.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000183105 secs
|
||||
create_atoms 1 single 16 9 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000178099 secs
|
||||
create_atoms 1 single 10 12 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000179768 secs
|
||||
create_atoms 1 single 31 44 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000184059 secs
|
||||
create_atoms 1 single 13 17 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000173807 secs
|
||||
create_atoms 1 single 8.5 28.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000167847 secs
|
||||
create_atoms 1 single 23 26 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000179052 secs
|
||||
create_atoms 1 single 38 27 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000169992 secs
|
||||
create_atoms 1 single 37.5 4.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000166178 secs
|
||||
create_atoms 1 single 41.5 47.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000172138 secs
|
||||
create_atoms 1 single 20.5 37.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000165939 secs
|
||||
create_atoms 1 single 5 8 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.00018096 secs
|
||||
create_atoms 1 single 2.5 16.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000165939 secs
|
||||
create_atoms 1 single 38.5 45.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000164986 secs
|
||||
create_atoms 1 single 9 0 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000168085 secs
|
||||
create_atoms 1 single 39 32 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000170946 secs
|
||||
create_atoms 1 single 45.5 11.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.00018096 secs
|
||||
create_atoms 1 single 40 0 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000168085 secs
|
||||
create_atoms 1 single 44.5 2.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000165939 secs
|
||||
create_atoms 1 single 4.5 44.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000168085 secs
|
||||
create_atoms 1 single 24.5 13.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000165939 secs
|
||||
create_atoms 1 single 47.5 23.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.00018096 secs
|
||||
create_atoms 1 single 1 20 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000166893 secs
|
||||
create_atoms 1 single 38.5 31.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000168085 secs
|
||||
create_atoms 1 single 12.5 12.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000169992 secs
|
||||
create_atoms 1 single 2 27 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000188828 secs
|
||||
create_atoms 1 single 21 5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000174999 secs
|
||||
create_atoms 1 single 47 12 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000164986 secs
|
||||
create_atoms 1 single 32.5 46.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000165939 secs
|
||||
create_atoms 1 single 9.5 40.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000166893 secs
|
||||
create_atoms 1 single 8.5 2.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000164986 secs
|
||||
create_atoms 1 single 41.5 22.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000174046 secs
|
||||
create_atoms 1 single 29 11 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000166893 secs
|
||||
create_atoms 1 single 3.5 3.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000165224 secs
|
||||
create_atoms 1 single 5 21 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000164986 secs
|
||||
create_atoms 1 single 46.5 31.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000166178 secs
|
||||
create_atoms 1 single 35 46 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000183105 secs
|
||||
create_atoms 1 single 40.5 41.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000164986 secs
|
||||
create_atoms 1 single 10 22 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000164986 secs
|
||||
create_atoms 1 single 43.5 14.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000169992 secs
|
||||
create_atoms 1 single 42 42 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000165939 secs
|
||||
create_atoms 1 single 4 26 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000174999 secs
|
||||
create_atoms 1 single 19 34 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000163078 secs
|
||||
create_atoms 1 single 33 9 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000164986 secs
|
||||
create_atoms 1 single 0.5 45.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000163078 secs
|
||||
create_atoms 1 single 30.5 32.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000164986 secs
|
||||
create_atoms 1 single 25.5 5.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000178099 secs
|
||||
create_atoms 1 single 47.5 39.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000165939 secs
|
||||
create_atoms 1 single 15 13 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000164986 secs
|
||||
create_atoms 1 single 21 21 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000164986 secs
|
||||
create_atoms 1 single 14 28 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000164986 secs
|
||||
create_atoms 1 single 9 34 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000174999 secs
|
||||
create_atoms 1 single 7 38 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000175953 secs
|
||||
create_atoms 1 single 11 35 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000154972 secs
|
||||
create_atoms 1 single 20.5 45.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000156879 secs
|
||||
create_atoms 1 single 30.5 31.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000159979 secs
|
||||
create_atoms 1 single 32.5 2.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000166178 secs
|
||||
create_atoms 1 single 21.5 3.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000157833 secs
|
||||
create_atoms 1 single 23 12 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000155926 secs
|
||||
create_atoms 1 single 4.5 33.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000156879 secs
|
||||
create_atoms 1 single 46 43 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000155926 secs
|
||||
create_atoms 1 single 42.5 45.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000156879 secs
|
||||
create_atoms 1 single 4.5 10.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000158072 secs
|
||||
create_atoms 1 single 33.5 15.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000157833 secs
|
||||
create_atoms 1 single 24 5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000154018 secs
|
||||
create_atoms 1 single 13 16 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000158072 secs
|
||||
create_atoms 1 single 16.5 23.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000156164 secs
|
||||
create_atoms 1 single 45.5 28.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000247002 secs
|
||||
create_atoms 1 single 44.5 5.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000156164 secs
|
||||
create_atoms 1 single 27.5 46.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000155926 secs
|
||||
create_atoms 1 single 44.5 12.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000157833 secs
|
||||
create_atoms 1 single 12 41 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000156879 secs
|
||||
create_atoms 1 single 6 4 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.0001688 secs
|
||||
create_atoms 1 single 31.5 10.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.00015521 secs
|
||||
create_atoms 1 single 1 44 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000154972 secs
|
||||
create_atoms 1 single 31 4 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000156879 secs
|
||||
create_atoms 1 single 21 33 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000156879 secs
|
||||
create_atoms 1 single 3 33 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000164032 secs
|
||||
create_atoms 1 single 15 10 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.0001719 secs
|
||||
create_atoms 1 single 28.5 22.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000153065 secs
|
||||
create_atoms 1 single 43 1 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000155926 secs
|
||||
create_atoms 1 single 3.5 0.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000154972 secs
|
||||
create_atoms 1 single 41 37 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000153065 secs
|
||||
create_atoms 1 single 18.5 43.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000213146 secs
|
||||
create_atoms 1 single 17 27 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000159979 secs
|
||||
create_atoms 1 single 3 5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000153065 secs
|
||||
create_atoms 1 single 18.5 23.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000154972 secs
|
||||
create_atoms 1 single 31.5 14.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000154972 secs
|
||||
create_atoms 1 single 41 31 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000155926 secs
|
||||
create_atoms 1 single 22 3 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.00015521 secs
|
||||
create_atoms 1 single 14.5 40.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000154018 secs
|
||||
create_atoms 1 single 9 38 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000154018 secs
|
||||
create_atoms 1 single 36 42 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000155926 secs
|
||||
create_atoms 1 single 33 22 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000163078 secs
|
||||
create_atoms 1 single 15.5 47.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000154972 secs
|
||||
create_atoms 1 single 3 0 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000154972 secs
|
||||
create_atoms 1 single 25.5 27.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000176907 secs
|
||||
create_atoms 1 single 2.5 28.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000154972 secs
|
||||
create_atoms 1 single 29.5 28.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000162125 secs
|
||||
create_atoms 1 single 44.5 18.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000152826 secs
|
||||
create_atoms 1 single 26 40 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000155926 secs
|
||||
create_atoms 1 single 41 27 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000158072 secs
|
||||
create_atoms 1 single 39.5 5.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000155926 secs
|
||||
create_atoms 1 single 3 38 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000152826 secs
|
||||
create_atoms 1 single 35 29 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000164986 secs
|
||||
create_atoms 1 single 11 19 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000164986 secs
|
||||
create_atoms 1 single 18 1 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000146866 secs
|
||||
create_atoms 1 single 39.5 40.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000146866 secs
|
||||
create_atoms 1 single 46 17 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000146151 secs
|
||||
create_atoms 1 single 1.5 23.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000154018 secs
|
||||
create_atoms 1 single 28.5 23.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000145912 secs
|
||||
create_atoms 1 single 10 28 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000159979 secs
|
||||
create_atoms 1 single 19 47 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000148058 secs
|
||||
create_atoms 1 single 10.5 16.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000147104 secs
|
||||
create_atoms 1 single 38 45 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000144958 secs
|
||||
create_atoms 1 single 42.5 41.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000161886 secs
|
||||
create_atoms 1 single 47.5 42.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000147104 secs
|
||||
create_atoms 1 single 38 7 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000146151 secs
|
||||
create_atoms 1 single 10 44 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000145912 secs
|
||||
create_atoms 1 single 29.5 27.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000146151 secs
|
||||
create_atoms 1 single 45 30 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000145912 secs
|
||||
create_atoms 1 single 3 9 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000154018 secs
|
||||
create_atoms 1 single 8.5 35.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000146151 secs
|
||||
create_atoms 1 single 24 44 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000145912 secs
|
||||
create_atoms 1 single 47 4 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000144958 secs
|
||||
create_atoms 1 single 7.5 8.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000144958 secs
|
||||
create_atoms 1 single 32.5 41.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000157833 secs
|
||||
create_atoms 1 single 0.5 34.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000145912 secs
|
||||
create_atoms 1 single 11 8 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000147104 secs
|
||||
create_atoms 1 single 2 40 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000144958 secs
|
||||
create_atoms 1 single 25 24 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000146151 secs
|
||||
create_atoms 1 single 47.5 6.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000147104 secs
|
||||
create_atoms 1 single 39.5 28.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000154972 secs
|
||||
create_atoms 1 single 17 21 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000164032 secs
|
||||
create_atoms 1 single 32 43 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000146151 secs
|
||||
create_atoms 1 single 16.5 29.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000146151 secs
|
||||
create_atoms 1 single 34 34 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000145912 secs
|
||||
create_atoms 1 single 11.5 3.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000154018 secs
|
||||
create_atoms 1 single 39 22 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000144958 secs
|
||||
create_atoms 1 single 24.5 36.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000144958 secs
|
||||
create_atoms 1 single 33 31 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000144958 secs
|
||||
create_atoms 1 single 35.5 35.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000146151 secs
|
||||
create_atoms 1 single 14.5 34.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000146866 secs
|
||||
create_atoms 1 single 34 28 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000153065 secs
|
||||
create_atoms 1 single 37 41 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000145912 secs
|
||||
create_atoms 1 single 33 46 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000144958 secs
|
||||
create_atoms 1 single 27.5 28.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000145197 secs
|
||||
create_atoms 1 single 40.5 22.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000150919 secs
|
||||
create_atoms 1 single 27.5 1.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000146151 secs
|
||||
create_atoms 1 single 12 2 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000151873 secs
|
||||
create_atoms 1 single 36 43 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000144005 secs
|
||||
create_atoms 1 single 28.5 9.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000146151 secs
|
||||
create_atoms 1 single 20.5 25.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000144958 secs
|
||||
create_atoms 1 single 3 3 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000144005 secs
|
||||
create_atoms 1 single 38 33 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000146151 secs
|
||||
create_atoms 1 single 3 20 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000154972 secs
|
||||
create_atoms 1 single 35 11 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000145912 secs
|
||||
create_atoms 1 single 5 25 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000144005 secs
|
||||
create_atoms 1 single 36.5 6.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000144005 secs
|
||||
create_atoms 1 single 19.5 24.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000236988 secs
|
||||
create_atoms 1 single 27 41 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000169992 secs
|
||||
create_atoms 1 single 39.5 11.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000138998 secs
|
||||
create_atoms 1 single 21.5 2.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000136852 secs
|
||||
create_atoms 1 single 46.5 15.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000138998 secs
|
||||
create_atoms 1 single 13 24 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000137091 secs
|
||||
create_atoms 1 single 11 37 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000144005 secs
|
||||
create_atoms 1 single 11.5 31.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000144958 secs
|
||||
create_atoms 1 single 47 0 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000138044 secs
|
||||
create_atoms 1 single 25.5 17.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.00014019 secs
|
||||
create_atoms 1 single 32 11 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000138044 secs
|
||||
create_atoms 1 single 8 17 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000138044 secs
|
||||
create_atoms 1 single 27.5 12.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000137806 secs
|
||||
create_atoms 1 single 25 7 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000146151 secs
|
||||
create_atoms 1 single 25.5 37.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000139952 secs
|
||||
create_atoms 1 single 12 15 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000138044 secs
|
||||
create_atoms 1 single 1 7 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000138998 secs
|
||||
create_atoms 1 single 18.5 47.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000138044 secs
|
||||
create_atoms 1 single 5 38 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000136852 secs
|
||||
create_atoms 1 single 42 19 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000149012 secs
|
||||
create_atoms 1 single 30.5 7.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000138044 secs
|
||||
create_atoms 1 single 42.5 7.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000138998 secs
|
||||
create_atoms 1 single 26.5 18.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000153065 secs
|
||||
create_atoms 1 single 18.5 1.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000137091 secs
|
||||
create_atoms 1 single 41.5 10.5 4
|
||||
Created 1 atoms
|
||||
Time spent = 0.000140905 secs
|
||||
|
||||
# define frozen substrate and mobile atoms
|
||||
|
||||
region base block INF INF INF INF 0 1.8
|
||||
set region base type 2
|
||||
18432 settings made for type
|
||||
group base type 2
|
||||
18432 atoms in group base
|
||||
group mobile type 1
|
||||
18616 atoms in group mobile
|
||||
|
||||
# pair style
|
||||
|
||||
pair_style eam/alloy
|
||||
pair_coeff * * ptvoterlammps.eam Pt Pt
|
||||
|
||||
neighbor 0.5 bin
|
||||
neigh_modify every 1 delay 5 check yes
|
||||
|
||||
fix 1 mobile nve
|
||||
fix 2 mobile langevin ${Tequil} ${Tequil} 1.0 ${seed} zero yes
|
||||
fix 2 mobile langevin 400.0 ${Tequil} 1.0 ${seed} zero yes
|
||||
fix 2 mobile langevin 400.0 400.0 1.0 ${seed} zero yes
|
||||
fix 2 mobile langevin 400.0 400.0 1.0 826626413 zero yes
|
||||
|
||||
timestep 0.005
|
||||
|
||||
compute tmobile mobile temp
|
||||
|
||||
thermo 100
|
||||
thermo_modify temp tmobile
|
||||
WARNING: Temperature for thermo pressure is not for group all (../thermo.cpp:488)
|
||||
|
||||
# thermal equilibration
|
||||
|
||||
run 1000
|
||||
Neighbor list info ...
|
||||
update every 1 steps, delay 5 steps, check yes
|
||||
max neighbors/atom: 2000, page size: 100000
|
||||
master list distance cutoff = 6.07583
|
||||
ghost atom cutoff = 12
|
||||
binsize = 3.03792, bins = 62 62 7
|
||||
1 neighbor lists, perpetual/occasional/extra = 1 0 0
|
||||
(1) pair eam/alloy, perpetual
|
||||
attributes: half, newton on
|
||||
pair build: half/bin/atomonly/newton
|
||||
stencil: half/bin/3d/newton
|
||||
bin: standard
|
||||
Per MPI rank memory allocation (min/avg/max) = 3.359 | 3.359 | 3.36 Mbytes
|
||||
Step Temp E_pair E_mol TotEng Press
|
||||
0 0 -206220.22 0 -206220.22 -52155.664
|
||||
100 188.18127 -206044.43 0 -205591.63 -25068.83
|
||||
200 274.34464 -205860.78 0 -205200.66 -40191.797
|
||||
300 325.66286 -205750.01 0 -204966.4 -31510.222
|
||||
400 352.48242 -205675.42 0 -204827.28 -35058.064
|
||||
500 370.88571 -205619.66 0 -204727.25 -32735.022
|
||||
600 388.62129 -205592.87 0 -204657.78 -33904.556
|
||||
700 389.54874 -205579.73 0 -204642.4 -32769.852
|
||||
800 395.56074 -205576.82 0 -204625.03 -33755.948
|
||||
900 398.03458 -205564.48 0 -204606.74 -32777.103
|
||||
1000 401.24089 -205562.85 0 -204597.4 -33785.341
|
||||
Loop time of 4.3687 on 16 procs for 1000 steps with 37048 atoms
|
||||
|
||||
Performance: 98.885 ns/day, 0.243 hours/ns, 228.901 timesteps/s
|
||||
98.4% CPU use with 16 MPI tasks x no OpenMP threads
|
||||
|
||||
MPI task timing breakdown:
|
||||
Section | min time | avg time | max time |%varavg| %total
|
||||
---------------------------------------------------------------
|
||||
Pair | 3.2988 | 3.3828 | 3.4667 | 2.3 | 77.43
|
||||
Neigh | 0.20856 | 0.23127 | 0.24382 | 1.9 | 5.29
|
||||
Comm | 0.33313 | 0.45075 | 0.55485 | 9.2 | 10.32
|
||||
Output | 0.00042987 | 0.00044042 | 0.00049591 | 0.0 | 0.01
|
||||
Modify | 0.18811 | 0.28363 | 0.36798 | 9.7 | 6.49
|
||||
Other | | 0.01983 | | | 0.45
|
||||
|
||||
Nlocal: 2315.5 ave 2332 max 2297 min
|
||||
Histogram: 2 0 0 3 4 0 2 1 2 2
|
||||
Nghost: 3186.31 ave 3205 max 3170 min
|
||||
Histogram: 2 1 3 0 2 3 2 1 0 2
|
||||
Neighs: 55590.9 ave 56174 max 55103 min
|
||||
Histogram: 2 2 1 1 4 1 3 0 0 2
|
||||
|
||||
Total # of neighbors = 889454
|
||||
Ave neighs/atom = 24.0082
|
||||
Neighbor list builds = 105
|
||||
Dangerous builds = 0
|
||||
reset_timestep 0
|
||||
|
||||
# pin base so will not move during quenches
|
||||
|
||||
fix freeze base setforce 0.0 0.0 0.0
|
||||
|
||||
# event detection
|
||||
|
||||
compute event all event/displace ${cutevent}
|
||||
compute event all event/displace 1.1
|
||||
|
||||
# hyper/local
|
||||
|
||||
fix HL mobile hyper/local ${cutbond} ${qfactor} ${Vmax} ${Tequil} ${Dcut} ${alpha} ${boost}
|
||||
fix HL mobile hyper/local 3.2 ${qfactor} ${Vmax} ${Tequil} ${Dcut} ${alpha} ${boost}
|
||||
fix HL mobile hyper/local 3.2 0.3 ${Vmax} ${Tequil} ${Dcut} ${alpha} ${boost}
|
||||
fix HL mobile hyper/local 3.2 0.3 0.4 ${Tequil} ${Dcut} ${alpha} ${boost}
|
||||
fix HL mobile hyper/local 3.2 0.3 0.4 400.0 ${Dcut} ${alpha} ${boost}
|
||||
fix HL mobile hyper/local 3.2 0.3 0.4 400.0 10.0 ${alpha} ${boost}
|
||||
fix HL mobile hyper/local 3.2 0.3 0.4 400.0 10.0 200.0 ${boost}
|
||||
fix HL mobile hyper/local 3.2 0.3 0.4 400.0 10.0 200.0 4000.0
|
||||
|
||||
# thermo output
|
||||
|
||||
thermo_style custom step temp pe f_HL f_HL[*]
|
||||
WARNING: New thermo_style command, previous thermo_modify settings will be lost (../output.cpp:705)
|
||||
|
||||
thermo_modify lost ignore
|
||||
thermo_modify temp tmobile
|
||||
WARNING: Temperature for thermo pressure is not for group all (../thermo.cpp:488)
|
||||
|
||||
thermo ${nevent}
|
||||
thermo 100
|
||||
|
||||
# dump
|
||||
|
||||
region substrate block INF INF INF INF 1.8 3.8
|
||||
region adatoms block INF INF INF INF 3.8 INF
|
||||
variable acolor atom rmask(base)+2*rmask(substrate)+3*rmask(adatoms)
|
||||
|
||||
dump 1 all image 10000000 local.*.jpg v_acolor type size 1024 1024 zoom ${zoom} adiam 2.5 view 0.0 0.0 up 0 1 0 axes yes 0.9 0.01
|
||||
dump 1 all image 10000000 local.*.jpg v_acolor type size 1024 1024 zoom 1.8 adiam 2.5 view 0.0 0.0 up 0 1 0 axes yes 0.9 0.01
|
||||
dump_modify 1 pad 6 amap 1 3 sa 1 3 blue red green
|
||||
|
||||
# run
|
||||
|
||||
hyper ${steps} ${nevent} HL event min ${tol} ${tol} 1000 1000 dump 1
|
||||
hyper 1500 ${nevent} HL event min ${tol} ${tol} 1000 1000 dump 1
|
||||
hyper 1500 100 HL event min ${tol} ${tol} 1000 1000 dump 1
|
||||
hyper 1500 100 HL event min 1.0e-15 ${tol} 1000 1000 dump 1
|
||||
hyper 1500 100 HL event min 1.0e-15 1.0e-15 1000 1000 dump 1
|
||||
WARNING: Resetting reneighboring criteria during hyper (../hyper.cpp:133)
|
||||
Neighbor list info ...
|
||||
update every 1 steps, delay 0 steps, check yes
|
||||
max neighbors/atom: 2000, page size: 100000
|
||||
master list distance cutoff = 6.07583
|
||||
ghost atom cutoff = 12
|
||||
binsize = 3.03792, bins = 62 62 7
|
||||
2 neighbor lists, perpetual/occasional/extra = 1 1 0
|
||||
(1) pair eam/alloy, perpetual
|
||||
attributes: half, newton on
|
||||
pair build: half/bin/atomonly/newton
|
||||
stencil: half/bin/3d/newton
|
||||
bin: standard
|
||||
(2) fix hyper/local, occasional
|
||||
attributes: full, newton on, cut 10
|
||||
pair build: full/bin/atomonly
|
||||
stencil: full/bin/3d
|
||||
bin: standard
|
||||
Per MPI rank memory allocation (min/avg/max) = 7.566 | 7.567 | 7.567 Mbytes
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
0 401.24089 -205562.85 0 0 0 1 0 0 0 0 0 0 0 0 4e+19 0 0 0 0 0 0 0 0 0 0 0
|
||||
77 401.24089 -206534.96 0 0 0 1 0 0 0 0 0 0 0 0 4e+19 0 0 0 0 0 0 0 1540 0 0 0
|
||||
Loop time of 0.540347 on 16 procs for 77 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
0 401.24089 -205562.85 23.271302 74 0.18753621 1 6.0138739 703.62325 0 0.55802338 3.5350432 0 0 0 4e+19 10.115141 10.115141 0 0 0 0 0 0 0 0 0
|
||||
100 399.15639 -205546.21 22.904368 90 0.32935524 0.39929142 6.0138739 703.62325 0.026229865 0.91517139 3.9968927 91.88 0.3995539 0.4009724 0.39695676 10.262823 10.262823 0 0 0 0 0 2000 0 0 0
|
||||
Loop time of 0.579085 on 16 procs for 100 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
100 399.15639 -205546.21 22.904368 90 0.32935524 0.39929142 6.0138739 703.62325 0.026229865 0.91517139 3.9968927 91.88 0.3995539 0.4009724 0.39695676 10.262823 10.262823 0 0 0 0 0 2000 0 0 0
|
||||
184 399.15639 -206534.96 22.904368 90 0.32935524 0.39929142 6.0138739 703.62325 0.026229865 0.91517139 3.9968927 49.934783 0.21714886 0.4009724 0.39695676 10.262823 10.262823 0 0 0 0 0 3680 0 0 0
|
||||
Loop time of 0.556056 on 16 procs for 84 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
100 399.15639 -205546.21 22.903938 90 0.32935524 0.39929142 6.0138739 703.62325 0.026229865 0.91517139 3.9968927 91.88 0.3995539 0.4009724 0.39695676 10.262823 10.262823 0 0 0 0 0 2000 0 0 0
|
||||
200 403.01717 -205543.17 20.844359 90 0.3291605 0.39888693 6.0138739 703.62325 0.039527213 0.94418421 4.0368484 90.95 0.39930574 0.4019706 0.39554353 10.262823 10.262823 0 0 0 0 0 4000 0 0 0
|
||||
Loop time of 0.581214 on 16 procs for 100 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
200 403.01717 -205543.17 20.844359 90 0.3291605 0.39888693 6.0138739 703.62325 0.039527213 0.94418421 4.0368484 90.95 0.39930574 0.4019706 0.39554353 10.262823 10.262823 0 0 0 0 0 4000 0 0 0
|
||||
275 403.01717 -206534.96 20.844359 90 0.3291605 0.39888693 6.0138739 703.62325 0.039527213 0.94418421 4.0368484 66.145455 0.29040418 0.4019706 0.39554353 10.262823 10.262823 0 0 0 0 0 5500 0 0 0
|
||||
Loop time of 0.481812 on 16 procs for 75 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
200 403.01717 -205543.17 21.115577 91 0.3291605 0.39888693 6.0138739 703.62325 0.039527213 0.94418421 4.0368484 90.95 0.39930574 0.4019706 0.39554353 10.262823 10.262823 0 0 0 0 0 4000 0 0 0
|
||||
300 399.01963 -205541.46 19.137336 85 0.32442182 0.39862755 6.0138739 703.62325 0.046873868 0.94776891 4.0368484 92.02 0.39912484 0.40296919 0.39497622 10.288936 10.288936 0 0 0 0 0 6000 0 0 0
|
||||
Loop time of 0.5757 on 16 procs for 100 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
300 399.01963 -205541.46 19.137336 85 0.32442182 0.39862755 6.0138739 703.62325 0.046873868 0.94776891 4.0368484 92.02 0.39912484 0.40296919 0.39497622 10.288936 10.288936 0 0 0 0 0 6000 0 0 0
|
||||
377 399.01963 -206534.96 19.137336 85 0.32442182 0.39862755 6.0138739 703.62325 0.046873868 0.94776891 4.0368484 73.225464 0.31760598 0.40296919 0.39497622 10.288936 10.288936 0 0 0 0 0 7540 0 0 0
|
||||
Loop time of 0.514907 on 16 procs for 77 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
300 399.01963 -205541.46 19.137003 85 0.32442182 0.39862755 6.0138739 703.62325 0.046873868 0.94776891 4.0368484 92.02 0.39912484 0.40296919 0.39497622 10.288936 10.288936 0 0 0 0 0 6000 0 0 0
|
||||
400 398.15351 -205544.87 20.470844 93 0.34589451 0.39828754 6.0138739 703.62325 0.049952465 0.94776891 4.0779385 92.0375 0.39894967 0.40395328 0.3932824 10.307052 10.307052 0 0 0 0 0 8000 0 0 0
|
||||
Loop time of 0.577371 on 16 procs for 100 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
400 398.15351 -205544.87 20.470844 93 0.34589451 0.39828754 6.0138739 703.62325 0.049952465 0.94776891 4.0779385 92.0375 0.39894967 0.40395328 0.3932824 10.307052 10.307052 0 0 0 0 0 8000 0 0 0
|
||||
471 398.15351 -206534.96 20.470844 93 0.34589451 0.39828754 6.0138739 703.62325 0.049952465 0.94776891 4.0779385 78.163482 0.33881076 0.40395328 0.3932824 10.307052 10.307052 0 0 0 0 0 9420 0 0 0
|
||||
Loop time of 0.465473 on 16 procs for 71 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
400 398.15351 -205544.87 20.470689 93 0.34589451 0.39828754 6.0138739 703.62325 0.049952465 0.94776891 4.0779385 92.0375 0.39894967 0.40395328 0.3932824 10.307052 10.307052 0 0 0 0 0 8000 0 0 0
|
||||
500 400.29399 -205544.98 17.051242 83 0.42140172 0.39805251 6.0138739 703.62325 0.056986933 1.0907861 4.0779385 91.986 0.39879563 0.40493836 0.39165573 10.307052 10.307052 0 0 0 0 0 10000 0 0 0
|
||||
Loop time of 0.579188 on 16 procs for 100 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
500 400.29399 -205544.98 17.051242 83 0.42140172 0.39805251 6.0138739 703.62325 0.056986933 1.0907861 4.0779385 91.986 0.39879563 0.40493836 0.39165573 10.307052 10.307052 0 0 0 0 0 10000 0 0 0
|
||||
577 400.29399 -206534.96 17.051242 83 0.42140172 0.39805251 6.0138739 703.62325 0.056986933 1.0910651 4.0779385 79.710572 0.3455768 0.40493836 0.39165573 10.307052 10.307052 0 0 0 0 0 11540 0 0 0
|
||||
Loop time of 0.502193 on 16 procs for 77 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
500 400.29399 -205544.98 17.051107 83 0.42140172 0.39805251 6.0138739 703.62325 0.056986933 1.0910651 4.0779385 91.986 0.39879563 0.40493836 0.39165573 10.307052 10.307052 0 0 0 0 0 10000 0 0 0
|
||||
600 400.96099 -205544.56 20.904479 91 0.41219484 0.39780769 6.0138739 703.62325 0.061331691 1.1358732 4.0779385 92.013333 0.39864794 0.40593806 0.39067432 10.307052 10.307052 0 0 0 0 0 12000 0 0 0
|
||||
Loop time of 0.694955 on 16 procs for 100 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
600 400.96099 -205544.56 20.904479 91 0.41219484 0.39780769 6.0138739 703.62325 0.061331691 1.1358732 4.0779385 92.013333 0.39864794 0.40593806 0.39067432 10.307052 10.307052 0 0 0 0 0 12000 0 0 0
|
||||
680 400.96099 -206534.96 20.904479 91 0.41219484 0.39780769 6.0138739 703.62325 0.061331691 1.1358732 4.0779385 81.188235 0.35174818 0.40593806 0.39067432 10.307052 10.307052 0 0 0 0 0 13600 0 0 0
|
||||
Loop time of 0.529041 on 16 procs for 80 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
600 400.96099 -205544.56 20.904088 91 0.41219484 0.39780769 6.0138739 703.62325 0.061331691 1.1358732 4.0779385 92.013333 0.39864794 0.40593806 0.39067432 10.307052 10.307052 0 0 0 0 0 12000 0 0 0
|
||||
700 397.78618 -205534.96 20.361513 95 0.54466603 0.39757442 6.0138739 703.62325 0.061146951 1.1853748 4.1995704 92.12 0.39850836 0.40693553 0.38981834 10.307052 10.307052 0 0 0 0 0 14000 0 0 0
|
||||
Loop time of 0.590093 on 16 procs for 100 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
700 397.78618 -205534.96 20.361513 95 0.54466603 0.39757442 6.0138739 703.62325 0.061146951 1.2139704 4.1995704 92.12 0.39850836 0.40693553 0.38981834 10.307052 10.307052 0 0 0 0 0 14000 0 0 0
|
||||
790 397.78618 -206534.96 20.361513 95 0.54466603 0.39757442 6.0138739 703.62325 0.061146951 2.2107138 4.1995704 81.625316 0.35310868 0.40693553 0.38981834 10.307052 10.307052 0 0 0 0 0 15800 0 0 0
|
||||
Loop time of 0.594281 on 16 procs for 90 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
700 397.78618 -205534.96 20.236364 94 0.51088027 0.39757442 6.0138739 703.62325 0.061146951 2.2107138 4.205089 92.12 0.39850836 0.40693553 0.38981834 10.307052 10.307052 0 0 0 0 0 14000 1 2 6
|
||||
800 399.66919 -205547.44 21.285461 94 0.56079766 0.39739855 6.0138739 703.62325 0.06556778 2.2107138 4.3041291 92.36625 0.3983806 0.40793368 0.38875 10.307052 10.385797 0 0 0 0 0 16000 1 2 6
|
||||
Loop time of 0.583824 on 16 procs for 100 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
800 399.66919 -205547.44 21.285461 94 0.56079766 0.39739855 6.0138739 703.62325 0.06556778 2.2107138 4.3041291 92.36625 0.3983806 0.40793368 0.38875 10.307052 10.385797 0 0 0 0 0 16000 1 2 6
|
||||
872 399.66919 -206535.54 21.285461 94 0.56079766 0.39739855 6.0138739 703.62325 0.06556778 2.3177682 4.3041291 84.739679 0.36548679 0.40793368 0.38875 10.307052 10.385797 0 0 0 0 0 17440 1 2 6
|
||||
Loop time of 0.46886 on 16 procs for 72 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
800 399.66919 -205547.44 21.2852 94 0.44964213 0.39739855 6.0138739 703.62325 0.06556778 2.3177682 4.3041291 92.36625 0.3983806 0.40793368 0.38875 10.307052 10.385797 0 0 0 0 0 16000 2 4 13
|
||||
900 401.5853 -205544.22 19.308189 94 0.47610389 0.39719191 6.0138739 703.62325 0.066991886 2.3177682 4.3041291 92.017778 0.39825974 0.40893337 0.3878576 10.307052 10.385797 0 0 0 0 0 18000 2 4 13
|
||||
Loop time of 0.585137 on 16 procs for 100 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
900 401.5853 -205544.22 19.308189 94 0.47610389 0.39719191 6.0138739 703.62325 0.066991886 2.3177682 4.3041291 92.017778 0.39825974 0.40893337 0.3878576 10.307052 10.385797 0 0 0 0 0 18000 2 4 13
|
||||
975 401.5853 -206535.54 19.308189 94 0.47610389 0.39719191 6.0138739 703.62325 0.066991886 2.3177682 4.3041291 84.939487 0.36762438 0.40893337 0.3878576 10.307052 10.385797 0 0 0 0 0 19500 2 4 13
|
||||
Loop time of 0.502012 on 16 procs for 75 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
900 401.5853 -205544.22 19.307938 94 0.47610389 0.39719191 6.0138739 703.62325 0.066991886 2.3177682 4.3041291 92.017778 0.39825974 0.40893337 0.3878576 10.307052 10.385797 0 0 0 0 0 18000 2 4 13
|
||||
1000 395.06218 -205526.35 17.514295 91 0.42044925 0.39716259 6.0138739 703.62325 0.067937867 2.3177682 4.3041291 92.511 0.39814962 0.40993184 0.3867545 10.307052 10.385797 0 0 0 0 0 20000 2 4 13
|
||||
Loop time of 0.588597 on 16 procs for 100 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
1000 395.06218 -205526.35 17.514295 91 0.42044925 0.39716259 6.0138739 703.62325 0.067937867 2.3177682 4.3041291 92.511 0.39814962 0.40993184 0.3867545 10.307052 10.385797 0 0 0 0 0 20000 2 4 13
|
||||
1083 395.06218 -206535.54 17.514295 91 0.42044925 0.39716259 6.0138739 703.62325 0.067937867 2.3177682 4.3041291 85.421053 0.36763584 0.40993184 0.3867545 10.307052 10.385797 0 0 0 0 0 21660 2 4 13
|
||||
Loop time of 0.543222 on 16 procs for 83 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
1000 395.06218 -205526.35 17.514191 91 0.42044925 0.39716259 6.0138739 703.62325 0.067937867 2.3177682 4.3041291 92.511 0.39814962 0.40993184 0.3867545 10.307052 10.385797 0 0 0 0 0 20000 2 4 13
|
||||
1100 400.04484 -205545.92 19.52012 89 0.58919981 0.39704631 6.0138739 703.62325 0.069136967 2.3177682 4.4265979 92.517273 0.39805636 0.41093134 0.38574293 10.307052 10.385797 0 0 0 0 0 22000 2 4 13
|
||||
Loop time of 0.590075 on 16 procs for 100 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
1100 400.04484 -205545.92 19.52012 89 0.58919981 0.39704631 6.0138739 703.62325 0.069136967 2.3177682 4.4265979 92.517273 0.39805636 0.41093134 0.38574293 10.307052 10.385797 0 0 0 0 0 22000 2 4 13
|
||||
1177 400.04484 -206535.53 19.52012 89 0.58919981 0.39704631 6.0138739 703.62325 0.069136967 2.3177682 4.4265979 86.464741 0.37201529 0.41093134 0.38574293 10.307052 10.385797 0 0 0 0 0 23540 2 4 13
|
||||
Loop time of 0.500839 on 16 procs for 77 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
1100 400.04484 -205545.92 19.518413 89 0.429675 0.39705701 6.0137119 703.6043 0.069136967 2.3177682 4.4265979 92.517273 0.39805636 0.41093134 0.38574293 10.307052 10.385797 0 0 0 0 0 22000 3 6 19
|
||||
1200 400.7462 -205543.2 21.169548 91 0.32511134 0.39679665 6.0137119 703.6043 0.06750442 2.3177682 4.4265979 92.376667 0.39796198 0.41191655 0.3846039 10.307052 10.385797 0 0 0 0 0 24000 3 6 19
|
||||
Loop time of 0.583971 on 16 procs for 100 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
1200 400.7462 -205543.2 21.169548 91 0.32511134 0.39679665 6.0137119 703.6043 0.06750442 2.3177682 4.4265979 92.376667 0.39796198 0.41191655 0.3846039 10.307052 10.385797 0 0 0 0 0 24000 3 6 19
|
||||
1277 400.7462 -206535.53 21.169548 91 0.32511134 0.39679665 6.0137119 703.6043 0.06750442 2.3177682 4.4265979 86.806578 0.37396584 0.41191655 0.3846039 10.307052 10.385797 0 0 0 0 0 25540 3 6 19
|
||||
Loop time of 0.509118 on 16 procs for 77 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
1200 400.7462 -205543.2 21.169281 91 0.32511134 0.39679665 6.0137119 703.6043 0.06750442 2.3177682 4.4265979 92.376667 0.39796198 0.41191655 0.3846039 10.307052 10.385797 0 0 0 0 0 24000 3 6 19
|
||||
1300 398.53702 -205539.33 21.35815 94 0.38773898 0.39659935 6.0137119 703.6043 0.067808168 2.3177682 4.4265979 92.500769 0.39786514 0.41289519 0.3846039 10.307052 10.385797 0 0 0 0 0 26000 3 6 19
|
||||
Loop time of 0.587306 on 16 procs for 100 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
1300 398.53702 -205539.33 21.35815 94 0.38773898 0.39659935 6.0137119 703.6043 0.067808168 2.3177682 4.4265979 92.500769 0.39786514 0.41289519 0.3846039 10.307052 10.385797 0 0 0 0 0 26000 3 6 19
|
||||
1375 398.53702 -206535.53 21.35815 94 0.38773898 0.39659935 6.0137119 703.6043 0.067808168 2.3177682 4.4265979 87.455273 0.37616341 0.41289519 0.3846039 10.307052 10.385797 0 0 0 0 0 27500 3 6 19
|
||||
Loop time of 0.483781 on 16 procs for 75 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
1300 398.53702 -205539.33 21.35787 94 0.38773898 0.39659935 6.0137119 703.6043 0.067808168 2.3177682 4.4265979 92.500769 0.39786514 0.41289519 0.3846039 10.307052 10.385797 0 0 0 0 0 26000 3 6 19
|
||||
1400 402.80537 -205549.3 19.481887 95 0.32554201 0.39648737 6.0137119 703.6043 0.069550538 2.3177682 4.4265979 92.666429 0.39776836 0.41389491 0.38420043 10.307052 10.385797 0 0 0 0 0 28000 3 6 19
|
||||
Loop time of 0.586411 on 16 procs for 100 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
1400 402.80537 -205549.3 19.481887 95 0.32554201 0.39648737 6.0137119 703.6043 0.069550538 2.3177682 4.4265979 92.666429 0.39776836 0.41389491 0.38420043 10.307052 10.385797 0 0 0 0 0 28000 3 6 19
|
||||
1471 402.80537 -206535.53 19.481887 95 0.32554201 0.39648737 6.0137119 703.6043 0.069550538 2.3177682 4.4265979 88.193746 0.37856948 0.41389491 0.38420043 10.307052 10.385797 0 0 0 0 0 29420 3 6 19
|
||||
Loop time of 0.473799 on 16 procs for 71 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
1400 402.80537 -205549.3 19.481632 95 0.32554201 0.39648737 6.0137119 703.6043 0.069550538 2.3177682 4.4265979 92.666429 0.39776836 0.41389491 0.38420043 10.307052 10.385797 0 0 0 0 0 28000 3 6 19
|
||||
1500 402.0803 -205537.7 20.903964 99 0.3340498 0.39635609 6.0137119 703.6043 0.070409086 2.3177682 4.4265979 92.857333 0.39767858 0.41489448 0.38372784 10.333041 10.385797 0 0 0 0 0 30000 3 6 19
|
||||
Loop time of 0.587342 on 16 procs for 100 steps with 37048 atoms
|
||||
|
||||
Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23]
|
||||
1500 402.0803 -205537.7 20.903964 99 0.3340498 0.39635609 6.0137119 703.6043 0.070409086 2.3177682 4.4265979 92.857333 0.39767858 0.41489448 0.38372784 10.333041 10.385797 0 0 0 0 0 30000 3 6 19
|
||||
1574 402.0803 -206535.53 20.903964 99 0.3340498 0.39635609 6.0137119 703.6043 0.070409086 2.3177682 4.4265979 88.491741 0.37898213 0.41489448 0.38372784 10.333041 10.385797 0 0 0 0 0 31480 3 6 19
|
||||
Loop time of 0.493982 on 16 procs for 74 steps with 37048 atoms
|
||||
|
||||
Final hyper stats ...
|
||||
|
||||
Cummulative quantities for fix hyper:
|
||||
hyper time = 30000
|
||||
event timesteps = 3
|
||||
# of atoms in events = 6
|
||||
Quantities for this hyper run:
|
||||
event timesteps = 3
|
||||
# of atoms in events = 6
|
||||
max length of any bond = 4.4266
|
||||
max drift distance of any atom = 2.31777
|
||||
fraction of steps & bonds with zero bias = 0.0704091
|
||||
Current quantities:
|
||||
ave bonds/atom = 6.01371
|
||||
Cummulative quantities specific tofix hyper/local:
|
||||
# of new bonds formed = 19
|
||||
max bonds/atom = 13
|
||||
Quantities for this hyper run specific to fix hyper/local:
|
||||
ave boosted bonds/step = 92.8573
|
||||
ave boost coeff of all bonds = 0.397679
|
||||
max boost coeff of any bond = 0.414894
|
||||
min boost coeff of any bond = 0.383728
|
||||
max dist from my box of any non-maxstrain bond ghost atom = 10.333
|
||||
max dist from my box of any bond ghost atom = 10.3858
|
||||
count of ghost bond neighbors not found on reneighbor steps = 0
|
||||
lost bond partners = 0
|
||||
ave bias coeff for lost bond partners = 0
|
||||
bias overlaps = 0
|
||||
non-matching bias coeffs = 0
|
||||
CPU time for bond builds = 0.044807
|
||||
Current quantities specific to fix hyper/local:
|
||||
neighbor bonds/bond = 703.604
|
||||
ave boost coeff for all bonds = 0.396356
|
||||
|
||||
Loop time of 17.9972 on 16 procs for 1500 steps with 37048 atoms
|
||||
|
||||
Performance: 36.006 ns/day, 0.667 hours/ns, 83.346 timesteps/s
|
||||
120.7% CPU use with 16 MPI tasks x no OpenMP threads
|
||||
|
||||
Hyper stats:
|
||||
Dynamics time (%) = 8.87027 (49.2869)
|
||||
Quench time (%) = 8.15972 (45.3388)
|
||||
Other time (%) = 1.2212 (6.78552)
|
||||
|
||||
MPI task timing breakdown:
|
||||
Section | min time | avg time | max time |%varavg| %total
|
||||
---------------------------------------------------------------
|
||||
Pair | 11.6 | 11.848 | 12.043 | 3.9 | 65.83
|
||||
Neigh | 0.50025 | 0.52638 | 0.55163 | 2.1 | 2.92
|
||||
Comm | 0.34528 | 0.49905 | 0.66742 | 13.3 | 2.77
|
||||
Output | 0.0021305 | 0.0021461 | 0.0022686 | 0.1 | 0.01
|
||||
Modify | 3.7498 | 3.9009 | 3.9786 | 2.8 | 21.67
|
||||
Other | | 1.221 | | | 6.79
|
||||
|
||||
Nlocal: 2315.5 ave 2361 max 2267 min
|
||||
Histogram: 1 1 0 4 2 1 3 3 0 1
|
||||
Nghost: 3187.88 ave 3236 max 3141 min
|
||||
Histogram: 1 0 3 2 2 1 4 1 1 1
|
||||
Neighs: 53950.6 ave 54989 max 53049 min
|
||||
Histogram: 2 0 3 2 1 2 4 1 0 1
|
||||
FullNghs: 542951 ave 554654 max 533224 min
|
||||
Histogram: 1 2 3 1 2 2 2 2 0 1
|
||||
|
||||
Total # of neighbors = 8687214
|
||||
Ave neighs/atom = 234.485
|
||||
Neighbor list builds = 165
|
||||
Dangerous builds = 0
|
||||
Total wall time: 0:00:22
|
||||
1206
examples/hyper/ptvoterlammps.eam
Normal file
@ -1,5 +1,68 @@
|
||||
# Change Log
|
||||
|
||||
## [2.7.24](https://github.com/kokkos/kokkos/tree/2.7.24) (2018-11-04)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.7.00...2.7.24)
|
||||
|
||||
**Implemented enhancements:**
|
||||
|
||||
- DualView: Add non-templated functions for sync, need\_sync, view, modify [\#1858](https://github.com/kokkos/kokkos/issues/1858)
|
||||
- DualView: Avoid needlessly allocates and initializes modify\_host and modify\_device flag views [\#1831](https://github.com/kokkos/kokkos/issues/1831)
|
||||
- DualView: Incorrect deduction of "not device type" [\#1659](https://github.com/kokkos/kokkos/issues/1659)
|
||||
- BuildSystem: Add KOKKOS\_ENABLE\_CXX14 and KOKKOS\_ENABLE\_CXX17 [\#1602](https://github.com/kokkos/kokkos/issues/1602)
|
||||
- BuildSystem: Installed kokkos\_generated\_settings.cmake contains build directories instead of install directories [\#1838](https://github.com/kokkos/kokkos/issues/1838)
|
||||
- BuildSystem: KOKKOS\_ARCH: add ticks to printout of improper arch setting [\#1649](https://github.com/kokkos/kokkos/issues/1649)
|
||||
- BuildSystem: Make core/src/Makefile for Cuda use needed nvcc\_wrapper [\#1296](https://github.com/kokkos/kokkos/issues/1296)
|
||||
- Build: Support PGI as host compiler for NVCC [\#1828](https://github.com/kokkos/kokkos/issues/1828)
|
||||
- Build: Many Warnings Fixed e.g.[\#1786](https://github.com/kokkos/kokkos/issues/1786)
|
||||
- Capability: OffsetView with non-zero begin index [\#567](https://github.com/kokkos/kokkos/issues/567)
|
||||
- Capability: Reductions into device side view [\#1788](https://github.com/kokkos/kokkos/issues/1788)
|
||||
- Capability: Add max\_size to Kokkos::Array [\#1760](https://github.com/kokkos/kokkos/issues/1760)
|
||||
- Capability: View Assignment: LayoutStride -\> LayoutLeft and LayoutStride -\> LayoutRight [\#1594](https://github.com/kokkos/kokkos/issues/1594)
|
||||
- Capability: Atomic function allow implicit conversion of update argument [\#1571](https://github.com/kokkos/kokkos/issues/1571)
|
||||
- Capability: Add team\_size\_max with tagged functors [\#663](https://github.com/kokkos/kokkos/issues/663)
|
||||
- Capability: Fix allignment of views from Kokkos\_ScratchSpace should use different alignment [\#1700](https://github.com/kokkos/kokkos/issues/1700)
|
||||
- Capabilitiy: create\_mirror\_view\_and\_copy for DynRankView [\#1651](https://github.com/kokkos/kokkos/issues/1651)
|
||||
- Capability: DeepCopy HBWSpace / HostSpace [\#548](https://github.com/kokkos/kokkos/issues/548)
|
||||
- ROCm: support team vector scan [\#1645](https://github.com/kokkos/kokkos/issues/1645)
|
||||
- ROCm: Merge from rocm-hackathon2 [\#1636](https://github.com/kokkos/kokkos/issues/1636)
|
||||
- ROCm: Add ParallelScanWithTotal [\#1611](https://github.com/kokkos/kokkos/issues/1611)
|
||||
- ROCm: Implement MDRange in ROCm [\#1314](https://github.com/kokkos/kokkos/issues/1314)
|
||||
- ROCm: Implement Reducers for Nested Parallelism Levels [\#963](https://github.com/kokkos/kokkos/issues/963)
|
||||
- ROCm: Add asynchronous deep copy [\#959](https://github.com/kokkos/kokkos/issues/959)
|
||||
- Tests: Memory pool test seems to allocate 8GB [\#1830](https://github.com/kokkos/kokkos/issues/1830)
|
||||
- Tests: Add unit\_test for team\_broadcast [\#734](https://github.com/kokkos/kokkos/issues/734)
|
||||
|
||||
**Fixed bugs:**
|
||||
|
||||
- BuildSystem: Makefile.kokkos gets gcc-toolchain wrong if gcc is cached [\#1841](https://github.com/kokkos/kokkos/issues/1841)
|
||||
- BuildSystem: kokkos\_generated\_settings.cmake placement is inconsistent [\#1771](https://github.com/kokkos/kokkos/issues/1771)
|
||||
- BuildSystem: Invalid escape sequence \. in kokkos\_functions.cmake [\#1661](https://github.com/kokkos/kokkos/issues/1661)
|
||||
- BuildSystem: Problem in Kokkos generated cmake file [\#1770](https://github.com/kokkos/kokkos/issues/1770)
|
||||
- BuildSystem: invalid file names on windows [\#1671](https://github.com/kokkos/kokkos/issues/1671)
|
||||
- Tests: reducers min/max\_loc test fails randomly due to multiple min values and thus multiple valid locations [\#1681](https://github.com/kokkos/kokkos/issues/1681)
|
||||
- Tests: cuda.scatterview unit test causes "Bus error" when force\_uvm and enable\_lambda are enabled [\#1852](https://github.com/kokkos/kokkos/issues/1852)
|
||||
- Tests: cuda.cxx11 unit test fails when force\_uvm and enable\_lambda are enabled [\#1850](https://github.com/kokkos/kokkos/issues/1850)
|
||||
- Tests: threads.reduce\_device\_view\_range\_policy failing with Cuda/8.0.44 and RDC [\#1836](https://github.com/kokkos/kokkos/issues/1836)
|
||||
- Build: compile error when compiling Kokkos with hwloc 2.0.1 \(on OSX 10.12.6, with g++ 7.2.0\) [\#1506](https://github.com/kokkos/kokkos/issues/1506)
|
||||
- Build: dual\_view.view broken with UVM [\#1834](https://github.com/kokkos/kokkos/issues/1834)
|
||||
- Build: White cuda/9.2 + gcc/7.2 warnings triggering errors [\#1833](https://github.com/kokkos/kokkos/issues/1833)
|
||||
- Build: warning: enum constant in boolean context [\#1813](https://github.com/kokkos/kokkos/issues/1813)
|
||||
- Capability: Fix overly conservative max\_team\_size thingy [\#1808](https://github.com/kokkos/kokkos/issues/1808)
|
||||
- DynRankView: Ctors taking ViewAllocateWithoutInitializing broken [\#1783](https://github.com/kokkos/kokkos/issues/1783)
|
||||
- Cuda: Apollo cuda.team\_broadcast test fail with clang-6.0 [\#1762](https://github.com/kokkos/kokkos/issues/1762)
|
||||
- Cuda: Clang spurious test failure in impl\_view\_accessible [\#1753](https://github.com/kokkos/kokkos/issues/1753)
|
||||
- Cuda: Kokkos::complex\<double\> atomic deadlocks with Clang 6 Cuda build with -O0 [\#1752](https://github.com/kokkos/kokkos/issues/1752)
|
||||
- Cuda: LayoutStride Test fails for UVM as default memory space [\#1688](https://github.com/kokkos/kokkos/issues/1688)
|
||||
- Cuda: Scan wrong values on Volta [\#1676](https://github.com/kokkos/kokkos/issues/1676)
|
||||
- Cuda: Kokkos::deep\_copy error with CudaUVM and Kokkos::Serial spaces [\#1652](https://github.com/kokkos/kokkos/issues/1652)
|
||||
- Cuda: cudaErrorInvalidConfiguration with debug build [\#1647](https://github.com/kokkos/kokkos/issues/1647)
|
||||
- Cuda: parallel\_for with TeamPolicy::team\_size\_recommended with launch bounds not working -- reported by Daniel Holladay [\#1283](https://github.com/kokkos/kokkos/issues/1283)
|
||||
- Cuda: Using KOKKOS\_CLASS\_LAMBDA in a class with Kokkos::Random\_XorShift64\_Pool member data [\#1696](https://github.com/kokkos/kokkos/issues/1696)
|
||||
- Long Build Times on Darwin [\#1721](https://github.com/kokkos/kokkos/issues/1721)
|
||||
- Capability: Typo in Kokkos\_Sort.hpp - BinOp3D - wrong comparison [\#1720](https://github.com/kokkos/kokkos/issues/1720)
|
||||
- Buffer overflow in SharedAllocationRecord in Kokkos\_HostSpace.cpp [\#1673](https://github.com/kokkos/kokkos/issues/1673)
|
||||
- Serial unit test failure [\#1632](https://github.com/kokkos/kokkos/issues/1632)
|
||||
|
||||
## [2.7.00](https://github.com/kokkos/kokkos/tree/2.7.00) (2018-05-24)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.6.00...2.7.00)
|
||||
|
||||
|
||||
@ -11,7 +11,7 @@ IF(NOT KOKKOS_HAS_TRILINOS)
|
||||
|
||||
# Define Project Name if this is a standalone build
|
||||
IF(NOT DEFINED ${PROJECT_NAME})
|
||||
project(Kokkos CXX)
|
||||
project(Kokkos CXX)
|
||||
ENDIF()
|
||||
|
||||
# Basic initialization (Used in KOKKOS_SETTINGS)
|
||||
@ -22,7 +22,7 @@ IF(NOT KOKKOS_HAS_TRILINOS)
|
||||
include(${KOKKOS_SRC_PATH}/cmake/kokkos_functions.cmake)
|
||||
set_kokkos_cxx_compiler()
|
||||
set_kokkos_cxx_standard()
|
||||
|
||||
|
||||
#------------ GET OPTIONS AND KOKKOS_SETTINGS --------------------------------
|
||||
# Add Kokkos' modules to CMake's module path.
|
||||
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/")
|
||||
@ -34,7 +34,7 @@ IF(NOT KOKKOS_HAS_TRILINOS)
|
||||
|
||||
#------------ GENERATE HEADER AND SOURCE FILES -------------------------------
|
||||
execute_process(
|
||||
COMMAND ${KOKKOS_SETTINGS} make -f ${KOKKOS_SRC_PATH}/cmake/Makefile.generate_cmake_settings CXX=${CMAKE_CXX_COMPILER} generate_build_settings
|
||||
COMMAND ${KOKKOS_SETTINGS} make -f ${KOKKOS_SRC_PATH}/cmake/Makefile.generate_cmake_settings CXX=${CMAKE_CXX_COMPILER} PREFIX=${CMAKE_INSTALL_PREFIX} generate_build_settings
|
||||
WORKING_DIRECTORY "${Kokkos_BINARY_DIR}"
|
||||
OUTPUT_FILE ${Kokkos_BINARY_DIR}/core_src_make.out
|
||||
RESULT_VARIABLE GEN_SETTINGS_RESULT
|
||||
@ -45,6 +45,7 @@ IF(NOT KOKKOS_HAS_TRILINOS)
|
||||
endif()
|
||||
include(${Kokkos_BINARY_DIR}/kokkos_generated_settings.cmake)
|
||||
install(FILES ${Kokkos_BINARY_DIR}/kokkos_generated_settings.cmake DESTINATION lib/cmake/Kokkos)
|
||||
install(FILES ${Kokkos_BINARY_DIR}/kokkos_generated_settings.cmake DESTINATION ${CMAKE_INSTALL_PREFIX})
|
||||
string(REPLACE " " ";" KOKKOS_TPL_INCLUDE_DIRS "${KOKKOS_GMAKE_TPL_INCLUDE_DIRS}")
|
||||
string(REPLACE " " ";" KOKKOS_TPL_LIBRARY_DIRS "${KOKKOS_GMAKE_TPL_LIBRARY_DIRS}")
|
||||
string(REPLACE " " ";" KOKKOS_TPL_LIBRARY_NAMES "${KOKKOS_GMAKE_TPL_LIBRARY_NAMES}")
|
||||
|
||||
@ -6,9 +6,9 @@ ifndef KOKKOS_PATH
|
||||
endif
|
||||
CXXFLAGS=$(CCFLAGS)
|
||||
|
||||
# Options: Cuda,ROCm,OpenMP,Pthreads,Qthreads,Serial
|
||||
# Options: Cuda,ROCm,OpenMP,Pthread,Qthreads,Serial
|
||||
KOKKOS_DEVICES ?= "OpenMP"
|
||||
#KOKKOS_DEVICES ?= "Pthreads"
|
||||
#KOKKOS_DEVICES ?= "Pthread"
|
||||
# Options:
|
||||
# Intel: KNC,KNL,SNB,HSW,BDW,SKX
|
||||
# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72
|
||||
@ -21,12 +21,13 @@ KOKKOS_ARCH ?= ""
|
||||
KOKKOS_DEBUG ?= "no"
|
||||
# Options: hwloc,librt,experimental_memkind
|
||||
KOKKOS_USE_TPLS ?= ""
|
||||
# Options: c++11,c++1z
|
||||
# Options: c++11,c++14,c++1y,c++17,c++1z,c++2a
|
||||
KOKKOS_CXX_STANDARD ?= "c++11"
|
||||
# Options: aggressive_vectorization,disable_profiling,disable_deprecated_code,enable_large_mem_tests
|
||||
KOKKOS_OPTIONS ?= ""
|
||||
# Option for setting ETI path
|
||||
KOKKOS_ETI_PATH ?= ${KOKKOS_PATH}/core/src/eti
|
||||
KOKKOS_CMAKE ?= "no"
|
||||
|
||||
# Default settings specific options.
|
||||
# Options: force_uvm,use_ldg,rdc,enable_lambda
|
||||
@ -41,7 +42,11 @@ kokkos_has_string=$(if $(findstring $2,$1),1,0)
|
||||
# Check for general settings.
|
||||
KOKKOS_INTERNAL_ENABLE_DEBUG := $(call kokkos_has_string,$(KOKKOS_DEBUG),yes)
|
||||
KOKKOS_INTERNAL_ENABLE_CXX11 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++11)
|
||||
KOKKOS_INTERNAL_ENABLE_CXX14 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++14)
|
||||
KOKKOS_INTERNAL_ENABLE_CXX1Y := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++1y)
|
||||
KOKKOS_INTERNAL_ENABLE_CXX17 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++17)
|
||||
KOKKOS_INTERNAL_ENABLE_CXX1Z := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++1z)
|
||||
KOKKOS_INTERNAL_ENABLE_CXX2A := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2a)
|
||||
|
||||
# Check for external libraries.
|
||||
KOKKOS_INTERNAL_USE_HWLOC := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),hwloc)
|
||||
@ -110,6 +115,18 @@ KOKKOS_INTERNAL_COMPILER_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VE
|
||||
KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),apple-darwin)
|
||||
KOKKOS_INTERNAL_COMPILER_HCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC)
|
||||
|
||||
# Check Host Compiler if using NVCC through nvcc_wrapper
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER := $(strip $(shell echo $(CXX) | grep nvcc_wrapper | wc -l))
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER), 1)
|
||||
|
||||
KOKKOS_CXX_HOST_VERSION := $(strip $(shell $(CXX) $(CXXFLAGS) --host-version 2>&1))
|
||||
KOKKOS_INTERNAL_COMPILER_PGI := $(call kokkos_has_string,$(KOKKOS_CXX_HOST_VERSION),PGI)
|
||||
KOKKOS_INTERNAL_COMPILER_INTEL := $(call kokkos_has_string,$(KOKKOS_CXX_HOST_VERSION),Intel Corporation)
|
||||
KOKKOS_INTERNAL_COMPILER_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_HOST_VERSION),clang)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2)
|
||||
KOKKOS_INTERNAL_COMPILER_CLANG = 1
|
||||
endif
|
||||
@ -202,18 +219,34 @@ endif
|
||||
# Set C++11 flags.
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
KOKKOS_INTERNAL_CXX11_FLAG := --c++11
|
||||
KOKKOS_INTERNAL_CXX14_FLAG := --c++14
|
||||
#KOKKOS_INTERNAL_CXX17_FLAG := --c++17
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
|
||||
KOKKOS_INTERNAL_CXX11_FLAG := -std=c++11
|
||||
#KOKKOS_INTERNAL_CXX14_FLAG := -std=c++14
|
||||
KOKKOS_INTERNAL_CXX1Y_FLAG := -std=c++1y
|
||||
#KOKKOS_INTERNAL_CXX17_FLAG := -std=c++17
|
||||
#KOKKOS_INTERNAL_CXX1Z_FLAG := -std=c++1Z
|
||||
#KOKKOS_INTERNAL_CXX2A_FLAG := -std=c++2a
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
KOKKOS_INTERNAL_CXX11_FLAG := -hstd=c++11
|
||||
KOKKOS_INTERNAL_CXX14_FLAG := -hstd=c++14
|
||||
#KOKKOS_INTERNAL_CXX1Y_FLAG := -hstd=c++1y
|
||||
#KOKKOS_INTERNAL_CXX17_FLAG := -hstd=c++17
|
||||
#KOKKOS_INTERNAL_CXX1Z_FLAG := -hstd=c++1z
|
||||
#KOKKOS_INTERNAL_CXX2A_FLAG := -hstd=c++2a
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_HCC), 1)
|
||||
KOKKOS_INTERNAL_CXX11_FLAG :=
|
||||
else
|
||||
KOKKOS_INTERNAL_CXX11_FLAG := --std=c++11
|
||||
KOKKOS_INTERNAL_CXX14_FLAG := --std=c++14
|
||||
KOKKOS_INTERNAL_CXX1Y_FLAG := --std=c++1y
|
||||
KOKKOS_INTERNAL_CXX17_FLAG := --std=c++17
|
||||
KOKKOS_INTERNAL_CXX1Z_FLAG := --std=c++1z
|
||||
KOKKOS_INTERNAL_CXX2A_FLAG := --std=c++2a
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
@ -336,7 +369,9 @@ endif
|
||||
|
||||
#CPPFLAGS is now unused
|
||||
KOKKOS_CPPFLAGS =
|
||||
KOKKOS_CXXFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src -I$(KOKKOS_ETI_PATH)
|
||||
ifneq ($(KOKKOS_CMAKE), yes)
|
||||
KOKKOS_CXXFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src -I$(KOKKOS_ETI_PATH)
|
||||
endif
|
||||
KOKKOS_TPL_INCLUDE_DIRS =
|
||||
KOKKOS_TPL_LIBRARY_DIRS =
|
||||
KOKKOS_TPL_LIBRARY_NAMES =
|
||||
@ -347,9 +382,11 @@ endif
|
||||
|
||||
KOKKOS_LIBS = -ldl
|
||||
KOKKOS_TPL_LIBRARY_NAMES += dl
|
||||
KOKKOS_LDFLAGS = -L$(shell pwd)
|
||||
# CXXLDFLAGS is used together with CXXFLAGS in a combined compile/link command
|
||||
KOKKOS_CXXLDFLAGS = -L$(shell pwd)
|
||||
ifneq ($(KOKKOS_CMAKE), yes)
|
||||
KOKKOS_LDFLAGS = -L$(shell pwd)
|
||||
# CXXLDFLAGS is used together with CXXFLAGS in a combined compile/link command
|
||||
KOKKOS_CXXLDFLAGS = -L$(shell pwd)
|
||||
endif
|
||||
KOKKOS_LINK_FLAGS =
|
||||
KOKKOS_SRC =
|
||||
KOKKOS_HEADERS =
|
||||
@ -377,10 +414,12 @@ tmp := $(call kokkos_append_header,"/* Execution Spaces */")
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_COMPILER_CUDA_VERSION $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION)")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
|
||||
tmp := $(call kokkos_append_header,'\#define KOKKOS_ENABLE_ROCM')
|
||||
tmp := $(call kokkos_append_header,'\#define KOKKOS_IMPL_ROCM_CLANG_WORKAROUND 1')
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
|
||||
@ -438,11 +477,25 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1)
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX11")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX14), 1)
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX14_FLAG)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX14")
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Y), 1)
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Y_FLAG)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX14")
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX17), 1)
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX17_FLAG)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX17")
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Z), 1)
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Z_FLAG)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX11")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX1Z")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX17")
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2A), 1)
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2A_FLAG)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX20")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
|
||||
@ -465,7 +518,9 @@ endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1)
|
||||
ifneq ($(HWLOC_PATH),)
|
||||
KOKKOS_CXXFLAGS += -I$(HWLOC_PATH)/include
|
||||
ifneq ($(KOKKOS_CMAKE), yes)
|
||||
KOKKOS_CXXFLAGS += -I$(HWLOC_PATH)/include
|
||||
endif
|
||||
KOKKOS_LDFLAGS += -L$(HWLOC_PATH)/lib
|
||||
KOKKOS_CXXLDFLAGS += -L$(HWLOC_PATH)/lib
|
||||
KOKKOS_TPL_INCLUDE_DIRS += $(HWLOC_PATH)/include
|
||||
@ -484,7 +539,9 @@ endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
|
||||
ifneq ($(MEMKIND_PATH),)
|
||||
KOKKOS_CXXFLAGS += -I$(MEMKIND_PATH)/include
|
||||
ifneq ($(KOKKOS_CMAKE), yes)
|
||||
KOKKOS_CXXFLAGS += -I$(MEMKIND_PATH)/include
|
||||
endif
|
||||
KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib
|
||||
KOKKOS_CXXLDFLAGS += -L$(MEMKIND_PATH)/lib
|
||||
KOKKOS_TPL_INCLUDE_DIRS += $(MEMKIND_PATH)/include
|
||||
@ -977,7 +1034,9 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
|
||||
endif
|
||||
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
|
||||
ifneq ($(CUDA_PATH),)
|
||||
KOKKOS_CXXFLAGS += -I$(CUDA_PATH)/include
|
||||
ifneq ($(KOKKOS_CMAKE), yes)
|
||||
KOKKOS_CXXFLAGS += -I$(CUDA_PATH)/include
|
||||
endif
|
||||
KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64
|
||||
KOKKOS_CXXLDFLAGS += -L$(CUDA_PATH)/lib64
|
||||
KOKKOS_TPL_INCLUDE_DIRS += $(CUDA_PATH)/include
|
||||
@ -1032,7 +1091,9 @@ ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
|
||||
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.cpp)
|
||||
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.hpp)
|
||||
ifneq ($(QTHREADS_PATH),)
|
||||
KOKKOS_CXXFLAGS += -I$(QTHREADS_PATH)/include
|
||||
ifneq ($(KOKKOS_CMAKE), yes)
|
||||
KOKKOS_CXXFLAGS += -I$(QTHREADS_PATH)/include
|
||||
endif
|
||||
KOKKOS_LDFLAGS += -L$(QTHREADS_PATH)/lib
|
||||
KOKKOS_CXXLDFLAGS += -L$(QTHREADS_PATH)/lib
|
||||
KOKKOS_TPL_INCLUDE_DIRS += $(QTHREADS_PATH)/include
|
||||
|
||||
@ -52,44 +52,47 @@ For specifics see the LICENSE file contained in the repository or distribution.
|
||||
* GCC 4.8.4
|
||||
* GCC 4.9.3
|
||||
* GCC 5.1.0
|
||||
* GCC 5.3.0
|
||||
* GCC 5.5.0
|
||||
* GCC 6.1.0
|
||||
* GCC 7.2.0
|
||||
* GCC 7.3.0
|
||||
* GCC 8.1.0
|
||||
* Intel 15.0.2
|
||||
* Intel 16.0.1
|
||||
* Intel 17.1.043
|
||||
* Intel 17.0.1
|
||||
* Intel 17.4.196
|
||||
* Intel 18.0.128
|
||||
* Intel 18.2.128
|
||||
* Clang 3.6.1
|
||||
* Clang 3.7.1
|
||||
* Clang 3.8.1
|
||||
* Clang 3.9.0
|
||||
* Clang 4.0.0
|
||||
* Clang 4.0.0 for CUDA (CUDA Toolkit 8.0.44)
|
||||
* Clang 6.0.0 for CUDA (CUDA Toolkit 9.1)
|
||||
* PGI 17.10
|
||||
* NVCC 7.0 for CUDA (with gcc 4.8.4)
|
||||
* Clang 6.0.0 for CUDA (CUDA Toolkit 9.0)
|
||||
* Clang 7.0.0 for CUDA (CUDA Toolkit 9.1)
|
||||
* PGI 18.7
|
||||
* NVCC 7.5 for CUDA (with gcc 4.8.4)
|
||||
* NVCC 8.0.44 for CUDA (with gcc 5.3.0)
|
||||
* NVCC 9.1 for CUDA (with gcc 6.1.0)
|
||||
|
||||
### Primary tested compilers on Power 8 are:
|
||||
* GCC 5.4.0 (OpenMP,Serial)
|
||||
* IBM XL 13.1.6 (OpenMP, Serial)
|
||||
* NVCC 8.0.44 for CUDA (with gcc 5.4.0)
|
||||
* NVCC 9.0.103 for CUDA (with gcc 6.3.0 and XL 13.1.6)
|
||||
* GCC 6.4.0 (OpenMP,Serial)
|
||||
* GCC 7.2.0 (OpenMP,Serial)
|
||||
* IBM XL 16.1.0 (OpenMP, Serial)
|
||||
* NVCC 9.2.88 for CUDA (with gcc 7.2.0 and XL 16.1.0)
|
||||
|
||||
### Primary tested compilers on Intel KNL are:
|
||||
* GCC 6.2.0
|
||||
* Intel 16.4.258 (with gcc 4.7.2)
|
||||
* Intel 17.2.174 (with gcc 4.9.3)
|
||||
* Intel 18.0.128 (with gcc 4.9.3)
|
||||
* Intel 18.2.199 (with gcc 4.9.3)
|
||||
|
||||
### Primary tested compilers on ARM
|
||||
* GCC 6.1.0
|
||||
### Primary tested compilers on ARM (Cavium ThunderX2)
|
||||
* GCC 7.2.0
|
||||
* ARM/Clang 18.4.0
|
||||
|
||||
### Other compilers working:
|
||||
* X86:
|
||||
- Cygwin 2.1.0 64bit with gcc 4.9.3
|
||||
- GCC 8.1.0 (not warning free)
|
||||
|
||||
### Known non-working combinations:
|
||||
* Power8:
|
||||
|
||||
@ -697,6 +697,7 @@ namespace Kokkos {
|
||||
typedef Random_XorShift64<DeviceType> generator_type;
|
||||
typedef DeviceType device_type;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift64_Pool() {
|
||||
num_states_ = 0;
|
||||
}
|
||||
@ -709,12 +710,14 @@ namespace Kokkos {
|
||||
#endif
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift64_Pool(const Random_XorShift64_Pool& src):
|
||||
locks_(src.locks_),
|
||||
state_(src.state_),
|
||||
num_states_(src.num_states_)
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift64_Pool operator = (const Random_XorShift64_Pool& src) {
|
||||
locks_ = src.locks_;
|
||||
state_ = src.state_;
|
||||
@ -958,6 +961,7 @@ namespace Kokkos {
|
||||
|
||||
typedef DeviceType device_type;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift1024_Pool() {
|
||||
num_states_ = 0;
|
||||
}
|
||||
@ -972,6 +976,7 @@ namespace Kokkos {
|
||||
#endif
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift1024_Pool(const Random_XorShift1024_Pool& src):
|
||||
locks_(src.locks_),
|
||||
state_(src.state_),
|
||||
@ -979,6 +984,7 @@ namespace Kokkos {
|
||||
num_states_(src.num_states_)
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift1024_Pool operator = (const Random_XorShift1024_Pool& src) {
|
||||
locks_ = src.locks_;
|
||||
state_ = src.state_;
|
||||
|
||||
@ -246,8 +246,8 @@ public:
|
||||
{
|
||||
bin_count_atomic = Kokkos::View<int*, Space >("Kokkos::SortImpl::BinSortFunctor::bin_count",bin_op.max_bins());
|
||||
bin_count_const = bin_count_atomic;
|
||||
bin_offsets = offset_type("Kokkos::SortImpl::BinSortFunctor::bin_offsets",bin_op.max_bins());
|
||||
sort_order = offset_type("PermutationVector",range_end-range_begin);
|
||||
bin_offsets = offset_type(ViewAllocateWithoutInitializing("Kokkos::SortImpl::BinSortFunctor::bin_offsets"),bin_op.max_bins());
|
||||
sort_order = offset_type(ViewAllocateWithoutInitializing("Kokkos::SortImpl::BinSortFunctor::sort_order"),range_end-range_begin);
|
||||
}
|
||||
|
||||
BinSort( const_key_view_type keys_
|
||||
@ -290,7 +290,7 @@ public:
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
scratch_view_type
|
||||
sorted_values("Scratch",
|
||||
sorted_values(ViewAllocateWithoutInitializing("Kokkos::SortImpl::BinSortFunctor::sorted_values"),
|
||||
len,
|
||||
values.extent(1),
|
||||
values.extent(2),
|
||||
@ -301,7 +301,7 @@ public:
|
||||
values.extent(7));
|
||||
#else
|
||||
scratch_view_type
|
||||
sorted_values("Scratch",
|
||||
sorted_values(ViewAllocateWithoutInitializing("Kokkos::SortImpl::BinSortFunctor::sorted_values"),
|
||||
values.rank_dynamic > 0 ? len : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
values.rank_dynamic > 1 ? values.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG ,
|
||||
values.rank_dynamic > 2 ? values.extent(2) : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
@ -483,7 +483,7 @@ struct BinOp3D {
|
||||
if (keys(i1,0)>keys(i2,0)) return true;
|
||||
else if (keys(i1,0)==keys(i2,0)) {
|
||||
if (keys(i1,1)>keys(i2,1)) return true;
|
||||
else if (keys(i1,1)==keys(i2,2)) {
|
||||
else if (keys(i1,1)==keys(i2,1)) {
|
||||
if (keys(i1,2)>keys(i2,2)) return true;
|
||||
}
|
||||
}
|
||||
|
||||
41
lib/kokkos/benchmarks/gups/Makefile
Normal file
@ -0,0 +1,41 @@
|
||||
#Set your Kokkos path to something appropriate
|
||||
KOKKOS_PATH = ${HOME}/git/kokkos-github-repo
|
||||
KOKKOS_DEVICES = "Cuda"
|
||||
KOKKOS_ARCH = "Pascal60"
|
||||
KOKKOS_CUDA_OPTIONS = enable_lambda
|
||||
#KOKKOS_DEVICES = "OpenMP"
|
||||
#KOKKOS_ARCH = "Power8"
|
||||
|
||||
SRC = gups-kokkos.cc
|
||||
|
||||
default: build
|
||||
echo "Start Build"
|
||||
|
||||
CXXFLAGS = -O3
|
||||
CXX = ${HOME}/git/kokkos-github-repo/bin/nvcc_wrapper
|
||||
#CXX = g++
|
||||
|
||||
LINK = ${CXX}
|
||||
|
||||
LINKFLAGS =
|
||||
EXE = gups-kokkos
|
||||
|
||||
DEPFLAGS = -M
|
||||
|
||||
OBJ = $(SRC:.cc=.o)
|
||||
LIB =
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
build: $(EXE)
|
||||
|
||||
$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
|
||||
|
||||
clean: kokkos-clean
|
||||
rm -f *.o $(EXE)
|
||||
|
||||
# Compilation rules
|
||||
|
||||
%.o:%.cc $(KOKKOS_CPP_DEPENDS)
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
|
||||
199
lib/kokkos/benchmarks/gups/gups-kokkos.cc
Normal file
@ -0,0 +1,199 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include "Kokkos_Core.hpp"
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cmath>
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#define HLINE "-------------------------------------------------------------\n"
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA)
|
||||
typedef Kokkos::View<int64_t*, Kokkos::CudaSpace>::HostMirror GUPSHostArray;
|
||||
typedef Kokkos::View<int64_t*, Kokkos::CudaSpace> GUPSDeviceArray;
|
||||
#else
|
||||
typedef Kokkos::View<int64_t*, Kokkos::HostSpace>::HostMirror GUPSHostArray;
|
||||
typedef Kokkos::View<int64_t*, Kokkos::HostSpace> GUPSDeviceArray;
|
||||
#endif
|
||||
|
||||
typedef int GUPSIndex;
|
||||
|
||||
double now() {
|
||||
struct timeval now;
|
||||
gettimeofday(&now, NULL);
|
||||
|
||||
return (double) now.tv_sec + ((double) now.tv_usec * 1.0e-6);
|
||||
}
|
||||
|
||||
void randomize_indices(GUPSHostArray& indices, GUPSDeviceArray& dev_indices, const int64_t dataCount) {
|
||||
for( GUPSIndex i = 0; i < indices.extent(0); ++i ) {
|
||||
indices[i] = lrand48() % dataCount;
|
||||
}
|
||||
|
||||
Kokkos::deep_copy(dev_indices, indices);
|
||||
}
|
||||
|
||||
void run_gups(GUPSDeviceArray& indices, GUPSDeviceArray& data, const int64_t datum,
|
||||
const bool performAtomics) {
|
||||
|
||||
if( performAtomics ) {
|
||||
Kokkos::parallel_for("bench-gups-atomic", indices.extent(0), KOKKOS_LAMBDA(const GUPSIndex i) {
|
||||
Kokkos::atomic_fetch_xor( &data[indices[i]], datum );
|
||||
});
|
||||
} else {
|
||||
Kokkos::parallel_for("bench-gups-non-atomic", indices.extent(0), KOKKOS_LAMBDA(const GUPSIndex i) {
|
||||
data[indices[i]] ^= datum;
|
||||
});
|
||||
}
|
||||
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
int run_benchmark(const GUPSIndex indicesCount, const GUPSIndex dataCount, const int repeats,
|
||||
const bool useAtomics) {
|
||||
|
||||
printf("Reports fastest timing per kernel\n");
|
||||
printf("Creating Views...\n");
|
||||
|
||||
printf("Memory Sizes:\n");
|
||||
printf("- Elements: %15" PRIu64 " (%12.4f MB)\n", static_cast<uint64_t>(dataCount),
|
||||
1.0e-6 * ((double) dataCount * (double) sizeof(int64_t)));
|
||||
printf("- Indices: %15" PRIu64 " (%12.4f MB)\n", static_cast<uint64_t>(indicesCount),
|
||||
1.0e-6 * ((double) indicesCount * (double) sizeof(int64_t)));
|
||||
printf(" - Atomics: %15s\n", (useAtomics ? "Yes" : "No") );
|
||||
printf("Benchmark kernels will be performed for %d iterations.\n", repeats);
|
||||
|
||||
printf(HLINE);
|
||||
|
||||
GUPSDeviceArray dev_indices("indices", indicesCount);
|
||||
GUPSDeviceArray dev_data("data", dataCount);
|
||||
int64_t datum = -1;
|
||||
|
||||
GUPSHostArray indices = Kokkos::create_mirror_view(dev_indices);
|
||||
GUPSHostArray data = Kokkos::create_mirror_view(dev_data);
|
||||
|
||||
double gupsTime = 0.0;
|
||||
|
||||
printf("Initializing Views...\n");
|
||||
|
||||
#if defined(KOKKOS_HAVE_OPENMP)
|
||||
Kokkos::parallel_for("init-data", Kokkos::RangePolicy<Kokkos::OpenMP>(0, dataCount),
|
||||
#else
|
||||
Kokkos::parallel_for("init-data", Kokkos::RangePolicy<Kokkos::Serial>(0, dataCount),
|
||||
#endif
|
||||
KOKKOS_LAMBDA(const int i) {
|
||||
|
||||
data[i] = 10101010101;
|
||||
});
|
||||
|
||||
#if defined(KOKKOS_HAVE_OPENMP)
|
||||
Kokkos::parallel_for("init-indices", Kokkos::RangePolicy<Kokkos::OpenMP>(0, indicesCount),
|
||||
#else
|
||||
Kokkos::parallel_for("init-indices", Kokkos::RangePolicy<Kokkos::Serial>(0, indicesCount),
|
||||
#endif
|
||||
KOKKOS_LAMBDA(const int i) {
|
||||
|
||||
indices[i] = 0;
|
||||
});
|
||||
|
||||
Kokkos::deep_copy(dev_data, data);
|
||||
Kokkos::deep_copy(dev_indices, indices);
|
||||
double start;
|
||||
|
||||
printf("Starting benchmarking...\n");
|
||||
|
||||
for( GUPSIndex k = 0; k < repeats; ++k ) {
|
||||
randomize_indices(indices, dev_indices, data.extent(0));
|
||||
|
||||
start = now();
|
||||
run_gups(dev_indices, dev_data, datum, useAtomics);
|
||||
gupsTime += now() - start;
|
||||
}
|
||||
|
||||
Kokkos::deep_copy(indices, dev_indices);
|
||||
Kokkos::deep_copy(data, dev_data);
|
||||
|
||||
printf(HLINE);
|
||||
printf("GUP/s Random: %18.6f\n",
|
||||
(1.0e-9 * ((double) repeats) * (double) dev_indices.extent(0)) / gupsTime);
|
||||
printf(HLINE);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
|
||||
printf(HLINE);
|
||||
printf("Kokkos GUPS Benchmark\n");
|
||||
printf(HLINE);
|
||||
|
||||
srand48(1010101);
|
||||
|
||||
Kokkos::initialize(argc, argv);
|
||||
|
||||
int64_t indices = 8192;
|
||||
int64_t data = 33554432;
|
||||
int64_t repeats = 10;
|
||||
bool useAtomics = false;
|
||||
|
||||
for( int i = 1; i < argc; ++i ) {
|
||||
if( strcmp( argv[i], "--indices" ) == 0 ) {
|
||||
indices = std::atoll(argv[i+1]);
|
||||
++i;
|
||||
} else if( strcmp( argv[i], "--data" ) == 0 ) {
|
||||
data = std::atoll(argv[i+1]);
|
||||
++i;
|
||||
} else if( strcmp( argv[i], "--repeats" ) == 0 ) {
|
||||
repeats = std::atoll(argv[i+1]);
|
||||
++i;
|
||||
} else if( strcmp( argv[i], "--atomics" ) == 0 ) {
|
||||
useAtomics = true;
|
||||
}
|
||||
}
|
||||
|
||||
const int rc = run_benchmark(indices, data, repeats, useAtomics);
|
||||
|
||||
Kokkos::finalize();
|
||||
|
||||
return rc;
|
||||
}
|
||||
41
lib/kokkos/benchmarks/stream/Makefile
Normal file
@ -0,0 +1,41 @@
|
||||
#Set your Kokkos path to something appropriate
|
||||
KOKKOS_PATH = ${HOME}/git/kokkos-github-repo
|
||||
#KOKKOS_DEVICES = "Cuda"
|
||||
#KOKKOS_ARCH = "Pascal60"
|
||||
#KOKKOS_CUDA_OPTIONS = enable_lambda
|
||||
KOKKOS_DEVICES = "OpenMP"
|
||||
KOKKOS_ARCH = "Power8"
|
||||
|
||||
SRC = stream-kokkos.cc
|
||||
|
||||
default: build
|
||||
echo "Start Build"
|
||||
|
||||
CXXFLAGS = -O3
|
||||
#CXX = ${HOME}/git/kokkos-github-repo/bin/nvcc_wrapper
|
||||
CXX = g++
|
||||
|
||||
LINK = ${CXX}
|
||||
|
||||
LINKFLAGS =
|
||||
EXE = stream-kokkos
|
||||
|
||||
DEPFLAGS = -M
|
||||
|
||||
OBJ = $(SRC:.cc=.o)
|
||||
LIB =
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
build: $(EXE)
|
||||
|
||||
$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
|
||||
|
||||
clean: kokkos-clean
|
||||
rm -f *.o $(EXE)
|
||||
|
||||
# Compilation rules
|
||||
|
||||
%.o:%.cc $(KOKKOS_CPP_DEPENDS)
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
|
||||
265
lib/kokkos/benchmarks/stream/stream-kokkos.cc
Normal file
@ -0,0 +1,265 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include "Kokkos_Core.hpp"
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cmath>
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#define STREAM_ARRAY_SIZE 100000000
|
||||
#define STREAM_NTIMES 20
|
||||
|
||||
#define HLINE "-------------------------------------------------------------\n"
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA)
|
||||
typedef Kokkos::View<double*, Kokkos::CudaSpace>::HostMirror StreamHostArray;
|
||||
typedef Kokkos::View<double*, Kokkos::CudaSpace> StreamDeviceArray;
|
||||
#else
|
||||
typedef Kokkos::View<double*, Kokkos::HostSpace>::HostMirror StreamHostArray;
|
||||
typedef Kokkos::View<double*, Kokkos::HostSpace> StreamDeviceArray;
|
||||
#endif
|
||||
|
||||
typedef int StreamIndex;
|
||||
|
||||
double now() {
|
||||
struct timeval now;
|
||||
gettimeofday(&now, NULL);
|
||||
|
||||
return (double) now.tv_sec + ((double) now.tv_usec * 1.0e-6);
|
||||
}
|
||||
|
||||
void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c) {
|
||||
|
||||
Kokkos::parallel_for("copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
|
||||
c[i] = a[i];
|
||||
});
|
||||
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c,
|
||||
const double scalar) {
|
||||
|
||||
Kokkos::parallel_for("copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
|
||||
b[i] = scalar * c[i];
|
||||
});
|
||||
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
void perform_add(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c) {
|
||||
Kokkos::parallel_for("add", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
|
||||
c[i] = a[i] + b[i];
|
||||
});
|
||||
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
void perform_triad(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c,
|
||||
const double scalar) {
|
||||
|
||||
Kokkos::parallel_for("triad", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
|
||||
a[i] = b[i] + scalar * c[i];
|
||||
});
|
||||
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
int perform_validation(StreamHostArray& a, StreamHostArray& b, StreamHostArray& c,
|
||||
const StreamIndex arraySize, const double scalar) {
|
||||
|
||||
double ai = 1.0;
|
||||
double bi = 2.0;
|
||||
double ci = 0.0;
|
||||
|
||||
for( StreamIndex i = 0; i < arraySize; ++i ) {
|
||||
ci = ai;
|
||||
bi = scalar * ci;
|
||||
ci = ai + bi;
|
||||
ai = bi + scalar * ci;
|
||||
};
|
||||
|
||||
double aError = 0.0;
|
||||
double bError = 0.0;
|
||||
double cError = 0.0;
|
||||
|
||||
for( StreamIndex i = 0; i < arraySize; ++i ) {
|
||||
aError = std::abs( a[i] - ai );
|
||||
bError = std::abs( b[i] - bi );
|
||||
cError = std::abs( c[i] - ci );
|
||||
}
|
||||
|
||||
double aAvgError = aError / (double) arraySize;
|
||||
double bAvgError = bError / (double) arraySize;
|
||||
double cAvgError = cError / (double) arraySize;
|
||||
|
||||
const double epsilon = 1.0e-13;
|
||||
int errorCount = 0;
|
||||
|
||||
if( std::abs( aAvgError / ai ) > epsilon ) {
|
||||
fprintf(stderr, "Error: validation check on View a failed.\n");
|
||||
errorCount++;
|
||||
}
|
||||
|
||||
if( std::abs( bAvgError / bi ) > epsilon ) {
|
||||
fprintf(stderr, "Error: validation check on View b failed.\n");
|
||||
errorCount++;
|
||||
}
|
||||
|
||||
if( std::abs( cAvgError / ci ) > epsilon ) {
|
||||
fprintf(stderr, "Error: validation check on View c failed.\n");
|
||||
errorCount++;
|
||||
}
|
||||
|
||||
if( errorCount == 0 ) {
|
||||
printf("All solutions checked and verified.\n");
|
||||
}
|
||||
|
||||
return errorCount;
|
||||
}
|
||||
|
||||
int run_benchmark() {
|
||||
|
||||
printf("Reports fastest timing per kernel\n");
|
||||
printf("Creating Views...\n");
|
||||
|
||||
printf("Memory Sizes:\n");
|
||||
printf("- Array Size: %" PRIu64 "\n", static_cast<uint64_t>(STREAM_ARRAY_SIZE));
|
||||
printf("- Per Array: %12.2f MB\n", 1.0e-6 * (double) STREAM_ARRAY_SIZE * (double) sizeof(double));
|
||||
printf("- Total: %12.2f MB\n", 3.0e-6 * (double) STREAM_ARRAY_SIZE * (double) sizeof(double));
|
||||
|
||||
printf("Benchmark kernels will be performed for %d iterations.\n", STREAM_NTIMES);
|
||||
|
||||
printf(HLINE);
|
||||
|
||||
StreamDeviceArray dev_a("a", STREAM_ARRAY_SIZE);
|
||||
StreamDeviceArray dev_b("b", STREAM_ARRAY_SIZE);
|
||||
StreamDeviceArray dev_c("c", STREAM_ARRAY_SIZE);
|
||||
|
||||
StreamHostArray a = Kokkos::create_mirror_view(dev_a);
|
||||
StreamHostArray b = Kokkos::create_mirror_view(dev_b);
|
||||
StreamHostArray c = Kokkos::create_mirror_view(dev_c);
|
||||
|
||||
const double scalar = 3.0;
|
||||
|
||||
double copyTime = std::numeric_limits<double>::max();
|
||||
double scaleTime = std::numeric_limits<double>::max();
|
||||
double addTime = std::numeric_limits<double>::max();
|
||||
double triadTime = std::numeric_limits<double>::max();
|
||||
|
||||
printf("Initializing Views...\n");
|
||||
|
||||
#if defined(KOKKOS_HAVE_OPENMP)
|
||||
Kokkos::parallel_for("init", Kokkos::RangePolicy<Kokkos::OpenMP>(0, STREAM_ARRAY_SIZE),
|
||||
#else
|
||||
Kokkos::parallel_for("init", Kokkos::RangePolicy<Kokkos::Serial>(0, STREAM_ARRAY_SIZE),
|
||||
#endif
|
||||
KOKKOS_LAMBDA(const int i) {
|
||||
|
||||
a[i] = 1.0;
|
||||
b[i] = 2.0;
|
||||
c[i] = 0.0;
|
||||
});
|
||||
|
||||
// Copy contents of a (from the host) to the dev_a (device)
|
||||
Kokkos::deep_copy(dev_a, a);
|
||||
Kokkos::deep_copy(dev_b, b);
|
||||
Kokkos::deep_copy(dev_c, c);
|
||||
|
||||
double start;
|
||||
|
||||
printf("Starting benchmarking...\n");
|
||||
|
||||
for( StreamIndex k = 0; k < STREAM_NTIMES; ++k ) {
|
||||
start = now();
|
||||
perform_copy(dev_a, dev_b, dev_c);
|
||||
copyTime = std::min( copyTime, (now() - start) );
|
||||
|
||||
start = now();
|
||||
perform_scale(dev_a, dev_b, dev_c, scalar);
|
||||
scaleTime = std::min( scaleTime, (now() - start) );
|
||||
|
||||
start = now();
|
||||
perform_add(dev_a, dev_b, dev_c);
|
||||
addTime = std::min( addTime, (now() - start) );
|
||||
|
||||
start = now();
|
||||
perform_triad(dev_a, dev_b, dev_c, scalar);
|
||||
triadTime = std::min( triadTime, (now() - start) );
|
||||
}
|
||||
|
||||
Kokkos::deep_copy(a, dev_a);
|
||||
Kokkos::deep_copy(b, dev_b);
|
||||
Kokkos::deep_copy(c, dev_c);
|
||||
|
||||
printf("Performing validation...\n");
|
||||
int rc = perform_validation(a, b, c, STREAM_ARRAY_SIZE, scalar);
|
||||
|
||||
printf(HLINE);
|
||||
|
||||
printf("Copy %11.2f MB/s\n",
|
||||
( 1.0e-06 * 2.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / copyTime );
|
||||
printf("Scale %11.2f MB/s\n",
|
||||
( 1.0e-06 * 2.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / scaleTime );
|
||||
printf("Add %11.2f MB/s\n",
|
||||
( 1.0e-06 * 3.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / addTime );
|
||||
printf("Triad %11.2f MB/s\n",
|
||||
( 1.0e-06 * 3.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / triadTime );
|
||||
|
||||
printf(HLINE);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
|
||||
printf(HLINE);
|
||||
printf("Kokkos STREAM Benchmark\n");
|
||||
printf(HLINE);
|
||||
|
||||
Kokkos::initialize(argc, argv);
|
||||
const int rc = run_benchmark();
|
||||
Kokkos::finalize();
|
||||
|
||||
return rc;
|
||||
}
|
||||
@ -125,18 +125,20 @@ function show_help {
|
||||
echo " --openmp-ratio=N/D Ratio of the cpuset to use for OpenMP"
|
||||
echo " Default: 1"
|
||||
echo " --openmp-places=<Op> Op=threads|cores|sockets. Default: threads"
|
||||
echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
|
||||
echo " --force-openmp-num-threads=N"
|
||||
echo " --openmp-num-threads=N"
|
||||
echo " Override logic for selecting OMP_NUM_THREADS"
|
||||
echo " --force-openmp-proc-bind=<OP>"
|
||||
echo " --openmp-proc-bind=<OP>"
|
||||
echo " Override logic for selecting OMP_PROC_BIND"
|
||||
echo " --no-openmp-nested Set OMP_NESTED to false"
|
||||
echo " --openmp-nested Set OMP_NESTED to true"
|
||||
echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
|
||||
echo " --output-prefix=<P> Save the output to files of the form"
|
||||
echo " P.hpcbind.N, P.stdout.N and P.stderr.N where P is "
|
||||
echo " the prefix and N is the rank (no spaces)"
|
||||
echo " --output-mode=<Op> How console output should be handled."
|
||||
echo " Options are all, rank0, and none. Default: rank0"
|
||||
echo " --lstopo Show bindings in lstopo"
|
||||
echo " --save-topology=<Xml> Save the topology to the given xml file"
|
||||
echo " --load-topology=<Xml> Load a previously saved topology from an xml file"
|
||||
echo " -v|--verbose Print bindings and relevant environment variables"
|
||||
echo " -h|--help Show this message"
|
||||
echo ""
|
||||
@ -189,7 +191,7 @@ HPCBIND_OPENMP_PLACES=${OMP_PLACES:-threads}
|
||||
declare -i HPCBIND_OPENMP_PROC_BIND=1
|
||||
HPCBIND_OPENMP_FORCE_NUM_THREADS=""
|
||||
HPCBIND_OPENMP_FORCE_PROC_BIND=""
|
||||
declare -i HPCBIND_OPENMP_NESTED=1
|
||||
declare -i HPCBIND_OPENMP_NESTED=0
|
||||
declare -i HPCBIND_VERBOSE=0
|
||||
|
||||
declare -i HPCBIND_LSTOPO=0
|
||||
@ -197,6 +199,9 @@ declare -i HPCBIND_LSTOPO=0
|
||||
HPCBIND_OUTPUT_PREFIX=""
|
||||
HPCBIND_OUTPUT_MODE="rank0"
|
||||
|
||||
HPCBIND_OUTPUT_TOPOLOGY=""
|
||||
HPCBIND_INPUT_TOPOLOGY=""
|
||||
|
||||
declare -i HPCBIND_HAS_COMMAND=0
|
||||
|
||||
for i in "$@"; do
|
||||
@ -276,10 +281,22 @@ for i in "$@"; do
|
||||
HPCBIND_OPENMP_NESTED=0
|
||||
shift
|
||||
;;
|
||||
--openmp-nested)
|
||||
HPCBIND_OPENMP_NESTED=1
|
||||
shift
|
||||
;;
|
||||
--output-prefix=*)
|
||||
HPCBIND_OUTPUT_PREFIX="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--save-topology=*)
|
||||
HPCBIND_OUTPUT_TOPOLOGY="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--load-topology=*)
|
||||
HPCBIND_INPUT_TOPOLOGY="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--output-mode=*)
|
||||
HPCBIND_OUTPUT_MODE="${i#*=}"
|
||||
#convert to lower case
|
||||
@ -327,24 +344,37 @@ elif [[ ${HPCBIND_QUEUE_RANK} -eq 0 ]]; then
|
||||
HPCBIND_TEE=1
|
||||
fi
|
||||
|
||||
# Save the topology to the given xml file
|
||||
if [[ "${HPCBIND_OUTPUT_TOPOLOGY}" != "" ]]; then
|
||||
if [[ ${HPCBIND_QUEUE_RANK} -eq 0 ]]; then
|
||||
lstopo-no-graphics "${HPCBIND_OUTPUT_TOPOLOGY}"
|
||||
else
|
||||
lstopo-no-graphics >/dev/null 2>&1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Load the topology to the given xml file
|
||||
if [[ "${HPCBIND_INPUT_TOPOLOGY}" != "" ]]; then
|
||||
if [ -f ${HPCBIND_INPUT_TOPOLOGY} ]; then
|
||||
export HWLOC_XMLFILE="${HPCBIND_INPUT_TOPOLOGY}"
|
||||
export HWLOC_THISSYSTEM=1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "${HPCBIND_OUTPUT_PREFIX}" == "" ]]; then
|
||||
HPCBIND_LOG=/dev/null
|
||||
HPCBIND_ERR=/dev/null
|
||||
HPCBIND_OUT=/dev/null
|
||||
else
|
||||
if [[ ${HPCBIND_QUEUE_SIZE} -gt 0 ]]; then
|
||||
HPCBIND_STR_QUEUE_SIZE="${HPCBIND_QUEUE_SIZE}"
|
||||
HPCBIND_STR_QUEUE_RANK=$(printf %0*d ${#HPCBIND_STR_QUEUE_SIZE} ${HPCBIND_QUEUE_RANK})
|
||||
|
||||
HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}.hpcbind.${HPCBIND_STR_QUEUE_RANK}"
|
||||
HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}.stderr.${HPCBIND_STR_QUEUE_RANK}"
|
||||
HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}.stdout.${HPCBIND_STR_QUEUE_RANK}"
|
||||
else
|
||||
HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}.hpcbind.${HPCBIND_QUEUE_RANK}"
|
||||
HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}.stderr.${HPCBIND_QUEUE_RANK}"
|
||||
HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}.stdout.${HPCBIND_QUEUE_RANK}"
|
||||
if [[ ${HPCBIND_QUEUE_SIZE} -le 0 ]]; then
|
||||
HPCBIND_QUEUE_SIZE=1
|
||||
fi
|
||||
HPCBIND_STR_QUEUE_SIZE="${HPCBIND_QUEUE_SIZE}"
|
||||
HPCBIND_STR_QUEUE_RANK=$(printf %0*d ${#HPCBIND_STR_QUEUE_SIZE} ${HPCBIND_QUEUE_RANK})
|
||||
|
||||
HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}.hpcbind.${HPCBIND_STR_QUEUE_RANK}"
|
||||
HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}.stderr.${HPCBIND_STR_QUEUE_RANK}"
|
||||
HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}.stdout.${HPCBIND_STR_QUEUE_RANK}"
|
||||
> ${HPCBIND_LOG}
|
||||
fi
|
||||
|
||||
@ -546,6 +576,8 @@ if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then
|
||||
hostname -s >> ${HPCBIND_LOG}
|
||||
echo "[HPCBIND]" >> ${HPCBIND_LOG}
|
||||
echo "${TMP_ENV}" | grep -E "^HPCBIND_" >> ${HPCBIND_LOG}
|
||||
echo "[HWLOC]" >> ${HPCBIND_LOG}
|
||||
echo "${TMP_ENV}" | grep -E "^HWLOC_" >> ${HPCBIND_LOG}
|
||||
echo "[CUDA]" >> ${HPCBIND_LOG}
|
||||
echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG}
|
||||
echo "[OPENMP]" >> ${HPCBIND_LOG}
|
||||
@ -568,6 +600,8 @@ else
|
||||
hostname -s > >(tee -a ${HPCBIND_LOG})
|
||||
echo "[HPCBIND]" > >(tee -a ${HPCBIND_LOG})
|
||||
echo "${TMP_ENV}" | grep -E "^HPCBIND_" > >(tee -a ${HPCBIND_LOG})
|
||||
echo "[HWLOC]" > >(tee -a ${HPCBIND_LOG})
|
||||
echo "${TMP_ENV}" | grep -E "^HWLOC_" > >(tee -a ${HPCBIND_LOG})
|
||||
echo "[CUDA]" > >(tee -a ${HPCBIND_LOG})
|
||||
echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG})
|
||||
echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG})
|
||||
|
||||
@ -74,6 +74,9 @@ dry_run=0
|
||||
host_only=0
|
||||
host_only_args=""
|
||||
|
||||
# Just run version on host compiler
|
||||
get_host_version=0
|
||||
|
||||
# Enable workaround for CUDA 6.5 for pragma ident
|
||||
replace_pragma_ident=0
|
||||
|
||||
@ -93,6 +96,9 @@ depfile_separate=0
|
||||
depfile_output_arg=""
|
||||
depfile_target_arg=""
|
||||
|
||||
# Option to remove duplicate libraries and object files
|
||||
remove_duplicate_link_files=0
|
||||
|
||||
#echo "Arguments: $# $@"
|
||||
|
||||
while [ $# -gt 0 ]
|
||||
@ -106,10 +112,18 @@ do
|
||||
--host-only)
|
||||
host_only=1
|
||||
;;
|
||||
#get the host version only
|
||||
--host-version)
|
||||
get_host_version=1
|
||||
;;
|
||||
#replace '#pragma ident' with '#ident' this is needed to compile OpenMPI due to a configure script bug and a non standardized behaviour of pragma with macros
|
||||
--replace-pragma-ident)
|
||||
replace_pragma_ident=1
|
||||
;;
|
||||
#remove duplicate link files
|
||||
--remove-duplicate-link-files)
|
||||
remove_duplicate_link_files=1
|
||||
;;
|
||||
#handle source files to be compiled as cuda files
|
||||
*.cpp|*.cxx|*.cc|*.C|*.c++|*.cu)
|
||||
cpp_files="$cpp_files $1"
|
||||
@ -124,7 +138,12 @@ do
|
||||
fi
|
||||
;;
|
||||
#Handle shared args (valid for both nvcc and the host compiler)
|
||||
-D*|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared)
|
||||
-D*)
|
||||
unescape_commas=`echo "$1" | sed -e 's/\\\,/,/g'`
|
||||
arg=`printf "%q" $unescape_commas`
|
||||
shared_args="$shared_args $arg"
|
||||
;;
|
||||
-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared|-w)
|
||||
shared_args="$shared_args $1"
|
||||
;;
|
||||
#Handle compilation argument
|
||||
@ -152,7 +171,7 @@ do
|
||||
shift
|
||||
;;
|
||||
#Handle known nvcc args
|
||||
-gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*)
|
||||
--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*)
|
||||
cuda_args="$cuda_args $1"
|
||||
;;
|
||||
#Handle more known nvcc args
|
||||
@ -164,8 +183,11 @@ do
|
||||
cuda_args="$cuda_args $1 $2"
|
||||
shift
|
||||
;;
|
||||
-rdc=*|-maxrregcount*|--maxrregcount*)
|
||||
cuda_args="$cuda_args $1"
|
||||
;;
|
||||
#Handle c++11
|
||||
--std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++1z|-std=c++1z)
|
||||
--std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++1y|-std=c++1y|--std=c++17|-std=c++17|--std=c++1z|-std=c++1z)
|
||||
if [ $stdcxx_applied -eq 1 ]; then
|
||||
echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-std=c++1* or --std=c++1*), only the first is used because nvcc can only accept a single std setting"
|
||||
else
|
||||
@ -205,6 +227,15 @@ do
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
#Handle -+ (same as -x c++, specifically used for xl compilers, but mutually exclusive with -x. So replace it with -x c++)
|
||||
-+)
|
||||
if [ $first_xcompiler_arg -eq 1 ]; then
|
||||
xcompiler_args="-x,c++"
|
||||
first_xcompiler_arg=0
|
||||
else
|
||||
xcompiler_args="$xcompiler_args,-x,c++"
|
||||
fi
|
||||
;;
|
||||
#Handle -ccbin (if its not set we can set it to a default value)
|
||||
-ccbin)
|
||||
cuda_args="$cuda_args $1 $2"
|
||||
@ -212,18 +243,39 @@ do
|
||||
host_compiler=$2
|
||||
shift
|
||||
;;
|
||||
#Handle -arch argument (if its not set use a default
|
||||
-arch*)
|
||||
|
||||
#Handle -arch argument (if its not set use a default) this is the version with = sign
|
||||
-arch*|-gencode*)
|
||||
cuda_args="$cuda_args $1"
|
||||
arch_set=1
|
||||
;;
|
||||
#Handle -code argument (if its not set use a default) this is the version with = sign
|
||||
-code*)
|
||||
cuda_args="$cuda_args $1"
|
||||
;;
|
||||
#Handle -arch argument (if its not set use a default) this is the version without = sign
|
||||
-arch|-gencode)
|
||||
cuda_args="$cuda_args $1 $2"
|
||||
arch_set=1
|
||||
shift
|
||||
;;
|
||||
#Handle -code argument (if its not set use a default) this is the version without = sign
|
||||
-code)
|
||||
cuda_args="$cuda_args $1 $2"
|
||||
shift
|
||||
;;
|
||||
#Handle -Xcudafe argument
|
||||
-Xcudafe)
|
||||
cuda_args="$cuda_args -Xcudafe $2"
|
||||
shift
|
||||
;;
|
||||
#Handle -Xlinker argument
|
||||
-Xlinker)
|
||||
xlinker_args="$xlinker_args -Xlinker $2"
|
||||
shift
|
||||
;;
|
||||
#Handle args that should be sent to the linker
|
||||
-Wl*)
|
||||
-Wl,*)
|
||||
xlinker_args="$xlinker_args -Xlinker ${1:4:${#1}}"
|
||||
host_linker_args="$host_linker_args ${1:4:${#1}}"
|
||||
;;
|
||||
@ -256,6 +308,44 @@ do
|
||||
shift
|
||||
done
|
||||
|
||||
# Only print host compiler version
|
||||
if [ $get_host_version -eq 1 ]; then
|
||||
$host_compiler --version
|
||||
exit
|
||||
fi
|
||||
|
||||
#Remove duplicate object files
|
||||
if [ $remove_duplicate_link_files -eq 1 ]; then
|
||||
for obj in $object_files
|
||||
do
|
||||
object_files_reverse="$obj $object_files_reverse"
|
||||
done
|
||||
|
||||
object_files_reverse_clean=""
|
||||
for obj in $object_files_reverse
|
||||
do
|
||||
exists=false
|
||||
for obj2 in $object_files_reverse_clean
|
||||
do
|
||||
if [ "$obj" == "$obj2" ]
|
||||
then
|
||||
exists=true
|
||||
echo "Exists: $obj"
|
||||
fi
|
||||
done
|
||||
if [ "$exists" == "false" ]
|
||||
then
|
||||
object_files_reverse_clean="$object_files_reverse_clean $obj"
|
||||
fi
|
||||
done
|
||||
|
||||
object_files=""
|
||||
for obj in $object_files_reverse_clean
|
||||
do
|
||||
object_files="$obj $object_files"
|
||||
done
|
||||
fi
|
||||
|
||||
#Add default host compiler if necessary
|
||||
if [ $ccbin_set -ne 1 ]; then
|
||||
cuda_args="$cuda_args -ccbin $host_compiler"
|
||||
@ -328,10 +418,19 @@ fi
|
||||
|
||||
#Run compilation command
|
||||
if [ $host_only -eq 1 ]; then
|
||||
if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then
|
||||
echo "$host_command"
|
||||
fi
|
||||
$host_command
|
||||
elif [ -n "$nvcc_depfile_command" ]; then
|
||||
if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then
|
||||
echo "$nvcc_command && $nvcc_depfile_command"
|
||||
fi
|
||||
$nvcc_command && $nvcc_depfile_command
|
||||
else
|
||||
if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then
|
||||
echo "$nvcc_command"
|
||||
fi
|
||||
$nvcc_command
|
||||
fi
|
||||
error_code=$?
|
||||
|
||||
@ -235,3 +235,7 @@ install(FILES
|
||||
# Install the export set for use with the install-tree
|
||||
INSTALL(EXPORT KokkosTargets DESTINATION
|
||||
"${INSTALL_CMAKE_DIR}")
|
||||
|
||||
# build and install pkgconfig file
|
||||
CONFIGURE_FILE(core/src/kokkos.pc.in kokkos.pc @ONLY)
|
||||
INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/kokkos.pc DESTINATION lib/pkgconfig)
|
||||
|
||||
@ -47,7 +47,7 @@ function(set_kokkos_cxx_compiler)
|
||||
OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
|
||||
string(REGEX MATCH "[0-9]+\.[0-9]+\.[0-9]+$"
|
||||
string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+$"
|
||||
INTERNAL_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION})
|
||||
endif()
|
||||
|
||||
|
||||
@ -41,7 +41,6 @@ list(APPEND KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST
|
||||
foreach(opt ${KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST})
|
||||
string(TOUPPER ${opt} OPT )
|
||||
IF(DEFINED Kokkos_ENABLE_${opt})
|
||||
MESSAGE("Kokkos_ENABLE_${opt} is defined!")
|
||||
IF(DEFINED KOKKOS_ENABLE_${OPT})
|
||||
IF(NOT ("${KOKKOS_ENABLE_${OPT}}" STREQUAL "${Kokkos_ENABLE_${opt}}"))
|
||||
IF(DEFINED KOKKOS_ENABLE_${OPT}_INTERNAL)
|
||||
@ -59,7 +58,6 @@ foreach(opt ${KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST})
|
||||
ENDIF()
|
||||
ELSE()
|
||||
SET(KOKKOS_INTERNAL_ENABLE_${OPT}_DEFAULT ${Kokkos_ENABLE_${opt}})
|
||||
MESSAGE("set KOKKOS_INTERNAL_ENABLE_${OPT}_DEFAULT!")
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
endforeach()
|
||||
@ -81,6 +79,7 @@ list(APPEND KOKKOS_ARCH_LIST
|
||||
ARMv80 # (HOST) ARMv8.0 Compatible CPU
|
||||
ARMv81 # (HOST) ARMv8.1 Compatible CPU
|
||||
ARMv8-ThunderX # (HOST) ARMv8 Cavium ThunderX CPU
|
||||
ARMv8-TX2 # (HOST) ARMv8 Cavium ThunderX2 CPU
|
||||
WSM # (HOST) Intel Westmere CPU
|
||||
SNB # (HOST) Intel Sandy/Ivy Bridge CPUs
|
||||
HSW # (HOST) Intel Haswell CPUs
|
||||
@ -123,11 +122,18 @@ list(APPEND KOKKOS_DEVICES_LIST
|
||||
# List of possible TPLs for Kokkos
|
||||
# From Makefile.kokkos: Options: hwloc,librt,experimental_memkind
|
||||
set(KOKKOS_USE_TPLS_LIST)
|
||||
if(APPLE)
|
||||
list(APPEND KOKKOS_USE_TPLS_LIST
|
||||
HWLOC # hwloc
|
||||
MEMKIND # experimental_memkind
|
||||
)
|
||||
else()
|
||||
list(APPEND KOKKOS_USE_TPLS_LIST
|
||||
HWLOC # hwloc
|
||||
LIBRT # librt
|
||||
MEMKIND # experimental_memkind
|
||||
)
|
||||
endif()
|
||||
# Map of cmake variables to Makefile variables
|
||||
set(KOKKOS_INTERNAL_HWLOC hwloc)
|
||||
set(KOKKOS_INTERNAL_LIBRT librt)
|
||||
@ -172,6 +178,7 @@ set(KOKKOS_INTERNAL_LAMBDA enable_lambda)
|
||||
|
||||
set(tmpr "\n ")
|
||||
string(REPLACE ";" ${tmpr} KOKKOS_INTERNAL_ARCH_DOCSTR "${KOKKOS_ARCH_LIST}")
|
||||
set(KOKKOS_INTERNAL_ARCH_DOCSTR "${tmpr}${KOKKOS_INTERNAL_ARCH_DOCSTR}")
|
||||
# This would be useful, but we use Foo_ENABLE mechanisms
|
||||
#string(REPLACE ";" ${tmpr} KOKKOS_INTERNAL_DEVICES_DOCSTR "${KOKKOS_DEVICES_LIST}")
|
||||
#string(REPLACE ";" ${tmpr} KOKKOS_INTERNAL_USE_TPLS_DOCSTR "${KOKKOS_USE_TPLS_LIST}")
|
||||
@ -269,7 +276,7 @@ set(KOKKOS_ENABLE_PROFILING_LOAD_PRINT ${KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_P
|
||||
set_kokkos_default_default(DEPRECATED_CODE ON)
|
||||
set(KOKKOS_ENABLE_DEPRECATED_CODE ${KOKKOS_INTERNAL_ENABLE_DEPRECATED_CODE_DEFAULT} CACHE BOOL "Enable deprecated code.")
|
||||
|
||||
set_kokkos_default_default(EXPLICIT_INSTANTIATION ON)
|
||||
set_kokkos_default_default(EXPLICIT_INSTANTIATION OFF)
|
||||
set(KOKKOS_ENABLE_EXPLICIT_INSTANTIATION ${KOKKOS_INTERNAL_ENABLE_EXPLICIT_INSTANTIATION_DEFAULT} CACHE BOOL "Enable explicit template instantiation.")
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
|
||||
@ -15,16 +15,16 @@
|
||||
|
||||
# Ensure that KOKKOS_ARCH is in the ARCH_LIST
|
||||
if (KOKKOS_ARCH MATCHES ",")
|
||||
message("-- Detected a comma in: KOKKOS_ARCH=${KOKKOS_ARCH}")
|
||||
message("-- Detected a comma in: KOKKOS_ARCH=`${KOKKOS_ARCH}`")
|
||||
message("-- Although we prefer KOKKOS_ARCH to be semicolon-delimited, we do allow")
|
||||
message("-- comma-delimited values for compatibility with scripts (see github.com/trilinos/Trilinos/issues/2330)")
|
||||
string(REPLACE "," ";" KOKKOS_ARCH "${KOKKOS_ARCH}")
|
||||
message("-- Commas were changed to semicolons, now KOKKOS_ARCH=${KOKKOS_ARCH}")
|
||||
message("-- Commas were changed to semicolons, now KOKKOS_ARCH=`${KOKKOS_ARCH}`")
|
||||
endif()
|
||||
foreach(arch ${KOKKOS_ARCH})
|
||||
list(FIND KOKKOS_ARCH_LIST ${arch} indx)
|
||||
if (indx EQUAL -1)
|
||||
message(FATAL_ERROR "${arch} is not an accepted value for KOKKOS_ARCH."
|
||||
message(FATAL_ERROR "`${arch}` is not an accepted value in KOKKOS_ARCH=`${KOKKOS_ARCH}`."
|
||||
" Please pick from these choices: ${KOKKOS_INTERNAL_ARCH_DOCSTR}")
|
||||
endif ()
|
||||
endforeach()
|
||||
@ -130,7 +130,8 @@ string(REPLACE ";" ":" KOKKOS_INTERNAL_ADDTOPATH "${addpathl}")
|
||||
# Set the KOKKOS_SETTINGS String -- this is the primary communication with the
|
||||
# makefile configuration. See Makefile.kokkos
|
||||
|
||||
set(KOKKOS_SETTINGS KOKKOS_SRC_PATH=${KOKKOS_SRC_PATH})
|
||||
set(KOKKOS_SETTINGS KOKKOS_CMAKE=yes)
|
||||
set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} KOKKOS_SRC_PATH=${KOKKOS_SRC_PATH})
|
||||
set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} KOKKOS_PATH=${KOKKOS_PATH})
|
||||
set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} KOKKOS_INSTALL_PATH=${CMAKE_INSTALL_PREFIX})
|
||||
|
||||
|
||||
@ -241,17 +241,16 @@ elif [ "$MACHINE" = "white" ]; then
|
||||
|
||||
BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
|
||||
IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
|
||||
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/5.4.0"
|
||||
CUDA_MODULE_LIST2="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/6.3.0,ibm/xl/13.1.6"
|
||||
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/6.4.0,ibm/xl/16.1.0"
|
||||
|
||||
# Don't do pthread on white.
|
||||
GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
|
||||
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"ibm/13.1.6 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
|
||||
"cuda/8.0.44 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"cuda/9.0.103 $CUDA_MODULE_LIST2 $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
|
||||
"cuda/9.0.103 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
)
|
||||
|
||||
if [ -z "$ARCH_FLAG" ]; then
|
||||
@ -362,7 +361,7 @@ elif [ "$MACHINE" = "apollo" ]; then
|
||||
"gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
|
||||
"intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
|
||||
"clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
|
||||
"clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread,OpenMP" clang++ $CUDA_WARNING_FLAGS"
|
||||
"cuda/9.1 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
)
|
||||
else
|
||||
|
||||
@ -96,6 +96,7 @@ template< class DataType ,
|
||||
class Arg3Type = void>
|
||||
class DualView : public ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type >
|
||||
{
|
||||
template< class , class , class , class > friend class DualView ;
|
||||
public:
|
||||
//! \name Typedefs for device types and various Kokkos::View specializations.
|
||||
//@{
|
||||
@ -182,8 +183,20 @@ public:
|
||||
//! \name Counters to keep track of changes ("modified" flags)
|
||||
//@{
|
||||
|
||||
View<unsigned int,LayoutLeft,typename t_host::execution_space> modified_device;
|
||||
View<unsigned int,LayoutLeft,typename t_host::execution_space> modified_host;
|
||||
#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
protected:
|
||||
// modified_flags[0] -> host
|
||||
// modified_flags[1] -> device
|
||||
typedef View<unsigned int[2],LayoutLeft,Kokkos::HostSpace> t_modified_flags;
|
||||
t_modified_flags modified_flags;
|
||||
|
||||
public:
|
||||
#else
|
||||
typedef View<unsigned int[2],LayoutLeft,typename t_host::execution_space> t_modified_flags;
|
||||
typedef View<unsigned int,LayoutLeft,typename t_host::execution_space> t_modified_flag;
|
||||
t_modified_flags modified_flags;
|
||||
t_modified_flag modified_host,modified_device;
|
||||
#endif
|
||||
|
||||
//@}
|
||||
//! \name Constructors
|
||||
@ -194,10 +207,14 @@ public:
|
||||
/// Both device and host View objects are constructed using their
|
||||
/// default constructors. The "modified" flags are both initialized
|
||||
/// to "unmodified."
|
||||
DualView () :
|
||||
modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device")),
|
||||
modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
|
||||
{}
|
||||
#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
DualView () = default;
|
||||
#else
|
||||
DualView ():modified_flags (t_modified_flags("DualView::modified_flags")) {
|
||||
modified_host = t_modified_flag(modified_flags,0);
|
||||
modified_device = t_modified_flag(modified_flags,1);
|
||||
}
|
||||
#endif
|
||||
|
||||
/// \brief Constructor that allocates View objects on both host and device.
|
||||
///
|
||||
@ -219,17 +236,24 @@ public:
|
||||
const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG)
|
||||
: d_view (label, n0, n1, n2, n3, n4, n5, n6, n7)
|
||||
, h_view (create_mirror_view (d_view)) // without UVM, host View mirrors
|
||||
, modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device"))
|
||||
, modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
|
||||
{}
|
||||
, modified_flags (t_modified_flags("DualView::modified_flags"))
|
||||
{
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
modified_host = t_modified_flag(modified_flags,0);
|
||||
modified_device = t_modified_flag(modified_flags,1);
|
||||
#endif
|
||||
}
|
||||
|
||||
//! Copy constructor (shallow copy)
|
||||
template<class SS, class LS, class DS, class MS>
|
||||
DualView (const DualView<SS,LS,DS,MS>& src) :
|
||||
d_view (src.d_view),
|
||||
h_view (src.h_view),
|
||||
modified_device (src.modified_device),
|
||||
modified_host (src.modified_host)
|
||||
modified_flags (src.modified_flags)
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
, modified_host(src.modified_host)
|
||||
, modified_device(src.modified_device)
|
||||
#endif
|
||||
{}
|
||||
|
||||
//! Subview constructor
|
||||
@ -241,8 +265,11 @@ public:
|
||||
)
|
||||
: d_view( Kokkos::subview( src.d_view , arg0 , args ... ) )
|
||||
, h_view( Kokkos::subview( src.h_view , arg0 , args ... ) )
|
||||
, modified_device (src.modified_device)
|
||||
, modified_host (src.modified_host)
|
||||
, modified_flags (src.modified_flags)
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
, modified_host(src.modified_host)
|
||||
, modified_device(src.modified_device)
|
||||
#endif
|
||||
{}
|
||||
|
||||
/// \brief Create DualView from existing device and host View objects.
|
||||
@ -258,8 +285,7 @@ public:
|
||||
DualView (const t_dev& d_view_, const t_host& h_view_) :
|
||||
d_view (d_view_),
|
||||
h_view (h_view_),
|
||||
modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device")),
|
||||
modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
|
||||
modified_flags (t_modified_flags("DualView::modified_flags"))
|
||||
{
|
||||
if ( int(d_view.rank) != int(h_view.rank) ||
|
||||
d_view.extent(0) != h_view.extent(0) ||
|
||||
@ -281,6 +307,10 @@ public:
|
||||
d_view.span() != h_view.span() ) {
|
||||
Kokkos::Impl::throw_runtime_exception("DualView constructed with incompatible views");
|
||||
}
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
modified_host = t_modified_flag(modified_flags,0);
|
||||
modified_device = t_modified_flag(modified_flags,1);
|
||||
#endif
|
||||
}
|
||||
|
||||
//@}
|
||||
@ -316,6 +346,30 @@ public:
|
||||
t_dev,
|
||||
t_host>::type& view () const
|
||||
{
|
||||
#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
constexpr bool device_is_memspace = std::is_same<Device,typename Device::memory_space>::value;
|
||||
constexpr bool device_is_execspace = std::is_same<Device,typename Device::execution_space>::value;
|
||||
constexpr bool device_exec_is_t_dev_exec = std::is_same<typename Device::execution_space,typename t_dev::execution_space>::value;
|
||||
constexpr bool device_mem_is_t_dev_mem = std::is_same<typename Device::memory_space,typename t_dev::memory_space>::value;
|
||||
constexpr bool device_exec_is_t_host_exec = std::is_same<typename Device::execution_space,typename t_host::execution_space>::value;
|
||||
constexpr bool device_mem_is_t_host_mem = std::is_same<typename Device::memory_space,typename t_host::memory_space>::value;
|
||||
constexpr bool device_is_t_host_device = std::is_same<typename Device::execution_space,typename t_host::device_type>::value;
|
||||
constexpr bool device_is_t_dev_device = std::is_same<typename Device::memory_space,typename t_host::device_type>::value;
|
||||
|
||||
static_assert(
|
||||
device_is_t_dev_device || device_is_t_host_device ||
|
||||
(device_is_memspace && (device_mem_is_t_dev_mem || device_mem_is_t_host_mem) ) ||
|
||||
(device_is_execspace && (device_exec_is_t_dev_exec || device_exec_is_t_host_exec) ) ||
|
||||
(
|
||||
(!device_is_execspace && !device_is_memspace) && (
|
||||
(device_mem_is_t_dev_mem || device_mem_is_t_host_mem) ||
|
||||
(device_exec_is_t_dev_exec || device_exec_is_t_host_exec)
|
||||
)
|
||||
)
|
||||
,
|
||||
"Template parameter to .view() must exactly match one of the DualView's device types or one of the execution or memory spaces");
|
||||
#endif
|
||||
|
||||
return Impl::if_c<
|
||||
std::is_same<
|
||||
typename t_dev::memory_space,
|
||||
@ -324,6 +378,72 @@ public:
|
||||
t_host >::select (d_view , h_view);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
t_host view_host() const {
|
||||
return h_view;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
t_dev view_device() const {
|
||||
return d_view;
|
||||
}
|
||||
|
||||
template<class Device>
|
||||
static int get_device_side() {
|
||||
constexpr bool device_is_memspace = std::is_same<Device,typename Device::memory_space>::value;
|
||||
constexpr bool device_is_execspace = std::is_same<Device,typename Device::execution_space>::value;
|
||||
constexpr bool device_exec_is_t_dev_exec = std::is_same<typename Device::execution_space,typename t_dev::execution_space>::value;
|
||||
constexpr bool device_mem_is_t_dev_mem = std::is_same<typename Device::memory_space,typename t_dev::memory_space>::value;
|
||||
constexpr bool device_exec_is_t_host_exec = std::is_same<typename Device::execution_space,typename t_host::execution_space>::value;
|
||||
constexpr bool device_mem_is_t_host_mem = std::is_same<typename Device::memory_space,typename t_host::memory_space>::value;
|
||||
constexpr bool device_is_t_host_device = std::is_same<typename Device::execution_space,typename t_host::device_type>::value;
|
||||
constexpr bool device_is_t_dev_device = std::is_same<typename Device::memory_space,typename t_host::device_type>::value;
|
||||
|
||||
#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
static_assert(
|
||||
device_is_t_dev_device || device_is_t_host_device ||
|
||||
(device_is_memspace && (device_mem_is_t_dev_mem || device_mem_is_t_host_mem) ) ||
|
||||
(device_is_execspace && (device_exec_is_t_dev_exec || device_exec_is_t_host_exec) ) ||
|
||||
(
|
||||
(!device_is_execspace && !device_is_memspace) && (
|
||||
(device_mem_is_t_dev_mem || device_mem_is_t_host_mem) ||
|
||||
(device_exec_is_t_dev_exec || device_exec_is_t_host_exec)
|
||||
)
|
||||
)
|
||||
,
|
||||
"Template parameter to .sync() must exactly match one of the DualView's device types or one of the execution or memory spaces");
|
||||
#endif
|
||||
|
||||
#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
int dev = -1;
|
||||
#else
|
||||
int dev = 0;
|
||||
#endif
|
||||
if(device_is_t_dev_device) dev = 1;
|
||||
else if(device_is_t_host_device) dev = 0;
|
||||
else {
|
||||
if(device_is_memspace) {
|
||||
if(device_mem_is_t_dev_mem) dev = 1;
|
||||
if(device_mem_is_t_host_mem) dev = 0;
|
||||
if(device_mem_is_t_host_mem && device_mem_is_t_dev_mem) dev = -1;
|
||||
}
|
||||
if(device_is_execspace) {
|
||||
if(device_exec_is_t_dev_exec) dev = 1;
|
||||
if(device_exec_is_t_host_exec) dev = 0;
|
||||
if(device_exec_is_t_host_exec && device_exec_is_t_dev_exec) dev = -1;
|
||||
}
|
||||
if(!device_is_execspace && !device_is_memspace) {
|
||||
if(device_mem_is_t_dev_mem) dev = 1;
|
||||
if(device_mem_is_t_host_mem) dev = 0;
|
||||
if(device_mem_is_t_host_mem && device_mem_is_t_dev_mem) dev = -1;
|
||||
if(device_exec_is_t_dev_exec) dev = 1;
|
||||
if(device_exec_is_t_host_exec) dev = 0;
|
||||
if(device_exec_is_t_host_exec && device_exec_is_t_dev_exec) dev = -1;
|
||||
}
|
||||
}
|
||||
return dev;
|
||||
}
|
||||
|
||||
/// \brief Update data on device or host only if data in the other
|
||||
/// space has been marked as modified.
|
||||
///
|
||||
@ -347,23 +467,20 @@ public:
|
||||
( std::is_same< Device , int>::value)
|
||||
, int >::type& = 0)
|
||||
{
|
||||
const unsigned int dev =
|
||||
Impl::if_c<
|
||||
std::is_same<
|
||||
typename t_dev::memory_space,
|
||||
typename Device::memory_space>::value ,
|
||||
unsigned int,
|
||||
unsigned int>::select (1, 0);
|
||||
if(modified_flags.data()==NULL) return;
|
||||
|
||||
if (dev) { // if Device is the same as DualView's device type
|
||||
if ((modified_host () > 0) && (modified_host () >= modified_device ())) {
|
||||
int dev = get_device_side<Device>();
|
||||
|
||||
if (dev == 1) { // if Device is the same as DualView's device type
|
||||
if ((modified_flags(0) > 0) && (modified_flags(0) >= modified_flags(1))) {
|
||||
deep_copy (d_view, h_view);
|
||||
modified_host() = modified_device() = 0;
|
||||
modified_flags(0) = modified_flags(1) = 0;
|
||||
}
|
||||
} else { // hopefully Device is the same as DualView's host type
|
||||
if ((modified_device () > 0) && (modified_device () >= modified_host ())) {
|
||||
}
|
||||
if (dev == 0) { // hopefully Device is the same as DualView's host type
|
||||
if ((modified_flags(1) > 0) && (modified_flags(1) >= modified_flags(0))) {
|
||||
deep_copy (h_view, d_view);
|
||||
modified_host() = modified_device() = 0;
|
||||
modified_flags(0) = modified_flags(1) = 0;
|
||||
}
|
||||
}
|
||||
if(std::is_same<typename t_host::memory_space,typename t_dev::memory_space>::value) {
|
||||
@ -378,46 +495,71 @@ public:
|
||||
( std::is_same< Device , int>::value)
|
||||
, int >::type& = 0 )
|
||||
{
|
||||
const unsigned int dev =
|
||||
Impl::if_c<
|
||||
std::is_same<
|
||||
typename t_dev::memory_space,
|
||||
typename Device::memory_space>::value,
|
||||
unsigned int,
|
||||
unsigned int>::select (1, 0);
|
||||
if (dev) { // if Device is the same as DualView's device type
|
||||
if ((modified_host () > 0) && (modified_host () >= modified_device ())) {
|
||||
if(modified_flags.data()==NULL) return;
|
||||
|
||||
int dev = get_device_side<Device>();
|
||||
|
||||
if (dev == 1) { // if Device is the same as DualView's device type
|
||||
if ((modified_flags(0) > 0) && (modified_flags(0) >= modified_flags(1))) {
|
||||
Impl::throw_runtime_exception("Calling sync on a DualView with a const datatype.");
|
||||
}
|
||||
} else { // hopefully Device is the same as DualView's host type
|
||||
if ((modified_device () > 0) && (modified_device () >= modified_host ())) {
|
||||
}
|
||||
if (dev == 0){ // hopefully Device is the same as DualView's host type
|
||||
if ((modified_flags(1) > 0) && (modified_flags(1) >= modified_flags(0))) {
|
||||
Impl::throw_runtime_exception("Calling sync on a DualView with a const datatype.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void sync_host() {
|
||||
if( ! std::is_same< typename traits::data_type , typename traits::non_const_data_type>::value )
|
||||
Impl::throw_runtime_exception("Calling sync_host on a DualView with a const datatype.");
|
||||
if(modified_flags.data()==NULL) return;
|
||||
if(modified_flags(1) > modified_flags(0)) {
|
||||
deep_copy (h_view, d_view);
|
||||
modified_flags(1) = modified_flags(0) = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void sync_device() {
|
||||
if( ! std::is_same< typename traits::data_type , typename traits::non_const_data_type>::value )
|
||||
Impl::throw_runtime_exception("Calling sync_device on a DualView with a const datatype.");
|
||||
if(modified_flags.data()==NULL) return;
|
||||
if(modified_flags(0) > modified_flags(1)) {
|
||||
deep_copy (d_view, h_view);
|
||||
modified_flags(1) = modified_flags(0) = 0;
|
||||
}
|
||||
}
|
||||
|
||||
template<class Device>
|
||||
bool need_sync() const
|
||||
{
|
||||
const unsigned int dev =
|
||||
Impl::if_c<
|
||||
std::is_same<
|
||||
typename t_dev::memory_space,
|
||||
typename Device::memory_space>::value ,
|
||||
unsigned int,
|
||||
unsigned int>::select (1, 0);
|
||||
if(modified_flags.data()==NULL) return false;
|
||||
int dev = get_device_side<Device>();
|
||||
|
||||
if (dev) { // if Device is the same as DualView's device type
|
||||
if ((modified_host () > 0) && (modified_host () >= modified_device ())) {
|
||||
if (dev == 1) { // if Device is the same as DualView's device type
|
||||
if ((modified_flags(0) > 0) && (modified_flags(0) >= modified_flags(1))) {
|
||||
return true;
|
||||
}
|
||||
} else { // hopefully Device is the same as DualView's host type
|
||||
if ((modified_device () > 0) && (modified_device () >= modified_host ())) {
|
||||
}
|
||||
if (dev == 0){ // hopefully Device is the same as DualView's host type
|
||||
if ((modified_flags(1) > 0) && (modified_flags(1) >= modified_flags(0))) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
inline bool need_sync_host() const {
|
||||
if(modified_flags.data()==NULL) return false;
|
||||
return modified_flags(0)<modified_flags(1);
|
||||
}
|
||||
|
||||
inline bool need_sync_device() const {
|
||||
if(modified_flags.data()==NULL) return false;
|
||||
return modified_flags(1)<modified_flags(0);
|
||||
}
|
||||
|
||||
/// \brief Mark data as modified on the given device \c Device.
|
||||
///
|
||||
/// If \c Device is the same as this DualView's device type, then
|
||||
@ -425,26 +567,22 @@ public:
|
||||
/// data as modified.
|
||||
template<class Device>
|
||||
void modify () {
|
||||
const unsigned int dev =
|
||||
Impl::if_c<
|
||||
std::is_same<
|
||||
typename t_dev::memory_space,
|
||||
typename Device::memory_space>::value,
|
||||
unsigned int,
|
||||
unsigned int>::select (1, 0);
|
||||
if(modified_flags.data()==NULL) return;
|
||||
int dev = get_device_side<Device>();
|
||||
|
||||
if (dev) { // if Device is the same as DualView's device type
|
||||
if (dev == 1) { // if Device is the same as DualView's device type
|
||||
// Increment the device's modified count.
|
||||
modified_device () = (modified_device () > modified_host () ?
|
||||
modified_device () : modified_host ()) + 1;
|
||||
} else { // hopefully Device is the same as DualView's host type
|
||||
modified_flags(1) = (modified_flags(1) > modified_flags(0) ?
|
||||
modified_flags(1) : modified_flags(0)) + 1;
|
||||
}
|
||||
if (dev == 0) { // hopefully Device is the same as DualView's host type
|
||||
// Increment the host's modified count.
|
||||
modified_host () = (modified_device () > modified_host () ?
|
||||
modified_device () : modified_host ()) + 1;
|
||||
modified_flags(0) = (modified_flags(1) > modified_flags(0) ?
|
||||
modified_flags(1) : modified_flags(0)) + 1;
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
|
||||
if (modified_host() && modified_device()) {
|
||||
if (modified_flags(0) && modified_flags(1)) {
|
||||
std::string msg = "Kokkos::DualView::modify ERROR: ";
|
||||
msg += "Concurrent modification of host and device views ";
|
||||
msg += "in DualView \"";
|
||||
@ -455,6 +593,45 @@ public:
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void modify_host() {
|
||||
if(modified_flags.data()!=NULL) {
|
||||
modified_flags(0) = (modified_flags(1) > modified_flags(0) ?
|
||||
modified_flags(1) : modified_flags(0)) + 1;
|
||||
#ifdef KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
|
||||
if (modified_flags(0) && modified_flags(1)) {
|
||||
std::string msg = "Kokkos::DualView::modify_host ERROR: ";
|
||||
msg += "Concurrent modification of host and device views ";
|
||||
msg += "in DualView \"";
|
||||
msg += d_view.label();
|
||||
msg += "\"\n";
|
||||
Kokkos::abort(msg.c_str());
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
inline void modify_device() {
|
||||
if(modified_flags.data()!=NULL) {
|
||||
modified_flags(1) = (modified_flags(1) > modified_flags(0) ?
|
||||
modified_flags(1) : modified_flags(0)) + 1;
|
||||
#ifdef KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
|
||||
if (modified_flags(0) && modified_flags(1)) {
|
||||
std::string msg = "Kokkos::DualView::modify_device ERROR: ";
|
||||
msg += "Concurrent modification of host and device views ";
|
||||
msg += "in DualView \"";
|
||||
msg += d_view.label();
|
||||
msg += "\"\n";
|
||||
Kokkos::abort(msg.c_str());
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
inline void clear_sync_state() {
|
||||
if(modified_flags.data()!=NULL)
|
||||
modified_flags(1) = modified_flags(0) = 0;
|
||||
}
|
||||
|
||||
//@}
|
||||
//! \name Methods for reallocating or resizing the View objects.
|
||||
//@{
|
||||
@ -476,7 +653,10 @@ public:
|
||||
h_view = create_mirror_view( d_view );
|
||||
|
||||
/* Reset dirty flags */
|
||||
modified_device() = modified_host() = 0;
|
||||
if(modified_flags.data()==NULL) {
|
||||
modified_flags = t_modified_flags("DualView::modified_flags");
|
||||
} else
|
||||
modified_flags(1) = modified_flags(0) = 0;
|
||||
}
|
||||
|
||||
/// \brief Resize both views, copying old contents into new if necessary.
|
||||
@ -491,13 +671,16 @@ public:
|
||||
const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG ,
|
||||
const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG ,
|
||||
const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG ) {
|
||||
if(modified_device() >= modified_host()) {
|
||||
if(modified_flags.data()==NULL) {
|
||||
modified_flags = t_modified_flags("DualView::modified_flags");
|
||||
}
|
||||
if(modified_flags(1) >= modified_flags(0)) {
|
||||
/* Resize on Device */
|
||||
::Kokkos::resize(d_view,n0,n1,n2,n3,n4,n5,n6,n7);
|
||||
h_view = create_mirror_view( d_view );
|
||||
|
||||
/* Mark Device copy as modified */
|
||||
modified_device() = modified_device()+1;
|
||||
modified_flags(1) = modified_flags(1)+1;
|
||||
|
||||
} else {
|
||||
/* Realloc on Device */
|
||||
@ -525,7 +708,7 @@ public:
|
||||
d_view = create_mirror_view( typename t_dev::execution_space(), h_view );
|
||||
|
||||
/* Mark Host copy as modified */
|
||||
modified_host() = modified_host()+1;
|
||||
modified_flags(0) = modified_flags(0)+1;
|
||||
}
|
||||
}
|
||||
|
||||
@ -649,7 +832,10 @@ void
|
||||
deep_copy (DualView<DT,DL,DD,DM> dst, // trust me, this must not be a reference
|
||||
const DualView<ST,SL,SD,SM>& src )
|
||||
{
|
||||
if (src.modified_device () >= src.modified_host ()) {
|
||||
if(src.modified_flags.data()==NULL || dst.modified_flags.data()==NULL) {
|
||||
return deep_copy(dst.d_view, src.d_view);
|
||||
}
|
||||
if (src.modified_flags(1) >= src.modified_flags(0)) {
|
||||
deep_copy (dst.d_view, src.d_view);
|
||||
dst.template modify<typename DualView<DT,DL,DD,DM>::device_type> ();
|
||||
} else {
|
||||
@ -666,7 +852,10 @@ deep_copy (const ExecutionSpace& exec ,
|
||||
DualView<DT,DL,DD,DM> dst, // trust me, this must not be a reference
|
||||
const DualView<ST,SL,SD,SM>& src )
|
||||
{
|
||||
if (src.modified_device () >= src.modified_host ()) {
|
||||
if(src.modified_flags.data()==NULL || dst.modified_flags.data()==NULL) {
|
||||
return deep_copy(exec, dst.d_view, src.d_view);
|
||||
}
|
||||
if (src.modified_flags(1) >= src.modified_flags(0)) {
|
||||
deep_copy (exec, dst.d_view, src.d_view);
|
||||
dst.template modify<typename DualView<DT,DL,DD,DM>::device_type> ();
|
||||
} else {
|
||||
|
||||
@ -64,7 +64,7 @@ namespace Impl {
|
||||
template <typename Specialize>
|
||||
struct DynRankDimTraits {
|
||||
|
||||
enum : size_t{unspecified =KOKKOS_INVALID_INDEX};
|
||||
enum : size_t{unspecified = KOKKOS_INVALID_INDEX};
|
||||
|
||||
// Compute the rank of the view from the nonzero dimension arguments.
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -384,8 +384,8 @@ public:
|
||||
// Removed dimension checks...
|
||||
|
||||
typedef typename DstType::offset_type dst_offset_type ;
|
||||
dst.m_map.m_offset = dst_offset_type(std::integral_constant<unsigned,0>() , src.layout() ); //Check this for integer input1 for padding, etc
|
||||
dst.m_map.m_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
|
||||
dst.m_map.m_impl_offset = dst_offset_type(std::integral_constant<unsigned,0>() , src.layout() ); //Check this for integer input1 for padding, etc
|
||||
dst.m_map.m_impl_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_impl_handle , src.m_track );
|
||||
dst.m_track.assign( src.m_track , DstTraits::is_managed );
|
||||
dst.m_rank = src.Rank ;
|
||||
}
|
||||
@ -565,10 +565,14 @@ public:
|
||||
|
||||
//----------------------------------------
|
||||
// Allow specializations to query their specialized map
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const Kokkos::Impl::ViewMapping< traits , void > &
|
||||
implementation_map() const { return m_map ; }
|
||||
#endif
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const Kokkos::Impl::ViewMapping< traits , void > &
|
||||
impl_map() const { return m_map ; }
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
@ -624,7 +628,7 @@ public:
|
||||
reference_type operator()() const
|
||||
{
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (0 , this->rank(), m_track, m_map) )
|
||||
return implementation_map().reference();
|
||||
return impl_map().reference();
|
||||
//return m_map.reference(0,0,0,0,0,0,0);
|
||||
}
|
||||
|
||||
@ -647,7 +651,7 @@ public:
|
||||
typename std::enable_if< !std::is_same<typename drvtraits::value_type, typename drvtraits::scalar_array_type>::value && std::is_integral<iType>::value, reference_type>::type
|
||||
operator[](const iType & i0) const
|
||||
{
|
||||
// auto map = implementation_map();
|
||||
// auto map = impl_map();
|
||||
const size_t dim_scalar = m_map.dimension_scalar();
|
||||
const size_t bytes = this->span() / dim_scalar;
|
||||
|
||||
@ -785,7 +789,7 @@ public:
|
||||
reference_type access() const
|
||||
{
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (0 , this->rank(), m_track, m_map) )
|
||||
return implementation_map().reference();
|
||||
return impl_map().reference();
|
||||
//return m_map.reference(0,0,0,0,0,0,0);
|
||||
}
|
||||
|
||||
@ -1004,7 +1008,7 @@ public:
|
||||
|
||||
//----------------------------------------
|
||||
// Allocation according to allocation properties and array layout
|
||||
// unused arg_layout dimensions must be set toKOKKOS_INVALID_INDEX so that rank deduction can properly take place
|
||||
// unused arg_layout dimensions must be set to KOKKOS_INVALID_INDEX so that rank deduction can properly take place
|
||||
template< class ... P >
|
||||
explicit inline
|
||||
DynRankView( const Kokkos::Impl::ViewCtorProp< P ... > & arg_prop
|
||||
@ -1179,7 +1183,7 @@ public:
|
||||
: DynRankView( Kokkos::Impl::ViewCtorProp< std::string >( arg_label )
|
||||
, typename traits::array_layout
|
||||
( arg_N0 , arg_N1 , arg_N2 , arg_N3 , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
|
||||
)
|
||||
)
|
||||
{}
|
||||
|
||||
// For backward compatibility
|
||||
@ -1189,8 +1193,7 @@ public:
|
||||
, const typename traits::array_layout & arg_layout
|
||||
)
|
||||
: DynRankView( Kokkos::Impl::ViewCtorProp< std::string , Kokkos::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::WithoutInitializing )
|
||||
|
||||
, Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout)
|
||||
, arg_layout
|
||||
)
|
||||
{}
|
||||
|
||||
@ -1205,7 +1208,9 @@ public:
|
||||
, const size_t arg_N6 =KOKKOS_INVALID_INDEX
|
||||
, const size_t arg_N7 =KOKKOS_INVALID_INDEX
|
||||
)
|
||||
: DynRankView(Kokkos::Impl::ViewCtorProp< std::string , Kokkos::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::WithoutInitializing ), arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7 )
|
||||
: DynRankView(Kokkos::Impl::ViewCtorProp< std::string , Kokkos::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::WithoutInitializing )
|
||||
, typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)
|
||||
)
|
||||
{}
|
||||
|
||||
//----------------------------------------
|
||||
@ -1445,30 +1450,30 @@ public:
|
||||
ret_type dst ;
|
||||
|
||||
const SubviewExtents< 7 , rank > extents =
|
||||
ExtentGenerator< Args ... >::generator( src.m_map.m_offset.m_dim , args... ) ;
|
||||
ExtentGenerator< Args ... >::generator( src.m_map.m_impl_offset.m_dim , args... ) ;
|
||||
|
||||
dst_offset_type tempdst( src.m_map.m_offset , extents ) ;
|
||||
dst_offset_type tempdst( src.m_map.m_impl_offset , extents ) ;
|
||||
|
||||
dst.m_track = src.m_track ;
|
||||
|
||||
dst.m_map.m_offset.m_dim.N0 = tempdst.m_dim.N0 ;
|
||||
dst.m_map.m_offset.m_dim.N1 = tempdst.m_dim.N1 ;
|
||||
dst.m_map.m_offset.m_dim.N2 = tempdst.m_dim.N2 ;
|
||||
dst.m_map.m_offset.m_dim.N3 = tempdst.m_dim.N3 ;
|
||||
dst.m_map.m_offset.m_dim.N4 = tempdst.m_dim.N4 ;
|
||||
dst.m_map.m_offset.m_dim.N5 = tempdst.m_dim.N5 ;
|
||||
dst.m_map.m_offset.m_dim.N6 = tempdst.m_dim.N6 ;
|
||||
dst.m_map.m_impl_offset.m_dim.N0 = tempdst.m_dim.N0 ;
|
||||
dst.m_map.m_impl_offset.m_dim.N1 = tempdst.m_dim.N1 ;
|
||||
dst.m_map.m_impl_offset.m_dim.N2 = tempdst.m_dim.N2 ;
|
||||
dst.m_map.m_impl_offset.m_dim.N3 = tempdst.m_dim.N3 ;
|
||||
dst.m_map.m_impl_offset.m_dim.N4 = tempdst.m_dim.N4 ;
|
||||
dst.m_map.m_impl_offset.m_dim.N5 = tempdst.m_dim.N5 ;
|
||||
dst.m_map.m_impl_offset.m_dim.N6 = tempdst.m_dim.N6 ;
|
||||
|
||||
dst.m_map.m_offset.m_stride.S0 = tempdst.m_stride.S0 ;
|
||||
dst.m_map.m_offset.m_stride.S1 = tempdst.m_stride.S1 ;
|
||||
dst.m_map.m_offset.m_stride.S2 = tempdst.m_stride.S2 ;
|
||||
dst.m_map.m_offset.m_stride.S3 = tempdst.m_stride.S3 ;
|
||||
dst.m_map.m_offset.m_stride.S4 = tempdst.m_stride.S4 ;
|
||||
dst.m_map.m_offset.m_stride.S5 = tempdst.m_stride.S5 ;
|
||||
dst.m_map.m_offset.m_stride.S6 = tempdst.m_stride.S6 ;
|
||||
dst.m_map.m_impl_offset.m_stride.S0 = tempdst.m_stride.S0 ;
|
||||
dst.m_map.m_impl_offset.m_stride.S1 = tempdst.m_stride.S1 ;
|
||||
dst.m_map.m_impl_offset.m_stride.S2 = tempdst.m_stride.S2 ;
|
||||
dst.m_map.m_impl_offset.m_stride.S3 = tempdst.m_stride.S3 ;
|
||||
dst.m_map.m_impl_offset.m_stride.S4 = tempdst.m_stride.S4 ;
|
||||
dst.m_map.m_impl_offset.m_stride.S5 = tempdst.m_stride.S5 ;
|
||||
dst.m_map.m_impl_offset.m_stride.S6 = tempdst.m_stride.S6 ;
|
||||
|
||||
dst.m_map.m_handle = dst_handle_type( src.m_map.m_handle +
|
||||
src.m_map.m_offset( extents.domain_offset(0)
|
||||
dst.m_map.m_impl_handle = dst_handle_type( src.m_map.m_impl_handle +
|
||||
src.m_map.m_impl_offset( extents.domain_offset(0)
|
||||
, extents.domain_offset(1)
|
||||
, extents.domain_offset(2)
|
||||
, extents.domain_offset(3)
|
||||
@ -1896,6 +1901,7 @@ inline
|
||||
typename DynRankView<T,P...>::HostMirror
|
||||
create_mirror( const DynRankView<T,P...> & src
|
||||
, typename std::enable_if<
|
||||
std::is_same< typename ViewTraits<T,P...>::specialize , void >::value &&
|
||||
! std::is_same< typename Kokkos::ViewTraits<T,P...>::array_layout
|
||||
, Kokkos::LayoutStride >::value
|
||||
>::type * = 0
|
||||
@ -1914,6 +1920,7 @@ inline
|
||||
typename DynRankView<T,P...>::HostMirror
|
||||
create_mirror( const DynRankView<T,P...> & src
|
||||
, typename std::enable_if<
|
||||
std::is_same< typename ViewTraits<T,P...>::specialize , void >::value &&
|
||||
std::is_same< typename Kokkos::ViewTraits<T,P...>::array_layout
|
||||
, Kokkos::LayoutStride >::value
|
||||
>::type * = 0
|
||||
@ -1929,7 +1936,11 @@ create_mirror( const DynRankView<T,P...> & src
|
||||
|
||||
// Create a mirror in a new space (specialization for different space)
|
||||
template<class Space, class T, class ... P>
|
||||
typename Impl::MirrorDRVType<Space,T,P ...>::view_type create_mirror(const Space& , const Kokkos::DynRankView<T,P...> & src) {
|
||||
typename Impl::MirrorDRVType<Space,T,P ...>::view_type
|
||||
create_mirror(const Space& , const Kokkos::DynRankView<T,P...> & src
|
||||
, typename std::enable_if<
|
||||
std::is_same< typename ViewTraits<T,P...>::specialize , void >::value
|
||||
>::type * = 0) {
|
||||
return typename Impl::MirrorDRVType<Space,T,P ...>::view_type(src.label(), Impl::reconstructLayout(src.layout(), src.rank()) );
|
||||
}
|
||||
|
||||
@ -1985,6 +1996,29 @@ create_mirror_view(const Space& , const Kokkos::DynRankView<T,P...> & src
|
||||
return typename Impl::MirrorDRViewType<Space,T,P ...>::view_type(src.label(), Impl::reconstructLayout(src.layout(), src.rank()) );
|
||||
}
|
||||
|
||||
// Create a mirror view and deep_copy in a new space (specialization for same space)
|
||||
template<class Space, class T, class ... P>
|
||||
typename Impl::MirrorDRViewType<Space,T,P ...>::view_type
|
||||
create_mirror_view_and_copy(const Space& , const Kokkos::DynRankView<T,P...> & src
|
||||
, std::string const& name = ""
|
||||
, typename std::enable_if<Impl::MirrorDRViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
|
||||
(void)name;
|
||||
return src;
|
||||
}
|
||||
|
||||
// Create a mirror view and deep_copy in a new space (specialization for different space)
|
||||
template<class Space, class T, class ... P>
|
||||
typename Impl::MirrorDRViewType<Space,T,P ...>::view_type
|
||||
create_mirror_view_and_copy(const Space& , const Kokkos::DynRankView<T,P...> & src
|
||||
, std::string const& name = ""
|
||||
, typename std::enable_if<!Impl::MirrorDRViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
|
||||
using Mirror = typename Impl::MirrorDRViewType<Space,T,P ...>::view_type;
|
||||
std::string label = name.empty() ? src.label() : name;
|
||||
auto mirror = Mirror( Kokkos::ViewAllocateWithoutInitializing(label), Impl::reconstructLayout(src.layout(), src.rank()) );
|
||||
deep_copy(mirror, src);
|
||||
return mirror;
|
||||
}
|
||||
|
||||
} //end Kokkos
|
||||
|
||||
|
||||
|
||||
1895
lib/kokkos/containers/src/Kokkos_OffsetView.hpp
Normal file
@ -47,7 +47,9 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <Kokkos_View.hpp>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <Kokkos_Parallel_Reduce.hpp>
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
|
||||
@ -86,14 +86,13 @@ public:
|
||||
vector():DV() {
|
||||
_size = 0;
|
||||
_extra_storage = 1.1;
|
||||
DV::modified_host() = 1;
|
||||
}
|
||||
|
||||
|
||||
vector(int n, Scalar val=Scalar()):DualView<Scalar*,LayoutLeft,Arg1Type>("Vector",size_t(n*(1.1))) {
|
||||
_size = n;
|
||||
_extra_storage = 1.1;
|
||||
DV::modified_host() = 1;
|
||||
DV::modified_flags(0) = 1;
|
||||
|
||||
assign(n,val);
|
||||
}
|
||||
@ -119,16 +118,16 @@ public:
|
||||
|
||||
/* Assign value either on host or on device */
|
||||
|
||||
if( DV::modified_host() >= DV::modified_device() ) {
|
||||
if( DV::template need_sync<typename DV::t_dev::device_type>() ) {
|
||||
set_functor_host f(DV::h_view,val);
|
||||
parallel_for(n,f);
|
||||
DV::t_host::execution_space::fence();
|
||||
DV::modified_host()++;
|
||||
DV::template modify<typename DV::t_host::device_type>();
|
||||
} else {
|
||||
set_functor f(DV::d_view,val);
|
||||
parallel_for(n,f);
|
||||
DV::t_dev::execution_space::fence();
|
||||
DV::modified_device()++;
|
||||
DV::template modify<typename DV::t_dev::device_type>();
|
||||
}
|
||||
}
|
||||
|
||||
@ -137,7 +136,8 @@ public:
|
||||
}
|
||||
|
||||
void push_back(Scalar val) {
|
||||
DV::modified_host()++;
|
||||
DV::template sync<typename DV::t_host::device_type>();
|
||||
DV::template modify<typename DV::t_host::device_type>();
|
||||
if(_size == span()) {
|
||||
size_t new_size = _size*_extra_storage;
|
||||
if(new_size == _size) new_size++;
|
||||
@ -247,10 +247,10 @@ public:
|
||||
}
|
||||
|
||||
void on_host() {
|
||||
DV::modified_host() = DV::modified_device() + 1;
|
||||
DV::template modify<typename DV::t_host::device_type>();
|
||||
}
|
||||
void on_device() {
|
||||
DV::modified_device() = DV::modified_host() + 1;
|
||||
DV::template modify<typename DV::t_dev::device_type>();
|
||||
}
|
||||
|
||||
void set_overallocation(float extra) {
|
||||
|
||||
@ -23,6 +23,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
|
||||
threads/TestThreads_DynRankViewAPI_rank12345.cpp
|
||||
threads/TestThreads_DynRankViewAPI_rank67.cpp
|
||||
threads/TestThreads_ErrorReporter.cpp
|
||||
threads/TestThreads_OffsetView.cpp
|
||||
threads/TestThreads_ScatterView.cpp
|
||||
threads/TestThreads_StaticCrsGraph.cpp
|
||||
threads/TestThreads_UnorderedMap.cpp
|
||||
@ -47,6 +48,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
|
||||
serial/TestSerial_DynRankViewAPI_rank12345.cpp
|
||||
serial/TestSerial_DynRankViewAPI_rank67.cpp
|
||||
serial/TestSerial_ErrorReporter.cpp
|
||||
serial/TestSerial_OffsetView.cpp
|
||||
serial/TestSerial_ScatterView.cpp
|
||||
serial/TestSerial_StaticCrsGraph.cpp
|
||||
serial/TestSerial_UnorderedMap.cpp
|
||||
@ -71,6 +73,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
|
||||
openmp/TestOpenMP_DynRankViewAPI_rank12345.cpp
|
||||
openmp/TestOpenMP_DynRankViewAPI_rank67.cpp
|
||||
openmp/TestOpenMP_ErrorReporter.cpp
|
||||
openmp/TestOpenMP_OffsetView.cpp
|
||||
openmp/TestOpenMP_ScatterView.cpp
|
||||
openmp/TestOpenMP_StaticCrsGraph.cpp
|
||||
openmp/TestOpenMP_UnorderedMap.cpp
|
||||
@ -95,6 +98,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
|
||||
cuda/TestCuda_DynRankViewAPI_rank12345.cpp
|
||||
cuda/TestCuda_DynRankViewAPI_rank67.cpp
|
||||
cuda/TestCuda_ErrorReporter.cpp
|
||||
cuda/TestCuda_OffsetView.cpp
|
||||
cuda/TestCuda_ScatterView.cpp
|
||||
cuda/TestCuda_StaticCrsGraph.cpp
|
||||
cuda/TestCuda_UnorderedMap.cpp
|
||||
|
||||
@ -39,6 +39,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
OBJ_CUDA += TestCuda_DynRankViewAPI_rank12345.o
|
||||
OBJ_CUDA += TestCuda_DynRankViewAPI_rank67.o
|
||||
OBJ_CUDA += TestCuda_ErrorReporter.o
|
||||
OBJ_CUDA += TestCuda_OffsetView.o
|
||||
OBJ_CUDA += TestCuda_ScatterView.o
|
||||
OBJ_CUDA += TestCuda_StaticCrsGraph.o
|
||||
OBJ_CUDA += TestCuda_UnorderedMap.o
|
||||
@ -57,6 +58,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
|
||||
OBJ_ROCM += TestROCm_DynRankViewAPI_rank12345.o
|
||||
OBJ_ROCM += TestROCm_DynRankViewAPI_rank67.o
|
||||
OBJ_ROCM += TestROCm_ErrorReporter.o
|
||||
OBJ_ROCM += TestROCm_OffsetView.o
|
||||
OBJ_ROCM += TestROCm_ScatterView.o
|
||||
OBJ_ROCM += TestROCm_StaticCrsGraph.o
|
||||
OBJ_ROCM += TestROCm_UnorderedMap.o
|
||||
@ -75,6 +77,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
|
||||
OBJ_THREADS += TestThreads_DynRankViewAPI_rank12345.o
|
||||
OBJ_THREADS += TestThreads_DynRankViewAPI_rank67.o
|
||||
OBJ_THREADS += TestThreads_ErrorReporter.o
|
||||
OBJ_THREADS += TestThreads_OffsetView.o
|
||||
OBJ_THREADS += TestThreads_ScatterView.o
|
||||
OBJ_THREADS += TestThreads_StaticCrsGraph.o
|
||||
OBJ_THREADS += TestThreads_UnorderedMap.o
|
||||
@ -93,6 +96,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
|
||||
OBJ_OPENMP += TestOpenMP_DynRankViewAPI_rank12345.o
|
||||
OBJ_OPENMP += TestOpenMP_DynRankViewAPI_rank67.o
|
||||
OBJ_OPENMP += TestOpenMP_ErrorReporter.o
|
||||
OBJ_OPENMP += TestOpenMP_OffsetView.o
|
||||
OBJ_OPENMP += TestOpenMP_ScatterView.o
|
||||
OBJ_OPENMP += TestOpenMP_StaticCrsGraph.o
|
||||
OBJ_OPENMP += TestOpenMP_UnorderedMap.o
|
||||
@ -111,6 +115,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
|
||||
OBJ_SERIAL += TestSerial_DynRankViewAPI_rank12345.o
|
||||
OBJ_SERIAL += TestSerial_DynRankViewAPI_rank67.o
|
||||
OBJ_SERIAL += TestSerial_ErrorReporter.o
|
||||
OBJ_SERIAL += TestSerial_OffsetView.o
|
||||
OBJ_SERIAL += TestSerial_ScatterView.o
|
||||
OBJ_SERIAL += TestSerial_StaticCrsGraph.o
|
||||
OBJ_SERIAL += TestSerial_UnorderedMap.o
|
||||
|
||||
@ -729,6 +729,7 @@ public:
|
||||
static void run_tests() {
|
||||
run_test_resize_realloc();
|
||||
run_test_mirror();
|
||||
run_test_mirror_and_copy();
|
||||
run_test_scalar();
|
||||
run_test();
|
||||
run_test_const();
|
||||
@ -885,6 +886,69 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
static void run_test_mirror_and_copy()
|
||||
{
|
||||
// LayoutLeft
|
||||
{
|
||||
Kokkos::DynRankView< double, Kokkos::LayoutLeft, Kokkos::HostSpace > a_org( "A", 10 );
|
||||
a_org(5) = 42.0;
|
||||
Kokkos::DynRankView< double, Kokkos::LayoutLeft, Kokkos::HostSpace > a_h = a_org;
|
||||
auto a_h2 = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), a_h );
|
||||
auto a_d = Kokkos::create_mirror_view_and_copy( DeviceType(), a_h );
|
||||
auto a_h3 = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), a_d );
|
||||
|
||||
int equal_ptr_h_h2 = a_h.data() == a_h2.data() ? 1 : 0;
|
||||
int equal_ptr_h_d = a_h.data() == a_d.data() ? 1 : 0;
|
||||
int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0;
|
||||
int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0;
|
||||
|
||||
int is_same_memspace = std::is_same< Kokkos::HostSpace, typename DeviceType::memory_space >::value ? 1 : 0;
|
||||
ASSERT_EQ( equal_ptr_h_h2, 1 );
|
||||
ASSERT_EQ( equal_ptr_h_d, is_same_memspace );
|
||||
ASSERT_EQ( equal_ptr_h2_d, is_same_memspace );
|
||||
ASSERT_EQ( equal_ptr_h3_d, is_same_memspace );
|
||||
|
||||
ASSERT_EQ( a_h.extent(0), a_h3.extent(0) );
|
||||
ASSERT_EQ( a_h.extent(0), a_h2.extent(0) );
|
||||
ASSERT_EQ( a_h.extent(0), a_d .extent(0) );
|
||||
ASSERT_EQ( a_h.extent(0), a_h3.extent(0) );
|
||||
ASSERT_EQ( a_h.rank(), a_org.rank() );
|
||||
ASSERT_EQ( a_h.rank(), a_h2.rank() );
|
||||
ASSERT_EQ( a_h.rank(), a_h3.rank() );
|
||||
ASSERT_EQ( a_h.rank(), a_d.rank() );
|
||||
ASSERT_EQ( a_org(5), a_h3(5) );
|
||||
}
|
||||
// LayoutRight
|
||||
{
|
||||
Kokkos::DynRankView< double, Kokkos::LayoutRight, Kokkos::HostSpace > a_org( "A", 10 );
|
||||
a_org(5) = 42.0;
|
||||
Kokkos::DynRankView< double, Kokkos::LayoutRight, Kokkos::HostSpace > a_h = a_org;
|
||||
auto a_h2 = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), a_h );
|
||||
auto a_d = Kokkos::create_mirror_view_and_copy( DeviceType(), a_h );
|
||||
auto a_h3 = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), a_d );
|
||||
|
||||
int equal_ptr_h_h2 = a_h.data() == a_h2.data() ? 1 : 0;
|
||||
int equal_ptr_h_d = a_h.data() == a_d.data() ? 1 : 0;
|
||||
int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0;
|
||||
int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0;
|
||||
|
||||
int is_same_memspace = std::is_same< Kokkos::HostSpace, typename DeviceType::memory_space >::value ? 1 : 0;
|
||||
ASSERT_EQ( equal_ptr_h_h2, 1 );
|
||||
ASSERT_EQ( equal_ptr_h_d, is_same_memspace );
|
||||
ASSERT_EQ( equal_ptr_h2_d, is_same_memspace );
|
||||
ASSERT_EQ( equal_ptr_h3_d, is_same_memspace );
|
||||
|
||||
ASSERT_EQ( a_h.extent(0), a_h3.extent(0) );
|
||||
ASSERT_EQ( a_h.extent(0), a_h2.extent(0) );
|
||||
ASSERT_EQ( a_h.extent(0), a_d .extent(0) );
|
||||
ASSERT_EQ( a_h.rank(), a_org.rank() );
|
||||
ASSERT_EQ( a_h.rank(), a_h2.rank() );
|
||||
ASSERT_EQ( a_h.rank(), a_h3.rank() );
|
||||
ASSERT_EQ( a_h.rank(), a_d.rank() );
|
||||
ASSERT_EQ( a_org(5), a_h3(5) );
|
||||
}
|
||||
}
|
||||
|
||||
static void run_test_scalar()
|
||||
{
|
||||
typedef typename dView0::HostMirror hView0 ; //HostMirror of DynRankView is a DynRankView
|
||||
|
||||
426
lib/kokkos/containers/unit_tests/TestOffsetView.hpp
Normal file
@ -0,0 +1,426 @@
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* FIXME the OffsetView class is really not very well tested.
|
||||
*/
|
||||
#ifndef CONTAINERS_UNIT_TESTS_TESTOFFSETVIEW_HPP_
|
||||
#define CONTAINERS_UNIT_TESTS_TESTOFFSETVIEW_HPP_
|
||||
|
||||
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
#include <impl/Kokkos_Timer.hpp>
|
||||
#include <Kokkos_OffsetView.hpp>
|
||||
#include <KokkosExp_MDRangePolicy.hpp>
|
||||
|
||||
using std::endl;
|
||||
using std::cout;
|
||||
|
||||
namespace Test{
|
||||
|
||||
template <typename Scalar, typename Device>
|
||||
void test_offsetview_construction(unsigned int size)
|
||||
{
|
||||
|
||||
typedef Kokkos::Experimental::OffsetView<Scalar**, Device> offset_view_type;
|
||||
typedef Kokkos::View<Scalar**, Device> view_type;
|
||||
|
||||
Kokkos::Experimental::index_list_type range0 = {-1, 3};
|
||||
Kokkos::Experimental::index_list_type range1 = {-2, 2};
|
||||
|
||||
offset_view_type ov("firstOV", range0, range1);
|
||||
|
||||
ASSERT_EQ("firstOV", ov.label());
|
||||
ASSERT_EQ(2, ov.Rank);
|
||||
|
||||
ASSERT_EQ(ov.begin(0), -1);
|
||||
ASSERT_EQ(ov.end(0), 4);
|
||||
|
||||
ASSERT_EQ(ov.begin(1), -2);
|
||||
ASSERT_EQ(ov.end(1), 3);
|
||||
|
||||
ASSERT_EQ(ov.extent(0), 5);
|
||||
ASSERT_EQ(ov.extent(1), 5);
|
||||
|
||||
const int ovmin0 = ov.begin(0);
|
||||
const int ovend0 = ov.end(0);
|
||||
const int ovmin1 = ov.begin(1);
|
||||
const int ovend1 = ov.end(1);
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
|
||||
{
|
||||
Kokkos::Experimental::OffsetView<Scalar*, Device> offsetV1("OneDOffsetView", range0);
|
||||
|
||||
Kokkos::RangePolicy<Device, int> rangePolicy1(offsetV1.begin(0), offsetV1.end(0));
|
||||
Kokkos::parallel_for(rangePolicy1, KOKKOS_LAMBDA (const int i){
|
||||
offsetV1(i) = 1;
|
||||
}
|
||||
);
|
||||
Kokkos::fence();
|
||||
|
||||
int OVResult = 0;
|
||||
Kokkos::parallel_reduce(rangePolicy1, KOKKOS_LAMBDA(const int i, int & updateMe){
|
||||
updateMe += offsetV1(i);
|
||||
}, OVResult);
|
||||
|
||||
Kokkos::fence();
|
||||
ASSERT_EQ(OVResult, offsetV1.end(0) - offsetV1.begin(0)) << "found wrong number of elements in OffsetView that was summed.";
|
||||
|
||||
}
|
||||
{ //test deep copy of scalar const value into mirro
|
||||
const int constVal = 6;
|
||||
typename offset_view_type::HostMirror hostOffsetView =
|
||||
Kokkos::Experimental::create_mirror_view(ov);
|
||||
|
||||
Kokkos::Experimental::deep_copy(hostOffsetView, constVal);
|
||||
|
||||
for(int i = hostOffsetView.begin(0); i < hostOffsetView.end(0); ++i) {
|
||||
for(int j = hostOffsetView.begin(1); j < hostOffsetView.end(1); ++j) {
|
||||
ASSERT_EQ(hostOffsetView(i,j), constVal) << "Bad data found in OffsetView";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef Kokkos::MDRangePolicy<Device, Kokkos::Rank<2>, Kokkos::IndexType<int> > range_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
|
||||
range_type rangePolicy2D(point_type{ {ovmin0, ovmin1 } },
|
||||
point_type{ { ovend0, ovend1 } });
|
||||
|
||||
const int constValue = 9;
|
||||
Kokkos::parallel_for(rangePolicy2D, KOKKOS_LAMBDA (const int i, const int j) {
|
||||
ov(i,j) = constValue;
|
||||
}
|
||||
);
|
||||
|
||||
//test offsetview to offsetviewmirror deep copy
|
||||
typename offset_view_type::HostMirror hostOffsetView =
|
||||
Kokkos::Experimental::create_mirror_view(ov);
|
||||
|
||||
Kokkos::Experimental::deep_copy(hostOffsetView, ov);
|
||||
|
||||
for(int i = hostOffsetView.begin(0); i < hostOffsetView.end(0); ++i) {
|
||||
for(int j = hostOffsetView.begin(1); j < hostOffsetView.end(1); ++j) {
|
||||
ASSERT_EQ(hostOffsetView(i,j), constValue) << "Bad data found in OffsetView";
|
||||
}
|
||||
}
|
||||
|
||||
int OVResult = 0;
|
||||
Kokkos::parallel_reduce(rangePolicy2D, KOKKOS_LAMBDA(const int i, const int j, int & updateMe){
|
||||
updateMe += ov(i, j);
|
||||
}, OVResult);
|
||||
|
||||
int answer = 0;
|
||||
for(int i = ov.begin(0); i < ov.end(0); ++i) {
|
||||
for(int j = ov.begin(1); j < ov.end(1); ++j) {
|
||||
answer += constValue;
|
||||
}
|
||||
}
|
||||
|
||||
ASSERT_EQ(OVResult, answer) << "Bad data found in OffsetView";
|
||||
#endif
|
||||
|
||||
{
|
||||
offset_view_type ovCopy(ov);
|
||||
ASSERT_EQ(ovCopy==ov, true) <<
|
||||
"Copy constructor or equivalence operator broken";
|
||||
}
|
||||
|
||||
{
|
||||
offset_view_type ovAssigned = ov;
|
||||
ASSERT_EQ(ovAssigned==ov, true) <<
|
||||
"Assignment operator or equivalence operator broken";
|
||||
}
|
||||
|
||||
{ //construct OffsetView from a View plus begins array
|
||||
const int extent0 = 100;
|
||||
const int extent1 = 200;
|
||||
const int extent2 = 300;
|
||||
Kokkos::View<Scalar***, Device> view3D("view3D", extent0, extent1, extent2);
|
||||
|
||||
Kokkos::deep_copy(view3D, 1);
|
||||
|
||||
Kokkos::Array<int64_t,3> begins = {{-10, -20, -30}};
|
||||
Kokkos::Experimental::OffsetView<Scalar***, Device> offsetView3D(view3D, begins);
|
||||
|
||||
typedef Kokkos::MDRangePolicy<Device, Kokkos::Rank<3>, Kokkos::IndexType<int64_t> > range3_type;
|
||||
typedef typename range3_type::point_type point3_type;
|
||||
|
||||
range3_type rangePolicy3DZero(point3_type{ {0, 0, 0 } },
|
||||
point3_type{ { extent0, extent1, extent2 } });
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
|
||||
int view3DSum = 0;
|
||||
Kokkos::parallel_reduce(rangePolicy3DZero, KOKKOS_LAMBDA(const int i, const int j, int k, int & updateMe){
|
||||
updateMe += view3D(i, j, k);
|
||||
}, view3DSum);
|
||||
|
||||
range3_type rangePolicy3D(point3_type{ {begins[0], begins[1], begins[2] } },
|
||||
point3_type{ { begins[0] + extent0, begins[1] + extent1, begins[2] + extent2 } });
|
||||
int offsetView3DSum = 0;
|
||||
|
||||
Kokkos::parallel_reduce(rangePolicy3D, KOKKOS_LAMBDA(const int i, const int j, int k, int & updateMe){
|
||||
updateMe += offsetView3D(i, j, k);
|
||||
}, offsetView3DSum);
|
||||
|
||||
ASSERT_EQ(view3DSum, offsetView3DSum) << "construction of OffsetView from View and begins array broken.";
|
||||
#endif
|
||||
}
|
||||
view_type viewFromOV = ov.view();
|
||||
|
||||
ASSERT_EQ(viewFromOV == ov, true) <<
|
||||
"OffsetView::view() or equivalence operator View == OffsetView broken";
|
||||
|
||||
{
|
||||
offset_view_type ovFromV(viewFromOV, {-1, -2});
|
||||
|
||||
ASSERT_EQ(ovFromV == viewFromOV , true) <<
|
||||
"Construction of OffsetView from View or equivalence operator OffsetView == View broken";
|
||||
}
|
||||
{
|
||||
offset_view_type ovFromV = viewFromOV;
|
||||
ASSERT_EQ(ovFromV == viewFromOV , true) <<
|
||||
"Construction of OffsetView from View by assignment (implicit conversion) or equivalence operator OffsetView == View broken";
|
||||
}
|
||||
|
||||
{// test offsetview to view deep copy
|
||||
view_type aView("aView", ov.extent(0), ov.extent(1));
|
||||
Kokkos::Experimental::deep_copy(aView, ov);
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
|
||||
int sum = 0;
|
||||
Kokkos::parallel_reduce(rangePolicy2D, KOKKOS_LAMBDA(const int i, const int j, int & updateMe){
|
||||
updateMe += ov(i, j) - aView(i- ov.begin(0), j-ov.begin(1));
|
||||
}, sum);
|
||||
|
||||
ASSERT_EQ(sum, 0) << "deep_copy(view, offsetView) broken.";
|
||||
#endif
|
||||
}
|
||||
|
||||
{// test view to offsetview deep copy
|
||||
view_type aView("aView", ov.extent(0), ov.extent(1));
|
||||
|
||||
Kokkos::deep_copy(aView, 99);
|
||||
Kokkos::Experimental::deep_copy(ov, aView);
|
||||
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
|
||||
int sum = 0;
|
||||
Kokkos::parallel_reduce(rangePolicy2D, KOKKOS_LAMBDA(const int i, const int j, int & updateMe){
|
||||
updateMe += ov(i, j) - aView(i- ov.begin(0), j-ov.begin(1));
|
||||
}, sum);
|
||||
|
||||
ASSERT_EQ(sum, 0) << "deep_copy(offsetView, view) broken.";
|
||||
#endif
|
||||
}
|
||||
}
|
||||
template <typename Scalar, typename Device>
|
||||
void test_offsetview_subview(unsigned int size)
|
||||
{
|
||||
{//test subview 1
|
||||
Kokkos::Experimental::OffsetView<Scalar*, Device> sliceMe("offsetToSlice", {-10, 20});
|
||||
{
|
||||
auto offsetSubviewa = Kokkos::Experimental::subview(sliceMe, 0);
|
||||
ASSERT_EQ(offsetSubviewa.Rank, 0) << "subview of offset is broken.";
|
||||
}
|
||||
|
||||
}
|
||||
{//test subview 2
|
||||
Kokkos::Experimental::OffsetView<Scalar**, Device> sliceMe("offsetToSlice", {-10,20}, {-20,30});
|
||||
{
|
||||
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(),-2);
|
||||
ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
|
||||
}
|
||||
|
||||
{
|
||||
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL());
|
||||
ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
{//test subview rank 3
|
||||
|
||||
Kokkos::Experimental::OffsetView<Scalar***, Device> sliceMe("offsetToSlice", {-10,20}, {-20,30}, {-30,40});
|
||||
|
||||
//slice 1
|
||||
{
|
||||
auto offsetSubview = Kokkos::Experimental::subview(sliceMe,Kokkos::ALL(),Kokkos::ALL(), 0);
|
||||
ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken.";
|
||||
}
|
||||
{
|
||||
auto offsetSubview = Kokkos::Experimental::subview(sliceMe,Kokkos::ALL(), 0,Kokkos::ALL());
|
||||
ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken.";
|
||||
}
|
||||
|
||||
{
|
||||
auto offsetSubview = Kokkos::Experimental::subview(sliceMe,0, Kokkos::ALL(),Kokkos::ALL());
|
||||
ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken.";
|
||||
|
||||
}
|
||||
{
|
||||
auto offsetSubview = Kokkos::Experimental::subview(sliceMe,0, Kokkos::ALL(), Kokkos::make_pair(-30, -21));
|
||||
ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken.";
|
||||
|
||||
ASSERT_EQ(offsetSubview.begin(0) , -20);
|
||||
ASSERT_EQ(offsetSubview.end(0) , 31);
|
||||
ASSERT_EQ(offsetSubview.begin(1) , 0);
|
||||
ASSERT_EQ(offsetSubview.end(1) , 9);
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
|
||||
typedef Kokkos::MDRangePolicy<Device, Kokkos::Rank<2>, Kokkos::IndexType<int> > range_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
|
||||
const int b0 = offsetSubview.begin(0);
|
||||
const int b1 = offsetSubview.begin(1);
|
||||
|
||||
const int e0 = offsetSubview.end(0);
|
||||
const int e1 = offsetSubview.end(1);
|
||||
|
||||
range_type rangeP2D(point_type{ {b0, b1 } }, point_type{ { e0, e1} });
|
||||
|
||||
Kokkos::parallel_for(rangeP2D, KOKKOS_LAMBDA(const int i, const int j) {
|
||||
offsetSubview(i,j) = 6;
|
||||
}
|
||||
);
|
||||
|
||||
int sum = 0;
|
||||
Kokkos::parallel_reduce(rangeP2D, KOKKOS_LAMBDA(const int i, const int j, int & updateMe){
|
||||
updateMe += offsetSubview(i, j);
|
||||
}, sum);
|
||||
|
||||
ASSERT_EQ(sum, 6*(e0-b0)*(e1-b1));
|
||||
#endif
|
||||
}
|
||||
|
||||
// slice 2
|
||||
{
|
||||
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0);
|
||||
ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
|
||||
}
|
||||
{
|
||||
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, Kokkos::ALL());
|
||||
ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
|
||||
}
|
||||
|
||||
{
|
||||
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0);
|
||||
ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
|
||||
}
|
||||
}
|
||||
|
||||
{//test subview rank 4
|
||||
|
||||
Kokkos::Experimental::OffsetView<Scalar****, Device> sliceMe("offsetToSlice", {-10,20}, {-20,30}, {-30,40}, {-40, 50});
|
||||
|
||||
//slice 1
|
||||
{
|
||||
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(),Kokkos::ALL(), Kokkos::ALL(), 0);
|
||||
ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken.";
|
||||
}
|
||||
{
|
||||
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0, Kokkos::ALL());
|
||||
ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken.";
|
||||
}
|
||||
{
|
||||
auto offsetSubview = Kokkos::Experimental::subview(sliceMe ,Kokkos::ALL(), 0, Kokkos::ALL(),Kokkos::ALL());
|
||||
ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken.";
|
||||
}
|
||||
{
|
||||
auto offsetSubview = Kokkos::Experimental::subview(sliceMe , 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL() );
|
||||
ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken.";
|
||||
}
|
||||
|
||||
// slice 2
|
||||
auto offsetSubview2a = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0, 0);
|
||||
ASSERT_EQ(offsetSubview2a.Rank, 2) << "subview of offset is broken.";
|
||||
{
|
||||
auto offsetSubview2b = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, Kokkos::ALL(), 0);
|
||||
ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken.";
|
||||
}
|
||||
{
|
||||
auto offsetSubview2b = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0, Kokkos::ALL());
|
||||
ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken.";
|
||||
}
|
||||
{
|
||||
auto offsetSubview2b = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0, Kokkos::ALL());
|
||||
ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken.";
|
||||
}
|
||||
{
|
||||
auto offsetSubview2b = Kokkos::Experimental::subview(sliceMe, 0, 0, Kokkos::ALL(), Kokkos::ALL());
|
||||
ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken.";
|
||||
}
|
||||
// slice 3
|
||||
{
|
||||
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0, 0);
|
||||
ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
|
||||
}
|
||||
{
|
||||
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0, 0);
|
||||
ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
|
||||
}
|
||||
{
|
||||
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, Kokkos::ALL(), 0);
|
||||
ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
|
||||
}
|
||||
{
|
||||
auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, 0, Kokkos::ALL());
|
||||
ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
TEST_F( TEST_CATEGORY, offsetview_construction) {
|
||||
test_offsetview_construction<int,TEST_EXECSPACE>(10);
|
||||
}
|
||||
TEST_F( TEST_CATEGORY, offsetview_subview) {
|
||||
test_offsetview_subview<int,TEST_EXECSPACE>(10);
|
||||
}
|
||||
|
||||
} // namespace Test
|
||||
|
||||
#endif /* CONTAINERS_UNIT_TESTS_TESTOFFSETVIEW_HPP_ */
|
||||
@ -80,7 +80,9 @@ void test_scatter_view_config(int n)
|
||||
Kokkos::Experimental::contribute(original_view, scatter_view);
|
||||
}
|
||||
#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
|
||||
Kokkos::fence();
|
||||
auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), original_view);
|
||||
Kokkos::fence();
|
||||
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
|
||||
auto val0 = host_view(i, 0);
|
||||
auto val1 = host_view(i, 1);
|
||||
@ -111,9 +113,6 @@ struct TestDuplicatedScatterView {
|
||||
test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
|
||||
Kokkos::Experimental::ScatterDuplicated,
|
||||
Kokkos::Experimental::ScatterNonAtomic>(n);
|
||||
test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
|
||||
Kokkos::Experimental::ScatterDuplicated,
|
||||
Kokkos::Experimental::ScatterAtomic>(n);
|
||||
}
|
||||
};
|
||||
|
||||
@ -127,6 +126,16 @@ struct TestDuplicatedScatterView<Kokkos::Cuda> {
|
||||
};
|
||||
#endif
|
||||
|
||||
#ifdef KOKKOS_ENABLE_ROCM
|
||||
// disable duplicated instantiation with ROCm until
|
||||
// UniqueToken can support it
|
||||
template <>
|
||||
struct TestDuplicatedScatterView<Kokkos::Experimental::ROCm> {
|
||||
TestDuplicatedScatterView(int) {
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
template <typename ExecSpace>
|
||||
void test_scatter_view(int n)
|
||||
{
|
||||
@ -142,16 +151,28 @@ void test_scatter_view(int n)
|
||||
Kokkos::Experimental::ScatterNonDuplicated,
|
||||
Kokkos::Experimental::ScatterNonAtomic>(n);
|
||||
}
|
||||
#ifdef KOKKOS_ENABLE_SERIAL
|
||||
if (!std::is_same<ExecSpace, Kokkos::Serial>::value) {
|
||||
#endif
|
||||
test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
|
||||
Kokkos::Experimental::ScatterNonDuplicated,
|
||||
Kokkos::Experimental::ScatterAtomic>(n);
|
||||
#ifdef KOKKOS_ENABLE_SERIAL
|
||||
}
|
||||
#endif
|
||||
|
||||
TestDuplicatedScatterView<ExecSpace> duptest(n);
|
||||
}
|
||||
|
||||
TEST_F( TEST_CATEGORY, scatterview) {
|
||||
#ifndef KOKKOS_ENABLE_ROCM
|
||||
test_scatter_view<TEST_EXECSPACE>(10);
|
||||
#ifdef KOKKOS_ENABLE_DEBUG
|
||||
test_scatter_view<TEST_EXECSPACE>(100000);
|
||||
#else
|
||||
test_scatter_view<TEST_EXECSPACE>(10000000);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace Test
|
||||
|
||||
@ -46,6 +46,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include <Kokkos_StaticCrsGraph.hpp>
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
namespace Test {
|
||||
|
||||
@ -0,0 +1,47 @@
|
||||
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include<cuda/TestCuda_Category.hpp>
|
||||
#include<TestOffsetView.hpp>
|
||||
|
||||
@ -0,0 +1,47 @@
|
||||
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include<openmp/TestOpenMP_Category.hpp>
|
||||
#include<TestOffsetView.hpp>
|
||||
|
||||
@ -60,6 +60,6 @@ protected:
|
||||
} // namespace Test
|
||||
|
||||
#define TEST_CATEGORY rocm
|
||||
#define TEST_EXECSPACE Kokkos::ROCm
|
||||
#define TEST_EXECSPACE Kokkos::Experimental::ROCm
|
||||
|
||||
#endif
|
||||
|
||||
@ -0,0 +1,46 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include<serial/TestSerial_Category.hpp>
|
||||
#include<TestOffsetView.hpp>
|
||||
|
||||
@ -0,0 +1,47 @@
|
||||
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include<threads/TestThreads_Category.hpp>
|
||||
#include<TestOffsetView.hpp>
|
||||
|
||||
@ -108,3 +108,7 @@ else()
|
||||
|
||||
endif()
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
# build and install pkgconfig file
|
||||
CONFIGURE_FILE(kokkos.pc.in kokkos.pc @ONLY)
|
||||
INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/kokkos.pc DESTINATION lib/pkgconfig)
|
||||
|
||||
@ -208,7 +208,7 @@ struct CudaParallelLaunch< DriverType
|
||||
, const int shmem
|
||||
, const cudaStream_t stream = 0 )
|
||||
{
|
||||
if ( grid.x && ( block.x * block.y * block.z ) ) {
|
||||
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
|
||||
|
||||
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
|
||||
sizeof( DriverType ) ) {
|
||||
@ -264,7 +264,7 @@ struct CudaParallelLaunch< DriverType
|
||||
, const int shmem
|
||||
, const cudaStream_t stream = 0 )
|
||||
{
|
||||
if ( grid.x && ( block.x * block.y * block.z ) ) {
|
||||
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
|
||||
|
||||
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
|
||||
sizeof( DriverType ) ) {
|
||||
@ -321,7 +321,7 @@ struct CudaParallelLaunch< DriverType
|
||||
, const int shmem
|
||||
, const cudaStream_t stream = 0 )
|
||||
{
|
||||
if ( grid.x && ( block.x * block.y * block.z ) ) {
|
||||
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
|
||||
|
||||
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
|
||||
sizeof( DriverType ) ) {
|
||||
@ -370,7 +370,7 @@ struct CudaParallelLaunch< DriverType
|
||||
, const int shmem
|
||||
, const cudaStream_t stream = 0 )
|
||||
{
|
||||
if ( grid.x && ( block.x * block.y * block.z ) ) {
|
||||
if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {
|
||||
|
||||
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
|
||||
sizeof( DriverType ) ) {
|
||||
|
||||
@ -453,6 +453,8 @@ SharedAllocationRecord( const Kokkos::CudaSpace & arg_space
|
||||
, arg_label.c_str()
|
||||
, SharedAllocationHeader::maximum_label_length
|
||||
);
|
||||
// Set last element zero, in case c_str is too long
|
||||
header.m_label[SharedAllocationHeader::maximum_label_length - 1] = (char) 0;
|
||||
|
||||
// Copy to device memory
|
||||
Kokkos::Impl::DeepCopy<CudaSpace,HostSpace>( RecordBase::m_alloc_ptr , & header , sizeof(SharedAllocationHeader) );
|
||||
@ -491,6 +493,9 @@ SharedAllocationRecord( const Kokkos::CudaUVMSpace & arg_space
|
||||
, arg_label.c_str()
|
||||
, SharedAllocationHeader::maximum_label_length
|
||||
);
|
||||
|
||||
// Set last element zero, in case c_str is too long
|
||||
RecordBase::m_alloc_ptr->m_label[SharedAllocationHeader::maximum_label_length - 1] = (char) 0;
|
||||
}
|
||||
|
||||
SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
|
||||
@ -525,6 +530,8 @@ SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace & arg_space
|
||||
, arg_label.c_str()
|
||||
, SharedAllocationHeader::maximum_label_length
|
||||
);
|
||||
// Set last element zero, in case c_str is too long
|
||||
RecordBase::m_alloc_ptr->m_label[SharedAllocationHeader::maximum_label_length - 1] = (char) 0;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -689,9 +689,13 @@ Cuda::size_type cuda_internal_multiprocessor_count()
|
||||
|
||||
CudaSpace::size_type cuda_internal_maximum_concurrent_block_count()
|
||||
{
|
||||
#if defined(KOKKOS_ARCH_KEPLER)
|
||||
// Compute capability 3.0 through 3.7
|
||||
enum : int { max_resident_blocks_per_multiprocessor = 16 };
|
||||
#else
|
||||
// Compute capability 5.0 through 6.2
|
||||
enum : int { max_resident_blocks_per_multiprocessor = 32 };
|
||||
|
||||
#endif
|
||||
return CudaInternal::singleton().m_multiProcCount
|
||||
* max_resident_blocks_per_multiprocessor ;
|
||||
};
|
||||
|
||||
@ -52,22 +52,22 @@
|
||||
|
||||
namespace Kokkos { namespace Impl {
|
||||
|
||||
template<class DriverType, bool Large>
|
||||
template<class DriverType, class LaunchBounds, bool Large>
|
||||
struct CudaGetMaxBlockSize;
|
||||
|
||||
template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
|
||||
template<class DriverType, class LaunchBounds>
|
||||
int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
|
||||
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
|
||||
return CudaGetMaxBlockSize<DriverType,Large>::get_block_size(f,vector_length, shmem_extra_block,shmem_extra_thread);
|
||||
return CudaGetMaxBlockSize<DriverType,LaunchBounds,(CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>::get_block_size(f,vector_length, shmem_extra_block,shmem_extra_thread);
|
||||
}
|
||||
|
||||
|
||||
template<class DriverType>
|
||||
struct CudaGetMaxBlockSize<DriverType,true> {
|
||||
struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<>,true> {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
|
||||
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
|
||||
int numBlocks;
|
||||
int blockSize=32;
|
||||
int blockSize=1024;
|
||||
int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
|
||||
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
@ -76,8 +76,9 @@ struct CudaGetMaxBlockSize<DriverType,true> {
|
||||
blockSize,
|
||||
sharedmem);
|
||||
|
||||
while (blockSize<1024 && numBlocks>0) {
|
||||
blockSize*=2;
|
||||
if(numBlocks>0) return blockSize;
|
||||
while (blockSize>32 && numBlocks==0) {
|
||||
blockSize/=2;
|
||||
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
|
||||
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
|
||||
@ -87,19 +88,30 @@ struct CudaGetMaxBlockSize<DriverType,true> {
|
||||
blockSize,
|
||||
sharedmem);
|
||||
}
|
||||
if(numBlocks>0) return blockSize;
|
||||
else return blockSize/2;
|
||||
int blockSizeUpperBound = blockSize*2;
|
||||
while (blockSize<blockSizeUpperBound && numBlocks>0) {
|
||||
blockSize+=32;
|
||||
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
|
||||
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
cuda_parallel_launch_constant_memory<DriverType>,
|
||||
blockSize,
|
||||
sharedmem);
|
||||
}
|
||||
return blockSize - 32;
|
||||
}
|
||||
};
|
||||
|
||||
template<class DriverType>
|
||||
struct CudaGetMaxBlockSize<DriverType,false> {
|
||||
struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<>,false> {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
|
||||
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
|
||||
int numBlocks;
|
||||
|
||||
int blockSize=32;
|
||||
int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
|
||||
unsigned int blockSize=1024;
|
||||
unsigned int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
|
||||
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
@ -107,8 +119,9 @@ struct CudaGetMaxBlockSize<DriverType,false> {
|
||||
blockSize,
|
||||
sharedmem);
|
||||
|
||||
while (blockSize<1024 && numBlocks>0) {
|
||||
blockSize*=2;
|
||||
if(numBlocks>0) return blockSize;
|
||||
while (blockSize>32 && numBlocks==0) {
|
||||
blockSize/=2;
|
||||
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
|
||||
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
|
||||
@ -118,24 +131,121 @@ struct CudaGetMaxBlockSize<DriverType,false> {
|
||||
blockSize,
|
||||
sharedmem);
|
||||
}
|
||||
if(numBlocks>0) return blockSize;
|
||||
else return blockSize/2;
|
||||
unsigned int blockSizeUpperBound = blockSize*2;
|
||||
while (blockSize<blockSizeUpperBound && numBlocks>0) {
|
||||
blockSize+=32;
|
||||
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
|
||||
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
cuda_parallel_launch_local_memory<DriverType>,
|
||||
blockSize,
|
||||
sharedmem);
|
||||
}
|
||||
return blockSize - 32;
|
||||
}
|
||||
};
|
||||
|
||||
template<class DriverType, unsigned int MaxThreadsPerBlock, unsigned int MinBlocksPerSM>
|
||||
struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<MaxThreadsPerBlock,MinBlocksPerSM>,true> {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
|
||||
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
|
||||
int numBlocks = 0, oldNumBlocks = 0;
|
||||
unsigned int blockSize=MaxThreadsPerBlock;
|
||||
unsigned int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
|
||||
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
cuda_parallel_launch_constant_memory<DriverType,MaxThreadsPerBlock,MinBlocksPerSM>,
|
||||
blockSize,
|
||||
sharedmem);
|
||||
|
||||
if(static_cast<unsigned int>(numBlocks)>=MinBlocksPerSM) return blockSize;
|
||||
|
||||
while (blockSize>32 && static_cast<unsigned int>(numBlocks)<MinBlocksPerSM) {
|
||||
blockSize/=2;
|
||||
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
|
||||
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
cuda_parallel_launch_constant_memory<DriverType>,
|
||||
blockSize,
|
||||
sharedmem);
|
||||
}
|
||||
unsigned int blockSizeUpperBound = (blockSize*2<MaxThreadsPerBlock?blockSize*2:MaxThreadsPerBlock);
|
||||
while (blockSize<blockSizeUpperBound && static_cast<unsigned int>(numBlocks)>MinBlocksPerSM) {
|
||||
blockSize+=32;
|
||||
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
|
||||
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
oldNumBlocks = numBlocks;
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
cuda_parallel_launch_constant_memory<DriverType>,
|
||||
blockSize,
|
||||
sharedmem);
|
||||
}
|
||||
if(static_cast<unsigned int>(oldNumBlocks)>=MinBlocksPerSM) return blockSize - 32;
|
||||
return -1;
|
||||
}
|
||||
};
|
||||
|
||||
template<class DriverType, unsigned int MaxThreadsPerBlock, unsigned int MinBlocksPerSM>
|
||||
struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<MaxThreadsPerBlock,MinBlocksPerSM>,false> {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
|
||||
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
|
||||
int numBlocks = 0, oldNumBlocks = 0;
|
||||
unsigned int blockSize=MaxThreadsPerBlock;
|
||||
int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
|
||||
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
cuda_parallel_launch_local_memory<DriverType,MaxThreadsPerBlock,MinBlocksPerSM>,
|
||||
blockSize,
|
||||
sharedmem);
|
||||
if(static_cast<unsigned int>(numBlocks)>=MinBlocksPerSM) return blockSize;
|
||||
|
||||
while (blockSize>32 && static_cast<unsigned int>(numBlocks)<MinBlocksPerSM) {
|
||||
blockSize/=2;
|
||||
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
|
||||
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
cuda_parallel_launch_local_memory<DriverType>,
|
||||
blockSize,
|
||||
sharedmem);
|
||||
}
|
||||
unsigned int blockSizeUpperBound = (blockSize*2<MaxThreadsPerBlock?blockSize*2:MaxThreadsPerBlock);
|
||||
while (blockSize<blockSizeUpperBound && static_cast<unsigned int>(numBlocks)>=MinBlocksPerSM) {
|
||||
blockSize+=32;
|
||||
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
|
||||
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
oldNumBlocks = numBlocks;
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
cuda_parallel_launch_local_memory<DriverType>,
|
||||
blockSize,
|
||||
sharedmem);
|
||||
}
|
||||
if(static_cast<unsigned int>(oldNumBlocks)>=MinBlocksPerSM) return blockSize - 32;
|
||||
return -1;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
template<class DriverType, bool Large>
|
||||
template<class DriverType, class LaunchBounds, bool Large>
|
||||
struct CudaGetOptBlockSize;
|
||||
|
||||
template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
|
||||
template<class DriverType, class LaunchBounds>
|
||||
int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
|
||||
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
|
||||
return CudaGetOptBlockSize<DriverType,Large>::get_block_size(f,vector_length,shmem_extra_block,shmem_extra_thread);
|
||||
return CudaGetOptBlockSize<DriverType,LaunchBounds,(CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>::get_block_size(f,vector_length,shmem_extra_block,shmem_extra_thread);
|
||||
}
|
||||
|
||||
template<class DriverType>
|
||||
struct CudaGetOptBlockSize<DriverType,true> {
|
||||
struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<>,true> {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
|
||||
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
|
||||
int blockSize=16;
|
||||
@ -165,7 +275,7 @@ struct CudaGetOptBlockSize<DriverType,true> {
|
||||
};
|
||||
|
||||
template<class DriverType>
|
||||
struct CudaGetOptBlockSize<DriverType,false> {
|
||||
struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<>,false> {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
|
||||
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
|
||||
int blockSize=16;
|
||||
@ -194,6 +304,75 @@ struct CudaGetOptBlockSize<DriverType,false> {
|
||||
}
|
||||
};
|
||||
|
||||
template<class DriverType, unsigned int MaxThreadsPerBlock, unsigned int MinBlocksPerSM>
|
||||
struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds< MaxThreadsPerBlock, MinBlocksPerSM >,true> {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
|
||||
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
|
||||
int blockSize=16;
|
||||
int numBlocks;
|
||||
int sharedmem;
|
||||
int maxOccupancy=0;
|
||||
int bestBlockSize=0;
|
||||
int max_threads_per_block = std::min(MaxThreadsPerBlock,cuda_internal_maximum_warp_count()*CudaTraits::WarpSize);
|
||||
|
||||
while(blockSize < max_threads_per_block ) {
|
||||
blockSize*=2;
|
||||
|
||||
//calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far
|
||||
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
|
||||
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
cuda_parallel_launch_constant_memory<DriverType,MaxThreadsPerBlock,MinBlocksPerSM>,
|
||||
blockSize,
|
||||
sharedmem);
|
||||
if(numBlocks >= int(MinBlocksPerSM) && blockSize<=int(MaxThreadsPerBlock)) {
|
||||
if(maxOccupancy < numBlocks*blockSize) {
|
||||
maxOccupancy = numBlocks*blockSize;
|
||||
bestBlockSize = blockSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(maxOccupancy > 0)
|
||||
return bestBlockSize;
|
||||
return -1;
|
||||
}
|
||||
};
|
||||
|
||||
template<class DriverType, unsigned int MaxThreadsPerBlock, unsigned int MinBlocksPerSM>
|
||||
struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds< MaxThreadsPerBlock, MinBlocksPerSM >,false> {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
|
||||
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
|
||||
int blockSize=16;
|
||||
int numBlocks;
|
||||
int sharedmem;
|
||||
int maxOccupancy=0;
|
||||
int bestBlockSize=0;
|
||||
int max_threads_per_block = std::min(MaxThreadsPerBlock,cuda_internal_maximum_warp_count()*CudaTraits::WarpSize);
|
||||
|
||||
while(blockSize < max_threads_per_block ) {
|
||||
blockSize*=2;
|
||||
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
|
||||
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
cuda_parallel_launch_local_memory<DriverType,MaxThreadsPerBlock,MinBlocksPerSM>,
|
||||
blockSize,
|
||||
sharedmem);
|
||||
if(numBlocks >= int(MinBlocksPerSM) && blockSize<=int(MaxThreadsPerBlock)) {
|
||||
if(maxOccupancy < numBlocks*blockSize) {
|
||||
maxOccupancy = numBlocks*blockSize;
|
||||
bestBlockSize = blockSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(maxOccupancy > 0)
|
||||
return bestBlockSize;
|
||||
return -1;
|
||||
}
|
||||
};
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
#endif // KOKKOS_ENABLE_CUDA
|
||||
|
||||
@ -148,6 +148,9 @@ namespace Kokkos {
|
||||
namespace Impl {
|
||||
namespace {
|
||||
static int lock_array_copied = 0;
|
||||
inline int eliminate_warning_for_lock_array() {
|
||||
return lock_array_copied;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -60,6 +60,7 @@
|
||||
#include <Cuda/Kokkos_Cuda_Internal.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Locks.hpp>
|
||||
#include <Kokkos_Vectorization.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
@ -114,6 +115,7 @@ public:
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
template< class FunctorType >
|
||||
inline static
|
||||
int team_size_max( const FunctorType & functor )
|
||||
@ -131,7 +133,35 @@ public:
|
||||
|
||||
return n ;
|
||||
}
|
||||
#endif
|
||||
|
||||
template<class FunctorType>
|
||||
int team_size_max( const FunctorType& f, const ParallelForTag& ) const {
|
||||
typedef Impl::ParallelFor< FunctorType , TeamPolicy<Properties...> > closure_type;
|
||||
int block_size = Kokkos::Impl::cuda_get_max_block_size< closure_type, typename traits::launch_bounds >( f ,(size_t) vector_length(),
|
||||
(size_t) team_scratch_size(0) + 2*sizeof(double), (size_t) thread_scratch_size(0) + sizeof(double) );
|
||||
return block_size/vector_length();
|
||||
}
|
||||
|
||||
template<class FunctorType>
|
||||
int team_size_max( const FunctorType& f, const ParallelReduceTag& ) const {
|
||||
typedef Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,TeamPolicyInternal,FunctorType> functor_analysis_type;
|
||||
typedef typename Impl::ParallelReduceReturnValue<void,typename functor_analysis_type::value_type,FunctorType>::reducer_type reducer_type;
|
||||
typedef Impl::ParallelReduce< FunctorType , TeamPolicy<Properties...>, reducer_type > closure_type;
|
||||
typedef Impl::FunctorValueTraits< FunctorType , typename traits::work_tag > functor_value_traits;
|
||||
|
||||
int block_size = Kokkos::Impl::cuda_get_max_block_size< closure_type, typename traits::launch_bounds >( f ,(size_t) vector_length(),
|
||||
(size_t) team_scratch_size(0) + 2*sizeof(double), (size_t) thread_scratch_size(0) + sizeof(double) +
|
||||
((functor_value_traits::StaticValueSize!=0)?0:functor_value_traits::value_size( f )));
|
||||
|
||||
// Currently we require Power-of-2 team size for reductions.
|
||||
int p2 = 1;
|
||||
while(p2<=block_size) p2*=2;
|
||||
p2/=2;
|
||||
return p2/vector_length();
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
template< class FunctorType >
|
||||
static int team_size_recommended( const FunctorType & functor )
|
||||
{ return team_size_max( functor ); }
|
||||
@ -143,11 +173,41 @@ public:
|
||||
if(max<1) max = 1;
|
||||
return max;
|
||||
}
|
||||
#endif
|
||||
|
||||
template<class FunctorType>
|
||||
int team_size_recommended( const FunctorType& f, const ParallelForTag& ) const {
|
||||
typedef Impl::ParallelFor< FunctorType , TeamPolicy<Properties...> > closure_type;
|
||||
int block_size = Kokkos::Impl::cuda_get_opt_block_size< closure_type, typename traits::launch_bounds >( f ,(size_t) vector_length(),
|
||||
(size_t) team_scratch_size(0) + 2*sizeof(double), (size_t) thread_scratch_size(0) + sizeof(double));
|
||||
return block_size/vector_length();
|
||||
}
|
||||
|
||||
template<class FunctorType>
|
||||
int team_size_recommended( const FunctorType& f, const ParallelReduceTag& ) const {
|
||||
typedef Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,TeamPolicyInternal,FunctorType> functor_analysis_type;
|
||||
typedef typename Impl::ParallelReduceReturnValue<void,typename functor_analysis_type::value_type,FunctorType>::reducer_type reducer_type;
|
||||
typedef Impl::ParallelReduce< FunctorType , TeamPolicy<Properties...>, reducer_type > closure_type;
|
||||
typedef Impl::FunctorValueTraits< FunctorType , typename traits::work_tag > functor_value_traits;
|
||||
|
||||
int block_size = Kokkos::Impl::cuda_get_opt_block_size< closure_type, typename traits::launch_bounds >( f ,(size_t) vector_length(),
|
||||
(size_t) team_scratch_size(0) + 2*sizeof(double), (size_t) thread_scratch_size(0) + sizeof(double) +
|
||||
((functor_value_traits::StaticValueSize!=0)?0:functor_value_traits::value_size( f )));
|
||||
return block_size/vector_length();
|
||||
}
|
||||
|
||||
|
||||
inline static
|
||||
int vector_length_max()
|
||||
{ return Impl::CudaTraits::WarpSize; }
|
||||
|
||||
inline static
|
||||
int scratch_size_max(int level)
|
||||
{ return (level==0?
|
||||
1024*40: // 48kB is the max for CUDA, but we need some for team_member.reduce etc.
|
||||
20*1024*1024); // arbitrarily setting this to 20MB, for a Volta V100 that would give us about 3.2GB for 2 teams per SM
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
inline int vector_length() const { return m_vector_length ; }
|
||||
@ -419,7 +479,7 @@ public:
|
||||
void execute() const
|
||||
{
|
||||
const typename Policy::index_type nwork = m_policy.end() - m_policy.begin();
|
||||
const int block_size = Kokkos::Impl::cuda_get_opt_block_size< ParallelFor >( m_functor , 1, 0 , 0 );
|
||||
const int block_size = Kokkos::Impl::cuda_get_opt_block_size< ParallelFor, LaunchBounds>( m_functor , 1, 0 , 0 );
|
||||
const dim3 block( 1 , block_size , 1);
|
||||
const dim3 grid( std::min( typename Policy::index_type(( nwork + block.y - 1 ) / block.y) , typename Policy::index_type(cuda_internal_maximum_grid_count()) ) , 1 , 1);
|
||||
|
||||
@ -654,7 +714,7 @@ public:
|
||||
: m_functor( arg_functor )
|
||||
, m_league_size( arg_policy.league_size() )
|
||||
, m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
|
||||
Kokkos::Impl::cuda_get_opt_block_size< ParallelFor >( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length() )
|
||||
Kokkos::Impl::cuda_get_opt_block_size< ParallelFor, LaunchBounds >( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length() )
|
||||
, m_vector_size( arg_policy.vector_length() )
|
||||
, m_shmem_begin( sizeof(double) * ( m_team_size + 2 ) )
|
||||
, m_shmem_size( arg_policy.scratch_size(0,m_team_size) + FunctorTeamShmemSize< FunctorType >::value( m_functor , m_team_size ) )
|
||||
@ -670,7 +730,7 @@ public:
|
||||
}
|
||||
|
||||
if ( int(m_team_size) >
|
||||
int(Kokkos::Impl::cuda_get_max_block_size< ParallelFor >
|
||||
int(Kokkos::Impl::cuda_get_max_block_size< ParallelFor, LaunchBounds >
|
||||
( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
|
||||
Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelFor< Cuda > requested too large team size."));
|
||||
}
|
||||
@ -725,12 +785,13 @@ public:
|
||||
const Policy m_policy ;
|
||||
const ReducerType m_reducer ;
|
||||
const pointer_type m_result_ptr ;
|
||||
const bool m_result_ptr_device_accessible ;
|
||||
size_type * m_scratch_space ;
|
||||
size_type * m_scratch_flags ;
|
||||
size_type * m_unified_space ;
|
||||
|
||||
// Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
|
||||
enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
|
||||
// Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit)
|
||||
enum { UseShflReduction = false };//((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
|
||||
// Some crutch to do function overloading
|
||||
private:
|
||||
typedef double DummyShflReductionType;
|
||||
@ -752,12 +813,12 @@ public:
|
||||
|
||||
__device__ inline
|
||||
void operator() () const {
|
||||
run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
|
||||
/* run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
|
||||
}
|
||||
|
||||
__device__ inline
|
||||
void run(const DummySHMEMReductionType& ) const
|
||||
{
|
||||
{*/
|
||||
const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
|
||||
word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
|
||||
|
||||
@ -786,7 +847,8 @@ public:
|
||||
// This is the final block with the final result at the final threads' location
|
||||
|
||||
size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
|
||||
size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
|
||||
size_type * const global = m_result_ptr_device_accessible? reinterpret_cast<size_type*>(m_result_ptr) :
|
||||
( m_unified_space ? m_unified_space : m_scratch_space );
|
||||
|
||||
if ( threadIdx.y == 0 ) {
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
|
||||
@ -798,10 +860,9 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
__device__ inline
|
||||
/* __device__ inline
|
||||
void run(const DummyShflReductionType&) const
|
||||
{
|
||||
|
||||
value_type value;
|
||||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
|
||||
// Number of blocks is bounded so that the reduction can be limited to two passes.
|
||||
@ -832,7 +893,7 @@ public:
|
||||
*result = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}*/
|
||||
|
||||
// Determine block size constrained by shared memory:
|
||||
static inline
|
||||
@ -863,16 +924,18 @@ public:
|
||||
|
||||
CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
|
||||
|
||||
Cuda::fence();
|
||||
if(!m_result_ptr_device_accessible) {
|
||||
Cuda::fence();
|
||||
|
||||
if ( m_result_ptr ) {
|
||||
if ( m_unified_space ) {
|
||||
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
|
||||
}
|
||||
else {
|
||||
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
|
||||
DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
|
||||
if ( m_result_ptr ) {
|
||||
if ( m_unified_space ) {
|
||||
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
|
||||
}
|
||||
else {
|
||||
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
|
||||
DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -883,17 +946,18 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
template< class HostViewType >
|
||||
template< class ViewType >
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
, const HostViewType & arg_result
|
||||
, const ViewType & arg_result
|
||||
, typename std::enable_if<
|
||||
Kokkos::is_view< HostViewType >::value
|
||||
Kokkos::is_view< ViewType >::value
|
||||
,void*>::type = NULL)
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result.data() )
|
||||
, m_result_ptr_device_accessible(MemorySpaceAccess< Kokkos::CudaSpace , typename ViewType::memory_space>::accessible )
|
||||
, m_scratch_space( 0 )
|
||||
, m_scratch_flags( 0 )
|
||||
, m_unified_space( 0 )
|
||||
@ -906,6 +970,7 @@ public:
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.view().data() )
|
||||
, m_result_ptr_device_accessible(MemorySpaceAccess< Kokkos::CudaSpace , typename ReducerType::result_view_type::memory_space>::accessible )
|
||||
, m_scratch_space( 0 )
|
||||
, m_scratch_flags( 0 )
|
||||
, m_unified_space( 0 )
|
||||
@ -953,6 +1018,7 @@ public:
|
||||
const Policy m_policy ; // used for workrange and nwork
|
||||
const ReducerType m_reducer ;
|
||||
const pointer_type m_result_ptr ;
|
||||
const bool m_result_ptr_device_accessible ;
|
||||
size_type * m_scratch_space ;
|
||||
size_type * m_scratch_flags ;
|
||||
size_type * m_unified_space ;
|
||||
@ -960,7 +1026,7 @@ public:
|
||||
typedef typename Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern;
|
||||
|
||||
// Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
|
||||
enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
|
||||
enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && (ValueTraits::StaticValueSize!=0)) };
|
||||
// Some crutch to do function overloading
|
||||
private:
|
||||
typedef double DummyShflReductionType;
|
||||
@ -978,12 +1044,12 @@ public:
|
||||
inline
|
||||
__device__
|
||||
void operator() (void) const {
|
||||
run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
|
||||
/* run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
|
||||
}
|
||||
|
||||
__device__ inline
|
||||
void run(const DummySHMEMReductionType& ) const
|
||||
{
|
||||
{*/
|
||||
const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
|
||||
word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
|
||||
|
||||
@ -1007,7 +1073,8 @@ public:
|
||||
|
||||
// This is the final block with the final result at the final threads' location
|
||||
size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
|
||||
size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
|
||||
size_type * const global = m_result_ptr_device_accessible? reinterpret_cast<size_type*>(m_result_ptr) :
|
||||
( m_unified_space ? m_unified_space : m_scratch_space );
|
||||
|
||||
if ( threadIdx.y == 0 ) {
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
|
||||
@ -1019,7 +1086,7 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
__device__ inline
|
||||
/* __device__ inline
|
||||
void run(const DummyShflReductionType&) const
|
||||
{
|
||||
|
||||
@ -1051,7 +1118,7 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*/
|
||||
// Determine block size constrained by shared memory:
|
||||
static inline
|
||||
unsigned local_block_size( const FunctorType & f )
|
||||
@ -1089,16 +1156,18 @@ public:
|
||||
|
||||
CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
|
||||
|
||||
Cuda::fence();
|
||||
if(!m_result_ptr_device_accessible) {
|
||||
Cuda::fence();
|
||||
|
||||
if ( m_result_ptr ) {
|
||||
if ( m_unified_space ) {
|
||||
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
|
||||
}
|
||||
else {
|
||||
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
|
||||
DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
|
||||
if ( m_result_ptr ) {
|
||||
if ( m_unified_space ) {
|
||||
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
|
||||
}
|
||||
else {
|
||||
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
|
||||
DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1109,17 +1178,18 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
template< class HostViewType >
|
||||
template< class ViewType >
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
, const HostViewType & arg_result
|
||||
, const ViewType & arg_result
|
||||
, typename std::enable_if<
|
||||
Kokkos::is_view< HostViewType >::value
|
||||
Kokkos::is_view< ViewType >::value
|
||||
,void*>::type = NULL)
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result.data() )
|
||||
, m_result_ptr_device_accessible(MemorySpaceAccess< Kokkos::CudaSpace , typename ViewType::memory_space>::accessible )
|
||||
, m_scratch_space( 0 )
|
||||
, m_scratch_flags( 0 )
|
||||
, m_unified_space( 0 )
|
||||
@ -1132,6 +1202,7 @@ public:
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.view().data() )
|
||||
, m_result_ptr_device_accessible(MemorySpaceAccess< Kokkos::CudaSpace , typename ReducerType::result_view_type::memory_space>::accessible )
|
||||
, m_scratch_space( 0 )
|
||||
, m_scratch_flags( 0 )
|
||||
, m_unified_space( 0 )
|
||||
@ -1174,7 +1245,7 @@ public:
|
||||
typedef FunctorType functor_type ;
|
||||
typedef Cuda::size_type size_type ;
|
||||
|
||||
enum { UseShflReduction = (true && ValueTraits::StaticValueSize) };
|
||||
enum { UseShflReduction = (true && (ValueTraits::StaticValueSize!=0)) };
|
||||
|
||||
private:
|
||||
typedef double DummyShflReductionType;
|
||||
@ -1191,6 +1262,7 @@ private:
|
||||
const FunctorType m_functor ;
|
||||
const ReducerType m_reducer ;
|
||||
const pointer_type m_result_ptr ;
|
||||
const bool m_result_ptr_device_accessible ;
|
||||
size_type * m_scratch_space ;
|
||||
size_type * m_scratch_flags ;
|
||||
size_type * m_unified_space ;
|
||||
@ -1279,7 +1351,8 @@ public:
|
||||
// This is the final block with the final result at the final threads' location
|
||||
|
||||
size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
|
||||
size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
|
||||
size_type * const global = m_result_ptr_device_accessible? reinterpret_cast<size_type*>(m_result_ptr) :
|
||||
( m_unified_space ? m_unified_space : m_scratch_space );
|
||||
|
||||
if ( threadIdx.y == 0 ) {
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
|
||||
@ -1312,12 +1385,18 @@ public:
|
||||
, value );
|
||||
}
|
||||
|
||||
pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
|
||||
pointer_type const result = m_result_ptr_device_accessible? m_result_ptr :
|
||||
(pointer_type) ( m_unified_space ? m_unified_space : m_scratch_space );
|
||||
|
||||
value_type init;
|
||||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init);
|
||||
if(Impl::cuda_inter_block_reduction<FunctorType,ValueJoin,WorkTag>
|
||||
(value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,blockDim.y)) {
|
||||
if(
|
||||
Impl::cuda_inter_block_reduction<FunctorType,ValueJoin,WorkTag>
|
||||
(value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,blockDim.y)
|
||||
//This breaks a test
|
||||
// Kokkos::Impl::CudaReductionsFunctor<FunctorType,WorkTag,false,true>::scalar_inter_block_reduction(ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x ,
|
||||
// kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags)
|
||||
) {
|
||||
const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
|
||||
if(id==0) {
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
|
||||
@ -1331,7 +1410,7 @@ public:
|
||||
{
|
||||
const int nwork = m_league_size * m_team_size ;
|
||||
if ( nwork ) {
|
||||
const int block_count = UseShflReduction? std::min( m_league_size , size_type(1024) )
|
||||
const int block_count = UseShflReduction? std::min( m_league_size , size_type(1024*32) )
|
||||
:std::min( m_league_size , m_team_size );
|
||||
|
||||
m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_count );
|
||||
@ -1344,16 +1423,18 @@ public:
|
||||
|
||||
CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute
|
||||
|
||||
Cuda::fence();
|
||||
if(!m_result_ptr_device_accessible) {
|
||||
Cuda::fence();
|
||||
|
||||
if ( m_result_ptr ) {
|
||||
if ( m_unified_space ) {
|
||||
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
|
||||
}
|
||||
else {
|
||||
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
|
||||
DeepCopy<HostSpace,CudaSpace>( m_result_ptr, m_scratch_space, size );
|
||||
if ( m_result_ptr ) {
|
||||
if ( m_unified_space ) {
|
||||
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
|
||||
}
|
||||
else {
|
||||
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
|
||||
DeepCopy<HostSpace,CudaSpace>( m_result_ptr, m_scratch_space, size );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1364,16 +1445,17 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
template< class HostViewType >
|
||||
template< class ViewType >
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
, const HostViewType & arg_result
|
||||
, const ViewType & arg_result
|
||||
, typename std::enable_if<
|
||||
Kokkos::is_view< HostViewType >::value
|
||||
Kokkos::is_view< ViewType >::value
|
||||
,void*>::type = NULL)
|
||||
: m_functor( arg_functor )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result.data() )
|
||||
, m_result_ptr_device_accessible(MemorySpaceAccess< Kokkos::CudaSpace , typename ViewType::memory_space>::accessible )
|
||||
, m_scratch_space( 0 )
|
||||
, m_scratch_flags( 0 )
|
||||
, m_unified_space( 0 )
|
||||
@ -1383,17 +1465,17 @@ public:
|
||||
, m_scratch_ptr{NULL,NULL}
|
||||
, m_scratch_size{
|
||||
arg_policy.scratch_size(0,( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
|
||||
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
|
||||
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce, LaunchBounds >( arg_functor , arg_policy.vector_length(),
|
||||
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
|
||||
arg_policy.vector_length() )
|
||||
), arg_policy.scratch_size(1,( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
|
||||
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
|
||||
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce, LaunchBounds >( arg_functor , arg_policy.vector_length(),
|
||||
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
|
||||
arg_policy.vector_length() )
|
||||
)}
|
||||
, m_league_size( arg_policy.league_size() )
|
||||
, m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
|
||||
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
|
||||
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce, LaunchBounds >( arg_functor , arg_policy.vector_length(),
|
||||
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
|
||||
arg_policy.vector_length() )
|
||||
, m_vector_size( arg_policy.vector_length() )
|
||||
@ -1430,9 +1512,7 @@ public:
|
||||
Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much L0 scratch memory"));
|
||||
}
|
||||
|
||||
if ( unsigned(m_team_size) >
|
||||
unsigned(Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
|
||||
( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
|
||||
if ( int(m_team_size) > arg_policy.team_size_max(m_functor,ParallelReduceTag()) ) {
|
||||
Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too large team size."));
|
||||
}
|
||||
|
||||
@ -1444,6 +1524,7 @@ public:
|
||||
: m_functor( arg_functor )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.view().data() )
|
||||
, m_result_ptr_device_accessible(MemorySpaceAccess< Kokkos::CudaSpace , typename ReducerType::result_view_type::memory_space>::accessible )
|
||||
, m_scratch_space( 0 )
|
||||
, m_scratch_flags( 0 )
|
||||
, m_unified_space( 0 )
|
||||
@ -1453,7 +1534,7 @@ public:
|
||||
, m_scratch_ptr{NULL,NULL}
|
||||
, m_league_size( arg_policy.league_size() )
|
||||
, m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
|
||||
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
|
||||
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce, LaunchBounds >( arg_functor , arg_policy.vector_length(),
|
||||
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
|
||||
arg_policy.vector_length() )
|
||||
, m_vector_size( arg_policy.vector_length() )
|
||||
@ -1486,10 +1567,7 @@ public:
|
||||
CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
|
||||
Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
|
||||
}
|
||||
|
||||
if ( int(m_team_size) >
|
||||
int(Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
|
||||
( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
|
||||
if ( int(m_team_size) > arg_policy.team_size_max(m_functor,ParallelReduceTag()) ) {
|
||||
Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too large team size."));
|
||||
}
|
||||
|
||||
@ -1753,7 +1831,7 @@ public:
|
||||
// Occupancy calculator assumes whole block.
|
||||
|
||||
m_team_size =
|
||||
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >
|
||||
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce, LaunchBounds >
|
||||
( arg_functor
|
||||
, arg_policy.vector_length()
|
||||
, arg_policy.team_scratch_size(0)
|
||||
@ -1970,7 +2048,9 @@ private:
|
||||
const WorkRange range( m_policy , blockIdx.x , gridDim.x );
|
||||
|
||||
for ( typename Policy::member_type iwork_base = range.begin(); iwork_base < range.end() ; iwork_base += blockDim.y ) {
|
||||
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
unsigned MASK=KOKKOS_IMPL_CUDA_ACTIVEMASK;
|
||||
#endif
|
||||
const typename Policy::member_type iwork = iwork_base + threadIdx.y ;
|
||||
|
||||
__syncthreads(); // Don't overwrite previous iteration values until they are used
|
||||
@ -1981,7 +2061,11 @@ private:
|
||||
for ( unsigned i = threadIdx.y ; i < word_count.value ; ++i ) {
|
||||
shared_data[i + word_count.value] = shared_data[i] = shared_accum[i] ;
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
|
||||
#else
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
|
||||
#endif
|
||||
if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values.
|
||||
|
||||
// Call functor to accumulate inclusive scan value for this work item
|
||||
@ -2189,6 +2273,9 @@ private:
|
||||
const WorkRange range( m_policy , blockIdx.x , gridDim.x );
|
||||
|
||||
for ( typename Policy::member_type iwork_base = range.begin(); iwork_base < range.end() ; iwork_base += blockDim.y ) {
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
unsigned MASK=KOKKOS_IMPL_CUDA_ACTIVEMASK;
|
||||
#endif
|
||||
|
||||
const typename Policy::member_type iwork = iwork_base + threadIdx.y ;
|
||||
|
||||
@ -2201,6 +2288,11 @@ private:
|
||||
shared_data[i + word_count.value] = shared_data[i] = shared_accum[i] ;
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
|
||||
#else
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
|
||||
#endif
|
||||
if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values.
|
||||
|
||||
// Call functor to accumulate inclusive scan value for this work item
|
||||
|
||||
@ -194,8 +194,9 @@ void cuda_shfl_up( T & out , T const & in , int delta ,
|
||||
*/
|
||||
|
||||
template< class ValueType , class JoinOp>
|
||||
__device__
|
||||
inline void cuda_intra_warp_reduction( ValueType& result,
|
||||
__device__ inline
|
||||
typename std::enable_if< !Kokkos::is_reducer<ValueType>::value >::type
|
||||
cuda_intra_warp_reduction( ValueType& result,
|
||||
const JoinOp& join,
|
||||
const uint32_t max_active_thread = blockDim.y) {
|
||||
|
||||
@ -214,8 +215,9 @@ inline void cuda_intra_warp_reduction( ValueType& result,
|
||||
}
|
||||
|
||||
template< class ValueType , class JoinOp>
|
||||
__device__
|
||||
inline void cuda_inter_warp_reduction( ValueType& value,
|
||||
__device__ inline
|
||||
typename std::enable_if< !Kokkos::is_reducer<ValueType>::value >::type
|
||||
cuda_inter_warp_reduction( ValueType& value,
|
||||
const JoinOp& join,
|
||||
const int max_active_thread = blockDim.y) {
|
||||
|
||||
@ -247,8 +249,9 @@ inline void cuda_inter_warp_reduction( ValueType& value,
|
||||
}
|
||||
|
||||
template< class ValueType , class JoinOp>
|
||||
__device__
|
||||
inline void cuda_intra_block_reduction( ValueType& value,
|
||||
__device__ inline
|
||||
typename std::enable_if< !Kokkos::is_reducer<ValueType>::value >::type
|
||||
cuda_intra_block_reduction( ValueType& value,
|
||||
const JoinOp& join,
|
||||
const int max_active_thread = blockDim.y) {
|
||||
cuda_intra_warp_reduction(value,join,max_active_thread);
|
||||
@ -314,31 +317,52 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
|
||||
if( id + 1 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
int active = KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
|
||||
int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
#endif
|
||||
if (int(blockDim.x*blockDim.y) > 2) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 2,32);
|
||||
if( id + 2 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
#endif
|
||||
if (int(blockDim.x*blockDim.y) > 4) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 4,32);
|
||||
if( id + 4 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
#endif
|
||||
if (int(blockDim.x*blockDim.y) > 8) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 8,32);
|
||||
if( id + 8 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
#endif
|
||||
if (int(blockDim.x*blockDim.y) > 16) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 16,32);
|
||||
if( id + 16 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
//The last block has in its thread=0 the global reduction value through "value"
|
||||
@ -478,31 +502,52 @@ cuda_inter_block_reduction( const ReducerType& reducer,
|
||||
if( id + 1 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
int active = KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
|
||||
int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
#endif
|
||||
if (int(blockDim.x*blockDim.y) > 2) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 2,32);
|
||||
if( id + 2 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
#endif
|
||||
if (int(blockDim.x*blockDim.y) > 4) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 4,32);
|
||||
if( id + 4 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
#endif
|
||||
if (int(blockDim.x*blockDim.y) > 8) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 8,32);
|
||||
if( id + 8 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
#endif
|
||||
if (int(blockDim.x*blockDim.y) > 16) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 16,32);
|
||||
if( id + 16 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -513,6 +558,213 @@ cuda_inter_block_reduction( const ReducerType& reducer,
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class FunctorType, class ArgTag, bool DoScan, bool UseShfl>
|
||||
struct CudaReductionsFunctor;
|
||||
|
||||
template<class FunctorType, class ArgTag>
|
||||
struct CudaReductionsFunctor<FunctorType, ArgTag, false, true> {
|
||||
typedef FunctorValueTraits< FunctorType , ArgTag > ValueTraits ;
|
||||
typedef FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
|
||||
typedef FunctorValueInit< FunctorType , ArgTag > ValueInit ;
|
||||
typedef FunctorValueOps< FunctorType , ArgTag > ValueOps ;
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::value_type Scalar;
|
||||
|
||||
__device__
|
||||
static inline void scalar_intra_warp_reduction(
|
||||
const FunctorType& functor,
|
||||
Scalar value, // Contribution
|
||||
const bool skip_vector, // Skip threads if Kokkos vector lanes are not part of the reduction
|
||||
const int width, // How much of the warp participates
|
||||
Scalar& result)
|
||||
{
|
||||
unsigned mask = width==32?0xffffffff:((1<<width)-1)<<((threadIdx.y*blockDim.x+threadIdx.x)%(32/width))*width;
|
||||
for(int delta=skip_vector?blockDim.x:1; delta<width; delta*=2) {
|
||||
Scalar tmp;
|
||||
cuda_shfl_down(tmp,value,delta,width,mask);
|
||||
ValueJoin::join( functor , &value, &tmp);
|
||||
}
|
||||
|
||||
cuda_shfl(result,value,0,width,mask);
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline void scalar_intra_block_reduction(
|
||||
const FunctorType& functor,
|
||||
Scalar value,
|
||||
const bool skip,
|
||||
Scalar* my_global_team_buffer_element,
|
||||
const int shared_elements,
|
||||
Scalar* shared_team_buffer_element) {
|
||||
|
||||
const int warp_id = (threadIdx.y*blockDim.x)/32;
|
||||
Scalar* const my_shared_team_buffer_element =
|
||||
shared_team_buffer_element + warp_id%shared_elements;
|
||||
|
||||
// Warp Level Reduction, ignoring Kokkos vector entries
|
||||
scalar_intra_warp_reduction(functor,value,skip,32,value);
|
||||
|
||||
if(warp_id<shared_elements) {
|
||||
*my_shared_team_buffer_element=value;
|
||||
}
|
||||
// Wait for every warp to be done before using one warp to do final cross warp reduction
|
||||
__syncthreads();
|
||||
|
||||
const int num_warps = blockDim.x*blockDim.y/32;
|
||||
for(int w = shared_elements; w<num_warps; w+=shared_elements) {
|
||||
if(warp_id>=w && warp_id<w+shared_elements) {
|
||||
if((threadIdx.y*blockDim.x + threadIdx.x)%32==0)
|
||||
ValueJoin::join( functor , my_shared_team_buffer_element, &value);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
|
||||
if( warp_id == 0) {
|
||||
ValueInit::init( functor , &value );
|
||||
for(unsigned int i=threadIdx.y*blockDim.x+threadIdx.x; i<blockDim.y*blockDim.x/32; i+=32)
|
||||
ValueJoin::join( functor , &value,&shared_team_buffer_element[i]);
|
||||
scalar_intra_warp_reduction(functor,value,false,32,*my_global_team_buffer_element);
|
||||
}
|
||||
}
|
||||
|
||||
__device__
|
||||
static inline bool scalar_inter_block_reduction(
|
||||
const FunctorType & functor ,
|
||||
const Cuda::size_type block_id ,
|
||||
const Cuda::size_type block_count ,
|
||||
Cuda::size_type * const shared_data ,
|
||||
Cuda::size_type * const global_data ,
|
||||
Cuda::size_type * const global_flags ) {
|
||||
Scalar* const global_team_buffer_element = ((Scalar*) global_data);
|
||||
Scalar* const my_global_team_buffer_element = global_team_buffer_element + blockIdx.x;
|
||||
Scalar* shared_team_buffer_elements = ((Scalar*) shared_data);
|
||||
Scalar value = shared_team_buffer_elements[threadIdx.y];
|
||||
int shared_elements=blockDim.x*blockDim.y/32;
|
||||
int global_elements=block_count;
|
||||
__syncthreads();
|
||||
|
||||
scalar_intra_block_reduction(functor,value,true,my_global_team_buffer_element,shared_elements,shared_team_buffer_elements);
|
||||
__syncthreads();
|
||||
unsigned int num_teams_done = 0;
|
||||
if(threadIdx.x + threadIdx.y == 0) {
|
||||
__threadfence();
|
||||
num_teams_done = Kokkos::atomic_fetch_add(global_flags,1)+1;
|
||||
}
|
||||
bool is_last_block = false;
|
||||
if(__syncthreads_or(num_teams_done == gridDim.x)) {
|
||||
is_last_block=true;
|
||||
*global_flags = 0;
|
||||
ValueInit::init( functor, &value);
|
||||
for(int i=threadIdx.y*blockDim.x+threadIdx.x; i<global_elements; i+=blockDim.x*blockDim.y) {
|
||||
ValueJoin::join( functor , &value,&global_team_buffer_element[i]);
|
||||
}
|
||||
scalar_intra_block_reduction(functor,value,false,shared_team_buffer_elements+(blockDim.y-1),shared_elements,shared_team_buffer_elements);
|
||||
}
|
||||
return is_last_block;
|
||||
}
|
||||
};
|
||||
|
||||
template<class FunctorType, class ArgTag>
|
||||
struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
|
||||
typedef FunctorValueTraits< FunctorType , ArgTag > ValueTraits ;
|
||||
typedef FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
|
||||
typedef FunctorValueInit< FunctorType , ArgTag > ValueInit ;
|
||||
typedef FunctorValueOps< FunctorType , ArgTag > ValueOps ;
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::value_type Scalar;
|
||||
|
||||
__device__
|
||||
static inline void scalar_intra_warp_reduction(
|
||||
const FunctorType& functor,
|
||||
Scalar* value, // Contribution
|
||||
const bool skip_vector, // Skip threads if Kokkos vector lanes are not part of the reduction
|
||||
const int width) // How much of the warp participates
|
||||
{
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
unsigned mask = width==32?0xffffffff:((1<<width)-1)<<((threadIdx.y*blockDim.x+threadIdx.x)%(32/width))*width;
|
||||
#endif
|
||||
const int lane_id = (threadIdx.y*blockDim.x+threadIdx.x)%32;
|
||||
for(int delta=skip_vector?blockDim.x:1; delta<width; delta*=2) {
|
||||
if(lane_id + delta<32) {
|
||||
ValueJoin::join( functor , value, value+delta);
|
||||
}
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(mask);
|
||||
#else
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
|
||||
#endif
|
||||
}
|
||||
*value=*(value-lane_id);
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline void scalar_intra_block_reduction(
|
||||
const FunctorType& functor,
|
||||
Scalar value,
|
||||
const bool skip,
|
||||
Scalar* result,
|
||||
const int shared_elements,
|
||||
Scalar* shared_team_buffer_element) {
|
||||
|
||||
const int warp_id = (threadIdx.y*blockDim.x)/32;
|
||||
Scalar* const my_shared_team_buffer_element =
|
||||
shared_team_buffer_element + threadIdx.y*blockDim.x+threadIdx.x;
|
||||
*my_shared_team_buffer_element = value;
|
||||
// Warp Level Reduction, ignoring Kokkos vector entries
|
||||
scalar_intra_warp_reduction(functor,my_shared_team_buffer_element,skip,32);
|
||||
// Wait for every warp to be done before using one warp to do final cross warp reduction
|
||||
__syncthreads();
|
||||
|
||||
if( warp_id == 0) {
|
||||
const unsigned int delta = (threadIdx.y*blockDim.x+threadIdx.x)*32;
|
||||
if(delta<blockDim.x*blockDim.y)
|
||||
*my_shared_team_buffer_element = shared_team_buffer_element[delta];
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP;
|
||||
scalar_intra_warp_reduction(functor,my_shared_team_buffer_element,false,blockDim.x*blockDim.y/32);
|
||||
if(threadIdx.x + threadIdx.y == 0) *result = *shared_team_buffer_element;
|
||||
}
|
||||
}
|
||||
|
||||
__device__
|
||||
static inline bool scalar_inter_block_reduction(
|
||||
const FunctorType & functor ,
|
||||
const Cuda::size_type block_id ,
|
||||
const Cuda::size_type block_count ,
|
||||
Cuda::size_type * const shared_data ,
|
||||
Cuda::size_type * const global_data ,
|
||||
Cuda::size_type * const global_flags ) {
|
||||
Scalar* const global_team_buffer_element = ((Scalar*) global_data);
|
||||
Scalar* const my_global_team_buffer_element = global_team_buffer_element + blockIdx.x;
|
||||
Scalar* shared_team_buffer_elements = ((Scalar*) shared_data);
|
||||
Scalar value = shared_team_buffer_elements[threadIdx.y];
|
||||
int shared_elements=blockDim.x*blockDim.y/32;
|
||||
int global_elements=block_count;
|
||||
__syncthreads();
|
||||
|
||||
scalar_intra_block_reduction(functor,value,true,my_global_team_buffer_element,shared_elements,shared_team_buffer_elements);
|
||||
__syncthreads();
|
||||
|
||||
unsigned int num_teams_done = 0;
|
||||
if(threadIdx.x + threadIdx.y == 0) {
|
||||
__threadfence();
|
||||
num_teams_done = Kokkos::atomic_fetch_add(global_flags,1)+1;
|
||||
}
|
||||
bool is_last_block = false;
|
||||
if(__syncthreads_or(num_teams_done == gridDim.x)) {
|
||||
is_last_block=true;
|
||||
*global_flags = 0;
|
||||
ValueInit::init( functor, &value);
|
||||
for(int i=threadIdx.y*blockDim.x+threadIdx.x; i<global_elements; i+=blockDim.x*blockDim.y) {
|
||||
ValueJoin::join( functor , &value,&global_team_buffer_element[i]);
|
||||
}
|
||||
scalar_intra_block_reduction(functor,value,false,shared_team_buffer_elements+(blockDim.y-1),shared_elements,shared_team_buffer_elements);
|
||||
}
|
||||
return is_last_block;
|
||||
}
|
||||
};
|
||||
//----------------------------------------------------------------------------
|
||||
// See section B.17 of Cuda C Programming Guide Version 3.2
|
||||
// for discussion of
|
||||
@ -639,14 +891,15 @@ void cuda_intra_block_reduce_scan( const FunctorType & functor ,
|
||||
*
|
||||
* Global reduce result is in the last threads' 'shared_data' location.
|
||||
*/
|
||||
|
||||
template< bool DoScan , class FunctorType , class ArgTag >
|
||||
__device__
|
||||
bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
|
||||
const Cuda::size_type block_id ,
|
||||
const Cuda::size_type block_count ,
|
||||
Cuda::size_type * const shared_data ,
|
||||
Cuda::size_type * const global_data ,
|
||||
Cuda::size_type * const global_flags )
|
||||
bool cuda_single_inter_block_reduce_scan2( const FunctorType & functor ,
|
||||
const Cuda::size_type block_id ,
|
||||
const Cuda::size_type block_count ,
|
||||
Cuda::size_type * const shared_data ,
|
||||
Cuda::size_type * const global_data ,
|
||||
Cuda::size_type * const global_flags )
|
||||
{
|
||||
typedef Cuda::size_type size_type ;
|
||||
typedef FunctorValueTraits< FunctorType , ArgTag > ValueTraits ;
|
||||
@ -655,7 +908,6 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
|
||||
typedef FunctorValueOps< FunctorType , ArgTag > ValueOps ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
//typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
// '__ffs' = position of the least significant bit set to 1.
|
||||
// 'blockDim.y' is guaranteed to be a power of two so this
|
||||
@ -678,12 +930,7 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
|
||||
size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
|
||||
size_type * const global = global_data + word_count.value * block_id ;
|
||||
|
||||
//#if (__CUDA_ARCH__ < 500)
|
||||
for ( int i = int(threadIdx.y) ; i < int(word_count.value) ; i += int(blockDim.y) ) { global[i] = shared[i] ; }
|
||||
//#else
|
||||
// for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
|
||||
//#endif
|
||||
|
||||
}
|
||||
|
||||
// Contributing blocks note that their contribution has been completed via an atomic-increment flag
|
||||
@ -725,6 +972,22 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
|
||||
return is_last_block ;
|
||||
}
|
||||
|
||||
template< bool DoScan , class FunctorType , class ArgTag >
|
||||
__device__
|
||||
bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
|
||||
const Cuda::size_type block_id ,
|
||||
const Cuda::size_type block_count ,
|
||||
Cuda::size_type * const shared_data ,
|
||||
Cuda::size_type * const global_data ,
|
||||
Cuda::size_type * const global_flags )
|
||||
{
|
||||
typedef FunctorValueTraits< FunctorType , ArgTag > ValueTraits ;
|
||||
if(!DoScan && ValueTraits::StaticValueSize)
|
||||
return Kokkos::Impl::CudaReductionsFunctor<FunctorType,ArgTag,false,(ValueTraits::StaticValueSize>16)>::scalar_inter_block_reduction(functor,block_id,block_count,shared_data,global_data,global_flags);
|
||||
else
|
||||
return cuda_single_inter_block_reduce_scan2<DoScan, FunctorType, ArgTag>(functor, block_id, block_count, shared_data, global_data, global_flags);
|
||||
}
|
||||
|
||||
// Size in bytes required for inter block reduce or scan
|
||||
template< bool DoScan , class FunctorType , class ArgTag >
|
||||
inline
|
||||
|
||||
@ -160,7 +160,7 @@ public:
|
||||
|
||||
template<class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void team_broadcast( ValueType & val, const int& thread_id) const
|
||||
void team_broadcast( ValueType & val, const int& thread_id ) const
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
if ( 1 == blockDim.z ) { // team == block
|
||||
@ -178,6 +178,29 @@ public:
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Closure, class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void team_broadcast( Closure const & f, ValueType & val, const int& thread_id ) const
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
f( val );
|
||||
|
||||
if ( 1 == blockDim.z ) { // team == block
|
||||
__syncthreads();
|
||||
// Wait for shared data write until all threads arrive here
|
||||
if ( threadIdx.x == 0u && threadIdx.y == (uint32_t)thread_id ) {
|
||||
*((ValueType*) m_team_reduce) = val ;
|
||||
}
|
||||
__syncthreads(); // Wait for shared data read until root thread writes
|
||||
val = *((ValueType*) m_team_reduce);
|
||||
}
|
||||
else { // team <= warp
|
||||
ValueType tmp( val ); // input might not be a register variable
|
||||
cuda_shfl( val, tmp, blockDim.x * thread_id, blockDim.x * blockDim.y );
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
/**\brief Reduction across a team
|
||||
@ -200,92 +223,7 @@ public:
|
||||
team_reduce( ReducerType const & reducer ) const noexcept
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
typedef typename ReducerType::value_type value_type ;
|
||||
|
||||
value_type tmp( reducer.reference() );
|
||||
|
||||
// reduce within the warp using shuffle
|
||||
|
||||
const int wx =
|
||||
( threadIdx.x + blockDim.x * threadIdx.y ) & CudaTraits::WarpIndexMask ;
|
||||
|
||||
for ( int i = CudaTraits::WarpSize ; (int)blockDim.x <= ( i >>= 1 ) ; ) {
|
||||
|
||||
cuda_shfl_down( reducer.reference() , tmp , i , CudaTraits::WarpSize );
|
||||
|
||||
// Root of each vector lane reduces:
|
||||
if ( 0 == threadIdx.x && wx < i ) {
|
||||
reducer.join( tmp , reducer.reference() );
|
||||
}
|
||||
}
|
||||
|
||||
if ( 1 < blockDim.z ) { // team <= warp
|
||||
// broadcast result from root vector lange of root thread
|
||||
|
||||
cuda_shfl( reducer.reference() , tmp
|
||||
, blockDim.x * threadIdx.y , CudaTraits::WarpSize );
|
||||
|
||||
}
|
||||
else { // team == block
|
||||
// Reduce across warps using shared memory
|
||||
// Broadcast result within block
|
||||
|
||||
// Number of warps, blockDim.y may not be power of two:
|
||||
const int nw = ( blockDim.x * blockDim.y + CudaTraits::WarpIndexMask ) >> CudaTraits::WarpIndexShift ;
|
||||
|
||||
// Warp index:
|
||||
const int wy = ( blockDim.x * threadIdx.y ) >> CudaTraits::WarpIndexShift ;
|
||||
|
||||
// Number of shared memory entries for the reduction:
|
||||
int nsh = m_team_reduce_size / sizeof(value_type);
|
||||
|
||||
// Using at most one entry per warp:
|
||||
if ( nw < nsh ) nsh = nw ;
|
||||
|
||||
__syncthreads(); // Wait before shared data write
|
||||
|
||||
if ( 0 == wx && wy < nsh ) {
|
||||
((value_type*) m_team_reduce)[wy] = tmp ;
|
||||
}
|
||||
|
||||
// When more warps than shared entries:
|
||||
for ( int i = nsh ; i < nw ; i += nsh ) {
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if ( 0 == wx && i <= wy ) {
|
||||
const int k = wy - i ;
|
||||
if ( k < nsh ) {
|
||||
reducer.join( *((value_type*) m_team_reduce + k) , tmp );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// One warp performs the inter-warp reduction:
|
||||
|
||||
if ( 0 == wy ) {
|
||||
|
||||
// Start at power of two covering nsh
|
||||
|
||||
for ( int i = 1 << ( 32 - __clz(nsh-1) ) ; ( i >>= 1 ) ; ) {
|
||||
const int k = wx + i ;
|
||||
if ( wx < i && k < nsh ) {
|
||||
reducer.join( ((value_type*)m_team_reduce)[wx]
|
||||
, ((value_type*)m_team_reduce)[k] );
|
||||
__threadfence_block();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads(); // Wait for reduction
|
||||
|
||||
// Broadcast result to all threads
|
||||
reducer.reference() = *((value_type*)m_team_reduce);
|
||||
}
|
||||
|
||||
cuda_intra_block_reduction(reducer,blockDim.y);
|
||||
#endif /* #ifdef __CUDA_ARCH__ */
|
||||
}
|
||||
|
||||
@ -801,7 +739,11 @@ void parallel_for
|
||||
; i += blockDim.x ) {
|
||||
closure(i);
|
||||
}
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x);
|
||||
#else
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -970,7 +912,11 @@ KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
if(threadIdx.x == 0) lambda();
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x);
|
||||
#else
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -979,7 +925,11 @@ KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
if(threadIdx.x == 0 && threadIdx.y == 0) lambda();
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x);
|
||||
#else
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -2,9 +2,11 @@
|
||||
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
#if ( CUDA_VERSION < 9000 )
|
||||
#define KOKKOS_IMPL_CUDA_ACTIVEMASK 0
|
||||
#define KOKKOS_IMPL_CUDA_SYNCWARP __threadfence_block()
|
||||
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(x) __threadfence_block()
|
||||
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK __threadfence_block()
|
||||
#define KOKKOS_IMPL_CUDA_BALLOT(x) __ballot(x)
|
||||
#define KOKKOS_IMPL_CUDA_BALLOT_MASK(x) __ballot(x)
|
||||
#define KOKKOS_IMPL_CUDA_SHFL(x,y,z) __shfl(x,y,z)
|
||||
#define KOKKOS_IMPL_CUDA_SHFL_MASK(m,x,y,z) __shfl(x,y,z)
|
||||
#define KOKKOS_IMPL_CUDA_SHFL_UP(x,y,z) __shfl_up(x,y,z)
|
||||
@ -12,9 +14,11 @@
|
||||
#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x,y,z) __shfl_down(x,y,z)
|
||||
#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m,x,y,z) __shfl_down(x,y,z)
|
||||
#else
|
||||
#define KOKKOS_IMPL_CUDA_ACTIVEMASK __activemask()
|
||||
#define KOKKOS_IMPL_CUDA_SYNCWARP __syncwarp(0xffffffff)
|
||||
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) __syncwarp(m)
|
||||
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) __syncwarp(m);
|
||||
#define KOKKOS_IMPL_CUDA_BALLOT(x) __ballot_sync(__activemask(),x)
|
||||
#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m,x) __ballot_sync(m,x)
|
||||
#define KOKKOS_IMPL_CUDA_SHFL(x,y,z) __shfl_sync(0xffffffff,x,y,z)
|
||||
#define KOKKOS_IMPL_CUDA_SHFL_MASK(m,x,y,z) __shfl_sync(m,x,y,z)
|
||||
#define KOKKOS_IMPL_CUDA_SHFL_UP(x,y,z) __shfl_up_sync(0xffffffff,x,y,z)
|
||||
@ -23,11 +27,16 @@
|
||||
#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m,x,y,z) __shfl_down_sync(m,x,y,z)
|
||||
#endif
|
||||
#else
|
||||
#define KOKKOS_IMPL_CUDA_ACTIVEMASK 0
|
||||
#define KOKKOS_IMPL_CUDA_SYNCWARP
|
||||
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK
|
||||
#define KOKKOS_IMPL_CUDA_BALLOT(x) 0
|
||||
#define KOKKOS_IMPL_CUDA_BALLOT_MASK(x) 0
|
||||
#define KOKKOS_IMPL_CUDA_SHFL(x,y,z) 0
|
||||
#define KOKKOS_IMPL_CUDA_SHFL_MASK(m,x,y,z) 0
|
||||
#define KOKKOS_IMPL_CUDA_SHFL_UP(x,y,z) 0
|
||||
#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x,y,z) 0
|
||||
#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m,x,y,z) 0
|
||||
#endif
|
||||
|
||||
#if ( CUDA_VERSION >= 9000 ) && (!defined(KOKKOS_COMPILER_CLANG))
|
||||
|
||||
@ -279,6 +279,8 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static handle_type assign( value_type * arg_data_ptr, track_type const & arg_tracker )
|
||||
{
|
||||
if(arg_data_ptr == NULL) return handle_type();
|
||||
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
// Assignment of texture = non-texture requires creation of a texture object
|
||||
// which can only occur on the host. In addition, 'get_record' is only valid
|
||||
@ -292,8 +294,7 @@ public:
|
||||
|
||||
#if ! defined( KOKKOS_ENABLE_CUDA_LDG_INTRINSIC )
|
||||
if ( 0 == r ) {
|
||||
//Kokkos::abort("Cuda const random access View using Cuda texture memory requires Kokkos to allocate the View's memory");
|
||||
return handle_type();
|
||||
Kokkos::abort("Cuda const random access View using Cuda texture memory requires Kokkos to allocate the View's memory");
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@ -46,6 +46,8 @@
|
||||
|
||||
#include <initializer_list>
|
||||
|
||||
#include <Kokkos_Layout.hpp>
|
||||
|
||||
#include<impl/KokkosExp_Host_IterateTile.hpp>
|
||||
#include <Kokkos_ExecPolicy.hpp>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
@ -63,13 +65,15 @@
|
||||
namespace Kokkos {
|
||||
|
||||
// ------------------------------------------------------------------ //
|
||||
|
||||
// Moved to Kokkos_Layout.hpp for more general accessibility
|
||||
/*
|
||||
enum class Iterate
|
||||
{
|
||||
Default, // Default for the device
|
||||
Left, // Left indices stride fastest
|
||||
Right, // Right indices stride fastest
|
||||
};
|
||||
*/
|
||||
|
||||
template <typename ExecSpace>
|
||||
struct default_outer_direction
|
||||
|
||||
@ -45,11 +45,13 @@
|
||||
#define KOKKOS_ARRAY_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
|
||||
#include <type_traits>
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <cstddef>
|
||||
#include <string>
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
@ -132,6 +134,7 @@ public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static constexpr size_type size() { return N ; }
|
||||
KOKKOS_INLINE_FUNCTION static constexpr bool empty(){ return false ; }
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_type max_size() const { return N ; }
|
||||
|
||||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -160,7 +163,7 @@ public:
|
||||
return & m_internal_implementation_private_member_data[0];
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ROCM_CLANG_WORKAROUND
|
||||
#ifdef KOKKOS_IMPL_ROCM_CLANG_WORKAROUND
|
||||
// Do not default unless move and move-assignment are also defined
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
~Array() = default ;
|
||||
@ -197,6 +200,7 @@ public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static constexpr size_type size() { return 0 ; }
|
||||
KOKKOS_INLINE_FUNCTION static constexpr bool empty() { return true ; }
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_type max_size() const { return 0 ; }
|
||||
|
||||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -261,6 +265,7 @@ public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_type size() const { return m_size ; }
|
||||
KOKKOS_INLINE_FUNCTION constexpr bool empty() const { return 0 != m_size ; }
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_type max_size() const { return m_size ; }
|
||||
|
||||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -336,6 +341,7 @@ public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_type size() const { return m_size ; }
|
||||
KOKKOS_INLINE_FUNCTION constexpr bool empty() const { return 0 != m_size ; }
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_type max_size() const { return m_size ; }
|
||||
|
||||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
||||
@ -105,7 +105,10 @@ namespace Kokkos {
|
||||
template< typename T > struct is_ ## CONCEPT { \
|
||||
private: \
|
||||
template< typename , typename = std::true_type > struct have : std::false_type {}; \
|
||||
template< typename U > struct have<U,typename std::is_same<U,typename U:: CONCEPT >::type> : std::true_type {}; \
|
||||
template< typename U > struct have<U,typename std::is_same< \
|
||||
typename std::remove_cv<U>::type, \
|
||||
typename std::remove_cv<typename U:: CONCEPT>::type \
|
||||
>::type> : std::true_type {}; \
|
||||
public: \
|
||||
enum { value = is_ ## CONCEPT::template have<T>::value }; \
|
||||
};
|
||||
|
||||
@ -453,8 +453,9 @@ template<class ViewTypeA,class ViewTypeB, class Layout, class ExecSpace,typename
|
||||
struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,2,iType,KOKKOS_IMPL_COMPILING_LIBRARY> {
|
||||
ViewTypeA a;
|
||||
ViewTypeB b;
|
||||
|
||||
typedef Kokkos::Rank<2,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
|
||||
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern;
|
||||
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern;
|
||||
typedef Kokkos::Rank<2,outer_iteration_pattern,inner_iteration_pattern> iterate_type;
|
||||
typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
|
||||
|
||||
ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
|
||||
@ -475,7 +476,9 @@ struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,3,iType,KOKKOS_IMPL_COMPILI
|
||||
ViewTypeA a;
|
||||
ViewTypeB b;
|
||||
|
||||
typedef Kokkos::Rank<3,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
|
||||
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern;
|
||||
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern;
|
||||
typedef Kokkos::Rank<3,outer_iteration_pattern,inner_iteration_pattern> iterate_type;
|
||||
typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
|
||||
|
||||
ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
|
||||
@ -496,7 +499,9 @@ struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,4,iType,KOKKOS_IMPL_COMPILI
|
||||
ViewTypeA a;
|
||||
ViewTypeB b;
|
||||
|
||||
typedef Kokkos::Rank<4,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
|
||||
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern;
|
||||
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern;
|
||||
typedef Kokkos::Rank<4,outer_iteration_pattern,inner_iteration_pattern> iterate_type;
|
||||
typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
|
||||
|
||||
ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
|
||||
@ -519,7 +524,9 @@ struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,5,iType,KOKKOS_IMPL_COMPILI
|
||||
ViewTypeA a;
|
||||
ViewTypeB b;
|
||||
|
||||
typedef Kokkos::Rank<5,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
|
||||
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern;
|
||||
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern;
|
||||
typedef Kokkos::Rank<5,outer_iteration_pattern,inner_iteration_pattern> iterate_type;
|
||||
typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
|
||||
|
||||
ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
|
||||
@ -542,7 +549,9 @@ struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,6,iType,KOKKOS_IMPL_COMPILI
|
||||
ViewTypeA a;
|
||||
ViewTypeB b;
|
||||
|
||||
typedef Kokkos::Rank<6,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
|
||||
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern;
|
||||
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern;
|
||||
typedef Kokkos::Rank<6,outer_iteration_pattern,inner_iteration_pattern> iterate_type;
|
||||
typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
|
||||
|
||||
ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
|
||||
@ -566,7 +575,9 @@ struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,7,iType,KOKKOS_IMPL_COMPILI
|
||||
ViewTypeA a;
|
||||
ViewTypeB b;
|
||||
|
||||
typedef Kokkos::Rank<6,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
|
||||
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern;
|
||||
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern;
|
||||
typedef Kokkos::Rank<6,outer_iteration_pattern,inner_iteration_pattern> iterate_type;
|
||||
typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
|
||||
|
||||
ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
|
||||
@ -590,7 +601,9 @@ struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,8,iType,KOKKOS_IMPL_COMPILI
|
||||
ViewTypeA a;
|
||||
ViewTypeB b;
|
||||
|
||||
typedef Kokkos::Rank<6,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
|
||||
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern;
|
||||
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern;
|
||||
typedef Kokkos::Rank<6,outer_iteration_pattern,inner_iteration_pattern> iterate_type;
|
||||
typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
|
||||
|
||||
ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
|
||||
@ -642,7 +655,9 @@ void view_copy(const DstType& dst, const SrcType& src) {
|
||||
int64_t strides[DstType::Rank+1];
|
||||
dst.stride(strides);
|
||||
Kokkos::Iterate iterate;
|
||||
if ( std::is_same<typename DstType::array_layout,Kokkos::LayoutRight>::value ) {
|
||||
if ( Kokkos::is_layouttiled<typename DstType::array_layout>::value ) {
|
||||
iterate = Kokkos::layout_iterate_type_selector<typename DstType::array_layout>::outer_iteration_pattern;
|
||||
} else if ( std::is_same<typename DstType::array_layout,Kokkos::LayoutRight>::value ) {
|
||||
iterate = Kokkos::Iterate::Right;
|
||||
} else if ( std::is_same<typename DstType::array_layout,Kokkos::LayoutLeft>::value ) {
|
||||
iterate = Kokkos::Iterate::Left;
|
||||
@ -1243,9 +1258,9 @@ void deep_copy
|
||||
ViewTypeFlat;
|
||||
|
||||
ViewTypeFlat dst_flat(dst.data(),dst.size());
|
||||
if(dst.span() < std::numeric_limits<int>::max())
|
||||
if(dst.span() < std::numeric_limits<int>::max()) {
|
||||
Kokkos::Impl::ViewFill< ViewTypeFlat , Kokkos::LayoutRight, typename ViewType::execution_space, ViewTypeFlat::Rank, int >( dst_flat , value );
|
||||
else
|
||||
} else
|
||||
Kokkos::Impl::ViewFill< ViewTypeFlat , Kokkos::LayoutRight, typename ViewType::execution_space, ViewTypeFlat::Rank, int64_t >( dst_flat , value );
|
||||
Kokkos::fence();
|
||||
return;
|
||||
@ -1397,7 +1412,6 @@ void deep_copy
|
||||
enum { SrcExecCanAccessDst =
|
||||
Kokkos::Impl::SpaceAccessibility< src_execution_space , dst_memory_space >::accessible };
|
||||
|
||||
|
||||
// Checking for Overlapping Views.
|
||||
dst_value_type* dst_start = dst.data();
|
||||
dst_value_type* dst_end = dst.data() + dst.span();
|
||||
@ -1493,7 +1507,7 @@ void deep_copy
|
||||
Kokkos::fence();
|
||||
} else {
|
||||
Kokkos::fence();
|
||||
Impl::view_copy(typename dst_type::uniform_runtime_nomemspace_type(dst),typename src_type::uniform_runtime_const_nomemspace_type(src));
|
||||
Impl::view_copy(dst, src);
|
||||
Kokkos::fence();
|
||||
}
|
||||
}
|
||||
@ -1739,8 +1753,7 @@ void deep_copy
|
||||
exec_space.fence();
|
||||
} else {
|
||||
exec_space.fence();
|
||||
Impl::view_copy(typename dst_type::uniform_runtime_nomemspace_type(dst),
|
||||
typename src_type::uniform_runtime_const_nomemspace_type(src));
|
||||
Impl::view_copy(dst, src);
|
||||
exec_space.fence();
|
||||
}
|
||||
}
|
||||
@ -1917,4 +1930,213 @@ void realloc( Kokkos::View<T,P...> & v ,
|
||||
}
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
// Deduce Mirror Types
|
||||
template<class Space, class T, class ... P>
|
||||
struct MirrorViewType {
|
||||
// The incoming view_type
|
||||
typedef typename Kokkos::View<T,P...> src_view_type;
|
||||
// The memory space for the mirror view
|
||||
typedef typename Space::memory_space memory_space;
|
||||
// Check whether it is the same memory space
|
||||
enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value };
|
||||
// The array_layout
|
||||
typedef typename src_view_type::array_layout array_layout;
|
||||
// The data type (we probably want it non-const since otherwise we can't even deep_copy to it.
|
||||
typedef typename src_view_type::non_const_data_type data_type;
|
||||
// The destination view type if it is not the same memory space
|
||||
typedef Kokkos::View<data_type,array_layout,Space> dest_view_type;
|
||||
// If it is the same memory_space return the existsing view_type
|
||||
// This will also keep the unmanaged trait if necessary
|
||||
typedef typename std::conditional<is_same_memspace,src_view_type,dest_view_type>::type view_type;
|
||||
};
|
||||
|
||||
template<class Space, class T, class ... P>
|
||||
struct MirrorType {
|
||||
// The incoming view_type
|
||||
typedef typename Kokkos::View<T,P...> src_view_type;
|
||||
// The memory space for the mirror view
|
||||
typedef typename Space::memory_space memory_space;
|
||||
// Check whether it is the same memory space
|
||||
enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value };
|
||||
// The array_layout
|
||||
typedef typename src_view_type::array_layout array_layout;
|
||||
// The data type (we probably want it non-const since otherwise we can't even deep_copy to it.
|
||||
typedef typename src_view_type::non_const_data_type data_type;
|
||||
// The destination view type if it is not the same memory space
|
||||
typedef Kokkos::View<data_type,array_layout,Space> view_type;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
template< class T , class ... P >
|
||||
inline
|
||||
typename Kokkos::View<T,P...>::HostMirror
|
||||
create_mirror( const Kokkos::View<T,P...> & src
|
||||
, typename std::enable_if<
|
||||
std::is_same< typename ViewTraits<T,P...>::specialize , void >::value &&
|
||||
! std::is_same< typename Kokkos::ViewTraits<T,P...>::array_layout
|
||||
, Kokkos::LayoutStride >::value
|
||||
>::type * = 0
|
||||
)
|
||||
{
|
||||
typedef View<T,P...> src_type ;
|
||||
typedef typename src_type::HostMirror dst_type ;
|
||||
|
||||
return dst_type( std::string( src.label() ).append("_mirror")
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
, src.extent(0)
|
||||
, src.extent(1)
|
||||
, src.extent(2)
|
||||
, src.extent(3)
|
||||
, src.extent(4)
|
||||
, src.extent(5)
|
||||
, src.extent(6)
|
||||
, src.extent(7) );
|
||||
#else
|
||||
, src.rank_dynamic > 0 ? src.extent(0): KOKKOS_IMPL_CTOR_DEFAULT_ARG
|
||||
, src.rank_dynamic > 1 ? src.extent(1): KOKKOS_IMPL_CTOR_DEFAULT_ARG
|
||||
, src.rank_dynamic > 2 ? src.extent(2): KOKKOS_IMPL_CTOR_DEFAULT_ARG
|
||||
, src.rank_dynamic > 3 ? src.extent(3): KOKKOS_IMPL_CTOR_DEFAULT_ARG
|
||||
, src.rank_dynamic > 4 ? src.extent(4): KOKKOS_IMPL_CTOR_DEFAULT_ARG
|
||||
, src.rank_dynamic > 5 ? src.extent(5): KOKKOS_IMPL_CTOR_DEFAULT_ARG
|
||||
, src.rank_dynamic > 6 ? src.extent(6): KOKKOS_IMPL_CTOR_DEFAULT_ARG
|
||||
, src.rank_dynamic > 7 ? src.extent(7): KOKKOS_IMPL_CTOR_DEFAULT_ARG );
|
||||
#endif
|
||||
}
|
||||
|
||||
template< class T , class ... P >
|
||||
inline
|
||||
typename Kokkos::View<T,P...>::HostMirror
|
||||
create_mirror( const Kokkos::View<T,P...> & src
|
||||
, typename std::enable_if<
|
||||
std::is_same< typename ViewTraits<T,P...>::specialize , void >::value &&
|
||||
std::is_same< typename Kokkos::ViewTraits<T,P...>::array_layout
|
||||
, Kokkos::LayoutStride >::value
|
||||
>::type * = 0
|
||||
)
|
||||
{
|
||||
typedef View<T,P...> src_type ;
|
||||
typedef typename src_type::HostMirror dst_type ;
|
||||
|
||||
Kokkos::LayoutStride layout ;
|
||||
|
||||
layout.dimension[0] = src.extent(0);
|
||||
layout.dimension[1] = src.extent(1);
|
||||
layout.dimension[2] = src.extent(2);
|
||||
layout.dimension[3] = src.extent(3);
|
||||
layout.dimension[4] = src.extent(4);
|
||||
layout.dimension[5] = src.extent(5);
|
||||
layout.dimension[6] = src.extent(6);
|
||||
layout.dimension[7] = src.extent(7);
|
||||
|
||||
layout.stride[0] = src.stride_0();
|
||||
layout.stride[1] = src.stride_1();
|
||||
layout.stride[2] = src.stride_2();
|
||||
layout.stride[3] = src.stride_3();
|
||||
layout.stride[4] = src.stride_4();
|
||||
layout.stride[5] = src.stride_5();
|
||||
layout.stride[6] = src.stride_6();
|
||||
layout.stride[7] = src.stride_7();
|
||||
|
||||
return dst_type( std::string( src.label() ).append("_mirror") , layout );
|
||||
}
|
||||
|
||||
|
||||
// Create a mirror in a new space (specialization for different space)
|
||||
template<class Space, class T, class ... P>
|
||||
typename Impl::MirrorType<Space,T,P ...>::view_type
|
||||
create_mirror(const Space& , const Kokkos::View<T,P...> & src
|
||||
, typename std::enable_if<
|
||||
std::is_same< typename ViewTraits<T,P...>::specialize , void >::value
|
||||
>::type * = 0) {
|
||||
return typename Impl::MirrorType<Space,T,P ...>::view_type(src.label(),src.layout());
|
||||
}
|
||||
|
||||
template< class T , class ... P >
|
||||
inline
|
||||
typename Kokkos::View<T,P...>::HostMirror
|
||||
create_mirror_view( const Kokkos::View<T,P...> & src
|
||||
, typename std::enable_if<(
|
||||
std::is_same< typename Kokkos::View<T,P...>::memory_space
|
||||
, typename Kokkos::View<T,P...>::HostMirror::memory_space
|
||||
>::value
|
||||
&&
|
||||
std::is_same< typename Kokkos::View<T,P...>::data_type
|
||||
, typename Kokkos::View<T,P...>::HostMirror::data_type
|
||||
>::value
|
||||
)>::type * = 0
|
||||
)
|
||||
{
|
||||
return src ;
|
||||
}
|
||||
|
||||
template< class T , class ... P >
|
||||
inline
|
||||
typename Kokkos::View<T,P...>::HostMirror
|
||||
create_mirror_view( const Kokkos::View<T,P...> & src
|
||||
, typename std::enable_if< ! (
|
||||
std::is_same< typename Kokkos::View<T,P...>::memory_space
|
||||
, typename Kokkos::View<T,P...>::HostMirror::memory_space
|
||||
>::value
|
||||
&&
|
||||
std::is_same< typename Kokkos::View<T,P...>::data_type
|
||||
, typename Kokkos::View<T,P...>::HostMirror::data_type
|
||||
>::value
|
||||
)>::type * = 0
|
||||
)
|
||||
{
|
||||
return Kokkos::create_mirror( src );
|
||||
}
|
||||
|
||||
// Create a mirror view in a new space (specialization for same space)
|
||||
template<class Space, class T, class ... P>
|
||||
typename Impl::MirrorViewType<Space,T,P ...>::view_type
|
||||
create_mirror_view(const Space& , const Kokkos::View<T,P...> & src
|
||||
, typename std::enable_if<Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
|
||||
return src;
|
||||
}
|
||||
|
||||
// Create a mirror view in a new space (specialization for different space)
|
||||
template<class Space, class T, class ... P>
|
||||
typename Impl::MirrorViewType<Space,T,P ...>::view_type
|
||||
create_mirror_view(const Space& , const Kokkos::View<T,P...> & src
|
||||
, typename std::enable_if<!Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
|
||||
return typename Impl::MirrorViewType<Space,T,P ...>::view_type(src.label(),src.layout());
|
||||
}
|
||||
|
||||
// Create a mirror view and deep_copy in a new space (specialization for same space)
|
||||
template<class Space, class T, class ... P>
|
||||
typename Impl::MirrorViewType<Space,T,P ...>::view_type
|
||||
create_mirror_view_and_copy(const Space& , const Kokkos::View<T,P...> & src
|
||||
, std::string const& name = ""
|
||||
, typename std::enable_if<Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
|
||||
(void)name;
|
||||
return src;
|
||||
}
|
||||
|
||||
// Create a mirror view and deep_copy in a new space (specialization for different space)
|
||||
template<class Space, class T, class ... P>
|
||||
typename Impl::MirrorViewType<Space,T,P ...>::view_type
|
||||
create_mirror_view_and_copy(const Space& , const Kokkos::View<T,P...> & src
|
||||
, std::string const& name = ""
|
||||
, typename std::enable_if<!Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
|
||||
using Mirror = typename Impl::MirrorViewType<Space,T,P ...>::view_type;
|
||||
std::string label = name.empty() ? src.label() : name;
|
||||
auto mirror = Mirror(ViewAllocateWithoutInitializing(label), src.layout());
|
||||
deep_copy(mirror, src);
|
||||
return mirror;
|
||||
}
|
||||
|
||||
} /* namespace Kokkos */
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif
|
||||
|
||||
@ -57,6 +57,10 @@
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
struct ParallelForTag {};
|
||||
struct ParallelScanTag {};
|
||||
struct ParallelReduceTag {};
|
||||
|
||||
struct ChunkSize {
|
||||
int value;
|
||||
ChunkSize(int value_):value(value_) {}
|
||||
@ -320,6 +324,10 @@ public:
|
||||
|
||||
template< class FunctorType >
|
||||
static int team_size_recommended( const FunctorType & , const int&);
|
||||
|
||||
template<class FunctorType>
|
||||
int team_size_recommended( const FunctorType & functor , const int vector_length);
|
||||
|
||||
//----------------------------------------
|
||||
/** \brief Construct policy with the given instance of the execution space */
|
||||
TeamPolicyInternal( const typename traits::execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1 );
|
||||
|
||||
@ -76,6 +76,8 @@ struct LayoutLeft {
|
||||
|
||||
size_t dimension[ ARRAY_LAYOUT_MAX_RANK ];
|
||||
|
||||
enum { is_extent_constructible = true };
|
||||
|
||||
LayoutLeft( LayoutLeft const & ) = default ;
|
||||
LayoutLeft( LayoutLeft && ) = default ;
|
||||
LayoutLeft & operator = ( LayoutLeft const & ) = default ;
|
||||
@ -108,6 +110,8 @@ struct LayoutRight {
|
||||
|
||||
size_t dimension[ ARRAY_LAYOUT_MAX_RANK ];
|
||||
|
||||
enum { is_extent_constructible = true };
|
||||
|
||||
LayoutRight( LayoutRight const & ) = default ;
|
||||
LayoutRight( LayoutRight && ) = default ;
|
||||
LayoutRight & operator = ( LayoutRight const & ) = default ;
|
||||
@ -132,6 +136,8 @@ struct LayoutStride {
|
||||
size_t dimension[ ARRAY_LAYOUT_MAX_RANK ] ;
|
||||
size_t stride[ ARRAY_LAYOUT_MAX_RANK ] ;
|
||||
|
||||
enum { is_extent_constructible = false };
|
||||
|
||||
LayoutStride( LayoutStride const & ) = default ;
|
||||
LayoutStride( LayoutStride && ) = default ;
|
||||
LayoutStride & operator = ( LayoutStride const & ) = default ;
|
||||
@ -222,6 +228,8 @@ struct LayoutTileLeft {
|
||||
|
||||
size_t dimension[ ARRAY_LAYOUT_MAX_RANK ] ;
|
||||
|
||||
enum { is_extent_constructible = true };
|
||||
|
||||
LayoutTileLeft( LayoutTileLeft const & ) = default ;
|
||||
LayoutTileLeft( LayoutTileLeft && ) = default ;
|
||||
LayoutTileLeft & operator = ( LayoutTileLeft const & ) = default ;
|
||||
@ -235,6 +243,144 @@ struct LayoutTileLeft {
|
||||
: dimension { argN0 , argN1 , argN2 , argN3 , argN4 , argN5 , argN6 , argN7 } {}
|
||||
};
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum class Iterate
|
||||
{
|
||||
Default,
|
||||
Left, // Left indices stride fastest
|
||||
Right // Right indices stride fastest
|
||||
};
|
||||
|
||||
// To check for LayoutTiled
|
||||
// This is to hide extra compile-time 'identifier' info within the LayoutTiled class by not relying on template specialization to include the ArgN*'s
|
||||
template < typename LayoutTiledCheck, class Enable = void >
|
||||
struct is_layouttiled : std::false_type {};
|
||||
|
||||
#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
template < typename LayoutTiledCheck >
|
||||
struct is_layouttiled< LayoutTiledCheck, typename std::enable_if<LayoutTiledCheck::is_array_layout_tiled>::type > : std::true_type {};
|
||||
|
||||
namespace Experimental {
|
||||
|
||||
/// LayoutTiled
|
||||
// Must have Rank >= 2
|
||||
template < Kokkos::Iterate OuterP, Kokkos::Iterate InnerP,
|
||||
unsigned ArgN0 , unsigned ArgN1 , unsigned ArgN2 = 0, unsigned ArgN3 = 0, unsigned ArgN4 = 0, unsigned ArgN5 = 0, unsigned ArgN6 = 0, unsigned ArgN7 = 0,
|
||||
bool IsPowerOfTwo =
|
||||
( Impl::is_integral_power_of_two(ArgN0) &&
|
||||
Impl::is_integral_power_of_two(ArgN1) &&
|
||||
(Impl::is_integral_power_of_two(ArgN2) || (ArgN2 == 0) ) &&
|
||||
(Impl::is_integral_power_of_two(ArgN3) || (ArgN3 == 0) ) &&
|
||||
(Impl::is_integral_power_of_two(ArgN4) || (ArgN4 == 0) ) &&
|
||||
(Impl::is_integral_power_of_two(ArgN5) || (ArgN5 == 0) ) &&
|
||||
(Impl::is_integral_power_of_two(ArgN6) || (ArgN6 == 0) ) &&
|
||||
(Impl::is_integral_power_of_two(ArgN7) || (ArgN7 == 0) )
|
||||
)
|
||||
>
|
||||
struct LayoutTiled {
|
||||
|
||||
static_assert( IsPowerOfTwo
|
||||
, "LayoutTiled must be given power-of-two tile dimensions" );
|
||||
|
||||
#if 0
|
||||
static_assert( (Impl::is_integral_power_of_two(ArgN0) ) &&
|
||||
(Impl::is_integral_power_of_two(ArgN1) ) &&
|
||||
(Impl::is_integral_power_of_two(ArgN2) || (ArgN2 == 0) ) &&
|
||||
(Impl::is_integral_power_of_two(ArgN3) || (ArgN3 == 0) ) &&
|
||||
(Impl::is_integral_power_of_two(ArgN4) || (ArgN4 == 0) ) &&
|
||||
(Impl::is_integral_power_of_two(ArgN5) || (ArgN5 == 0) ) &&
|
||||
(Impl::is_integral_power_of_two(ArgN6) || (ArgN6 == 0) ) &&
|
||||
(Impl::is_integral_power_of_two(ArgN7) || (ArgN7 == 0) )
|
||||
, "LayoutTiled must be given power-of-two tile dimensions" );
|
||||
#endif
|
||||
|
||||
typedef LayoutTiled<OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, ArgN7, IsPowerOfTwo> array_layout ;
|
||||
static constexpr Iterate outer_pattern = OuterP;
|
||||
static constexpr Iterate inner_pattern = InnerP;
|
||||
|
||||
enum { N0 = ArgN0 };
|
||||
enum { N1 = ArgN1 };
|
||||
enum { N2 = ArgN2 };
|
||||
enum { N3 = ArgN3 };
|
||||
enum { N4 = ArgN4 };
|
||||
enum { N5 = ArgN5 };
|
||||
enum { N6 = ArgN6 };
|
||||
enum { N7 = ArgN7 };
|
||||
|
||||
size_t dimension[ ARRAY_LAYOUT_MAX_RANK ] ;
|
||||
|
||||
enum { is_extent_constructible = true };
|
||||
|
||||
LayoutTiled( LayoutTiled const & ) = default ;
|
||||
LayoutTiled( LayoutTiled && ) = default ;
|
||||
LayoutTiled & operator = ( LayoutTiled const & ) = default ;
|
||||
LayoutTiled & operator = ( LayoutTiled && ) = default ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
explicit constexpr
|
||||
LayoutTiled( size_t argN0 = 0 , size_t argN1 = 0 , size_t argN2 = 0 , size_t argN3 = 0
|
||||
, size_t argN4 = 0 , size_t argN5 = 0 , size_t argN6 = 0 , size_t argN7 = 0
|
||||
)
|
||||
: dimension { argN0 , argN1 , argN2 , argN3 , argN4 , argN5 , argN6 , argN7 } {}
|
||||
};
|
||||
|
||||
} // namespace Experimental
|
||||
#endif
|
||||
|
||||
|
||||
// For use with view_copy
|
||||
template < typename ... Layout >
|
||||
struct layout_iterate_type_selector {
|
||||
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Default ;
|
||||
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Default ;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct layout_iterate_type_selector< Kokkos::LayoutRight > {
|
||||
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right ;
|
||||
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right ;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct layout_iterate_type_selector< Kokkos::LayoutLeft > {
|
||||
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left ;
|
||||
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left ;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct layout_iterate_type_selector< Kokkos::LayoutStride > {
|
||||
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Default ;
|
||||
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Default ;
|
||||
};
|
||||
|
||||
#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
template < unsigned ArgN0 , unsigned ArgN1 , unsigned ArgN2 , unsigned ArgN3 , unsigned ArgN4 , unsigned ArgN5 , unsigned ArgN6 , unsigned ArgN7 >
|
||||
struct layout_iterate_type_selector< Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Left, Kokkos::Iterate::Left, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, ArgN7, true> > {
|
||||
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left ;
|
||||
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left ;
|
||||
};
|
||||
|
||||
template < unsigned ArgN0 , unsigned ArgN1 , unsigned ArgN2 , unsigned ArgN3 , unsigned ArgN4 , unsigned ArgN5 , unsigned ArgN6 , unsigned ArgN7 >
|
||||
struct layout_iterate_type_selector< Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Right, Kokkos::Iterate::Left, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, ArgN7, true> > {
|
||||
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right ;
|
||||
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left ;
|
||||
};
|
||||
|
||||
template < unsigned ArgN0 , unsigned ArgN1 , unsigned ArgN2 , unsigned ArgN3 , unsigned ArgN4 , unsigned ArgN5 , unsigned ArgN6 , unsigned ArgN7 >
|
||||
struct layout_iterate_type_selector< Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Left, Kokkos::Iterate::Right, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, ArgN7, true> > {
|
||||
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left ;
|
||||
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right ;
|
||||
};
|
||||
|
||||
template < unsigned ArgN0 , unsigned ArgN1 , unsigned ArgN2 , unsigned ArgN3 , unsigned ArgN4 , unsigned ArgN5 , unsigned ArgN6 , unsigned ArgN7 >
|
||||
struct layout_iterate_type_selector< Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Right, Kokkos::Iterate::Right, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, ArgN7, true> > {
|
||||
static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right ;
|
||||
static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right ;
|
||||
};
|
||||
#endif
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif // #ifndef KOKKOS_LAYOUT_HPP
|
||||
|
||||
@ -153,7 +153,7 @@
|
||||
#else
|
||||
#define KOKKOS_LAMBDA [=]__host__ __device__
|
||||
|
||||
#if defined( KOKKOS_ENABLE_CXX1Z )
|
||||
#if defined( KOKKOS_ENABLE_CXX17 ) || defined( KOKKOS_ENABLE_CXX20 )
|
||||
#define KOKKOS_CLASS_LAMBDA [=,*this] __host__ __device__
|
||||
#endif
|
||||
#endif
|
||||
@ -213,7 +213,7 @@
|
||||
#define KOKKOS_LAMBDA [=]
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_CXX1Z ) && !defined( KOKKOS_CLASS_LAMBDA )
|
||||
#if (defined( KOKKOS_ENABLE_CXX17 ) || defined( KOKKOS_ENABLE_CXX20) )&& !defined( KOKKOS_CLASS_LAMBDA )
|
||||
#define KOKKOS_CLASS_LAMBDA [=,*this]
|
||||
#endif
|
||||
|
||||
@ -521,6 +521,9 @@
|
||||
#if defined ( KOKKOS_ENABLE_CUDA )
|
||||
#if ( 9000 <= CUDA_VERSION )
|
||||
#define KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND
|
||||
#if ( __CUDA_ARCH__ )
|
||||
#define KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
@ -793,7 +793,7 @@ struct ParallelReduceReturnValue<typename std::enable_if<
|
||||
|
||||
static return_type return_value(ReturnType& return_val,
|
||||
const FunctorType& functor) {
|
||||
#ifdef KOKOOS_ENABLE_DEPRECATED_CODE
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
return return_type(return_val,functor.value_count);
|
||||
#else
|
||||
if ( is_array<ReturnType>::value )
|
||||
@ -1002,7 +1002,8 @@ void parallel_reduce(const std::string& label,
|
||||
typename Impl::enable_if<
|
||||
Kokkos::Impl::is_execution_policy<PolicyType>::value
|
||||
>::type * = 0) {
|
||||
Impl::ParallelReduceAdaptor<PolicyType,FunctorType,const ReturnType>::execute(label,policy,functor,return_value);
|
||||
ReturnType return_value_impl = return_value;
|
||||
Impl::ParallelReduceAdaptor<PolicyType,FunctorType,ReturnType>::execute(label,policy,functor,return_value_impl);
|
||||
}
|
||||
|
||||
template< class PolicyType, class FunctorType, class ReturnType >
|
||||
@ -1054,6 +1055,9 @@ void parallel_reduce(const std::string& label,
|
||||
, typename ValueTraits::pointer_type
|
||||
>::type value_type ;
|
||||
|
||||
static_assert(Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,PolicyType,FunctorType>::
|
||||
has_final_member_function,"Calling parallel_reduce without either return value or final function.");
|
||||
|
||||
typedef Kokkos::View< value_type
|
||||
, Kokkos::HostSpace
|
||||
, Kokkos::MemoryUnmanaged
|
||||
@ -1076,6 +1080,9 @@ void parallel_reduce(const PolicyType& policy,
|
||||
, typename ValueTraits::pointer_type
|
||||
>::type value_type ;
|
||||
|
||||
static_assert(Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,PolicyType,FunctorType>::
|
||||
has_final_member_function,"Calling parallel_reduce without either return value or final function.");
|
||||
|
||||
typedef Kokkos::View< value_type
|
||||
, Kokkos::HostSpace
|
||||
, Kokkos::MemoryUnmanaged
|
||||
@ -1096,6 +1103,9 @@ void parallel_reduce(const size_t& policy,
|
||||
, typename ValueTraits::pointer_type
|
||||
>::type value_type ;
|
||||
|
||||
static_assert(Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,RangePolicy<>,FunctorType>::
|
||||
has_final_member_function,"Calling parallel_reduce without either return value or final function.");
|
||||
|
||||
typedef Kokkos::View< value_type
|
||||
, Kokkos::HostSpace
|
||||
, Kokkos::MemoryUnmanaged
|
||||
@ -1117,6 +1127,9 @@ void parallel_reduce(const std::string& label,
|
||||
, typename ValueTraits::pointer_type
|
||||
>::type value_type ;
|
||||
|
||||
static_assert(Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,RangePolicy<>,FunctorType>::
|
||||
has_final_member_function,"Calling parallel_reduce without either return value or final function.");
|
||||
|
||||
typedef Kokkos::View< value_type
|
||||
, Kokkos::HostSpace
|
||||
, Kokkos::MemoryUnmanaged
|
||||
|
||||
@ -136,6 +136,55 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void* get_shmem_aligned (const ptrdiff_t size, const ptrdiff_t alignment, int level = -1) const {
|
||||
if(level == -1)
|
||||
level = m_default_level;
|
||||
if(level == 0) {
|
||||
|
||||
char* previous = m_iter_L0;
|
||||
const ptrdiff_t missalign = size_t(m_iter_L0)%alignment;
|
||||
if(missalign) m_iter_L0 += alignment-missalign;
|
||||
|
||||
void* tmp = m_iter_L0 + m_offset * size;
|
||||
if (m_end_L0 < (m_iter_L0 += size * m_multiplier)) {
|
||||
m_iter_L0 = previous; // put it back like it was
|
||||
#ifdef KOKKOS_DEBUG
|
||||
// mfh 23 Jun 2015: printf call consumes 25 registers
|
||||
// in a CUDA build, so only print in debug mode. The
|
||||
// function still returns NULL if not enough memory.
|
||||
printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
|
||||
"%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
|
||||
long(m_end_L0-m_iter_L0));
|
||||
#endif // KOKKOS_DEBUG
|
||||
tmp = 0;
|
||||
}
|
||||
return tmp;
|
||||
} else {
|
||||
|
||||
char* previous = m_iter_L1;
|
||||
const ptrdiff_t missalign = size_t(m_iter_L1)%alignment;
|
||||
if(missalign) m_iter_L1 += alignment-missalign;
|
||||
|
||||
void* tmp = m_iter_L1 + m_offset * size;
|
||||
if (m_end_L1 < (m_iter_L1 += size * m_multiplier)) {
|
||||
m_iter_L1 = previous; // put it back like it was
|
||||
#ifdef KOKKOS_DEBUG
|
||||
// mfh 23 Jun 2015: printf call consumes 25 registers
|
||||
// in a CUDA build, so only print in debug mode. The
|
||||
// function still returns NULL if not enough memory.
|
||||
printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
|
||||
"%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
|
||||
long(m_end_L1-m_iter_L1));
|
||||
#endif // KOKKOS_DEBUG
|
||||
tmp = 0;
|
||||
}
|
||||
return tmp;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
template< typename IntType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ScratchMemorySpace( void * ptr_L0 , const IntType & size_L0 , void * ptr_L1 = NULL , const IntType & size_L1 = 0)
|
||||
|
||||
@ -262,7 +262,7 @@ public:
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
template< class FunctorType >
|
||||
static
|
||||
int team_size_max( const FunctorType & ) { return 1 ; }
|
||||
@ -274,6 +274,16 @@ public:
|
||||
template< class FunctorType >
|
||||
static
|
||||
int team_size_recommended( const FunctorType & , const int& ) { return 1 ; }
|
||||
#endif
|
||||
|
||||
template<class FunctorType>
|
||||
int team_size_max( const FunctorType&, const ParallelForTag& ) const { return 1 ; }
|
||||
template<class FunctorType>
|
||||
int team_size_max( const FunctorType&, const ParallelReduceTag& ) const { return 1 ; }
|
||||
template<class FunctorType>
|
||||
int team_size_recommended( const FunctorType&, const ParallelForTag& ) const { return 1 ; }
|
||||
template<class FunctorType>
|
||||
int team_size_recommended( const FunctorType&, const ParallelReduceTag& ) const { return 1 ; }
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
@ -281,6 +291,16 @@ public:
|
||||
inline int league_size() const { return m_league_size ; }
|
||||
inline size_t scratch_size(const int& level, int = 0) const { return m_team_scratch_size[level] + m_thread_scratch_size[level]; }
|
||||
|
||||
inline static
|
||||
int vector_length_max()
|
||||
{ return 1024; } // Use arbitrary large number, is meant as a vectorizable length
|
||||
|
||||
inline static
|
||||
int scratch_size_max(int level)
|
||||
{ return (level==0?
|
||||
1024*32:
|
||||
20*1024*1024);
|
||||
}
|
||||
/** \brief Specify league size, request team size */
|
||||
TeamPolicyInternal( execution_space &
|
||||
, int league_size_request
|
||||
|
||||