diff --git a/SECURITY.md b/SECURITY.md index 9f65e2f88e..f06b781d11 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -23,6 +23,10 @@ either a user mistake or a bug in the code. Bugs can be reported in the LAMMPS project [issue tracker on GitHub](https://github.com/lammps/lammps/issues). +To mitigate issues with using homoglyphs or bidirectional reordering in +unicode, which have been demonstrated as a vector to obfuscate and hide +malicious changes to the source code, all LAMMPS submissions are checked +for unicode characters and only all-ASCII source code is accepted. # Version Updates diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 101e0e13d3..b5f8db93d2 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -133,10 +133,7 @@ endif() set(LAMMPS_BINARY lmp${LAMMPS_MACHINE}) option(BUILD_SHARED_LIBS "Build shared library" OFF) -if(BUILD_SHARED_LIBS) # for all pkg libs, mpi_stubs and linalg - set(CMAKE_POSITION_INDEPENDENT_CODE ON) -endif() - +option(CMAKE_POSITION_INDEPENDENT_CODE "Create object compatible with shared libraries" ON) option(BUILD_TOOLS "Build and install LAMMPS tools (msi2lmp, binary2txt, chain)" OFF) option(BUILD_LAMMPS_SHELL "Build and install the LAMMPS shell" OFF) @@ -304,10 +301,12 @@ else() target_link_libraries(lmp PRIVATE mpi_stubs) target_include_directories(lmp INTERFACE $) target_compile_definitions(lmp INTERFACE $) - endif(MSVC) + endif() target_include_directories(lammps INTERFACE $) target_compile_definitions(lammps INTERFACE $) else() + target_include_directories(lammps INTERFACE $) + target_compile_definitions(lammps INTERFACE $) target_link_libraries(lammps PUBLIC mpi_stubs) endif() add_library(MPI::MPI_CXX ALIAS mpi_stubs) @@ -341,7 +340,6 @@ pkg_depends(ML-IAP ML-SNAP) pkg_depends(MPIIO MPI) pkg_depends(ATC MANYBODY) pkg_depends(LATBOLTZ MPI) -pkg_depends(PHONON KSPACE) pkg_depends(SCAFACOS MPI) pkg_depends(DIELECTRIC KSPACE) pkg_depends(DIELECTRIC EXTRA-PAIR) @@ -611,7 +609,7 @@ endif() # packages which selectively include variants based on enabled styles # e.g. accelerator packages ###################################################################### -foreach(PKG_WITH_INCL CORESHELL QEQ OPENMP DPD-SMOOTH KOKKOS OPT INTEL GPU) +foreach(PKG_WITH_INCL CORESHELL DPD-SMOOTH PHONON QEQ OPENMP KOKKOS OPT INTEL GPU) if(PKG_${PKG_WITH_INCL}) include(Packages/${PKG_WITH_INCL}) endif() @@ -810,11 +808,17 @@ if(ClangFormat_FOUND) endif() get_target_property(DEFINES lammps COMPILE_DEFINITIONS) +get_property(BUILD_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) +if(BUILD_IS_MULTI_CONFIG) + set(LAMMPS_BUILD_TYPE "Multi-Config") +else() + set(LAMMPS_BUILD_TYPE ${CMAKE_BUILD_TYPE}) +endif() include(FeatureSummary) feature_summary(DESCRIPTION "The following tools and libraries have been found and configured:" WHAT PACKAGES_FOUND) message(STATUS "<<< Build configuration >>> Operating System: ${CMAKE_SYSTEM_NAME} ${CMAKE_LINUX_DISTRO} ${CMAKE_DISTRO_VERSION} - Build type: ${CMAKE_BUILD_TYPE} + Build type: ${LAMMPS_BUILD_TYPE} Install path: ${CMAKE_INSTALL_PREFIX} Generator: ${CMAKE_GENERATOR} using ${CMAKE_MAKE_PROGRAM}") ############################################################################### diff --git a/cmake/CMakeSettings.json b/cmake/CMakeSettings.json index dada2f6752..ee4b3c46d5 100644 --- a/cmake/CMakeSettings.json +++ b/cmake/CMakeSettings.json @@ -1,55 +1,111 @@ { - "configurations": [ + "configurations": [ + { + "name": "x64-Debug-MSVC", + "generator": "Ninja", + "configurationType": "Debug", + "buildRoot": "${workspaceRoot}\\build\\${name}", + "installRoot": "${workspaceRoot}\\install\\${name}", + "cmakeCommandArgs": "-S ${workspaceRoot}\\cmake -C ${workspaceRoot}\\cmake\\presets\\windows.cmake -DENABLE_TESTING=on", + "buildCommandArgs": "", + "ctestCommandArgs": "", + "inheritEnvironments": [ "msvc_x64_x64" ], + "variables": [ { - "name": "x64-Debug-MSVC", - "generator": "Ninja", - "configurationType": "Debug", - "buildRoot": "${workspaceRoot}\\build\\${name}", - "installRoot": "${workspaceRoot}\\install\\${name}", - "cmakeCommandArgs": "-S ${workspaceRoot}\\cmake -C ${workspaceRoot}\\cmake\\presets\\windows.cmake", - "buildCommandArgs": "", - "ctestCommandArgs": "", - "inheritEnvironments": [ "msvc_x64_x64" ], - "variables": [ - { - "name": "BUILD_SHARED_LIBS", - "value": "True", - "type": "BOOL" - }, - { - "name": "BUILD_TOOLS", - "value": "True", - "type": "BOOL" - }, - { - "name": "LAMMPS_EXCEPTIONS", - "value": "True", - "type": "BOOL" - } - ] + "name": "BUILD_SHARED_LIBS", + "value": "True", + "type": "BOOL" }, { - "name": "x64-Debug-Clang", - "generator": "Ninja", - "configurationType": "Debug", - "buildRoot": "${workspaceRoot}\\build\\${name}", - "installRoot": "${workspaceRoot}\\install\\${name}", - "cmakeCommandArgs": "-S ${workspaceRoot}\\cmake -C ${workspaceRoot}\\cmake\\presets\\windows.cmake", - "buildCommandArgs": "", - "ctestCommandArgs": "", - "inheritEnvironments": [ "clang_cl_x64" ], - "variables": [ - { - "name": "BUILD_TOOLS", - "value": "True", - "type": "BOOL" - }, - { - "name": "LAMMPS_EXCEPTIONS", - "value": "True", - "type": "BOOL" - } - ] + "name": "BUILD_TOOLS", + "value": "True", + "type": "BOOL" + }, + { + "name": "LAMMPS_EXCEPTIONS", + "value": "True", + "type": "BOOL" } - ] + ] + }, + { + "name": "x64-Debug-Clang", + "generator": "Ninja", + "configurationType": "Debug", + "buildRoot": "${workspaceRoot}\\build\\${name}", + "installRoot": "${workspaceRoot}\\install\\${name}", + "cmakeCommandArgs": "-S ${workspaceRoot}\\cmake -C ${workspaceRoot}\\cmake\\presets\\windows.cmake -DENABLE_TESTING=on", + "buildCommandArgs": "", + "ctestCommandArgs": "", + "inheritEnvironments": [ "clang_cl_x64" ], + "variables": [ + { + "name": "BUILD_TOOLS", + "value": "True", + "type": "BOOL" + }, + { + "name": "LAMMPS_EXCEPTIONS", + "value": "True", + "type": "BOOL" + } + ] + }, + { + "name": "x64-Debug-OneAPI", + "generator": "Ninja", + "configurationType": "Debug", + "buildRoot": "${workspaceRoot}\\build\\${name}", + "installRoot": "${workspaceRoot}\\install\\${name}", + "cmakeCommandArgs": "-S ${workspaceRoot}\\cmake -C ${workspaceRoot}\\cmake\\presets\\windows.cmake -DENABLE_TESTING=on -DCMAKE_CXX_COMPILER=icx -DCMAKE_C_COMPILER=icx -DBUILD_MPI=off", + "buildCommandArgs": "", + "ctestCommandArgs": "", + "inheritEnvironments": [ "msvc_x64_x64" ], + "variables": [ + { + "name": "BUILD_SHARED_LIBS", + "value": "True", + "type": "BOOL" + }, + { + "name": "BUILD_TOOLS", + "value": "True", + "type": "BOOL" + }, + { + "name": "LAMMPS_EXCEPTIONS", + "value": "True", + "type": "BOOL" + } + ] + }, + { + "name": "x64-Debug-Intel", + "generator": "Ninja", + "configurationType": "Debug", + "buildRoot": "${workspaceRoot}\\build\\${name}", + "installRoot": "${workspaceRoot}\\install\\${name}", + "cmakeCommandArgs": "-S ${workspaceRoot}\\cmake -C ${workspaceRoot}\\cmake\\presets\\windows.cmake -DENABLE_TESTING=off -DCMAKE_CXX_COMPILER=icl -DCMAKE_C_COMPILER=icl -DCMAKE_Fortran_COMPILER=ifort -DBUILD_MPI=off", + "buildCommandArgs": "", + "ctestCommandArgs": "", + "inheritEnvironments": [ "msvc_x64_x64" ], + "variables": [ + { + "name": "BUILD_SHARED_LIBS", + "value": "True", + "type": "BOOL" + }, + { + "name": "BUILD_TOOLS", + "value": "True", + "type": "BOOL" + }, + { + "name": "LAMMPS_EXCEPTIONS", + "value": "True", + "type": "BOOL" + } + ] + } + ] } \ No newline at end of file diff --git a/cmake/Modules/ExternalCMakeProject.cmake b/cmake/Modules/ExternalCMakeProject.cmake new file mode 100644 index 0000000000..855ce254c9 --- /dev/null +++ b/cmake/Modules/ExternalCMakeProject.cmake @@ -0,0 +1,33 @@ +# Build a CMake based external library as subdirectory. +# The sources will be unpacked to ${CMAKE_BINARY_DIR}/_deps/${target}-src +# The binaries will be built in ${CMAKE_BINARY_DIR}/_deps/${target}-build +# +function(ExternalCMakeProject target url hash basedir cmakedir cmakefile) + # change settings locally + set(BUILD_SHARED_LIBS OFF) + set(CMAKE_POSITION_INDEPENDENT_CODE ON) + + get_filename_component(archive ${url} NAME) + file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/_deps/src) + message(STATUS "Downloading ${url}") + file(DOWNLOAD ${url} ${CMAKE_BINARY_DIR}/_deps/${archive} EXPECTED_HASH MD5=${hash} SHOW_PROGRESS) + message(STATUS "Unpacking and configuring ${archive}") + execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_BINARY_DIR}/_deps/${archive} + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/_deps/src) + file(GLOB TARGET_SOURCE "${CMAKE_BINARY_DIR}/_deps/src/${basedir}*") + list(LENGTH TARGET_SOURCE _num) + if(_num GREATER 1) + message(FATAL_ERROR "Inconsistent ${target} library sources. " + "Please delete ${CMAKE_BINARY_DIR}/_deps/src and re-run cmake") + endif() + file(REMOVE_RECURSE ${CMAKE_BINARY_DIR}/_deps/${target}-src) + file(RENAME ${TARGET_SOURCE} ${CMAKE_BINARY_DIR}/_deps/${target}-src) + if(NOT (cmakefile STREQUAL "")) + file(COPY ${cmakefile} DESTINATION ${CMAKE_BINARY_DIR}/_deps/${target}-src/${cmakedir}/) + get_filename_component(_cmakefile ${cmakefile} NAME) + file(RENAME "${CMAKE_BINARY_DIR}/_deps/${target}-src/${cmakedir}/${_cmakefile}" + "${CMAKE_BINARY_DIR}/_deps/${target}-src/${cmakedir}/CMakeLists.txt") + endif() + add_subdirectory("${CMAKE_BINARY_DIR}/_deps/${target}-src/${cmakedir}" + "${CMAKE_BINARY_DIR}/_deps/${target}-build") +endfunction(ExternalCMakeProject) diff --git a/cmake/Modules/GTest.cmake b/cmake/Modules/GTest.cmake deleted file mode 100644 index e012e61ea9..0000000000 --- a/cmake/Modules/GTest.cmake +++ /dev/null @@ -1,81 +0,0 @@ -message(STATUS "Downloading and building Google Test library") - -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - set(GTEST_LIB_POSTFIX d) -else() - set(GTEST_LIB_POSTFIX) -endif() - -include(ExternalProject) -set(GTEST_URL "https://github.com/google/googletest/archive/release-1.11.0.tar.gz" CACHE STRING "URL of googletest source") -set(GTEST_MD5 "e8a8df240b6938bb6384155d4c37d937" CACHE STRING "MD5 sum for googletest source") -mark_as_advanced(GTEST_URL) -mark_as_advanced(GTEST_MD5) -ExternalProject_Add(googletest - URL ${GTEST_URL} - URL_MD5 ${GTEST_MD5} - SOURCE_DIR "${CMAKE_BINARY_DIR}/gtest-src" - BINARY_DIR "${CMAKE_BINARY_DIR}/gtest-build" - CMAKE_ARGS ${CMAKE_REQUEST_PIC} ${CMAKE_EXTRA_GTEST_OPTS} - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_INSTALL_PREFIX= - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM} - -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} - BUILD_BYPRODUCTS /lib/libgtest${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX} - /lib/libgmock${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX} - /lib/libgtest_main${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX} - /lib/libgmock_main${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX} - LOG_DOWNLOAD ON - LOG_CONFIGURE ON - LOG_BUILD ON - INSTALL_COMMAND "" - TEST_COMMAND "") - -ExternalProject_Get_Property(googletest SOURCE_DIR) -set(GTEST_INCLUDE_DIR ${SOURCE_DIR}/googletest/include) -set(GMOCK_INCLUDE_DIR ${SOURCE_DIR}/googlemock/include) - -# workaround for CMake 3.10 on ubuntu 18.04 -file(MAKE_DIRECTORY ${GTEST_INCLUDE_DIR}) -file(MAKE_DIRECTORY ${GMOCK_INCLUDE_DIR}) - -ExternalProject_Get_Property(googletest BINARY_DIR) -set(GTEST_LIBRARY_PATH ${BINARY_DIR}/lib/libgtest${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}) -set(GMOCK_LIBRARY_PATH ${BINARY_DIR}/lib/libgmock${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}) -set(GTEST_MAIN_LIBRARY_PATH ${BINARY_DIR}/lib/libgtest_main${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}) -set(GMOCK_MAIN_LIBRARY_PATH ${BINARY_DIR}/lib/libgmock_main${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}) - -# Prevent GoogleTest from overriding our compiler/linker options -# when building with Visual Studio -set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) - -find_package(Threads QUIET) - -add_library(GTest::GTest UNKNOWN IMPORTED) -set_target_properties(GTest::GTest PROPERTIES - IMPORTED_LOCATION ${GTEST_LIBRARY_PATH} - INTERFACE_INCLUDE_DIRECTORIES ${GTEST_INCLUDE_DIR} - INTERFACE_LINK_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") -add_dependencies(GTest::GTest googletest) - -add_library(GTest::GMock UNKNOWN IMPORTED) -set_target_properties(GTest::GMock PROPERTIES - IMPORTED_LOCATION ${GMOCK_LIBRARY_PATH} - INTERFACE_INCLUDE_DIRECTORIES ${GMOCK_INCLUDE_DIR} - INTERFACE_LINK_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") -add_dependencies(GTest::GMock googletest) - -add_library(GTest::GTestMain UNKNOWN IMPORTED) -set_target_properties(GTest::GTestMain PROPERTIES - IMPORTED_LOCATION ${GTEST_MAIN_LIBRARY_PATH} - INTERFACE_INCLUDE_DIRECTORIES ${GTEST_INCLUDE_DIR} - INTERFACE_LINK_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") -add_dependencies(GTest::GTestMain googletest) - -add_library(GTest::GMockMain UNKNOWN IMPORTED) -set_target_properties(GTest::GMockMain PROPERTIES - IMPORTED_LOCATION ${GMOCK_MAIN_LIBRARY_PATH} - INTERFACE_INCLUDE_DIRECTORIES ${GMOCK_INCLUDE_DIR} - INTERFACE_LINK_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") -add_dependencies(GTest::GMockMain googletest) diff --git a/cmake/Modules/LAMMPSUtils.cmake b/cmake/Modules/LAMMPSUtils.cmake index 28ad99fa31..943c3d851e 100644 --- a/cmake/Modules/LAMMPSUtils.cmake +++ b/cmake/Modules/LAMMPSUtils.cmake @@ -25,7 +25,7 @@ function(validate_option name values) endfunction(validate_option) function(get_lammps_version version_header variable) - file(READ ${version_header} line) + file(STRINGS ${version_header} line REGEX LAMMPS_VERSION) set(MONTHS x Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec) string(REGEX REPLACE "#define LAMMPS_VERSION \"([0-9]+) ([A-Za-z]+) ([0-9]+)\"" "\\1" day "${line}") string(REGEX REPLACE "#define LAMMPS_VERSION \"([0-9]+) ([A-Za-z]+) ([0-9]+)\"" "\\2" month "${line}") diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake index a57715d294..048c0ed473 100644 --- a/cmake/Modules/Packages/GPU.cmake +++ b/cmake/Modules/Packages/GPU.cmake @@ -306,12 +306,12 @@ elseif(GPU_API STREQUAL "HIP") if(HIP_COMPILER STREQUAL "clang") add_custom_command(OUTPUT ${CUBIN_FILE} - VERBATIM COMMAND ${HIP_HIPCC_EXECUTABLE} --genco --offload-arch=${HIP_ARCH} -O3 -ffast-math -DUSE_HIP -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES} -I${LAMMPS_LIB_SOURCE_DIR}/gpu -o ${CUBIN_FILE} ${CU_CPP_FILE} + VERBATIM COMMAND ${HIP_HIPCC_EXECUTABLE} --genco --offload-arch=${HIP_ARCH} -O3 -DUSE_HIP -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES} -I${LAMMPS_LIB_SOURCE_DIR}/gpu -o ${CUBIN_FILE} ${CU_CPP_FILE} DEPENDS ${CU_CPP_FILE} COMMENT "Generating ${CU_NAME}.cubin") else() add_custom_command(OUTPUT ${CUBIN_FILE} - VERBATIM COMMAND ${HIP_HIPCC_EXECUTABLE} --genco -t="${HIP_ARCH}" -f=\"-O3 -ffast-math -DUSE_HIP -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES} -I${LAMMPS_LIB_SOURCE_DIR}/gpu\" -o ${CUBIN_FILE} ${CU_CPP_FILE} + VERBATIM COMMAND ${HIP_HIPCC_EXECUTABLE} --genco -t="${HIP_ARCH}" -f=\"-O3 -DUSE_HIP -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES} -I${LAMMPS_LIB_SOURCE_DIR}/gpu\" -o ${CUBIN_FILE} ${CU_CPP_FILE} DEPENDS ${CU_CPP_FILE} COMMENT "Generating ${CU_NAME}.cubin") endif() diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake index fe6c17801e..25211268e9 100644 --- a/cmake/Modules/Packages/KOKKOS.cmake +++ b/cmake/Modules/Packages/KOKKOS.cmake @@ -39,8 +39,8 @@ if(DOWNLOAD_KOKKOS) list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}") list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}") include(ExternalProject) - set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.4.01.tar.gz" CACHE STRING "URL for KOKKOS tarball") - set(KOKKOS_MD5 "4c84698917c93a18985b311bb6caf84f" CACHE STRING "MD5 checksum of KOKKOS tarball") + set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.5.00.tar.gz" CACHE STRING "URL for KOKKOS tarball") + set(KOKKOS_MD5 "079323d973ae0e1c38c0a54a150c674e" CACHE STRING "MD5 checksum of KOKKOS tarball") mark_as_advanced(KOKKOS_URL) mark_as_advanced(KOKKOS_MD5) ExternalProject_Add(kokkos_build @@ -60,7 +60,7 @@ if(DOWNLOAD_KOKKOS) target_link_libraries(lmp PRIVATE LAMMPS::KOKKOS) add_dependencies(LAMMPS::KOKKOS kokkos_build) elseif(EXTERNAL_KOKKOS) - find_package(Kokkos 3.4.01 REQUIRED CONFIG) + find_package(Kokkos 3.5.00 REQUIRED CONFIG) target_link_libraries(lammps PRIVATE Kokkos::kokkos) target_link_libraries(lmp PRIVATE Kokkos::kokkos) else() diff --git a/cmake/Modules/Packages/ML-QUIP.cmake b/cmake/Modules/Packages/ML-QUIP.cmake index 92418e8939..947c555842 100644 --- a/cmake/Modules/Packages/ML-QUIP.cmake +++ b/cmake/Modules/Packages/ML-QUIP.cmake @@ -32,7 +32,8 @@ if(DOWNLOAD_QUIP) foreach(flag ${LAPACK_LIBRARIES}) set(temp "${temp} ${flag}") endforeach() - set(temp "${temp}\n") + # Fix cmake crashing when MATH_LINKOPTS not set, required for e.g. recent Cray Programming Environment + set(temp "${temp} -L/_DUMMY_PATH_\n") set(temp "${temp}PYTHON=python\nPIP=pip\nEXTRA_LINKOPTS=\n") set(temp "${temp}HAVE_CP2K=0\nHAVE_VASP=0\nHAVE_TB=0\nHAVE_PRECON=1\nHAVE_LOTF=0\nHAVE_ONIOM=0\n") set(temp "${temp}HAVE_LOCAL_E_MIX=0\nHAVE_QC=0\nHAVE_GAP=1\nHAVE_DESCRIPTORS_NONCOMMERCIAL=1\n") diff --git a/cmake/Modules/Packages/MSCG.cmake b/cmake/Modules/Packages/MSCG.cmake index cf3d506c82..e4260e059e 100644 --- a/cmake/Modules/Packages/MSCG.cmake +++ b/cmake/Modules/Packages/MSCG.cmake @@ -12,41 +12,12 @@ if(DOWNLOAD_MSCG) mark_as_advanced(MSCG_URL) mark_as_advanced(MSCG_MD5) - # CMake cannot pass BLAS or LAPACK library variable to external project if they are a list - list(LENGTH BLAS_LIBRARIES} NUM_BLAS) - list(LENGTH LAPACK_LIBRARIES NUM_LAPACK) - if((NUM_BLAS GREATER 1) OR (NUM_LAPACK GREATER 1)) - message(FATAL_ERROR "Cannot compile downloaded MSCG library due to a technical limitation") - endif() + include(ExternalCMakeProject) + ExternalCMakeProject(mscg ${MSCG_URL} ${MSCG_MD5} MSCG-release src/CMake "") - include(ExternalProject) - ExternalProject_Add(mscg_build - URL ${MSCG_URL} - URL_MD5 ${MSCG_MD5} - SOURCE_SUBDIR src/CMake - CMAKE_ARGS ${CMAKE_REQUEST_PIC} ${EXTRA_MSCG_OPTS} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER} - -DBLAS_LIBRARIES=${BLAS_LIBRARIES} -DLAPACK_LIBRARIES=${LAPACK_LIBRARIES} - -DCMAKE_INSTALL_PREFIX= - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM} - -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} - BUILD_COMMAND ${CMAKE_COMMAND} --build . --target mscg - INSTALL_COMMAND "" - BUILD_BYPRODUCTS /libmscg.a - ) - ExternalProject_get_property(mscg_build BINARY_DIR) - ExternalProject_get_property(mscg_build SOURCE_DIR) - file(MAKE_DIRECTORY ${SOURCE_DIR}/src) - add_library(LAMMPS::MSCG UNKNOWN IMPORTED) - set_target_properties(LAMMPS::MSCG PROPERTIES - IMPORTED_LOCATION "${BINARY_DIR}/libmscg.a" - INTERFACE_INCLUDE_DIRECTORIES "${SOURCE_DIR}/src" - INTERFACE_LINK_LIBRARIES "${LAPACK_LIBRARIES}") - target_link_libraries(lammps PRIVATE LAMMPS::MSCG) - add_dependencies(LAMMPS::MSCG mscg_build) + # set include and link library + target_include_directories(lammps PRIVATE "${CMAKE_BINARY_DIR}/_deps/mscg-src/src") + target_link_libraries(lammps PRIVATE mscg) else() find_package(MSCG) if(NOT MSCG_FOUND) diff --git a/cmake/Modules/Packages/PHONON.cmake b/cmake/Modules/Packages/PHONON.cmake new file mode 100644 index 0000000000..3021868f68 --- /dev/null +++ b/cmake/Modules/Packages/PHONON.cmake @@ -0,0 +1,9 @@ +# fix phonon may only be installed if also the FFT wrappers from KSPACE are installed +if(NOT PKG_KSPACE) + get_property(LAMMPS_FIX_HEADERS GLOBAL PROPERTY FIX) + list(REMOVE_ITEM LAMMPS_FIX_HEADERS ${LAMMPS_SOURCE_DIR}/PHONON/fix_phonon.h) + set_property(GLOBAL PROPERTY FIX "${LAMMPS_FIX_HEADERS}") + get_target_property(LAMMPS_SOURCES lammps SOURCES) + list(REMOVE_ITEM LAMMPS_SOURCES ${LAMMPS_SOURCE_DIR}/PHONON/fix_phonon.cpp) + set_property(TARGET lammps PROPERTY SOURCES "${LAMMPS_SOURCES}") +endif() diff --git a/cmake/Modules/Packages/PLUMED.cmake b/cmake/Modules/Packages/PLUMED.cmake index 0f063f3e14..6b832574ca 100644 --- a/cmake/Modules/Packages/PLUMED.cmake +++ b/cmake/Modules/Packages/PLUMED.cmake @@ -54,8 +54,8 @@ if(DOWNLOAD_PLUMED) set(PLUMED_BUILD_BYPRODUCTS "/lib/libplumedWrapper.a") endif() - set(PLUMED_URL "https://github.com/plumed/plumed2/releases/download/v2.7.2/plumed-src-2.7.2.tgz" CACHE STRING "URL for PLUMED tarball") - set(PLUMED_MD5 "cfa0b4dd90a81c25d3302e8d97bfeaea" CACHE STRING "MD5 checksum of PLUMED tarball") + set(PLUMED_URL "https://github.com/plumed/plumed2/releases/download/v2.7.3/plumed-src-2.7.3.tgz" CACHE STRING "URL for PLUMED tarball") + set(PLUMED_MD5 "f00cc82edfefe6bb3df934911dbe32fb" CACHE STRING "MD5 checksum of PLUMED tarball") mark_as_advanced(PLUMED_URL) mark_as_advanced(PLUMED_MD5) diff --git a/cmake/Modules/YAML.cmake b/cmake/Modules/YAML.cmake deleted file mode 100644 index 77ee804111..0000000000 --- a/cmake/Modules/YAML.cmake +++ /dev/null @@ -1,47 +0,0 @@ -message(STATUS "Downloading and building YAML library") - -include(ExternalProject) -set(YAML_URL "https://pyyaml.org/download/libyaml/yaml-0.2.5.tar.gz" CACHE STRING "URL for libyaml tarball") -set(YAML_MD5 "bb15429d8fb787e7d3f1c83ae129a999" CACHE STRING "MD5 checksum of libyaml tarball") -mark_as_advanced(YAML_URL) -mark_as_advanced(YAML_MD5) - -# support cross-compilation to windows -if(CMAKE_CROSSCOMPILING AND (CMAKE_SYSTEM_NAME STREQUAL "Windows")) - if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86") - set(YAML_CROSS_HOST --host=i686-mingw64) - elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") - set(YAML_CROSS_HOST --host=x86_64-mingw64) - else() - message(FATAL_ERROR "Unsupported cross-compilation " - " for ${CMAKE_SYSTEM_NAME}/${CMAKE_SYSTEM_PROCESSOR}" - " on ${CMAKE_HOST_SYSTEM}/${CMAKE_HOST_SYSTEM_PROCESSOR}") - endif() -endif() - -ExternalProject_Add(libyaml - URL ${YAML_URL} - URL_MD5 ${YAML_MD5} - SOURCE_DIR "${CMAKE_BINARY_DIR}/yaml-src" - BINARY_DIR "${CMAKE_BINARY_DIR}/yaml-build" - CONFIGURE_COMMAND /configure ${CONFIGURE_REQUEST_PIC} - CXX=${CMAKE_CXX_COMPILER} CC=${CMAKE_C_COMPILER} - --prefix= --disable-shared ${YAML_CROSS_HOST} - BUILD_BYPRODUCTS /lib/libyaml${CMAKE_STATIC_LIBRARY_SUFFIX} - TEST_COMMAND "") - -ExternalProject_Get_Property(libyaml INSTALL_DIR) -set(YAML_INCLUDE_DIR ${INSTALL_DIR}/include) -set(YAML_LIBRARY_DIR ${INSTALL_DIR}/lib) - -# workaround for CMake 3.10 on ubuntu 18.04 -file(MAKE_DIRECTORY ${YAML_INCLUDE_DIR}) -file(MAKE_DIRECTORY ${YAML_LIBRARY_DIR}) - -set(YAML_LIBRARY_PATH ${INSTALL_DIR}/lib/libyaml${CMAKE_STATIC_LIBRARY_SUFFIX}) - -add_library(Yaml::Yaml UNKNOWN IMPORTED) -set_target_properties(Yaml::Yaml PROPERTIES - IMPORTED_LOCATION ${YAML_LIBRARY_PATH} - INTERFACE_INCLUDE_DIRECTORIES ${YAML_INCLUDE_DIR}) -add_dependencies(Yaml::Yaml libyaml) diff --git a/cmake/presets/most.cmake b/cmake/presets/most.cmake index eb26b38928..27ce57621c 100644 --- a/cmake/presets/most.cmake +++ b/cmake/presets/most.cmake @@ -48,7 +48,6 @@ set(ALL_PACKAGES PHONON PLUGIN POEMS - PYTHON QEQ REACTION REAXFF diff --git a/doc/Makefile b/doc/Makefile index d61f844a1b..a082018dfb 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -230,7 +230,7 @@ $(VENV): ) $(MATHJAX): - @git clone -b 3.2.0 -c advice.detachedHead=0 --depth 1 git://github.com/mathjax/MathJax.git $@ + @git clone -b 3.2.0 -c advice.detachedHead=0 --depth 1 https://github.com/mathjax/MathJax.git $@ $(ANCHORCHECK): $(VENV) @( \ diff --git a/doc/lammps.1 b/doc/lammps.1 index 78b6c9fd67..58086b1fae 100644 --- a/doc/lammps.1 +++ b/doc/lammps.1 @@ -1,4 +1,4 @@ -.TH LAMMPS "1" "27 October 2021" "2021-10-27" +.TH LAMMPS "1" "7 January 2022" "2022-1-7" .SH NAME .B LAMMPS \- Molecular Dynamics Simulator. diff --git a/doc/src/Bibliography.rst b/doc/src/Bibliography.rst index 0256552332..9f3591dcde 100644 --- a/doc/src/Bibliography.rst +++ b/doc/src/Bibliography.rst @@ -1123,9 +1123,12 @@ Bibliography **(Sun)** Sun, J. Phys. Chem. B, 102, 7338-7364 (1998). -**(Surblys)** +**(Surblys2019)** Surblys, Matsubara, Kikugawa, Ohara, Phys Rev E, 99, 051301(R) (2019). +**(Surblys2021)** + Surblys, Matsubara, Kikugawa, Ohara, J Appl Phys 130, 215104 (2021). + **(Sutmann)** Sutmann, Arnold, Fahrenberger, et. al., Physical review / E 88(6), 063308 (2013) diff --git a/doc/src/Build_cmake.rst b/doc/src/Build_cmake.rst index 2a64bc3240..9bee18146c 100644 --- a/doc/src/Build_cmake.rst +++ b/doc/src/Build_cmake.rst @@ -150,6 +150,42 @@ for IDEs like Eclipse, CodeBlocks, or Kate can be selected using the *-G* command line flag. A list of available generator settings for your specific CMake version is given when running ``cmake --help``. +.. _cmake_multiconfig: + +Multi-configuration build systems +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Throughout this manual it is mostly assumed that LAMMPS is being built +on a Unix-like operating system with "make" as the underlying "builder", +since this is the most common case. In this case the build "configuration" +is chose using ``-D CMAKE_BUILD_TYPE=`` with ```` +being one of "Release", "Debug", "RelWithDebInfo", or "MinSizeRel". +Some build tools, however, can also use or even require to have a so-called +multi-configuration build system setup. For those the built type (or +configuration) is chosen at compile time using the same build files. E.g. +with: + +.. code-block:: bash + + cmake --build build-multi --config Release + +In that case the resulting binaries are not in the build folder directly +but in sub-directories corresponding to the build type (i.e. Release in +the example from above). Similarly, for running unit tests the +configuration is selected with the *-C* flag: + +.. code-block:: bash + + ctest -C Debug + +The CMake scripts in LAMMPS have basic support for being compiled using a +multi-config build system, but not all of it has been ported. This is in +particular applicable to compiling packages that require additional libraries +that would be downloaded and compiled by CMake. The "windows" preset file +tries to keep track of which packages can be compiled natively with the +MSVC compilers out-of-the box. Not all of those external libraries are +portable to Windows either. + Installing CMake ^^^^^^^^^^^^^^^^ diff --git a/doc/src/Build_development.rst b/doc/src/Build_development.rst index 3c2acbaa7e..5492a1e536 100644 --- a/doc/src/Build_development.rst +++ b/doc/src/Build_development.rst @@ -185,6 +185,10 @@ The ``ctest`` command has many options, the most important ones are: - run subset of tests matching the regular expression * - -E - exclude subset of tests matching the regular expression + * - -L + - run subset of tests with a label matching the regular expression + * - -LE + - exclude subset of tests with a label matching the regular expression * - -N - dry-run: display list of tests without running them * - -T memcheck @@ -299,6 +303,12 @@ will destroy the original file, if the generation run does not complete, so using *-g* is recommended unless the YAML file is fully tested and working. +Some of the force style tests are rather slow to run and some are very +sensitive to small differences like CPU architecture, compiler +toolchain, compiler optimization. Those tests are flagged with a "slow" +and/or "unstable" label, and thus those tests can be selectively +excluded with the ``-LE`` flag or selected with the ``-L`` flag. + .. admonition:: Recommendations and notes for YAML files :class: note diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst index 2157fe86c8..9648df402f 100644 --- a/doc/src/Build_extras.rst +++ b/doc/src/Build_extras.rst @@ -341,6 +341,18 @@ minutes to hours) to build. Of course you only need to do that once.) $ make lib-kim args="-p /usr/local" # use an existing KIM API installation at the provided location $ make lib-kim args="-p /usr/local -a EAM_Dynamo_Ackland_W__MO_141627196590_002" # ditto but add one model or driver + When using the "-b " option, the KIM library is built using its native + cmake build system. The ``lib/kim/Install.py`` script supports a + ``CMAKE`` environment variable if the cmake executable is named other + than ``cmake`` on your system. Additional environment variables may be + provided on the command line for use by cmake. For example, to use the + ``cmake3`` executable and tell it to use the gnu version 11 compilers + to build KIM, one could use the following command line. + + .. code-block:: bash + + $ CMAKE=cmake3 CXX=g++-11 CC=gcc-11 FC=gfortran-11 make lib-kim args="-b " # (re-)install KIM API lib using cmake3 and gnu v11 compilers with only example models + Settings for debugging OpenKIM web queries discussed below need to be applied by adding them to the ``LMP_INC`` variable through editing the ``Makefile.machine`` you are using. For example: @@ -560,11 +572,26 @@ They must be specified in uppercase. * - VEGA908 - GPU - AMD GPU MI100 GFX908 - * - INTEL_GEN + * - VEGA90A - GPU - - Intel GPUs Gen9+ + - AMD GPU + * - INTEL_DG1 + - GPU + - Intel Iris XeMAX GPU + * - INTEL_GEN9 + - GPU + - Intel GPU Gen9 + * - INTEL_GEN11 + - GPU + - Intel GPU Gen11 + * - INTEL_GEN12LP + - GPU + - Intel GPU Gen12LP + * - INTEL_XEHP + - GPU + - Intel GPUs Xe-HP -This list was last updated for version 3.4.1 of the Kokkos library. +This list was last updated for version 3.5.0 of the Kokkos library. .. tabs:: diff --git a/doc/src/Build_windows.rst b/doc/src/Build_windows.rst index fa2296d302..4bb5cfec27 100644 --- a/doc/src/Build_windows.rst +++ b/doc/src/Build_windows.rst @@ -89,6 +89,11 @@ miss the correct master ``CMakeLists.txt``. Try to open the starting point. It is also possible to configure and compile LAMMPS from the command line with a CMake binary from `cmake.org `_. +Please note, that for either approach CMake will create a so-called +:ref:`"multi-configuration" build environment `, and +the command lines for building and testing LAMMPS must be adjusted +accordingly. + To support running in parallel you can compile with OpenMP enabled using the OPENMP package or install Microsoft MPI (including the SDK) and compile LAMMPS with MPI enabled. diff --git a/doc/src/Commands_bond.rst b/doc/src/Commands_bond.rst index c28c9db864..40b99a5fb8 100644 --- a/doc/src/Commands_bond.rst +++ b/doc/src/Commands_bond.rst @@ -37,6 +37,7 @@ OPT. * :doc:`class2 (ko) ` * :doc:`fene (iko) ` * :doc:`fene/expand (o) ` + * :doc:`fene/nm ` * :doc:`gaussian ` * :doc:`gromos (o) ` * :doc:`harmonic (iko) ` diff --git a/doc/src/Commands_compute.rst b/doc/src/Commands_compute.rst index 0c60883314..f1f0597f30 100644 --- a/doc/src/Commands_compute.rst +++ b/doc/src/Commands_compute.rst @@ -28,6 +28,7 @@ KOKKOS, o = OPENMP, t = OPT. * :doc:`angle ` * :doc:`angle/local ` * :doc:`angmom/chunk ` + * :doc:`ave/sphere/atom (k) ` * :doc:`basal/atom ` * :doc:`body/local ` * :doc:`bond ` diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst index 7cf4e7635b..9ac4fc851c 100644 --- a/doc/src/Commands_pair.rst +++ b/doc/src/Commands_pair.rst @@ -210,6 +210,7 @@ OPT. * :doc:`nm/cut (o) ` * :doc:`nm/cut/coul/cut (o) ` * :doc:`nm/cut/coul/long (o) ` + * :doc:`nm/cut/split ` * :doc:`oxdna/coaxstk ` * :doc:`oxdna/excv ` * :doc:`oxdna/hbond ` @@ -262,6 +263,7 @@ OPT. * :doc:`spin/neel ` * :doc:`srp ` * :doc:`sw (giko) ` + * :doc:`sw/mod (o) ` * :doc:`table (gko) ` * :doc:`table/rx (k) ` * :doc:`tdpd ` diff --git a/doc/src/Developer_platform.rst b/doc/src/Developer_platform.rst index c9ecd30cec..cdc4bb6770 100644 --- a/doc/src/Developer_platform.rst +++ b/doc/src/Developer_platform.rst @@ -118,6 +118,9 @@ Environment variable functions .. doxygenfunction:: putenv :project: progguide +.. doxygenfunction:: unsetenv + :project: progguide + .. doxygenfunction:: list_pathenv :project: progguide diff --git a/doc/src/Developer_utils.rst b/doc/src/Developer_utils.rst index db47a9e3c3..a9969b7543 100644 --- a/doc/src/Developer_utils.rst +++ b/doc/src/Developer_utils.rst @@ -56,11 +56,11 @@ String to number conversions with validity check These functions should be used to convert strings to numbers. They are are strongly preferred over C library calls like ``atoi()`` or -``atof()`` since they check if the **entire** provided string is a valid +``atof()`` since they check if the **entire** string is a valid (floating-point or integer) number, and will error out instead of silently returning the result of a partial conversion or zero in cases -where the string is not a valid number. This behavior allows to more -easily detect typos or issues when processing input files. +where the string is not a valid number. This behavior improves +detecting typos or issues when processing input files. Similarly the :cpp:func:`logical() ` function will convert a string into a boolean and will only accept certain words. @@ -76,19 +76,34 @@ strings for compliance without conversion. ---------- -.. doxygenfunction:: numeric +.. doxygenfunction:: numeric(const char *file, int line, const std::string &str, bool do_abort, LAMMPS *lmp) :project: progguide -.. doxygenfunction:: inumeric +.. doxygenfunction:: numeric(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp) :project: progguide -.. doxygenfunction:: bnumeric +.. doxygenfunction:: inumeric(const char *file, int line, const std::string &str, bool do_abort, LAMMPS *lmp) :project: progguide -.. doxygenfunction:: tnumeric +.. doxygenfunction:: inumeric(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp) :project: progguide -.. doxygenfunction:: logical +.. doxygenfunction:: bnumeric(const char *file, int line, const std::string &str, bool do_abort, LAMMPS *lmp) + :project: progguide + +.. doxygenfunction:: bnumeric(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp) + :project: progguide + +.. doxygenfunction:: tnumeric(const char *file, int line, const std::string &str, bool do_abort, LAMMPS *lmp) + :project: progguide + +.. doxygenfunction:: tnumeric(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp) + :project: progguide + +.. doxygenfunction:: logical(const char *file, int line, const std::string &str, bool do_abort, LAMMPS *lmp) + :project: progguide + +.. doxygenfunction:: logical(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp) :project: progguide diff --git a/doc/src/Developer_write.rst b/doc/src/Developer_write.rst index c3df6ad6bb..bdc6559060 100644 --- a/doc/src/Developer_write.rst +++ b/doc/src/Developer_write.rst @@ -55,7 +55,7 @@ of each timestep. First of all, implement a constructor: if (narg < 4) error->all(FLERR,"Illegal fix print/vel command"); - nevery = force->inumeric(FLERR,arg[3]); + nevery = utils::inumeric(FLERR,arg[3],false,lmp); if (nevery <= 0) error->all(FLERR,"Illegal fix print/vel command"); } diff --git a/doc/src/Errors_messages.rst b/doc/src/Errors_messages.rst index 3a593b5a3f..c06f4c86e3 100644 --- a/doc/src/Errors_messages.rst +++ b/doc/src/Errors_messages.rst @@ -7772,9 +7772,6 @@ keyword to allow for additional bonds to be formed The system size must fit in a 32-bit integer to use this dump style. -*Too many atoms to dump sort* - Cannot sort when running with more than 2\^31 atoms. - *Too many elements extracted from MEAM library.* Increase 'maxelt' in meam.h and recompile. diff --git a/doc/src/Howto_drude2.rst b/doc/src/Howto_drude2.rst index 589e9d7b9a..00289a989a 100644 --- a/doc/src/Howto_drude2.rst +++ b/doc/src/Howto_drude2.rst @@ -491,11 +491,6 @@ NPT ensemble using Nose-Hoover thermostat: **(Schroeder)** Schroeder and Steinhauser, J Chem Phys, 133, 154511 (2010). -.. _Jiang2: - -**(Jiang)** Jiang, Hardy, Phillips, MacKerell, Schulten, and Roux, - J Phys Chem Lett, 2, 87-92 (2011). - .. _Thole2: **(Thole)** Chem Phys, 59, 341 (1981). diff --git a/doc/src/Howto_github.rst b/doc/src/Howto_github.rst index 278b9e4bfd..315bacac69 100644 --- a/doc/src/Howto_github.rst +++ b/doc/src/Howto_github.rst @@ -141,7 +141,8 @@ unrelated feature, you should switch branches! Committing changes to the *develop*, *release*, or *stable* branches is strongly discouraged. While it may be convenient initially, it will create more work in the long run. Various texts and tutorials - on using git effectively discuss the motivation for this. + on using git effectively discuss the motivation for using feature + branches instead. **After changes are made** diff --git a/doc/src/Install_git.rst b/doc/src/Install_git.rst index 4e7db77873..a5dc19fe79 100644 --- a/doc/src/Install_git.rst +++ b/doc/src/Install_git.rst @@ -28,8 +28,9 @@ provides `limited support for subversion clients `_. You can follow the LAMMPS development on 3 different git branches: -* **stable** : this branch is updated with every stable release; - updates are always "fast forward" merges from *develop* +* **stable** : this branch is updated from the *release* branch with + every stable release version and also has selected bug fixes and updates + back-ported from the *develop* branch * **release** : this branch is updated with every patch release; updates are always "fast forward" merges from *develop* * **develop** : this branch follows the ongoing development and @@ -47,20 +48,22 @@ your machine and "release" is one of the 3 branches listed above. (Note that you actually download all 3 branches; you can switch between them at any time using "git checkout ".) -.. note:: +.. admonition:: Saving time and disk space when using ``git clone`` The complete git history of the LAMMPS project is quite large because it contains the entire commit history of the project since fall 2006, - which includes the time when LAMMPS was managed with subversion. This - also includes commits that have added and removed some large files - (mostly by accident). If you do not need access to the entire commit - history, you can speed up the "cloning" process and reduce local disk - space requirements by using the *--depth* git command line flag thus - create a "shallow clone" of the repository that contains only a - subset of the git history. Using a depth of 1000 is usually sufficient - to include the head commits of the *develop* and the *release* branches. - To include the head commit of the *stable* branch you may need a depth - of up to 10000. + which includes the time when LAMMPS was managed with subversion. + This includes a few commits that have added and removed some large + files (mostly by accident). If you do not need access to the entire + commit history (most people don't), you can speed up the "cloning" + process and reduce local disk space requirements by using the + *--depth* git command line flag. That will create a "shallow clone" + of the repository containing only a subset of the git history. Using + a depth of 1000 is usually sufficient to include the head commits of + the *develop* and the *release* branches. To include the head commit + of the *stable* branch you may need a depth of up to 10000. If you + later need more of the git history, you can always convert the + shallow clone into a "full clone". Once the command completes, your directory will contain the same files as if you unpacked a current LAMMPS tarball, with the exception, that @@ -156,9 +159,9 @@ changed. How to do this depends on the build system you are using. .. admonition:: Git protocols :class: note - The servers at github.com support the "git://" and "https://" access - protocols for anonymous, read-only access. If you have a suitably - configured GitHub account, you may also use SSH protocol with the + The servers at github.com support the "https://" access protocol for + anonymous, read-only access. If you have a suitably configured GitHub + account, you may also use SSH protocol with the URL "git@github.com:lammps/lammps.git". The LAMMPS GitHub project is currently managed by Axel Kohlmeyer diff --git a/doc/src/Intro_citing.rst b/doc/src/Intro_citing.rst index 0e10b7559a..08f82fac33 100644 --- a/doc/src/Intro_citing.rst +++ b/doc/src/Intro_citing.rst @@ -16,7 +16,7 @@ source code design, the program structure, the spatial decomposition approach, the neighbor finding, basic communications algorithms, and how users and developers have contributed to LAMMPS is: - `LAMMPS - A flexible simulation tool for particle-based materials modeling at the atomic, meso, and continuum scales, Comp. Phys. Comm. (accepted 09/2021), DOI:10.1016/j.cpc.2021.108171 `_ + `LAMMPS - A flexible simulation tool for particle-based materials modeling at the atomic, meso, and continuum scales, Comp. Phys. Comm. 271, 108171 (2022) `_ So a project using LAMMPS or a derivative application that uses LAMMPS as a simulation engine should cite this paper. The paper is expected to diff --git a/doc/src/Manual_version.rst b/doc/src/Manual_version.rst index b705ce8c4a..78ed61cd7c 100644 --- a/doc/src/Manual_version.rst +++ b/doc/src/Manual_version.rst @@ -10,23 +10,31 @@ Whenever we fix a bug or update or add a feature, it will be merged into the *develop* branch of the git repository. When a sufficient number of changes have accumulated *and* the software passes a set of automated tests, we release it in the next *patch* release, which are made every -few weeks. Info on patch releases are on `this website page +few weeks. The *release* branch of the git repository is updated with +every such release. Info on patch releases are on `this website page `_. -Once or twice a year, only bug fixes and small, non-intrusive changes are -included for a period of time, and the code is subjected to more detailed +Once or twice a year, we apply only bug fixes and small, non-intrusive +changes to the *develop* branch and the code is subjected to more detailed and thorough testing than the default automated testing. The latest -patch release after such a period is then labeled as a *stable* version. +patch release after such a period is then also labeled as a *stable* version +and the *stable* branch is updated with it. Between stable releases +we occasionally release some updates to the stable release containing +only bug fixes and updates back-ported from *develop* but no new features +and update the *stable* branch accordingly. -Each version of LAMMPS contains all the features and bug-fixes up to -and including its version date. +Each version of LAMMPS contains all the documented features up to and +including its version date. The version date is printed to the screen and logfile every time you run LAMMPS. It is also in the file src/version.h and in the LAMMPS directory name created when you unpack a tarball. And it is on the first page of the :doc:`manual `. -* If you browse the HTML pages on the LAMMPS WWW site, they always - describe the most current patch release of LAMMPS. +* If you browse the HTML pages on the LAMMPS WWW site, they will by + default describe the most current patch release version of LAMMPS. + In the navigation bar on the bottom left, there is the option to + view instead the documentation for the most recent *stable* version + or the latest version from the current development branch. * If you browse the HTML pages included in your tarball, they describe the version you have, which may be older. diff --git a/doc/src/Modify_pair.rst b/doc/src/Modify_pair.rst index 7263b8fd48..6913204504 100644 --- a/doc/src/Modify_pair.rst +++ b/doc/src/Modify_pair.rst @@ -12,24 +12,24 @@ includes some optional methods to enable its use with rRESPA. Here is a brief description of the class methods in pair.h: -+---------------------------------+-------------------------------------------------------------------+ -| compute | workhorse routine that computes pairwise interactions | -+---------------------------------+-------------------------------------------------------------------+ -| settings | reads the input script line with arguments you define | -+---------------------------------+-------------------------------------------------------------------+ -| coeff | set coefficients for one i,j type pair | -+---------------------------------+-------------------------------------------------------------------+ -| init_one | perform initialization for one i,j type pair | -+---------------------------------+-------------------------------------------------------------------+ -| init_style | initialization specific to this pair style | -+---------------------------------+-------------------------------------------------------------------+ -| write & read_restart | write/read i,j pair coeffs to restart files | -+---------------------------------+-------------------------------------------------------------------+ -| write & read_restart_settings | write/read global settings to restart files | -+---------------------------------+-------------------------------------------------------------------+ -| single | force and energy of a single pairwise interaction between 2 atoms | -+---------------------------------+-------------------------------------------------------------------+ -| compute_inner/middle/outer | versions of compute used by rRESPA | -+---------------------------------+-------------------------------------------------------------------+ ++---------------------------------+---------------------------------------------------------------------+ +| compute | workhorse routine that computes pairwise interactions | ++---------------------------------+---------------------------------------------------------------------+ +| settings | reads the input script line with arguments you define | ++---------------------------------+---------------------------------------------------------------------+ +| coeff | set coefficients for one i,j type pair | ++---------------------------------+---------------------------------------------------------------------+ +| init_one | perform initialization for one i,j type pair | ++---------------------------------+---------------------------------------------------------------------+ +| init_style | initialization specific to this pair style | ++---------------------------------+---------------------------------------------------------------------+ +| write & read_restart | write/read i,j pair coeffs to restart files | ++---------------------------------+---------------------------------------------------------------------+ +| write & read_restart_settings | write/read global settings to restart files | ++---------------------------------+---------------------------------------------------------------------+ +| single | force/r and energy of a single pairwise interaction between 2 atoms | ++---------------------------------+---------------------------------------------------------------------+ +| compute_inner/middle/outer | versions of compute used by rRESPA | ++---------------------------------+---------------------------------------------------------------------+ The inner/middle/outer routines are optional. diff --git a/doc/src/Packages_details.rst b/doc/src/Packages_details.rst index e5e548a341..eb5cae0443 100644 --- a/doc/src/Packages_details.rst +++ b/doc/src/Packages_details.rst @@ -1907,6 +1907,12 @@ MPIIO library. It adds :doc:`dump styles ` with a "mpiio" in their style name. Restart files with an ".mpiio" suffix are also written and read in parallel. +.. warning:: + + The MPIIO package is currently unmaintained and has become + unreliable. Use with caution. + + **Install:** The MPIIO package requires that LAMMPS is build in :ref:`MPI parallel mode `. diff --git a/doc/src/angle_class2.rst b/doc/src/angle_class2.rst index f257d96dc3..4e8e515564 100644 --- a/doc/src/angle_class2.rst +++ b/doc/src/angle_class2.rst @@ -64,34 +64,44 @@ These are the 4 coefficients for the :math:`E_a` formula: radians internally; hence the various :math:`K` are effectively energy per radian\^2 or radian\^3 or radian\^4. -For the :math:`E_{bb}` formula, each line in a :doc:`angle_coeff ` -command in the input script lists 4 coefficients, the first of which -is "bb" to indicate they are BondBond coefficients. In a data file, -these coefficients should be listed under a "BondBond Coeffs" heading -and you must leave out the "bb", i.e. only list 3 coefficients after -the angle type. +For the :math:`E_{bb}` formula, each line in a :doc:`angle_coeff +` command in the input script lists 4 coefficients, the +first of which is "bb" to indicate they are BondBond coefficients. In +a data file, these coefficients should be listed under a "BondBond +Coeffs" heading and you must leave out the "bb", i.e. only list 3 +coefficients after the angle type. * bb * :math:`M` (energy/distance\^2) * :math:`r_1` (distance) * :math:`r_2` (distance) -For the :math:`E_{ba}` formula, each line in a :doc:`angle_coeff ` -command in the input script lists 5 coefficients, the first of which -is "ba" to indicate they are BondAngle coefficients. In a data file, -these coefficients should be listed under a "BondAngle Coeffs" heading -and you must leave out the "ba", i.e. only list 4 coefficients after -the angle type. +For the :math:`E_{ba}` formula, each line in a :doc:`angle_coeff +` command in the input script lists 5 coefficients, the +first of which is "ba" to indicate they are BondAngle coefficients. +In a data file, these coefficients should be listed under a "BondAngle +Coeffs" heading and you must leave out the "ba", i.e. only list 4 +coefficients after the angle type. * ba -* :math:`N_1` (energy/distance\^2) -* :math:`N_2` (energy/distance\^2) +* :math:`N_1` (energy/distance) +* :math:`N_2` (energy/distance) * :math:`r_1` (distance) * :math:`r_2` (distance) The :math:`\theta_0` value in the :math:`E_{ba}` formula is not specified, since it is the same value from the :math:`E_a` formula. +.. note:: + + It is important that the order of the I,J,K atoms in each angle + listed in the Angles section of the data file read by the + :doc:`read_data ` command be consistent with the order + of the :math:`r_1` and :math:`r_2` BondBond and BondAngle + coefficients. This is because the terms in the formulas for + :math:`E_{bb}` and :math:`E_{ba}` will use the I,J atoms to compute + :math:`r_{ij}` and the J,K atoms to compute :math:`r_{jk}`. + ---------- .. include:: accel_styles.rst diff --git a/doc/src/bond_fene.rst b/doc/src/bond_fene.rst index 108f538628..be7775489a 100644 --- a/doc/src/bond_fene.rst +++ b/doc/src/bond_fene.rst @@ -1,4 +1,5 @@ .. index:: bond_style fene +.. index:: bond_style fene/nm .. index:: bond_style fene/intel .. index:: bond_style fene/kk .. index:: bond_style fene/omp @@ -8,12 +9,16 @@ bond_style fene command Accelerator Variants: *fene/intel*, *fene/kk*, *fene/omp* +bond_style fene/nm command +========================== + Syntax """""" .. code-block:: LAMMPS bond_style fene + bond_style fene/nm Examples """""""" @@ -23,6 +28,9 @@ Examples bond_style fene bond_coeff 1 30.0 1.5 1.0 1.0 + bond_style fene/nm + bond_coeff 1 2.25344 1.5 1.0 1.12246 2 6 + Description """"""""""" @@ -38,16 +46,36 @@ term is attractive, the second Lennard-Jones term is repulsive. The first term extends to :math:`R_0`, the maximum extent of the bond. The second term is cutoff at :math:`2^\frac{1}{6} \sigma`, the minimum of the LJ potential. -The following coefficients must be defined for each bond type via the -:doc:`bond_coeff ` command as in the example above, or in -the data file or restart files read by the :doc:`read_data ` -or :doc:`read_restart ` commands: +The *fene/nm* bond style substitutes the standard LJ potential with the generalized LJ potential +in the same form as in pair style :doc:`nm/cut `. The bond energy is then given by + +.. math:: + + E = -0.5 K r_0^2 \ln \left[ 1 - \left(\frac{r}{R_0}\right)^2\right] + \frac{E_0}{(n-m)} \left[ m \left(\frac{r_0}{r}\right)^n - n \left(\frac{r_0}{r}\right)^m \right] + +Similar to the *fene* style, the generalized Lennard-Jones is cut off at +the potential minimum, :math:`r_0`, to be repulsive only. The following +coefficients must be defined for each bond type via the :doc:`bond_coeff +` command as in the example above, or in the data file or +restart files read by the :doc:`read_data ` or +:doc:`read_restart ` commands: * :math:`K` (energy/distance\^2) * :math:`R_0` (distance) * :math:`\epsilon` (energy) * :math:`\sigma` (distance) +For the *fene/nm* style, the following coefficients are used. Please +note, that the standard LJ potential and thus the regular FENE potential +is recovered for (n=12 m=6) and :math:`r_0 = 2^\frac{1}{6} \sigma`. + +* :math:`K` (energy/distance\^2) +* :math:`R_0` (distance) +* :math:`E_0` (energy) +* :math:`r_0` (distance) +* :math:`n` (unitless) +* :math:`m` (unitless) + ---------- .. include:: accel_styles.rst @@ -57,9 +85,10 @@ or :doc:`read_restart ` commands: Restrictions """""""""""" -This bond style can only be used if LAMMPS was built with the MOLECULE -package. See the :doc:`Build package ` page for more -info. +The *fene* bond style can only be used if LAMMPS was built with the MOLECULE +package; the *fene/nm* bond style can only be used if LAMMPS was built +with the EXTRA-MOLECULE package. See the :doc:`Build package ` +page for more info. You typically should specify :doc:`special_bonds fene ` or :doc:`special_bonds lj/coul 0 1 1 ` to use this bond @@ -68,7 +97,8 @@ style. LAMMPS will issue a warning it that's not the case. Related commands """""""""""""""" -:doc:`bond_coeff `, :doc:`delete_bonds ` +:doc:`bond_coeff `, :doc:`delete_bonds `, +:doc:`pair style lj/cut `, :doc:`pair style nm/cut `. Default """"""" diff --git a/doc/src/bond_style.rst b/doc/src/bond_style.rst index 177dc8cc05..4dee48a78d 100644 --- a/doc/src/bond_style.rst +++ b/doc/src/bond_style.rst @@ -87,6 +87,7 @@ accelerated styles exist. * :doc:`class2 ` - COMPASS (class 2) bond * :doc:`fene ` - FENE (finite-extensible non-linear elastic) bond * :doc:`fene/expand ` - FENE bonds with variable size particles +* :doc:`fene/nm ` - FENE bonds with a generalized Lennard-Jones potential * :doc:`gaussian ` - multicentered Gaussian-based bond potential * :doc:`gromos ` - GROMOS force field bond * :doc:`harmonic ` - harmonic bond diff --git a/doc/src/compute.rst b/doc/src/compute.rst index 0b8249cc7d..9a501127c2 100644 --- a/doc/src/compute.rst +++ b/doc/src/compute.rst @@ -174,6 +174,7 @@ The individual style names on the :doc:`Commands compute ` pag * :doc:`angle ` - energy of each angle sub-style * :doc:`angle/local ` - theta and energy of each angle * :doc:`angmom/chunk ` - angular momentum for each chunk +* :doc:`ave/sphere/atom ` - compute local density and temperature around each atom * :doc:`basal/atom ` - calculates the hexagonal close-packed "c" lattice vector of each atom * :doc:`body/local ` - attributes of body sub-particles * :doc:`bond ` - energy of each bond sub-style diff --git a/doc/src/compute_ave_sphere_atom.rst b/doc/src/compute_ave_sphere_atom.rst new file mode 100644 index 0000000000..db04682865 --- /dev/null +++ b/doc/src/compute_ave_sphere_atom.rst @@ -0,0 +1,101 @@ +.. index:: compute ave/sphere/atom +.. index:: compute ave/sphere/atom/kk + +compute ave/sphere/atom command +================================ + +Accelerator Variants: *ave/sphere/atom/kk* + +Syntax +"""""" + +.. parsed-literal:: + + compute ID group-ID ave/sphere/atom keyword values ... + +* ID, group-ID are documented in :doc:`compute ` command +* ave/sphere/atom = style name of this compute command +* one or more keyword/value pairs may be appended + + .. parsed-literal:: + + keyword = *cutoff* + *cutoff* value = distance cutoff + +Examples +"""""""" + +.. code-block:: LAMMPS + + compute 1 all ave/sphere/atom + + compute 1 all ave/sphere/atom cutoff 5.0 + comm_modify cutoff 5.0 + +Description +""""""""""" + +Define a computation that calculates the local density and temperature +for each atom and neighbors inside a spherical cutoff. + +The optional keyword *cutoff* defines the distance cutoff +used when searching for neighbors. The default value is the cutoff +specified by the pair style. If no pair style is defined, then a cutoff +must be defined using this keyword. If the specified cutoff is larger than +that of the pair_style plus neighbor skin (or no pair style is defined), +the *comm_modify cutoff* option must also be set to match that of the +*cutoff* keyword. + +The neighbor list needed to compute this quantity is constructed each +time the calculation is performed (i.e. each time a snapshot of atoms +is dumped). Thus it can be inefficient to compute/dump this quantity +too frequently. + +.. note:: + + If you have a bonded system, then the settings of + :doc:`special_bonds ` command can remove pairwise + interactions between atoms in the same bond, angle, or dihedral. This + is the default setting for the :doc:`special_bonds ` + command, and means those pairwise interactions do not appear in the + neighbor list. Because this fix uses the neighbor list, it also means + those pairs will not be included in the order parameter. This + difficulty can be circumvented by writing a dump file, and using the + :doc:`rerun ` command to compute the order parameter for + snapshots in the dump file. The rerun script can use a + :doc:`special_bonds ` command that includes all pairs in + the neighbor list. + +---------- + + +.. include:: accel_styles.rst + + +---------- + +Output info +""""""""""" + +This compute calculates a per-atom array with two columns: density and temperature. + +These values can be accessed by any command that uses per-atom values +from a compute as input. See the :doc:`Howto output ` doc +page for an overview of LAMMPS output options. + +Restrictions +"""""""""""" + +This compute is part of the EXTRA-COMPUTE package. It is only enabled if +LAMMPS was built with that package. See the :doc:`Build package ` page for more info. + +Related commands +"""""""""""""""" + +:doc:`comm_modify ` + +Default +""""""" + +The option defaults are *cutoff* = pair style cutoff + diff --git a/doc/src/compute_bond_local.rst b/doc/src/compute_bond_local.rst index 8bdde70dd9..24b0943484 100644 --- a/doc/src/compute_bond_local.rst +++ b/doc/src/compute_bond_local.rst @@ -13,7 +13,7 @@ Syntax * ID, group-ID are documented in :doc:`compute ` command * bond/local = style name of this compute command * one or more values may be appended -* value = *dist* or *engpot* or *force* or *fx* or *fy* or *fz* or *engvib* or *engrot* or *engtrans* or *omega* or *velvib* or *v_name* +* value = *dist* or *dx* or *dy* or *dz* or *engpot* or *force* or *fx* or *fy* or *fz* or *engvib* or *engrot* or *engtrans* or *omega* or *velvib* or *v_name* .. parsed-literal:: @@ -21,6 +21,7 @@ Syntax *engpot* = bond potential energy *force* = bond force + *dx*,\ *dy*,\ *dz* = components of pairwise distance *fx*,\ *fy*,\ *fz* = components of bond force *engvib* = bond kinetic energy of vibration *engrot* = bond kinetic energy of rotation @@ -63,6 +64,9 @@ whether the 2 atoms represent a simple diatomic molecule, or are part of some larger molecule. The value *dist* is the current length of the bond. +The values *dx*, *dy*, and *dz* are the xyz components of the +*distance* between the pair of atoms. This value is always the +distance from the atom of lower to the one with the higher id. The value *engpot* is the potential energy for the bond, based on the current separation of the pair of atoms in the bond. diff --git a/doc/src/compute_heat_flux.rst b/doc/src/compute_heat_flux.rst index 94d6f09700..56975adc70 100644 --- a/doc/src/compute_heat_flux.rst +++ b/doc/src/compute_heat_flux.rst @@ -89,13 +89,20 @@ included in the calculation. .. warning:: The compute *heat/flux* has been reported to produce unphysical - values for angle, dihedral and improper contributions + values for angle, dihedral, improper and constraint force contributions when used with :doc:`compute stress/atom `, - as discussed in :ref:`(Surblys) ` and :ref:`(Boone) `. - You are strongly advised to + as discussed in :ref:`(Surblys2019) `, :ref:`(Boone) ` + and :ref:`(Surblys2021) `. You are strongly advised to use :doc:`compute centroid/stress/atom `, which has been implemented specifically for such cases. +.. warning:: + + Due to an implementation detail, the :math:`y` and :math:`z` + components of heat flux from :doc:`fix rigid ` + contribution when computed via :doc:`compute stress/atom ` + are highly unphysical and should not be used. + The Green-Kubo formulas relate the ensemble average of the auto-correlation of the heat flux :math:`\mathbf{J}` to the thermal conductivity :math:`\kappa`: @@ -232,10 +239,14 @@ none ---------- -.. _Surblys2: +.. _Surblys3: -**(Surblys)** Surblys, Matsubara, Kikugawa, Ohara, Phys Rev E, 99, 051301(R) (2019). +**(Surblys2019)** Surblys, Matsubara, Kikugawa, Ohara, Phys Rev E, 99, 051301(R) (2019). .. _Boone: **(Boone)** Boone, Babaei, Wilmer, J Chem Theory Comput, 15, 5579--5587 (2019). + +.. _Surblys4: + +**(Surblys2021)** Surblys, Matsubara, Kikugawa, Ohara, J Appl Phys 130, 215104 (2021). diff --git a/doc/src/compute_pair_local.rst b/doc/src/compute_pair_local.rst index f464c7cec6..38953d203c 100644 --- a/doc/src/compute_pair_local.rst +++ b/doc/src/compute_pair_local.rst @@ -13,11 +13,12 @@ Syntax * ID, group-ID are documented in :doc:`compute ` command * pair/local = style name of this compute command * one or more values may be appended -* value = *dist* or *eng* or *force* or *fx* or *fy* or *fz* or *pN* +* value = *dist* or *dx* or *dy* or *dz* or *eng* or *force* or *fx* or *fy* or *fz* or *pN* .. parsed-literal:: *dist* = pairwise distance + *dx*,\ *dy*,\ *dz* = components of pairwise distance *eng* = pairwise energy *force* = pairwise force *fx*,\ *fy*,\ *fz* = components of pairwise force @@ -56,6 +57,9 @@ force cutoff distance for that interaction, as defined by the commands. The value *dist* is the distance between the pair of atoms. +The values *dx*, *dy*, and *dz* are the xyz components of the +*distance* between the pair of atoms. This value is always the +distance from the atom of lower to the one with the higher id. The value *eng* is the interaction energy for the pair of atoms. @@ -89,10 +93,10 @@ from the second of the two sub-styles. If the referenced *pN* is not computed for the specific pairwise interaction (based on atom types), then the output will be 0.0. -The value *dist* will be in distance :doc:`units `. The value -*eng* will be in energy :doc:`units `. The values *force*, *fx*, -*fy*, and *fz* will be in force :doc:`units `. The values *pN* -will be in whatever units the pair style defines. +The value *dist*, *dx*, *dy* and *dz* will be in distance :doc:`units `. +The value *eng* will be in energy :doc:`units `. +The values *force*, *fx*, *fy*, and *fz* will be in force :doc:`units `. +The values *pN* will be in whatever units the pair style defines. The optional *cutoff* keyword determines how the force cutoff distance for an interaction is determined. For the default setting of *type*, diff --git a/doc/src/compute_stress_atom.rst b/doc/src/compute_stress_atom.rst index cdb464a9d0..2c8be0c05a 100644 --- a/doc/src/compute_stress_atom.rst +++ b/doc/src/compute_stress_atom.rst @@ -87,6 +87,10 @@ Tersoff 3-body interaction) is assigned in equal portions to each atom in the set. E.g. 1/4 of the dihedral virial to each of the 4 atoms, or 1/3 of the fix virial due to SHAKE constraints applied to atoms in a water molecule via the :doc:`fix shake ` command. +As an exception, the virial contribution from +constraint forces in :doc:`fix rigid ` on each atom +is computed from the constraint force acting on the corresponding atom +and its position, i.e. the total virial is not equally distributed. In case of compute *centroid/stress/atom*, the virial contribution is: @@ -103,13 +107,25 @@ atom :math:`I` due to the interaction and the relative position :math:`\mathbf{r}_{I0}` of the atom :math:`I` to the geometric center of the interacting atoms, i.e. centroid, is used. As the geometric center is different for each interaction, the :math:`\mathbf{r}_{I0}` -also differs. The sixth and seventh terms, Kspace and :doc:`fix -` contribution respectively, are computed identical to compute -*stress/atom*. Although the total system virial is the same as +also differs. The sixth term, Kspace contribution, +is computed identically to compute *stress/atom*. +The seventh term is handed differently depending on +if the constraint forces are due to :doc:`fix shake ` +or :doc:`fix rigid `. +In case of SHAKE constraints, each distance constraint is +handed as a pairwise interaction. +E.g. in case of a water molecule, two OH and one HH distance +constraints are treated as three pairwise interactions. +In case of :doc:`fix rigid `, +all constraint forces in the molecule are treated +as a single many-body interaction with a single centroid position. +In case of water molecule, the formula expression would become +identical to that of the three-body angle interaction. +Although the total system virial is the same as compute *stress/atom*, compute *centroid/stress/atom* is know to -result in more consistent heat flux values for angle, dihedrals and -improper contributions when computed via :doc:`compute heat/flux -`. +result in more consistent heat flux values for angle, dihedrals, +improper and constraint force contributions +when computed via :doc:`compute heat/flux `. If no extra keywords are listed, the kinetic contribution all of the virial contribution terms are included in the per-atom stress tensor. @@ -134,7 +150,8 @@ contribution for the cluster interaction is divided evenly among those atoms. Details of how compute *centroid/stress/atom* obtains the virial for -individual atoms is given in :ref:`(Surblys) `, where the +individual atoms are given in :ref:`(Surblys2019) ` and +:ref:`(Surblys2021) `, where the idea is that the virial of the atom :math:`I` is the result of only the force :math:`\mathbf{F}_I` on the atom due to the interaction and its positional vector :math:`\mathbf{r}_{I0}`, relative to the @@ -235,10 +252,10 @@ between the pair of particles. All bond styles are supported. All angle, dihedral, improper styles are supported with the exception of INTEL and KOKKOS variants of specific styles. It also does not support models with long-range Coulombic or dispersion forces, -i.e. the kspace_style command in LAMMPS. It also does not support the -following fixes which add rigid-body constraints: :doc:`fix shake -`, :doc:`fix rattle `, :doc:`fix rigid -`, :doc:`fix rigid/small `. +i.e. the kspace_style command in LAMMPS. It also does not implement the +following fixes which add rigid-body constraints: +:doc:`fix rigid/* ` and the OpenMP accelerated version of :doc:`fix rigid/small `, +while all other :doc:`fix rigid/*/small ` are implemented. LAMMPS will generate an error if one of these options is included in your model. Extension of centroid stress calculations to these force @@ -270,4 +287,8 @@ none .. _Surblys1: -**(Surblys)** Surblys, Matsubara, Kikugawa, Ohara, Phys Rev E, 99, 051301(R) (2019). +**(Surblys2019)** Surblys, Matsubara, Kikugawa, Ohara, Phys Rev E, 99, 051301(R) (2019). + +.. _Surblys2: + +**(Surblys2021)** Surblys, Matsubara, Kikugawa, Ohara, J Appl Phys 130, 215104 (2021). diff --git a/doc/src/delete_atoms.rst b/doc/src/delete_atoms.rst index d47743071b..f78f295011 100644 --- a/doc/src/delete_atoms.rst +++ b/doc/src/delete_atoms.rst @@ -20,8 +20,10 @@ Syntax cutoff = delete one atom from pairs of atoms within the cutoff (distance units) group1-ID = one atom in pair must be in this group group2-ID = other atom in pair must be in this group - *porosity* args = region-ID fraction seed + *porosity* args = group-ID region-ID fraction seed + group-ID = group within which to perform deletions region-ID = region within which to perform deletions + or NULL to only impose the group criterion fraction = delete this fraction of atoms seed = random number seed (positive integer) @@ -43,7 +45,8 @@ Examples delete_atoms region sphere compress no delete_atoms overlap 0.3 all all delete_atoms overlap 0.5 solvent colloid - delete_atoms porosity cube 0.1 482793 bond yes + delete_atoms porosity all cube 0.1 482793 bond yes + delete_atoms porosity polymer cube 0.1 482793 bond yes Description """"""""""" @@ -76,12 +79,17 @@ have occurred that no atom pairs within the cutoff will remain minimum number of atoms will be deleted, or that the same atoms will be deleted when running on different numbers of processors. -For style *porosity* a specified *fraction* of atoms are deleted -within the specified region. For example, if fraction is 0.1, then -10% of the atoms will be deleted. The atoms to delete are chosen -randomly. There is no guarantee that the exact fraction of atoms will -be deleted, or that the same atoms will be deleted when running on -different numbers of processors. +For style *porosity* a specified *fraction* of atoms are deleted which +are both in the specified group and within the specified region. The +region-ID can be specified as NULL to only impose the group criterion. +Likewise, specifying the group-ID as *all* will only impose the region +criterion. + +For example, if fraction is 0.1, then 10% of the eligible atoms will +be deleted. The atoms to delete are chosen randomly. There is no +guarantee that the exact fraction of atoms will be deleted, or that +the same atoms will be deleted when running on different numbers of +processors. If the *compress* keyword is set to *yes*, then after atoms are deleted, then atom IDs are re-assigned so that they run from 1 to the @@ -89,8 +97,8 @@ number of atoms in the system. Note that this is not done for molecular systems (see the :doc:`atom_style ` command), regardless of the *compress* setting, since it would foul up the bond connectivity that has already been assigned. However, the -:doc:`reset_atom_ids ` command can be used after this command to -accomplish the same thing. +:doc:`reset_atom_ids ` command can be used after this +command to accomplish the same thing. Note that the re-assignment of IDs is not really a compression, where gaps in atom IDs are removed by decrementing atom IDs that are larger. @@ -100,15 +108,15 @@ the :doc:`create_atoms ` command explains. A molecular system with fixed bonds, angles, dihedrals, or improper interactions, is one where the topology of the interactions is -typically defined in the data file read by the -:doc:`read_data ` command, and where the interactions -themselves are defined with the :doc:`bond_style `, -:doc:`angle_style `, etc commands. If you delete atoms -from such a system, you must be careful not to end up with bonded -interactions that are stored by remaining atoms but which include -deleted atoms. This will cause LAMMPS to generate a "missing atoms" -error when the bonded interaction is computed. The *bond* and *mol* -keywords offer two ways to do that. +typically defined in the data file read by the :doc:`read_data +` command, and where the interactions themselves are +defined with the :doc:`bond_style `, :doc:`angle_style +`, etc commands. If you delete atoms from such a system, +you must be careful not to end up with bonded interactions that are +stored by remaining atoms but which include deleted atoms. This will +cause LAMMPS to generate a "missing atoms" error when the bonded +interaction is computed. The *bond* and *mol* keywords offer two ways +to do that. It the *bond* keyword is set to *yes* then any bond or angle or dihedral or improper interaction that includes a deleted atom is also diff --git a/doc/src/dump.rst b/doc/src/dump.rst index c2509e6654..c94813a41e 100644 --- a/doc/src/dump.rst +++ b/doc/src/dump.rst @@ -137,7 +137,7 @@ Examples dump myDump all atom/gz 100 dump.atom.gz dump myDump all atom/zstd 100 dump.atom.zst dump 2 subgroup atom 50 dump.run.bin - dump 2 subgroup atom 50 dump.run.mpiio.bin + dump 2 subgroup atom/mpiio 50 dump.run.mpiio.bin dump 4a all custom 100 dump.myforce.* id type x y vx fx dump 4b flow custom 100 dump.%.myforce id type c_myF[3] v_ke dump 4b flow custom 100 dump.%.myforce id type c_myF[*] v_ke @@ -169,11 +169,12 @@ or multiple smaller files). .. note:: - Because periodic boundary conditions are enforced only on - timesteps when neighbor lists are rebuilt, the coordinates of an atom - written to a dump file may be slightly outside the simulation box. - Re-neighbor timesteps will not typically coincide with the timesteps - dump snapshots are written. See the :doc:`dump_modify pbc ` command if you with to force coordinates to be + Because periodic boundary conditions are enforced only on timesteps + when neighbor lists are rebuilt, the coordinates of an atom written + to a dump file may be slightly outside the simulation box. + Re-neighbor timesteps will not typically coincide with the + timesteps dump snapshots are written. See the :doc:`dump_modify + pbc ` command if you with to force coordinates to be strictly inside the simulation box. .. note:: @@ -189,20 +190,21 @@ or multiple smaller files). multiple processors, each of which owns a subset of the atoms. For the *atom*, *custom*, *cfg*, and *local* styles, sorting is off by -default. For the *dcd*, *xtc*, *xyz*, and *molfile* styles, sorting by -atom ID is on by default. See the :doc:`dump_modify ` doc -page for details. +default. For the *dcd*, *xtc*, *xyz*, and *molfile* styles, sorting +by atom ID is on by default. See the :doc:`dump_modify ` +doc page for details. -The *atom/gz*, *cfg/gz*, *custom/gz*, *local/gz*, and *xyz/gz* styles are identical -in command syntax to the corresponding styles without "gz", however, -they generate compressed files using the zlib library. Thus the filename -suffix ".gz" is mandatory. This is an alternative approach to writing -compressed files via a pipe, as done by the regular dump styles, which -may be required on clusters where the interface to the high-speed network -disallows using the fork() library call (which is needed for a pipe). -For the remainder of this doc page, you should thus consider the *atom* -and *atom/gz* styles (etc) to be inter-changeable, with the exception -of the required filename suffix. +The *atom/gz*, *cfg/gz*, *custom/gz*, *local/gz*, and *xyz/gz* styles +are identical in command syntax to the corresponding styles without +"gz", however, they generate compressed files using the zlib +library. Thus the filename suffix ".gz" is mandatory. This is an +alternative approach to writing compressed files via a pipe, as done +by the regular dump styles, which may be required on clusters where +the interface to the high-speed network disallows using the fork() +library call (which is needed for a pipe). For the remainder of this +doc page, you should thus consider the *atom* and *atom/gz* styles +(etc) to be inter-changeable, with the exception of the required +filename suffix. Similarly, the *atom/zstd*, *cfg/zstd*, *custom/zstd*, *local/zstd*, and *xyz/zstd* styles are identical to the gz styles, but use the Zstd @@ -219,6 +221,11 @@ you should thus consider the *atom* and *atom/mpiio* styles (etc) to be inter-changeable. The one exception is how the filename is specified for the MPI-IO styles, as explained below. +.. warning:: + + The MPIIO package is currently unmaintained and has become + unreliable. Use with caution. + The precision of values output to text-based dump files can be controlled by the :doc:`dump_modify format ` command and its options. @@ -275,10 +282,11 @@ This bounding box is convenient for many visualization programs. The meaning of the 6 character flags for "xx yy zz" is the same as above. Note that the first two numbers on each line are now xlo_bound instead -of xlo, etc, since they represent a bounding box. See the :doc:`Howto triclinic ` page for a geometric description -of triclinic boxes, as defined by LAMMPS, simple formulas for how the -6 bounding box extents (xlo_bound,xhi_bound,etc) are calculated from -the triclinic parameters, and how to transform those parameters to and +of xlo, etc, since they represent a bounding box. See the :doc:`Howto +triclinic ` page for a geometric description of +triclinic boxes, as defined by LAMMPS, simple formulas for how the 6 +bounding box extents (xlo_bound,xhi_bound,etc) are calculated from the +triclinic parameters, and how to transform those parameters to and from other commonly used triclinic representations. The "ITEM: ATOMS" line in each snapshot lists column descriptors for @@ -310,23 +318,24 @@ written to the dump file. This local data is typically calculated by each processor based on the atoms it owns, but there may be zero or more entities per atom, e.g. a list of bond distances. An explanation of the possible dump local attributes is given below. Note that by -using input from the :doc:`compute property/local ` command with dump local, -it is possible to generate information on bonds, angles, etc that can -be cut and pasted directly into a data file read by the -:doc:`read_data ` command. +using input from the :doc:`compute property/local +` command with dump local, it is possible to +generate information on bonds, angles, etc that can be cut and pasted +directly into a data file read by the :doc:`read_data ` +command. Style *cfg* has the same command syntax as style *custom* and writes -extended CFG format files, as used by the -`AtomEye `_ visualization -package. Since the extended CFG format uses a single snapshot of the -system per file, a wildcard "\*" must be included in the filename, as -discussed below. The list of atom attributes for style *cfg* must -begin with either "mass type xs ys zs" or "mass type xsu ysu zsu" -since these quantities are needed to write the CFG files in the -appropriate format (though the "mass" and "type" fields do not appear -explicitly in the file). Any remaining attributes will be stored as -"auxiliary properties" in the CFG files. Note that you will typically -want to use the :doc:`dump_modify element ` command with +extended CFG format files, as used by the `AtomEye +`_ visualization package. +Since the extended CFG format uses a single snapshot of the system per +file, a wildcard "\*" must be included in the filename, as discussed +below. The list of atom attributes for style *cfg* must begin with +either "mass type xs ys zs" or "mass type xsu ysu zsu" since these +quantities are needed to write the CFG files in the appropriate format +(though the "mass" and "type" fields do not appear explicitly in the +file). Any remaining attributes will be stored as "auxiliary +properties" in the CFG files. Note that you will typically want to +use the :doc:`dump_modify element ` command with CFG-formatted files, to associate element names with atom types, so that AtomEye can render atoms appropriately. When unwrapped coordinates *xsu*, *ysu*, and *zsu* are requested, the nominal AtomEye @@ -452,6 +461,11 @@ use the :doc:`read_dump ` command or perform other post-processing, just as if the dump file was not written using MPI-IO. +.. warning:: + + The MPIIO package is currently unmaintained and has become + unreliable. Use with caution. + Note that MPI-IO dump files are one large file which all processors write to. You thus cannot use the "%" wildcard character described above in the filename since that specifies generation of multiple @@ -708,8 +722,9 @@ are part of the MPIIO package. They are only enabled if LAMMPS was built with that package. See the :doc:`Build package ` doc page for more info. -The *xtc* style is part of the MISC package. It is only enabled if -LAMMPS was built with that package. See the :doc:`Build package ` page for more info. +The *xtc* and *dcd* styles are part of the EXTRA-DUMP package. They +are only enabled if LAMMPS was built with that package. See the +:doc:`Build package ` page for more info. Related commands """""""""""""""" diff --git a/doc/src/dump_image.rst b/doc/src/dump_image.rst index be14a237e5..9b8c7febf4 100644 --- a/doc/src/dump_image.rst +++ b/doc/src/dump_image.rst @@ -6,6 +6,8 @@ dump image command dump movie command ================== +(see below for :ref:`dump_modify options ` specific to dump image/movie) + Syntax """""" @@ -15,7 +17,7 @@ Syntax * ID = user-assigned name for the dump * group-ID = ID of the group of atoms to be imaged -* style = *image* or *movie* = style of dump command (other styles *atom* or *cfg* or *dcd* or *xtc* or *xyz* or *local* or *custom* are discussed on the :doc:`dump ` doc page) +* style = *image* or *movie* = style of dump command (other styles such as *atom* or *cfg* or *dcd* or *xtc* or *xyz* or *local* or *custom* are discussed on the :doc:`dump ` doc page) * N = dump every this many timesteps * file = name of file to write image to * color = atom attribute that determines color of each atom @@ -79,6 +81,69 @@ Syntax seed = random # seed (positive integer) dfactor = strength of shading from 0.0 to 1.0 + +.. _dump_modify_image: + +dump_modify options for dump image/movie +======================================== + +Syntax +"""""" + +.. parsed-literal:: + + dump_modify dump-ID keyword values ... + +* these keywords apply only to the *image* and *movie* styles and are documented on this page +* keyword = *acolor* or *adiam* or *amap* or *backcolor* or *bcolor* or *bdiam* or *boxcolor* or *color* or *bitrate* or *framerate* +* see the :doc:`dump modify ` doc page for more general keywords + + .. parsed-literal:: + + *acolor* args = type color + type = atom type or range of types (see below) + color = name of color or color1/color2/... + *adiam* args = type diam + type = atom type or range of types (see below) + diam = diameter of atoms of that type (distance units) + *amap* args = lo hi style delta N entry1 entry2 ... entryN + lo = number or *min* = lower bound of range of color map + hi = number or *max* = upper bound of range of color map + style = 2 letters = "c" or "d" or "s" plus "a" or "f" + "c" for continuous + "d" for discrete + "s" for sequential + "a" for absolute + "f" for fractional + delta = binsize (only used for style "s", otherwise ignored) + binsize = range is divided into bins of this width + N = # of subsequent entries + entry = value color (for continuous style) + value = number or *min* or *max* = single value within range + color = name of color used for that value + entry = lo hi color (for discrete style) + lo/hi = number or *min* or *max* = lower/upper bound of subset of range + color = name of color used for that subset of values + entry = color (for sequential style) + color = name of color used for a bin of values + *backcolor* arg = color + color = name of color for background + *bcolor* args = type color + type = bond type or range of types (see below) + color = name of color or color1/color2/... + *bdiam* args = type diam + type = bond type or range of types (see below) + diam = diameter of bonds of that type (distance units) + *boxcolor* arg = color + color = name of color for simulation box lines and processor sub-domain lines + *color* args = name R G B + name = name of color + R,G,B = red/green/blue numeric values from 0.0 to 1.0 + *bitrate* arg = rate + rate = target bitrate for movie in kbps + *framerate* arg = fps + fps = frames per second for movie + Examples """""""" @@ -91,6 +156,8 @@ Examples dump m1 all movie 1000 movie.avi type type size 640 480 dump m2 all movie 100 movie.m4v type type zoom 1.8 adiam v_value size 1280 720 + dump_modify 1 amap min max cf 0.0 3 min green 0.5 yellow max blue boxcolor red + Description """"""""""" @@ -145,10 +212,10 @@ is used. Similarly, the format of the resulting movie is chosen with the *movie* dump style. This is handled by the underlying FFmpeg converter and thus details have to be looked up in the `FFmpeg documentation -`_. -Typical examples are: .avi, .mpg, .m4v, .mp4, .mkv, .flv, .mov, .gif -Additional settings of the movie compression like bitrate and -framerate can be set using the :doc:`dump_modify ` command. +`_. Typical examples are: .avi, .mpg, +.m4v, .mp4, .mkv, .flv, .mov, .gif Additional settings of the movie +compression like bitrate and framerate can be set using the +dump_modify command as described below. To write out JPEG and PNG format files, you must build LAMMPS with support for the corresponding JPEG or PNG library. To convert images @@ -210,19 +277,20 @@ to colors is as follows: * type 6 = cyan and repeats itself for types > 6. This mapping can be changed by the -:doc:`dump_modify acolor ` command. +"dump_modify acolor" command, as described below. If *type* is specified for the *diameter* setting then the diameter of each atom is determined by its atom type. By default all types have -diameter 1.0. This mapping can be changed by the :doc:`dump_modify adiam ` command. +diameter 1.0. This mapping can be changed by the "dump_modify adiam" +command, as described below. If *element* is specified for the *color* and/or *diameter* setting, then the color and/or diameter of each atom is determined by which element it is, which in turn is specified by the element-to-type -mapping specified by the "dump_modify element" command. By default -every atom type is C (carbon). Every element has a color and diameter -associated with it, which is the same as the colors and sizes used by -the `AtomEye `_ visualization package. +mapping specified by the "dump_modify element" command, as described +below. By default every atom type is C (carbon). Every element has a +color and diameter associated with it, which is the same as the colors +and sizes used by the `AtomEye `_ visualization package. .. _atomeye: http://li.mit.edu/Archive/Graphics/A/ @@ -232,13 +300,13 @@ settings, they are interpreted in the following way. If "vx", for example, is used as the *color* setting, then the color of the atom will depend on the x-component of its velocity. The association of a per-atom value with a specific color is determined by -a "color map", which can be specified via the -:doc:`dump_modify ` command. The basic idea is that the -atom-attribute will be within a range of values, and every value -within the range is mapped to a specific color. Depending on how the -color map is defined, that mapping can take place via interpolation so -that a value of -3.2 is halfway between "red" and "blue", or -discretely so that the value of -3.2 is "orange". +a "color map", which can be specified via the dump_modify command, as +described below. The basic idea is that the atom-attribute will be +within a range of values, and every value within the range is mapped +to a specific color. Depending on how the color map is defined, that +mapping can take place via interpolation so that a value of -3.2 is +halfway between "red" and "blue", or discretely so that the value of +-3.2 is "orange". If "vx", for example, is used as the *diameter* setting, then the atom will be rendered using the x-component of its velocity as the @@ -251,9 +319,10 @@ diameter, which can be used as the *diameter* setting. The various keywords listed above control how the image is rendered. As listed below, all of the keywords have defaults, most of which you -will likely not need to change. The :doc:`dump modify ` -also has options specific to the dump image style, particularly for -assigning colors to atoms, bonds, and other image features. +will likely not need to change. As described below, the dump modify +command also has options specific to the dump image style, +particularly for assigning colors to atoms, bonds, and other image +features. ---------- @@ -295,7 +364,7 @@ types to colors is as follows: * type 6 = cyan and repeats itself for bond types > 6. This mapping can be changed by -the :doc:`dump_modify bcolor ` command. +the "dump_modify bcolor" command, as described below. The bond *width* value can be a numeric value or *atom* or *type* (or *none* as indicated above). @@ -310,7 +379,8 @@ of the 2 atoms in the bond. If *type* is specified for the *width* value then the diameter of each bond is determined by its bond type. By default all types have -diameter 0.5. This mapping can be changed by the :doc:`dump_modify bdiam ` command. +diameter 0.5. This mapping can be changed by the "dump_modify bdiam" command, +as described below. ---------- @@ -330,7 +400,7 @@ mapping of types to colors is as follows: * type 6 = cyan and repeats itself for types > 6. There is not yet an option to -change this via the :doc:`dump_modify ` command. +change this via the dump_modify command. The line *width* can only be a numeric value, which specifies that all lines will be drawn as cylinders with that diameter, e.g. 1.0, which @@ -357,7 +427,7 @@ default the mapping of types to colors is as follows: * type 6 = cyan and repeats itself for types > 6. There is not yet an option to -change this via the :doc:`dump_modify ` command. +change this via the dump_modify command. ---------- @@ -390,7 +460,7 @@ particle. By default the mapping of types to colors is as follows: * type 6 = cyan and repeats itself for types > 6. There is not yet an option to -change this via the :doc:`dump_modify ` command. +change this via the dump_modify command. ---------- @@ -414,7 +484,7 @@ the mapping of types to colors is as follows: * type 6 = cyan and repeats itself for types > 6. There is not yet an option to -change this via the :doc:`dump_modify ` command. +change this via the dump_modify command. ---------- @@ -488,7 +558,8 @@ are rendered as thin cylinders in the image. If *no* is set, then the box boundaries are not drawn and the *diam* setting is ignored. If *yes* is set, the 12 edges of the box are drawn, with a diameter that is a fraction of the shortest box length in x,y,z (for 3d) or x,y (for -2d). The color of the box boundaries can be set with the :doc:`dump_modify boxcolor ` command. +2d). The color of the box boundaries can be set with the "dump_modify +boxcolor" command. The *axes* keyword determines if and how the coordinate axes are rendered as thin cylinders in the image. If *no* is set, then the @@ -507,7 +578,8 @@ set (default), then the sub-domain boundaries are not drawn and the *diam* setting is ignored. If *yes* is set, the 12 edges of each processor sub-domain are drawn, with a diameter that is a fraction of the shortest box length in x,y,z (for 3d) or x,y (for 2d). The color -of the sub-domain boundaries can be set with the :doc:`dump_modify boxcolor ` command. +of the sub-domain boundaries can be set with the "dump_modify +boxcolor" command. ---------- @@ -607,9 +679,272 @@ Play the movie: ---------- -See the :doc:`Modify ` page for information on how to add -new compute and fix styles to LAMMPS to calculate per-atom quantities -which could then be output into dump files. +Dump_modify keywords for dump image and dump movie +"""""""""""""""""""""""""""""""""""""""""""""""""" + +The following dump_modify keywords apply only to the dump image and +dump movie styles. Any keyword that works with dump image also works +with dump movie, since the movie is simply a collection of images. +Some of the keywords only affect the dump movie style. The +descriptions give details. + +---------- + +The *acolor* keyword can be used with the dump image command, when its +atom color setting is *type*, to set the color that atoms of each type +will be drawn in the image. + +The specified *type* should be an integer from 1 to Ntypes = the +number of atom types. A wildcard asterisk can be used in place of or +in conjunction with the *type* argument to specify a range of atom +types. This takes the form "\*" or "\*n" or "n\*" or "m\*n". If N = +the number of atom types, then an asterisk with no numeric values +means all types from 1 to N. A leading asterisk means all types from +1 to n (inclusive). A trailing asterisk means all types from n to N +(inclusive). A middle asterisk means all types from m to n +(inclusive). + +The specified *color* can be a single color which is any of the 140 +pre-defined colors (see below) or a color name defined by the +"dump_modify color" command, as described below. Or it can be two or +more colors separated by a "/" character, e.g. red/green/blue. In the +former case, that color is assigned to all the specified atom types. +In the latter case, the list of colors are assigned in a round-robin +fashion to each of the specified atom types. + +---------- + +The *adiam* keyword can be used with the dump image command, when its +atom diameter setting is *type*, to set the size that atoms of each +type will be drawn in the image. The specified *type* should be an +integer from 1 to Ntypes. As with the *acolor* keyword, a wildcard +asterisk can be used as part of the *type* argument to specify a range +of atom types. The specified *diam* is the size in whatever distance +:doc:`units ` the input script is using, e.g. Angstroms. + +---------- + +The *amap* keyword can be used with the dump image command, with its +*atom* keyword, when its atom setting is an atom-attribute, to setup a +color map. The color map is used to assign a specific RGB +(red/green/blue) color value to an individual atom when it is drawn, +based on the atom's attribute, which is a numeric value, e.g. its +x-component of velocity if the atom-attribute "vx" was specified. + +The basic idea of a color map is that the atom-attribute will be +within a range of values, and that range is associated with a series +of colors (e.g. red, blue, green). An atom's specific value (vx = +-3.2) can then mapped to the series of colors (e.g. halfway between +red and blue), and a specific color is determined via an interpolation +procedure. + +There are many possible options for the color map, enabled by the +*amap* keyword. Here are the details. + +The *lo* and *hi* settings determine the range of values allowed for +the atom attribute. If numeric values are used for *lo* and/or *hi*, +then values that are lower/higher than that value are set to the +value. I.e. the range is static. If *lo* is specified as *min* or +*hi* as *max* then the range is dynamic, and the lower and/or +upper bound will be calculated each time an image is drawn, based +on the set of atoms being visualized. + +The *style* setting is two letters, such as "ca". The first letter is +either "c" for continuous, "d" for discrete, or "s" for sequential. +The second letter is either "a" for absolute, or "f" for fractional. + +A continuous color map is one in which the color changes continuously +from value to value within the range. A discrete color map is one in +which discrete colors are assigned to sub-ranges of values within the +range. A sequential color map is one in which discrete colors are +assigned to a sequence of sub-ranges of values covering the entire +range. + +An absolute color map is one in which the values to which colors are +assigned are specified explicitly as values within the range. A +fractional color map is one in which the values to which colors are +assigned are specified as a fractional portion of the range. For +example if the range is from -10.0 to 10.0, and the color red is to be +assigned to atoms with a value of 5.0, then for an absolute color map +the number 5.0 would be used. But for a fractional map, the number +0.75 would be used since 5.0 is 3/4 of the way from -10.0 to 10.0. + +The *delta* setting must be specified for all styles, but is only used +for the sequential style; otherwise the value is ignored. It +specifies the bin size to use within the range for assigning +consecutive colors to. For example, if the range is from -10.0 to +10.0 and a *delta* of 1.0 is used, then 20 colors will be assigned to +the range. The first will be from -10.0 <= color1 < -9.0, then second +from -9.0 <= color2 < -8.0, etc. + +The *N* setting is how many entries follow. The format of the entries +depends on whether the color map style is continuous, discrete or +sequential. In all cases the *color* setting can be any of the 140 +pre-defined colors (see below) or a color name defined by the +dump_modify color option. + +For continuous color maps, each entry has a *value* and a *color*\ . +The *value* is either a number within the range of values or *min* or +*max*\ . The *value* of the first entry must be *min* and the *value* +of the last entry must be *max*\ . Any entries in between must have +increasing values. Note that numeric values can be specified either +as absolute numbers or as fractions (0.0 to 1.0) of the range, +depending on the "a" or "f" in the style setting for the color map. + +Here is how the entries are used to determine the color of an +individual atom, given the value X of its atom attribute. X will fall +between 2 of the entry values. The color of the atom is linearly +interpolated (in each of the RGB values) between the 2 colors +associated with those entries. For example, if X = -5.0 and the 2 +surrounding entries are "red" at -10.0 and "blue" at 0.0, then the +atom's color will be halfway between "red" and "blue", which happens +to be "purple". + +For discrete color maps, each entry has a *lo* and *hi* value and a +*color*\ . The *lo* and *hi* settings are either numbers within the +range of values or *lo* can be *min* or *hi* can be *max*\ . The *lo* +and *hi* settings of the last entry must be *min* and *max*\ . Other +entries can have any *lo* and *hi* values and the sub-ranges of +different values can overlap. Note that numeric *lo* and *hi* values +can be specified either as absolute numbers or as fractions (0.0 to +1.0) of the range, depending on the "a" or "f" in the style setting +for the color map. + +Here is how the entries are used to determine the color of an +individual atom, given the value X of its atom attribute. The entries +are scanned from first to last. The first time that *lo* <= X <= +*hi*, X is assigned the color associated with that entry. You can +think of the last entry as assigning a default color (since it will +always be matched by X), and the earlier entries as colors that +override the default. Also note that no interpolation of a color RGB +is done. All atoms will be drawn with one of the colors in the list +of entries. + +For sequential color maps, each entry has only a *color*\ . Here is how +the entries are used to determine the color of an individual atom, +given the value X of its atom attribute. The range is partitioned +into N bins of width *binsize*\ . Thus X will fall in a specific bin +from 1 to N, say the Mth bin. If it falls on a boundary between 2 +bins, it is considered to be in the higher of the 2 bins. Each bin is +assigned a color from the E entries. If E < N, then the colors are +repeated. For example if 2 entries with colors red and green are +specified, then the odd numbered bins will be red and the even bins +green. The color of the atom is the color of its bin. Note that the +sequential color map is really a shorthand way of defining a discrete +color map without having to specify where all the bin boundaries are. + +Here is an example of using a sequential color map to color all the +atoms in individual molecules with a different color. See the +examples/pour/in.pour.2d.molecule input script for an example of how +this is used. + +.. code-block:: LAMMPS + + variable colors string & + "red green blue yellow white & + purple pink orange lime gray" + variable mol atom mol%10 + dump 1 all image 250 image.*.jpg v_mol type & + zoom 1.6 adiam 1.5 + dump_modify 1 pad 5 amap 0 10 sa 1 10 ${colors} + +In this case, 10 colors are defined, and molecule IDs are +mapped to one of the colors, even if there are 1000s of molecules. + +---------- + +The *backcolor* sets the background color of the images. The color +name can be any of the 140 pre-defined colors (see below) or a color +name defined by the dump_modify color option. + +---------- + +The *bcolor* keyword can be used with the dump image command, with its +*bond* keyword, when its color setting is *type*, to set the color +that bonds of each type will be drawn in the image. + +The specified *type* should be an integer from 1 to Nbondtypes = the +number of bond types. A wildcard asterisk can be used in place of or +in conjunction with the *type* argument to specify a range of bond +types. This takes the form "\*" or "\*n" or "n\*" or "m\*n". If N = +the number of bond types, then an asterisk with no numeric values +means all types from 1 to N. A leading asterisk means all types from +1 to n (inclusive). A trailing asterisk means all types from n to N +(inclusive). A middle asterisk means all types from m to n +(inclusive). + +The specified *color* can be a single color which is any of the 140 +pre-defined colors (see below) or a color name defined by the +dump_modify color option. Or it can be two or more colors separated +by a "/" character, e.g. red/green/blue. In the former case, that +color is assigned to all the specified bond types. In the latter +case, the list of colors are assigned in a round-robin fashion to each +of the specified bond types. + +---------- + +The *bdiam* keyword can be used with the dump image command, with its +*bond* keyword, when its diam setting is *type*, to set the diameter +that bonds of each type will be drawn in the image. The specified +*type* should be an integer from 1 to Nbondtypes. As with the +*bcolor* keyword, a wildcard asterisk can be used as part of the +*type* argument to specify a range of bond types. The specified +*diam* is the size in whatever distance :doc:`units ` you are +using, e.g. Angstroms. + +---------- + +The *bitrate* keyword can be used with the :doc:`dump movie +` command to define the size of the resulting movie file +and its quality via setting how many kbits per second are to be used +for the movie file. Higher bitrates require less compression and will +result in higher quality movies. The quality is also determined by +the compression format and encoder. The default setting is 2000 +kbit/s, which will result in average quality with older compression +formats. + +.. note:: + + Not all movie file formats supported by dump movie allow the + bitrate to be set. If not, the setting is silently ignored. + +---------- + +The *boxcolor* keyword sets the color of the simulation box drawn +around the atoms in each image as well as the color of processor +sub-domain boundaries. See the "dump image box" command for how to +specify that a box be drawn via the *box* keyword, and the sub-domain +boundaries via the *subbox* keyword. The color name can be any of the +140 pre-defined colors (see below) or a color name defined by the +dump_modify color option. + +---------- + +The *color* keyword allows definition of a new color name, in addition +to the 140-predefined colors (see below), and associates 3 +red/green/blue RGB values with that color name. The color name can +then be used with any other dump_modify keyword that takes a color +name as a value. The RGB values should each be floating point values +between 0.0 and 1.0 inclusive. + +When a color name is converted to RGB values, the user-defined color +names are searched first, then the 140 pre-defined color names. This +means you can also use the *color* keyword to overwrite one of the +pre-defined color names with new RBG values. + +---------- + +The *framerate* keyword can be used with the :doc:`dump movie +` command to define the duration of the resulting movie +file. Movie files written by the dump *movie* command have a default +frame rate of 24 frames per second and the images generated will be +converted at that rate. Thus a sequence of 1000 dump images will +result in a movie of about 42 seconds. To make a movie run longer you +can either generate images more frequently or lower the frame rate. +To speed a movie up, you can do the inverse. Using a frame rate +higher than 24 is not recommended, as it will result in simply +dropping the rendered images. It is more efficient to dump images less +frequently. ---------- @@ -664,7 +999,7 @@ Related commands Default """"""" -The defaults for the keywords are as follows: +The defaults for the dump image and dump movie keywords are as follows: * adiam = not specified (use diameter setting) * atom = yes @@ -682,3 +1017,101 @@ The defaults for the keywords are as follows: * subbox no 0.0 * shiny = 1.0 * ssao = no + +---------- + +The defaults for the dump_modify keywords specific to dump image and dump movie are as follows: + +* acolor = \* red/green/blue/yellow/aqua/cyan +* adiam = \* 1.0 +* amap = min max cf 0.0 2 min blue max red +* backcolor = black +* bcolor = \* red/green/blue/yellow/aqua/cyan +* bdiam = \* 0.5 +* bitrate = 2000 +* boxcolor = yellow +* color = 140 color names are pre-defined as listed below +* framerate = 24 + +---------- + +These are the standard 109 element names that LAMMPS pre-defines for +use with the dump image and dump_modify commands. + +* 1-10 = "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne" +* 11-20 = "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca" +* 21-30 = "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn" +* 31-40 = "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr" +* 41-50 = "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn" +* 51-60 = "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd" +* 61-70 = "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb" +* 71-80 = "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg" +* 81-90 = "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th" +* 91-100 = "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm" +* 101-109 = "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt" + +---------- + +These are the 140 colors that LAMMPS pre-defines for use with the dump +image and dump_modify commands. Additional colors can be defined with +the dump_modify color command. The 3 numbers listed for each name are +the RGB (red/green/blue) values. Divide each value by 255 to get the +equivalent 0.0 to 1.0 value. + ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| aliceblue = 240, 248, 255 | antiquewhite = 250, 235, 215 | aqua = 0, 255, 255 | aquamarine = 127, 255, 212 | azure = 240, 255, 255 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| beige = 245, 245, 220 | bisque = 255, 228, 196 | black = 0, 0, 0 | blanchedalmond = 255, 255, 205 | blue = 0, 0, 255 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| blueviolet = 138, 43, 226 | brown = 165, 42, 42 | burlywood = 222, 184, 135 | cadetblue = 95, 158, 160 | chartreuse = 127, 255, 0 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| chocolate = 210, 105, 30 | coral = 255, 127, 80 | cornflowerblue = 100, 149, 237 | cornsilk = 255, 248, 220 | crimson = 220, 20, 60 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| cyan = 0, 255, 255 | darkblue = 0, 0, 139 | darkcyan = 0, 139, 139 | darkgoldenrod = 184, 134, 11 | darkgray = 169, 169, 169 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| darkgreen = 0, 100, 0 | darkkhaki = 189, 183, 107 | darkmagenta = 139, 0, 139 | darkolivegreen = 85, 107, 47 | darkorange = 255, 140, 0 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| darkorchid = 153, 50, 204 | darkred = 139, 0, 0 | darksalmon = 233, 150, 122 | darkseagreen = 143, 188, 143 | darkslateblue = 72, 61, 139 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| darkslategray = 47, 79, 79 | darkturquoise = 0, 206, 209 | darkviolet = 148, 0, 211 | deeppink = 255, 20, 147 | deepskyblue = 0, 191, 255 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| dimgray = 105, 105, 105 | dodgerblue = 30, 144, 255 | firebrick = 178, 34, 34 | floralwhite = 255, 250, 240 | forestgreen = 34, 139, 34 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| fuchsia = 255, 0, 255 | gainsboro = 220, 220, 220 | ghostwhite = 248, 248, 255 | gold = 255, 215, 0 | goldenrod = 218, 165, 32 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| gray = 128, 128, 128 | green = 0, 128, 0 | greenyellow = 173, 255, 47 | honeydew = 240, 255, 240 | hotpink = 255, 105, 180 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| indianred = 205, 92, 92 | indigo = 75, 0, 130 | ivory = 255, 240, 240 | khaki = 240, 230, 140 | lavender = 230, 230, 250 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| lavenderblush = 255, 240, 245 | lawngreen = 124, 252, 0 | lemonchiffon = 255, 250, 205 | lightblue = 173, 216, 230 | lightcoral = 240, 128, 128 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| lightcyan = 224, 255, 255 | lightgoldenrodyellow = 250, 250, 210 | lightgreen = 144, 238, 144 | lightgrey = 211, 211, 211 | lightpink = 255, 182, 193 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| lightsalmon = 255, 160, 122 | lightseagreen = 32, 178, 170 | lightskyblue = 135, 206, 250 | lightslategray = 119, 136, 153 | lightsteelblue = 176, 196, 222 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| lightyellow = 255, 255, 224 | lime = 0, 255, 0 | limegreen = 50, 205, 50 | linen = 250, 240, 230 | magenta = 255, 0, 255 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| maroon = 128, 0, 0 | mediumaquamarine = 102, 205, 170 | mediumblue = 0, 0, 205 | mediumorchid = 186, 85, 211 | mediumpurple = 147, 112, 219 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| mediumseagreen = 60, 179, 113 | mediumslateblue = 123, 104, 238 | mediumspringgreen = 0, 250, 154 | mediumturquoise = 72, 209, 204 | mediumvioletred = 199, 21, 133 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| midnightblue = 25, 25, 112 | mintcream = 245, 255, 250 | mistyrose = 255, 228, 225 | moccasin = 255, 228, 181 | navajowhite = 255, 222, 173 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| navy = 0, 0, 128 | oldlace = 253, 245, 230 | olive = 128, 128, 0 | olivedrab = 107, 142, 35 | orange = 255, 165, 0 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| orangered = 255, 69, 0 | orchid = 218, 112, 214 | palegoldenrod = 238, 232, 170 | palegreen = 152, 251, 152 | paleturquoise = 175, 238, 238 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| palevioletred = 219, 112, 147 | papayawhip = 255, 239, 213 | peachpuff = 255, 239, 213 | peru = 205, 133, 63 | pink = 255, 192, 203 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| plum = 221, 160, 221 | powderblue = 176, 224, 230 | purple = 128, 0, 128 | red = 255, 0, 0 | rosybrown = 188, 143, 143 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| royalblue = 65, 105, 225 | saddlebrown = 139, 69, 19 | salmon = 250, 128, 114 | sandybrown = 244, 164, 96 | seagreen = 46, 139, 87 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| seashell = 255, 245, 238 | sienna = 160, 82, 45 | silver = 192, 192, 192 | skyblue = 135, 206, 235 | slateblue = 106, 90, 205 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| slategray = 112, 128, 144 | snow = 255, 250, 250 | springgreen = 0, 255, 127 | steelblue = 70, 130, 180 | tan = 210, 180, 140 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| teal = 0, 128, 128 | thistle = 216, 191, 216 | tomato = 253, 99, 71 | turquoise = 64, 224, 208 | violet = 238, 130, 238 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ +| wheat = 245, 222, 179 | white = 255, 255, 255 | whitesmoke = 245, 245, 245 | yellow = 255, 255, 0 | yellowgreen = 154, 205, 50 | ++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ diff --git a/doc/src/dump_modify.rst b/doc/src/dump_modify.rst index 5fea976e70..be75153f6f 100644 --- a/doc/src/dump_modify.rst +++ b/doc/src/dump_modify.rst @@ -3,6 +3,9 @@ dump_modify command =================== +:doc:`dump_modify ` command for image/movie options +=============================================================== + Syntax """""" @@ -12,8 +15,9 @@ Syntax * dump-ID = ID of dump to modify * one or more keyword/value pairs may be appended + * these keywords apply to various dump styles -* keyword = *append* or *at* or *buffer* or *delay* or *element* or *every* or *fileper* or *first* or *flush* or *format* or *image* or *label* or *maxfiles* or *nfile* or *pad* or *pbc* or *precision* or *region* or *refresh* or *scale* or *sfactor* or *sort* or *tfactor* or *thermo* or *thresh* or *time* or *units* or *unwrap* +* keyword = *append* or *at* or *buffer* or *delay* or *element* or *every* or *every/time* or *fileper* or *first* or *flush* or *format* or *header* or *image* or *label* or *maxfiles* or *nfile* or *pad* or *pbc* or *precision* or *region* or *refresh* or *scale* or *sfactor* or *sort* or *tfactor* or *thermo* or *thresh* or *time* or *units* or *unwrap* .. parsed-literal:: @@ -28,6 +32,9 @@ Syntax *every* arg = N N = dump every this many timesteps N can be a variable (see below) + *every/time* arg = Delta + Delta = dump every this interval in simulation time (time units) + Delta can be a variable (see below) *fileper* arg = Np Np = write one file for every this many processors *first* arg = *yes* or *no* @@ -35,6 +42,9 @@ Syntax *format* args = *line* string, *int* string, *float* string, M string, or *none* string = C-style format string M = integer from 1 to N, where N = # of per-atom quantities being output + *header* arg = *yes* or *no* + *yes* to write the header + *no* to not write the header *image* arg = *yes* or *no* *label* arg = string string = character string (e.g. BONDS) to use in header of dump local file @@ -66,56 +76,11 @@ Syntax *unwrap* arg = *yes* or *no* * these keywords apply only to the *image* and *movie* :doc:`styles ` -* keyword = *acolor* or *adiam* or *amap* or *backcolor* or *bcolor* or *bdiam* or *boxcolor* or *color* or *bitrate* or *framerate* or *header* +* keyword = *acolor* or *adiam* or *amap* or *backcolor* or *bcolor* or *bdiam* or *boxcolor* or *color* or *bitrate* or *framerate* .. parsed-literal:: - *acolor* args = type color - type = atom type or range of types (see below) - color = name of color or color1/color2/... - *adiam* args = type diam - type = atom type or range of types (see below) - diam = diameter of atoms of that type (distance units) - *amap* args = lo hi style delta N entry1 entry2 ... entryN - lo = number or *min* = lower bound of range of color map - hi = number or *max* = upper bound of range of color map - style = 2 letters = "c" or "d" or "s" plus "a" or "f" - "c" for continuous - "d" for discrete - "s" for sequential - "a" for absolute - "f" for fractional - delta = binsize (only used for style "s", otherwise ignored) - binsize = range is divided into bins of this width - N = # of subsequent entries - entry = value color (for continuous style) - value = number or *min* or *max* = single value within range - color = name of color used for that value - entry = lo hi color (for discrete style) - lo/hi = number or *min* or *max* = lower/upper bound of subset of range - color = name of color used for that subset of values - entry = color (for sequential style) - color = name of color used for a bin of values - *backcolor* arg = color - color = name of color for background - *bcolor* args = type color - type = bond type or range of types (see below) - color = name of color or color1/color2/... - *bdiam* args = type diam - type = bond type or range of types (see below) - diam = diameter of bonds of that type (distance units) - *boxcolor* arg = color - color = name of color for simulation box lines and processor sub-domain lines - *color* args = name R G B - name = name of color - R,G,B = red/green/blue numeric values from 0.0 to 1.0 - *bitrate* arg = rate - rate = target bitrate for movie in kbps - *framerate* arg = fps - fps = frames per second for movie - *header* arg = *yes* or *no* - *yes* to write the header - *no* to not write the header + see the :doc:`dump image ` doc page for details * these keywords apply only to the */gz* and */zstd* dump styles * keyword = *compression_level* @@ -126,7 +91,7 @@ Syntax level = integer specifying the compression level that should be used (see below for supported levels) * these keywords apply only to the */zstd* dump styles -* keyword = *compression_level* +* keyword = *checksum* .. parsed-literal:: @@ -144,7 +109,6 @@ Examples dump_modify xtcdump precision 10000 sfactor 0.1 dump_modify 1 every 1000 nfile 20 dump_modify 1 every v_myVar - dump_modify 1 amap min max cf 0.0 3 min green 0.5 yellow max blue boxcolor red Description """"""""""" @@ -163,8 +127,9 @@ which allow for use of MPI-IO. ---------- -These keywords apply to various dump styles, including the :doc:`dump image ` and :doc:`dump movie ` styles. The -description gives details. +Unless otherwise noted, the following keywords apply to all the +various dump styles, including the :doc:`dump image ` and +:doc:`dump movie ` styles. ---------- @@ -235,11 +200,19 @@ will be accepted. ---------- -The *every* keyword changes the dump frequency originally specified by -the :doc:`dump ` command to a new value. The every keyword can be -specified in one of two ways. It can be a numeric value in which case -it must be > 0. Or it can be an :doc:`equal-style variable `, -which should be specified as v_name, where name is the variable name. +The *every* keyword can be used with any dump style except the *dcd* +and *xtc* styles. It does two things. It specifies that the interval +between dump snapshots will be set in timesteps, which is the default +if the *every* or *every/time* keywords are not used. See the +*every/time* keyword for how to specify the interval in simulation +time, i.e. in time units of the :doc:`units ` command. The +*every* keyword also sets the interval value, which overrides the dump +frequency originally specified by the :doc:`dump ` command. + +The *every* keyword can be specified in one of two ways. It can be a +numeric value in which case it must be > 0. Or it can be an +:doc:`equal-style variable `, which should be specified as +v_name, where name is the variable name. In this case, the variable is evaluated at the beginning of a run to determine the next timestep at which a dump snapshot will be written @@ -248,11 +221,12 @@ determine the next timestep, etc. Thus the variable should return timestep values. See the stagger() and logfreq() and stride() math functions for :doc:`equal-style variables `, as examples of useful functions to use in this context. Other similar math functions -could easily be added as options for :doc:`equal-style variables `. Also see the next() function, which allows -use of a file-style variable which reads successive values from a -file, each time the variable is evaluated. Used with the *every* -keyword, if the file contains a list of ascending timesteps, you can -output snapshots whenever you wish. +could easily be added as options for :doc:`equal-style variables +`. Also see the next() function, which allows use of a +file-style variable which reads successive values from a file, each +time the variable is evaluated. Used with the *every* keyword, if the +file contains a list of ascending timesteps, you can output snapshots +whenever you wish. Note that when using the variable option with the *every* keyword, you need to use the *first* option if you want an initial snapshot written @@ -293,14 +267,103 @@ in file tmp.times: ---------- +The *every/time* keyword can be used with any dump style except the +*dcd* and *xtc* styles. It does two things. It specifies that the +interval between dump snapshots will be set in simulation time, +i.e. in time units of the :doc:`units ` command. This can be +useful when the timestep size varies during a simulation run, e.g. by +use of the :doc:`fix dt/reset ` command. The default is +to specify the interval in timesteps; see the *every* keyword. The +*every/time* command also sets the interval value. + +.. note:: + + If you wish dump styles *atom*, *custom*, *local*, or *xyz* to + include the simulation time as a field in the header portion of + each snapshot, you also need to use the dump_modify *time* keyword + with a setting of *yes*. See its documentation below. + +Note that since snapshots are output on simulation steps, each +snapshot will be written on the first timestep whose associated +simulation time is >= the exact snapshot time value. + +As with the *every* option, the *Delta* value can be specified in one +of two ways. It can be a numeric value in which case it must be > +0.0. Or it can be an :doc:`equal-style variable `, which +should be specified as v_name, where name is the variable name. + +In this case, the variable is evaluated at the beginning of a run to +determine the next simulation time at which a dump snapshot will be +written out. On that timestep the variable will be evaluated again to +determine the next simulation time, etc. Thus the variable should +return values in time units. Note the current timestep or simulation +time can be used in an :doc:`equal-style variables ` since +they are both thermodynamic keywords. Also see the next() function, +which allows use of a file-style variable which reads successive +values from a file, each time the variable is evaluated. Used with +the *every/time* keyword, if the file contains a list of ascending +simulation times, you can output snapshots whenever you wish. + +Note that when using the variable option with the *every/time* +keyword, you need to use the *first* option if you want an initial +snapshot written to the dump file. The *every/time* keyword cannot be +used with the dump *dcd* style. + +For example, the following commands will write snapshots at successive +simulation times which grow by a factor of 1.5 with each interval. +The dt value used in the variable is to avoid a zero result when the +initial simulation time is 0.0. + +.. code-block:: LAMMPS + + variable increase equal 1.5*(time+dt) + dump 1 all atom 100 tmp.dump + dump_modify 1 every/time v_increase first yes + +The following commands would write snapshots at the times listed in +file tmp.times: + +.. code-block:: LAMMPS + + variable f file tmp.times + variable s equal next(f) + dump 1 all atom 100 tmp.dump + dump_modify 1 every/time v_s + +.. note:: + + When using a file-style variable with the *every/time* keyword, the + file of timesteps must list a first time that is beyond the time + associated with the current timestep (e.g. it cannot be 0.0). And + it must list one or more times beyond the length of the run you + perform. This is because the dump command will generate an error + if the next time it reads from the file is not a value greater than + the current time. Thus if you wanted output at times 0,15,100 of a + run of length 100 in simulation time, the file should contain the + values 15,100,101 and you should also use the dump_modify first + command. Any final value > 100 could be used in place of 101. + +---------- + The *first* keyword determines whether a dump snapshot is written on the very first timestep after the dump command is invoked. This will -always occur if the current timestep is a multiple of N, the frequency -specified in the :doc:`dump ` command, including timestep 0. But -if this is not the case, a dump snapshot will only be written if the -setting of this keyword is *yes*\ . If it is *no*, which is the +always occur if the current timestep is a multiple of $N$, the +frequency specified in the :doc:`dump ` command or +:doc:`dump_modify every ` command, including timestep 0. +It will also always occur if the current simulation time is a multiple +of *Delta*, the time interval specified in the doc:`dump_modify +every/time ` command. + +But if this is not the case, a dump snapshot will only be written if +the setting of this keyword is *yes*\ . If it is *no*, which is the default, then it will not be written. +Note that if the argument to the :doc:`dump_modify every +` doc:`dump_modify every/time ` commands is +a variable and not a numeric value, then specifying *first yes* is the +only way to write a dump snapshot on the first timestep after the dump +command is invoked. + ---------- The *flush* keyword determines whether a flush operation is invoked @@ -380,6 +443,13 @@ The *fileper* keyword is documented below with the *nfile* keyword. ---------- +The *header* keyword toggles whether the dump file will include a +header. Excluding a header will reduce the size of the dump file for +fixes such as :doc:`fix pair/tracker ` which do not +require the information typically written to the header. + +---------- + The *image* keyword applies only to the dump *atom* style. If the image value is *yes*, 3 flags are appended to each atom's coords which are the absolute box image of the atom in each dimension. For @@ -592,7 +662,9 @@ The dump *local* style cannot be sorted by atom ID, since there are typically multiple lines of output per atom. Some dump styles, such as *dcd* and *xtc*, require sorting by atom ID to format the output file correctly. If multiple processors are writing the dump file, via -the "%" wildcard in the dump filename, then sorting cannot be +the "%" wildcard in the dump filename and the *nfile* or *fileper* +keywords are set to non-default values (i.e. the number of dump file +pieces is not equal to the number of procs), then sorting cannot be performed. .. note:: @@ -670,16 +742,20 @@ threshold criterion is met. Otherwise it is not met. ---------- -The *time* keyword only applies to the dump *atom*, *custom*, and -*local* styles (and their COMPRESS package versions *atom/gz*, -*custom/gz* and *local/gz*\ ). If set to *yes*, each frame will will -contain two extra lines before the "ITEM: TIMESTEP" entry: +The *time* keyword only applies to the dump *atom*, *custom*, *local*, +and *xyz* styles (and their COMPRESS package versions *atom/gz*, +*custom/gz* and *local/gz*\ ). For the first 3 styles, if set to +*yes*, each frame will will contain two extra lines before the "ITEM: +TIMESTEP" entry: .. parsed-literal:: ITEM: TIME \ +For the *xyz* style, the simulation time is included on the same line +as the timestep value. + This will output the current elapsed simulation time in current time units equivalent to the :doc:`thermo keyword ` *time*\ . This is to simplify post-processing of trajectories using a variable time @@ -715,303 +791,35 @@ box size stored with the snapshot. ---------- -These keywords apply only to the :doc:`dump image ` and -:doc:`dump movie ` styles. Any keyword that affects an -image, also affects a movie, since the movie is simply a collection of -images. Some of the keywords only affect the :doc:`dump movie ` style. The descriptions give details. +The COMPRESS package offers both GZ and Zstd compression variants of +styles atom, custom, local, cfg, and xyz. When using these styles the +compression level can be controlled by the :code:`compression_level` +keyword. File names with these styles have to end in either +:code:`.gz` or :code:`.zst`. ----------- - -The *acolor* keyword can be used with the :doc:`dump image ` -command, when its atom color setting is *type*, to set the color that -atoms of each type will be drawn in the image. - -The specified *type* should be an integer from 1 to Ntypes = the -number of atom types. A wildcard asterisk can be used in place of or -in conjunction with the *type* argument to specify a range of atom -types. This takes the form "\*" or "\*n" or "n\*" or "m\*n". If N = the -number of atom types, then an asterisk with no numeric values means -all types from 1 to N. A leading asterisk means all types from 1 to n -(inclusive). A trailing asterisk means all types from n to N -(inclusive). A middle asterisk means all types from m to n -(inclusive). - -The specified *color* can be a single color which is any of the 140 -pre-defined colors (see below) or a color name defined by the -dump_modify color option. Or it can be two or more colors separated -by a "/" character, e.g. red/green/blue. In the former case, that -color is assigned to all the specified atom types. In the latter -case, the list of colors are assigned in a round-robin fashion to each -of the specified atom types. - ----------- - -The *adiam* keyword can be used with the :doc:`dump image ` -command, when its atom diameter setting is *type*, to set the size -that atoms of each type will be drawn in the image. The specified -*type* should be an integer from 1 to Ntypes. As with the *acolor* -keyword, a wildcard asterisk can be used as part of the *type* -argument to specify a range of atom types. The specified *diam* is -the size in whatever distance :doc:`units ` the input script is -using, e.g. Angstroms. - ----------- - -The *amap* keyword can be used with the :doc:`dump image ` -command, with its *atom* keyword, when its atom setting is an -atom-attribute, to setup a color map. The color map is used to assign -a specific RGB (red/green/blue) color value to an individual atom when -it is drawn, based on the atom's attribute, which is a numeric value, -e.g. its x-component of velocity if the atom-attribute "vx" was -specified. - -The basic idea of a color map is that the atom-attribute will be -within a range of values, and that range is associated with a series -of colors (e.g. red, blue, green). An atom's specific value (vx = --3.2) can then mapped to the series of colors (e.g. halfway between -red and blue), and a specific color is determined via an interpolation -procedure. - -There are many possible options for the color map, enabled by the -*amap* keyword. Here are the details. - -The *lo* and *hi* settings determine the range of values allowed for -the atom attribute. If numeric values are used for *lo* and/or *hi*, -then values that are lower/higher than that value are set to the -value. I.e. the range is static. If *lo* is specified as *min* or -*hi* as *max* then the range is dynamic, and the lower and/or -upper bound will be calculated each time an image is drawn, based -on the set of atoms being visualized. - -The *style* setting is two letters, such as "ca". The first letter is -either "c" for continuous, "d" for discrete, or "s" for sequential. -The second letter is either "a" for absolute, or "f" for fractional. - -A continuous color map is one in which the color changes continuously -from value to value within the range. A discrete color map is one in -which discrete colors are assigned to sub-ranges of values within the -range. A sequential color map is one in which discrete colors are -assigned to a sequence of sub-ranges of values covering the entire -range. - -An absolute color map is one in which the values to which colors are -assigned are specified explicitly as values within the range. A -fractional color map is one in which the values to which colors are -assigned are specified as a fractional portion of the range. For -example if the range is from -10.0 to 10.0, and the color red is to be -assigned to atoms with a value of 5.0, then for an absolute color map -the number 5.0 would be used. But for a fractional map, the number -0.75 would be used since 5.0 is 3/4 of the way from -10.0 to 10.0. - -The *delta* setting must be specified for all styles, but is only used -for the sequential style; otherwise the value is ignored. It -specifies the bin size to use within the range for assigning -consecutive colors to. For example, if the range is from -10.0 to -10.0 and a *delta* of 1.0 is used, then 20 colors will be assigned to -the range. The first will be from -10.0 <= color1 < -9.0, then second -from -9.0 <= color2 < -8.0, etc. - -The *N* setting is how many entries follow. The format of the entries -depends on whether the color map style is continuous, discrete or -sequential. In all cases the *color* setting can be any of the 140 -pre-defined colors (see below) or a color name defined by the -dump_modify color option. - -For continuous color maps, each entry has a *value* and a *color*\ . -The *value* is either a number within the range of values or *min* or -*max*\ . The *value* of the first entry must be *min* and the *value* -of the last entry must be *max*\ . Any entries in between must have -increasing values. Note that numeric values can be specified either -as absolute numbers or as fractions (0.0 to 1.0) of the range, -depending on the "a" or "f" in the style setting for the color map. - -Here is how the entries are used to determine the color of an -individual atom, given the value X of its atom attribute. X will fall -between 2 of the entry values. The color of the atom is linearly -interpolated (in each of the RGB values) between the 2 colors -associated with those entries. For example, if X = -5.0 and the 2 -surrounding entries are "red" at -10.0 and "blue" at 0.0, then the -atom's color will be halfway between "red" and "blue", which happens -to be "purple". - -For discrete color maps, each entry has a *lo* and *hi* value and a -*color*\ . The *lo* and *hi* settings are either numbers within the -range of values or *lo* can be *min* or *hi* can be *max*\ . The *lo* -and *hi* settings of the last entry must be *min* and *max*\ . Other -entries can have any *lo* and *hi* values and the sub-ranges of -different values can overlap. Note that numeric *lo* and *hi* values -can be specified either as absolute numbers or as fractions (0.0 to -1.0) of the range, depending on the "a" or "f" in the style setting -for the color map. - -Here is how the entries are used to determine the color of an -individual atom, given the value X of its atom attribute. The entries -are scanned from first to last. The first time that *lo* <= X <= -*hi*, X is assigned the color associated with that entry. You can -think of the last entry as assigning a default color (since it will -always be matched by X), and the earlier entries as colors that -override the default. Also note that no interpolation of a color RGB -is done. All atoms will be drawn with one of the colors in the list -of entries. - -For sequential color maps, each entry has only a *color*\ . Here is how -the entries are used to determine the color of an individual atom, -given the value X of its atom attribute. The range is partitioned -into N bins of width *binsize*\ . Thus X will fall in a specific bin -from 1 to N, say the Mth bin. If it falls on a boundary between 2 -bins, it is considered to be in the higher of the 2 bins. Each bin is -assigned a color from the E entries. If E < N, then the colors are -repeated. For example if 2 entries with colors red and green are -specified, then the odd numbered bins will be red and the even bins -green. The color of the atom is the color of its bin. Note that the -sequential color map is really a shorthand way of defining a discrete -color map without having to specify where all the bin boundaries are. - -Here is an example of using a sequential color map to color all the -atoms in individual molecules with a different color. See the -examples/pour/in.pour.2d.molecule input script for an example of how -this is used. - -.. code-block:: LAMMPS - - variable colors string & - "red green blue yellow white & - purple pink orange lime gray" - variable mol atom mol%10 - dump 1 all image 250 image.*.jpg v_mol type & - zoom 1.6 adiam 1.5 - dump_modify 1 pad 5 amap 0 10 sa 1 10 ${colors} - -In this case, 10 colors are defined, and molecule IDs are -mapped to one of the colors, even if there are 1000s of molecules. - ----------- - -The *backcolor* sets the background color of the images. The color -name can be any of the 140 pre-defined colors (see below) or a color -name defined by the dump_modify color option. - ----------- - -The *bcolor* keyword can be used with the :doc:`dump image ` -command, with its *bond* keyword, when its color setting is *type*, to -set the color that bonds of each type will be drawn in the image. - -The specified *type* should be an integer from 1 to Nbondtypes = the -number of bond types. A wildcard asterisk can be used in place of or -in conjunction with the *type* argument to specify a range of bond -types. This takes the form "\*" or "\*n" or "n\*" or "m\*n". If N = the -number of bond types, then an asterisk with no numeric values means -all types from 1 to N. A leading asterisk means all types from 1 to n -(inclusive). A trailing asterisk means all types from n to N -(inclusive). A middle asterisk means all types from m to n -(inclusive). - -The specified *color* can be a single color which is any of the 140 -pre-defined colors (see below) or a color name defined by the -dump_modify color option. Or it can be two or more colors separated -by a "/" character, e.g. red/green/blue. In the former case, that -color is assigned to all the specified bond types. In the latter -case, the list of colors are assigned in a round-robin fashion to each -of the specified bond types. - ----------- - -The *bdiam* keyword can be used with the :doc:`dump image ` -command, with its *bond* keyword, when its diam setting is *type*, to -set the diameter that bonds of each type will be drawn in the image. -The specified *type* should be an integer from 1 to Nbondtypes. As -with the *bcolor* keyword, a wildcard asterisk can be used as part of -the *type* argument to specify a range of bond types. The specified -*diam* is the size in whatever distance :doc:`units ` you are -using, e.g. Angstroms. - ----------- - -The *bitrate* keyword can be used with the :doc:`dump movie ` command to define the size of the resulting -movie file and its quality via setting how many kbits per second are -to be used for the movie file. Higher bitrates require less -compression and will result in higher quality movies. The quality is -also determined by the compression format and encoder. The default -setting is 2000 kbit/s, which will result in average quality with -older compression formats. - -.. note:: - - Not all movie file formats supported by dump movie allow the - bitrate to be set. If not, the setting is silently ignored. - ----------- - -The *boxcolor* keyword sets the color of the simulation box drawn -around the atoms in each image as well as the color of processor -sub-domain boundaries. See the "dump image box" command for how to -specify that a box be drawn via the *box* keyword, and the sub-domain -boundaries via the *subbox* keyword. The color name can be any of the -140 pre-defined colors (see below) or a color name defined by the -dump_modify color option. - ----------- - -The *color* keyword allows definition of a new color name, in addition -to the 140-predefined colors (see below), and associates 3 -red/green/blue RGB values with that color name. The color name can -then be used with any other dump_modify keyword that takes a color -name as a value. The RGB values should each be floating point values -between 0.0 and 1.0 inclusive. - -When a color name is converted to RGB values, the user-defined color -names are searched first, then the 140 pre-defined color names. This -means you can also use the *color* keyword to overwrite one of the -pre-defined color names with new RBG values. - ----------- - -The *framerate* keyword can be used with the :doc:`dump movie ` command to define the duration of the resulting -movie file. Movie files written by the dump *movie* command have a -default frame rate of 24 frames per second and the images generated -will be converted at that rate. Thus a sequence of 1000 dump images -will result in a movie of about 42 seconds. To make a movie run -longer you can either generate images more frequently or lower the -frame rate. To speed a movie up, you can do the inverse. Using a -frame rate higher than 24 is not recommended, as it will result in -simply dropping the rendered images. It is more efficient to dump -images less frequently. - ----------- - -The *header* keyword toggles whether the dump file will include a header. -Excluding a header will reduce the size of the dump file for fixes such as -:doc:`fix pair/tracker ` which do not require the information -typically written to the header. - ----------- - -The COMPRESS package offers both GZ and Zstd compression variants of styles -atom, custom, local, cfg, and xyz. When using these styles the compression -level can be controlled by the :code:`compression_level` parameter. File names -with these styles have to end in either :code:`.gz` or :code:`.zst`. - -GZ supports compression levels from -1 (default), 0 (no compression), and 1 to -9. 9 being the best compression. The COMPRESS :code:`/gz` styles use 9 as -default compression level. +GZ supports compression levels from -1 (default), 0 (no compression), +and 1 to +9. 9 being the best compression. The COMPRESS :code:`/gz` styles use 9 +as default compression level. Zstd offers a wider range of compression levels, including negative -levels that sacrifice compression for performance. 0 is the -default, positive levels are 1 to 22, with 22 being the most expensive +levels that sacrifice compression for performance. 0 is the default, +positive levels are 1 to 22, with 22 being the most expensive compression. Zstd promises higher compression/decompression speeds for similar compression ratios. For more details see `http://facebook.github.io/zstd/`. -In addition, Zstd compressed files can have a checksum of the entire -contents. The Zstd enabled dump styles enable this feature by default and it -can be disabled with the :code:`checksum` parameter. +In addition, Zstd compressed files can include a checksum of the +entire contents. The Zstd enabled dump styles enable this feature by +default and it can be disabled with the :code:`checksum` keyword. ---------- Restrictions """""""""""" - none + +Not all *dump_modify* options can be applied to all dump styles. +Details are in the discussions of the individual options. Related commands """""""""""""""" @@ -1046,100 +854,7 @@ The option defaults are * units = no * unwrap = no -* acolor = \* red/green/blue/yellow/aqua/cyan -* adiam = \* 1.0 -* amap = min max cf 0.0 2 min blue max red -* backcolor = black -* bcolor = \* red/green/blue/yellow/aqua/cyan -* bdiam = \* 0.5 -* bitrate = 2000 -* boxcolor = yellow -* color = 140 color names are pre-defined as listed below -* framerate = 24 - * compression_level = 9 (gz variants) * compression_level = 0 (zstd variants) * checksum = yes (zstd variants) ----------- - -These are the standard 109 element names that LAMMPS pre-defines for -use with the :doc:`dump image ` and dump_modify commands. - -* 1-10 = "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne" -* 11-20 = "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca" -* 21-30 = "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn" -* 31-40 = "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr" -* 41-50 = "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn" -* 51-60 = "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd" -* 61-70 = "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb" -* 71-80 = "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg" -* 81-90 = "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th" -* 91-100 = "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm" -* 101-109 = "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt" - ----------- - -These are the 140 colors that LAMMPS pre-defines for use with the -:doc:`dump image ` and dump_modify commands. Additional -colors can be defined with the dump_modify color command. The 3 -numbers listed for each name are the RGB (red/green/blue) values. -Divide each value by 255 to get the equivalent 0.0 to 1.0 value. - -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| aliceblue = 240, 248, 255 | antiquewhite = 250, 235, 215 | aqua = 0, 255, 255 | aquamarine = 127, 255, 212 | azure = 240, 255, 255 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| beige = 245, 245, 220 | bisque = 255, 228, 196 | black = 0, 0, 0 | blanchedalmond = 255, 255, 205 | blue = 0, 0, 255 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| blueviolet = 138, 43, 226 | brown = 165, 42, 42 | burlywood = 222, 184, 135 | cadetblue = 95, 158, 160 | chartreuse = 127, 255, 0 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| chocolate = 210, 105, 30 | coral = 255, 127, 80 | cornflowerblue = 100, 149, 237 | cornsilk = 255, 248, 220 | crimson = 220, 20, 60 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| cyan = 0, 255, 255 | darkblue = 0, 0, 139 | darkcyan = 0, 139, 139 | darkgoldenrod = 184, 134, 11 | darkgray = 169, 169, 169 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| darkgreen = 0, 100, 0 | darkkhaki = 189, 183, 107 | darkmagenta = 139, 0, 139 | darkolivegreen = 85, 107, 47 | darkorange = 255, 140, 0 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| darkorchid = 153, 50, 204 | darkred = 139, 0, 0 | darksalmon = 233, 150, 122 | darkseagreen = 143, 188, 143 | darkslateblue = 72, 61, 139 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| darkslategray = 47, 79, 79 | darkturquoise = 0, 206, 209 | darkviolet = 148, 0, 211 | deeppink = 255, 20, 147 | deepskyblue = 0, 191, 255 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| dimgray = 105, 105, 105 | dodgerblue = 30, 144, 255 | firebrick = 178, 34, 34 | floralwhite = 255, 250, 240 | forestgreen = 34, 139, 34 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| fuchsia = 255, 0, 255 | gainsboro = 220, 220, 220 | ghostwhite = 248, 248, 255 | gold = 255, 215, 0 | goldenrod = 218, 165, 32 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| gray = 128, 128, 128 | green = 0, 128, 0 | greenyellow = 173, 255, 47 | honeydew = 240, 255, 240 | hotpink = 255, 105, 180 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| indianred = 205, 92, 92 | indigo = 75, 0, 130 | ivory = 255, 240, 240 | khaki = 240, 230, 140 | lavender = 230, 230, 250 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| lavenderblush = 255, 240, 245 | lawngreen = 124, 252, 0 | lemonchiffon = 255, 250, 205 | lightblue = 173, 216, 230 | lightcoral = 240, 128, 128 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| lightcyan = 224, 255, 255 | lightgoldenrodyellow = 250, 250, 210 | lightgreen = 144, 238, 144 | lightgrey = 211, 211, 211 | lightpink = 255, 182, 193 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| lightsalmon = 255, 160, 122 | lightseagreen = 32, 178, 170 | lightskyblue = 135, 206, 250 | lightslategray = 119, 136, 153 | lightsteelblue = 176, 196, 222 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| lightyellow = 255, 255, 224 | lime = 0, 255, 0 | limegreen = 50, 205, 50 | linen = 250, 240, 230 | magenta = 255, 0, 255 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| maroon = 128, 0, 0 | mediumaquamarine = 102, 205, 170 | mediumblue = 0, 0, 205 | mediumorchid = 186, 85, 211 | mediumpurple = 147, 112, 219 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| mediumseagreen = 60, 179, 113 | mediumslateblue = 123, 104, 238 | mediumspringgreen = 0, 250, 154 | mediumturquoise = 72, 209, 204 | mediumvioletred = 199, 21, 133 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| midnightblue = 25, 25, 112 | mintcream = 245, 255, 250 | mistyrose = 255, 228, 225 | moccasin = 255, 228, 181 | navajowhite = 255, 222, 173 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| navy = 0, 0, 128 | oldlace = 253, 245, 230 | olive = 128, 128, 0 | olivedrab = 107, 142, 35 | orange = 255, 165, 0 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| orangered = 255, 69, 0 | orchid = 218, 112, 214 | palegoldenrod = 238, 232, 170 | palegreen = 152, 251, 152 | paleturquoise = 175, 238, 238 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| palevioletred = 219, 112, 147 | papayawhip = 255, 239, 213 | peachpuff = 255, 239, 213 | peru = 205, 133, 63 | pink = 255, 192, 203 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| plum = 221, 160, 221 | powderblue = 176, 224, 230 | purple = 128, 0, 128 | red = 255, 0, 0 | rosybrown = 188, 143, 143 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| royalblue = 65, 105, 225 | saddlebrown = 139, 69, 19 | salmon = 250, 128, 114 | sandybrown = 244, 164, 96 | seagreen = 46, 139, 87 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| seashell = 255, 245, 238 | sienna = 160, 82, 45 | silver = 192, 192, 192 | skyblue = 135, 206, 235 | slateblue = 106, 90, 205 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| slategray = 112, 128, 144 | snow = 255, 250, 250 | springgreen = 0, 255, 127 | steelblue = 70, 130, 180 | tan = 210, 180, 140 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| teal = 0, 128, 128 | thistle = 216, 191, 216 | tomato = 253, 99, 71 | turquoise = 64, 224, 208 | violet = 238, 130, 238 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ -| wheat = 245, 222, 179 | white = 255, 255, 255 | whitesmoke = 245, 245, 245 | yellow = 255, 255, 0 | yellowgreen = 154, 205, 50 | -+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+ diff --git a/doc/src/fix_addtorque.rst b/doc/src/fix_addtorque.rst index 4e1ca12228..73af4ae571 100644 --- a/doc/src/fix_addtorque.rst +++ b/doc/src/fix_addtorque.rst @@ -99,7 +99,7 @@ invoked by the :doc:`minimize ` command. Restrictions """""""""""" -This fix is part of the MISC package. It is only enabled if +This fix is part of the EXTRA-FIX package. It is only enabled if LAMMPS was built with that package. See the :doc:`Build package ` page for more info. diff --git a/doc/src/fix_dt_reset.rst b/doc/src/fix_dt_reset.rst index c3aa431e18..368a3dcd70 100644 --- a/doc/src/fix_dt_reset.rst +++ b/doc/src/fix_dt_reset.rst @@ -78,13 +78,20 @@ outer loop (largest) timestep, which is the same timestep that the Note that the cumulative simulation time (in time units), which accounts for changes in the timestep size as a simulation proceeds, -can be accessed by the :doc:`thermo_style time ` keyword. +can be accessed by the :doc:`thermo_style time ` +keyword. + +Also note that the :doc:`dump_modify every/time ` option +allows dump files to be written at intervals specified by simulation +time, rather than by timesteps. Simulation time is in time units; +see the :doc:`units ` doc page for details. Restart, fix_modify, output, run start/stop, minimize info """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" -No information about this fix is written to :doc:`binary restart files `. None of the :doc:`fix_modify ` options -are relevant to this fix. +No information about this fix is written to :doc:`binary restart files +`. None of the :doc:`fix_modify ` options are +relevant to this fix. This fix computes a global scalar which can be accessed by various :doc:`output commands `. The scalar stores the last @@ -93,7 +100,8 @@ timestep on which the timestep was reset to a new value. The scalar value calculated by this fix is "intensive". No parameter of this fix can be used with the *start/stop* keywords of -the :doc:`run ` command. This fix is not invoked during :doc:`energy minimization `. +the :doc:`run ` command. This fix is not invoked during +:doc:`energy minimization `. Restrictions """""""""""" @@ -102,7 +110,7 @@ Restrictions Related commands """""""""""""""" -:doc:`timestep ` +:doc:`timestep `, :doc:`dump_modify every/time ` Default """"""" diff --git a/doc/src/fix_langevin_drude.rst b/doc/src/fix_langevin_drude.rst index 89ea28cf08..5e62e4f416 100644 --- a/doc/src/fix_langevin_drude.rst +++ b/doc/src/fix_langevin_drude.rst @@ -40,7 +40,7 @@ Example input scripts available: examples/PACKAGES/drude Description """"""""""" -Apply two Langevin thermostats as described in :ref:`(Jiang) ` for +Apply two Langevin thermostats as described in :ref:`(Jiang1) ` for thermalizing the reduced degrees of freedom of Drude oscillators. This link describes how to use the :doc:`thermalized Drude oscillator model ` in LAMMPS and polarizable models in LAMMPS are discussed on the :doc:`Howto polarizable ` doc @@ -300,5 +300,5 @@ The option defaults are zero = no. .. _Jiang1: -**(Jiang)** Jiang, Hardy, Phillips, MacKerell, Schulten, and Roux, J +**(Jiang1)** Jiang, Hardy, Phillips, MacKerell, Schulten, and Roux, J Phys Chem Lett, 2, 87-92 (2011). diff --git a/doc/src/fix_oneway.rst b/doc/src/fix_oneway.rst index f54cc42ed0..4c5afb29cf 100644 --- a/doc/src/fix_oneway.rst +++ b/doc/src/fix_oneway.rst @@ -51,7 +51,7 @@ the :doc:`run ` command. This fix is not invoked during :doc:`energy minim Restrictions """""""""""" -This fix is part of the MISC package. It is only enabled if LAMMPS +This fix is part of the EXTRA-FIX package. It is only enabled if LAMMPS was built with that package. See the :doc:`Build package ` page for more info. Related commands diff --git a/doc/src/fix_smd.rst b/doc/src/fix_smd.rst index 93554a4510..4c682e66c0 100644 --- a/doc/src/fix_smd.rst +++ b/doc/src/fix_smd.rst @@ -144,7 +144,7 @@ the :doc:`run ` command. This fix is not invoked during Restrictions """""""""""" -This fix is part of the MISC package. It is only enabled if +This fix is part of the EXTRA-FIX package. It is only enabled if LAMMPS was built with that package. See the :doc:`Build package ` page for more info. Related commands diff --git a/doc/src/kspace_style.rst b/doc/src/kspace_style.rst index 1dec62bb43..b6287650b9 100644 --- a/doc/src/kspace_style.rst +++ b/doc/src/kspace_style.rst @@ -310,7 +310,7 @@ Forschungszentrum Juelich. The library is available for download at "http://scafacos.de" or can be cloned from the git-repository -"git://github.com/scafacos/scafacos.git". +"https://github.com/scafacos/scafacos.git". In order to use this KSpace style, you must download and build the ScaFaCoS library, then build LAMMPS with the SCAFACOS package diff --git a/doc/src/pair_granular.rst b/doc/src/pair_granular.rst index b7f9da9f8b..6f84b0d9c7 100644 --- a/doc/src/pair_granular.rst +++ b/doc/src/pair_granular.rst @@ -205,7 +205,7 @@ For *damping mass_velocity*, the normal damping is given by: \eta_n = \eta_{n0} m_{eff} Here, :math:`\eta_{n0}` is the damping coefficient specified for the normal -contact model, in units of *mass*\ /\ *time* and +contact model, in units of 1/\ *time* and :math:`m_{eff} = m_i m_j/(m_i + m_j)` is the effective mass. Use *damping mass_velocity* to reproduce the damping behavior of *pair gran/hooke/\**. diff --git a/doc/src/pair_hybrid.rst b/doc/src/pair_hybrid.rst index 541cdc1911..1460927add 100644 --- a/doc/src/pair_hybrid.rst +++ b/doc/src/pair_hybrid.rst @@ -74,14 +74,17 @@ atoms interact with each other via an *eam* potential, the surface atoms interact with each other via a *lj/cut* potential, and the metal/surface interaction is also computed via a *lj/cut* potential. The *hybrid/overlay* style could be used as in the second example above, -where multiple potentials are superposed in an additive fashion to +where multiple potentials are superimposed in an additive fashion to compute the interaction between atoms. In this example, using *lj/cut* and *coul/long* together gives the same result as if the *lj/cut/coul/long* potential were used by itself. In this case, it would be more efficient to use the single combined potential, but in general any combination of pair potentials can be used together in to produce an interaction that is not encoded in any single pair_style -file, e.g. adding Coulombic forces between granular particles. +file, e.g. adding Coulombic forces between granular particles. Another +limitation of using the *hybrid/overlay* variant, that it does not generate +*lj/cut* parameters for mixed atom types from a mixing rule due to +restrictions discussed below. If the *hybrid/scaled* style is used instead of *hybrid/overlay*, contributions from sub-styles are weighted by their scale factors, which @@ -150,10 +153,14 @@ with Tersoff, and the cross-interactions with Lennard-Jones: pair_coeff * * tersoff 2 C.tersoff NULL C pair_coeff 1 2 lj/cut 1.0 1.5 -If pair coefficients are specified in the data file read via the -:doc:`read_data ` command, then the same rule applies. -E.g. "eam/alloy" or "lj/cut" must be added after the atom type, for -each line in the "Pair Coeffs" section, e.g. + +It is not recommended to read pair coefficients for a hybrid style from a "Pair Coeffs" +or "PairIJ Coeffs" section of a data file via the :doc:`read_data ` command, +since those sections expect a fixed number of lines, either one line per atom type or +one line pair pair of atom types, respectively. When reading from a data file, the +lines of the "Pair Coeffs" and "PairIJ Coeffs" are changed in the same way as the *pair_coeff* +command, i.e. the name of the pair style to which the parameters apply must follow the +atom type (or atom types), e.g. .. parsed-literal:: @@ -162,6 +169,11 @@ each line in the "Pair Coeffs" section, e.g. 1 lj/cut/coul/cut 1.0 1.0 ... + PairIJ Coeffs + + 1 1 lj/cut/coul/cut 1.0 1.0 + ... + Note that the pair_coeff command for some potentials such as :doc:`pair_style eam/alloy ` includes a mapping specification of elements to all atom types, which in the hybrid case, can include @@ -208,12 +220,22 @@ examples above, or in the data file read by the :doc:`read_data `, or by mixing as described below. Also all sub-styles must be used at least once in a :doc:`pair_coeff ` command. -.. note:: +.. warning:: - LAMMPS never performs mixing of parameters from different sub-styles, - **even** if they use the same type of coefficients, e.g. contain - a Lennard-Jones potential variant. Those parameters must be provided - explicitly. + With hybrid pair styles the use of mixing to generate pair + coefficients is significantly limited compared to the individual pair + styles. LAMMPS **never** performs mixing of parameters from + different sub-styles, **even** if they use the same type of + coefficients, e.g. contain a Lennard-Jones potential variant. Those + parameters must be provided explicitly. Also for *hybrid/overlay* + and *hybrid/scaled* mixing is **only** performed for pairs of atom + types for which only a single pair style is assigned. + + Thus it is strongly recommended to provide all mixed terms + explicitly. For non-hybrid styles those could be generated and + written out using the :doc:`write_coeff command ` and + then edited as needed to comply with the requirements for hybrid + styles as explained above. If you want there to be no interactions between a particular pair of atom types, you have 3 choices. You can assign the pair of atom types diff --git a/doc/src/pair_lebedeva_z.rst b/doc/src/pair_lebedeva_z.rst index 5afd0da92c..80fe1c52cb 100644 --- a/doc/src/pair_lebedeva_z.rst +++ b/doc/src/pair_lebedeva_z.rst @@ -26,15 +26,29 @@ Examples Description """"""""""" -The *lebedeva/z* style computes the Lebedeva interaction -potential as described in :ref:`(Lebedeva et al.) `. An important simplification is made, -which is to take all normals along the z-axis. +The *lebedeva/z* pair style computes the Lebedeva interaction potential +as described in :ref:`(Lebedeva1) ` and :ref:`(Lebedeva2) +`. An important simplification is made, which is to take all +normals along the z-axis. + +The Lebedeva potential is intended for the description of the interlayer +interaction between graphene layers. To perform a realistic simulation, +this potential must be used in combination with an intralayer potential +such as :doc:`AIREBO ` or :doc:`Tersoff ` +facilitated by using pair style :doc:`hybrid/overlay `. To +keep the intralayer properties unaffected, the interlayer interaction +within the same layers should be avoided. This can be achieved by +assigning different atom types to atoms of different layers (e.g. 1 and +2 in the examples above). + +Other interactions can be set to zero using pair_style *none*\ . + .. math:: - E = & \frac{1}{2} \sum_i \sum_{i \neq j} V_{ij}\\ + E = & \frac{1}{2} \sum_i \sum_{j \neq i} V_{ij}\\ V_{ij} = & B e^{-\alpha(r_{ij} - z_0)} \\ - & + C(1 + D_1\rho^2_{ij} + D_2\rho^4_{ij} e^{-\lambda_1\rho^2_{ij}} e^{-\lambda_2 (z^2_{ij} - z^2_0)} \\ + & + C(1 + D_1\rho^2_{ij} + D_2\rho^4_{ij}) e^{-\lambda_1\rho^2_{ij}} e^{-\lambda_2 (z^2_{ij} - z^2_0)} \\ & - A \left(\frac{z_0}{r_ij}\right)^6 + A \left( \frac{z_0}{r_c} \right)^6 \\ \rho^2_{ij} = & x^2_{ij} + y^2_{ij} \qquad (\mathbf{n_i} \equiv \mathbf{\hat{z}}) @@ -43,12 +57,15 @@ Energies are shifted so that they go continuously to zero at the cutoff assuming that the exponential part of :math:`V_{ij}` (first term) decays sufficiently fast. This shift is achieved by the last term in the equation for :math:`V_{ij}` above. -The parameter file (e.g. CC.Lebedeva), is intended for use with metal -:doc:`units `, with energies in meV. An additional parameter, *S*, -is available to facilitate scaling of energies. +The provided parameter file (CC.Lebedeva) contains two sets of parameters. -This potential must be used in combination with hybrid/overlay. -Other interactions can be set to zero using pair_style *none*\ . +- The first set (element name "C") is suitable for normal conditions and + is taken from :ref:`(Popov1) ` +- The second set (element name "C1") is suitable for high-pressure + conditions and is taken from :ref:`(Koziol1) ` + +Both sets contain an additional parameter, *S*, that can be used to +facilitate scaling of energies and is set to 1.0 by default. Restrictions """""""""""" @@ -77,4 +94,16 @@ none .. _Leb01: -**(Lebedeva et al.)** I. V. Lebedeva, A. A. Knizhnik, A. M. Popov, Y. E. Lozovik, B. V. Potapkin, Phys. Rev. B, 84, 245437 (2011) +**(Lebedeva1)** I. V. Lebedeva, A. A. Knizhnik, A. M. Popov, Y. E. Lozovik, B. V. Potapkin, Phys. Rev. B, 84, 245437 (2011) + +.. _Leb02: + +**(Lebedeva2)** I. V. Lebedeva, A. A. Knizhnik, A. M. Popov, Y. E. Lozovik, B. V. Potapkin, Physica E: 44, 949-954 (2012) + +.. _Popov: + +**(Popov1)** A.M. Popov, I. V. Lebedeva, A. A. Knizhnik, Y. E. Lozovik and B. V. Potapkin, Chem. Phys. Lett. 536, 82-86 (2012). + +.. _Koziol: + +**(Koziol1)** Z. Koziol, G. Gawlik and J. Jagielski, Chinese Phys. B 28, 096101 (2019). diff --git a/doc/src/pair_local_density.rst b/doc/src/pair_local_density.rst index f7e26389c3..2925ef2811 100644 --- a/doc/src/pair_local_density.rst +++ b/doc/src/pair_local_density.rst @@ -26,23 +26,25 @@ Examples Description """"""""""" -The local density (LD) potential is a mean-field manybody potential, and, in some -sense,a generalization of embedded atom models (EAM). The name "local density -potential" arises from the fact that it assigns an energy to an atom depending -on the number of neighboring atoms of given type around it within a predefined -spherical volume (i.e., within a cutoff). The bottom-up coarse-graining (CG) -literature suggests that such potentials can be widely useful in capturing -effective multibody forces in a computationally efficient manner so as to -improve the quality of CG models of implicit solvation:ref:`(Sanyal1) ` and -phase-segregation in liquid mixtures:ref:`(Sanyal2) `, and provide guidelines -to determine the extent of manybody correlations present in a CG -model.:ref:`(Rosenberger) ` The LD potential in LAMMPS is primarily -intended to be used as a corrective potential over traditional pair potentials -in bottom-up CG models, i.e., as a hybrid pair style with -other explicit pair interaction terms (e.g., table spline, Lennard Jones, etc.). -Because the LD potential is not a pair potential per se, it is implemented -simply as a single auxiliary file with all specifications that will be read -upon initialization. +The local density (LD) potential is a mean-field manybody potential, +and, in some way, a generalization of embedded atom models (EAM). The +name "local density potential" arises from the fact that it assigns an +energy to an atom depending on the number of neighboring atoms of a +given type around it within a predefined spherical volume (i.e., within +the cutoff). The bottom-up coarse-graining (CG) literature suggests +that such potentials can be widely useful in capturing effective +multibody forces in a computationally efficient manner and thus improve +the quality of CG models of implicit solvation :ref:`(Sanyal1) +` and phase-segregation in liquid mixtures :ref:`(Sanyal2) +`, and provide guidelines to determine the extent of manybody +correlations present in a CG model :ref:`(Rosenberger) `. +The LD potential in LAMMPS is primarily intended to be used as a +corrective potential over traditional pair potentials in bottom-up CG +models via :doc:`hybrid/overlay pair style ` with other +explicit pair interaction terms (e.g., tabulated, Lennard-Jones, Morse +etc.). Because the LD potential is not a pair potential per se, it is +implemented simply as a single auxiliary file with all specifications +that will be read upon initialization. .. note:: diff --git a/doc/src/pair_meam.rst b/doc/src/pair_meam.rst index d091496325..022de60f98 100644 --- a/doc/src/pair_meam.rst +++ b/doc/src/pair_meam.rst @@ -28,16 +28,16 @@ Description as of November 2010; see description below of the mixture_ref_t parameter -Style *meam* computes pairwise interactions for a variety of materials -using modified embedded-atom method (MEAM) potentials +Pair style *meam* computes non-bonded interactions for a variety of materials +using the modified embedded-atom method (MEAM) :ref:`(Baskes) `. Conceptually, it is an extension to the original -:doc:`EAM potentials ` which adds angular forces. It is +:doc:`EAM method ` which adds angular forces. It is thus suitable for modeling metals and alloys with fcc, bcc, hcp and -diamond cubic structures, as well as covalently bonded materials like -silicon and carbon. Style *meam* is a translation of the (now obsolete) -*meam* code from Fortran to C++. It is functionally equivalent to *meam* -but more efficient, and thus *meam* has been removed from LAMMPS after -the 12 December 2018 release. +diamond cubic structures, as well as materials with covalent interactions +like silicon and carbon. This *meam* pair style is a translation of the +original Fortran version to C++. It is functionally equivalent but more +efficient and has additional features. The Fortran version of the *meam* +pair style has been removed from LAMMPS after the 12 December 2018 release. In the MEAM formulation, the total energy E of a system of atoms is given by: diff --git a/doc/src/pair_modify.rst b/doc/src/pair_modify.rst index 1a62a4c1a0..4941693fbd 100644 --- a/doc/src/pair_modify.rst +++ b/doc/src/pair_modify.rst @@ -71,21 +71,23 @@ The *mix* keyword affects pair coefficients for interactions between atoms of type I and J, when I != J and the coefficients are not explicitly set in the input script. Note that coefficients for I = J must be set explicitly, either in the input script via the -:doc:`pair_coeff ` command or in the "Pair Coeffs" section of the -:doc:`data file `. For some pair styles it is not +:doc:`pair_coeff ` command or in the "Pair Coeffs" or "PairIJ Coeffs" +sections of the :doc:`data file `. For some pair styles it is not necessary to specify coefficients when I != J, since a "mixing" rule will create them from the I,I and J,J settings. The pair_modify *mix* value determines what formulas are used to compute the mixed coefficients. In each case, the cutoff distance is mixed the same way as sigma. -Note that not all pair styles support mixing and some mix options -are not available for certain pair styles. Also, there are additional -restrictions when using :doc:`pair style hybrid or hybrid/overlay `. -See the page for individual pair styles for those restrictions. Note also that the -:doc:`pair_coeff ` command also can be used to directly set -coefficients for a specific I != J pairing, in which case no mixing is -performed. +Note that not all pair styles support mixing and some mix options are +not available for certain pair styles. Also, there are additional +restrictions when using :doc:`pair style hybrid or hybrid/overlay +`. See the page for individual pair styles for those +restrictions. Note also that the :doc:`pair_coeff ` command +also can be used to directly set coefficients for a specific I != J +pairing, in which case no mixing is performed. If possible, LAMMPS will +print an informational message about how many of the mixed pair +coefficients were generated and which mixing rule was applied. - mix *geometric* diff --git a/doc/src/pair_nm.rst b/doc/src/pair_nm.rst index 2c356bb4ca..2256c7f220 100644 --- a/doc/src/pair_nm.rst +++ b/doc/src/pair_nm.rst @@ -1,4 +1,5 @@ .. index:: pair_style nm/cut +.. index:: pair_style nm/cut/split .. index:: pair_style nm/cut/coul/cut .. index:: pair_style nm/cut/coul/long .. index:: pair_style nm/cut/omp @@ -10,6 +11,9 @@ pair_style nm/cut command Accelerator Variants: *nm/cut/omp* +pair_style nm/cut/split command +=============================== + pair_style nm/cut/coul/cut command ================================== @@ -27,13 +31,15 @@ Syntax pair_style style args -* style = *nm/cut* or *nm/cut/coul/cut* or *nm/cut/coul/long* +* style = *nm/cut* or *nm/cut/split* or *nm/cut/coul/cut* or *nm/cut/coul/long* * args = list of arguments for a particular style .. parsed-literal:: *nm/cut* args = cutoff cutoff = global cutoff for Pair interactions (distance units) + *nm/cut/split* args = cutoff + cutoff = global cutoff for Pair interactions (distance units) *nm/cut/coul/cut* args = cutoff (cutoff2) cutoff = global cutoff for Pair (and Coulombic if only 1 arg) (distance units) cutoff2 = global cutoff for Coulombic (optional) (distance units) @@ -50,6 +56,10 @@ Examples pair_coeff * * 0.01 5.4 8.0 7.0 pair_coeff 1 1 0.01 4.4 7.0 6.0 + pair_style nm/cut/split 1.12246 + pair_coeff 1 1 1.0 1.1246 12 6 + pair_coeff * * 1.0 1.1246 11 6 + pair_style nm/cut/coul/cut 12.0 15.0 pair_coeff * * 0.01 5.4 8.0 7.0 pair_coeff 1 1 0.01 4.4 7.0 6.0 @@ -71,7 +81,15 @@ interaction has the following form: E = \frac{E_0}{(n-m)} \left[ m \left(\frac{r_0}{r}\right)^n - n \left(\frac{r_0}{r}\right)^m \right] \qquad r < r_c -where :math:`r_c` is the cutoff. +where :math:`r_c` is the cutoff and :math:`r_0` is the minimum of the +potential. Please note that this differs from the convention used for +other Lennard-Jones potentials in LAMMPS where :math:`\sigma` represents +the location where the energy is zero. + +Style *nm/cut/split* applies the standard LJ (12-6) potential above +:math:`r_0 = 2^\frac{1}{6}\sigma`. Style *nm/cut/split* is employed in +polymer equilibration protocols that combine core-softening approaches +with topology-changing moves :ref:`Dietz `. Style *nm/cut/coul/cut* adds a Coulombic pairwise interaction given by @@ -155,7 +173,6 @@ the :doc:`run_style respa ` command. They do not support the Restrictions """""""""""" - These pair styles are part of the EXTRA-PAIR package. They are only enabled if LAMMPS was built with that package. See the :doc:`Build package ` page for more info. @@ -163,7 +180,7 @@ LAMMPS was built with that package. See the Related commands """""""""""""""" -:doc:`pair_coeff ` +:doc:`pair_coeff `, :doc:`pair style lj/cut `, :doc:`bond style fene/nm ` Default """"""" @@ -175,3 +192,7 @@ none .. _Clarke: **(Clarke)** Clarke and Smith, J Chem Phys, 84, 2290 (1986). + +.. _Dietz: + +**(Dietz)** Dietz and Hoy, J. Chem Phys, 156, 014103 (2022). diff --git a/doc/src/pair_python.rst b/doc/src/pair_python.rst index 3d087565be..35e07dbd11 100644 --- a/doc/src/pair_python.rst +++ b/doc/src/pair_python.rst @@ -126,11 +126,11 @@ and *compute_energy*, which both take 3 numerical arguments: * itype = the (numerical) type of the first atom * jtype = the (numerical) type of the second atom -This functions need to compute the force and the energy, respectively, -and use the result as return value. The functions need to use the -*pmap* dictionary to convert the LAMMPS atom type number to the symbolic -value of the internal potential parameter data structure. Following -the *LJCutMelt* example, here are the two functions: +This functions need to compute the (scaled) force and the energy, +respectively, and use the result as return value. The functions need +to use the *pmap* dictionary to convert the LAMMPS atom type number +to the symbolic value of the internal potential parameter data structure. +Following the *LJCutMelt* example, here are the two functions: .. code-block:: python @@ -154,10 +154,10 @@ the *LJCutMelt* example, here are the two functions: for consistency with the C++ pair styles in LAMMPS, the *compute_force* function follows the conventions of the Pair::single() - methods and does not return the full force, but the force scaled by - the distance between the two atoms, so this value only needs to be - multiplied by delta x, delta y, and delta z to conveniently obtain the - three components of the force vector between these two atoms. + methods and does not return the pairwise force directly, but the force + divided by the distance between the two atoms, so this value only needs + to be multiplied by delta x, delta y, and delta z to conveniently obtain + the three components of the force vector between these two atoms. ---------- diff --git a/doc/src/pair_style.rst b/doc/src/pair_style.rst index 1cf033ddba..4bb3c90a8d 100644 --- a/doc/src/pair_style.rst +++ b/doc/src/pair_style.rst @@ -274,6 +274,7 @@ accelerated styles exist. * :doc:`nm/cut ` - N-M potential * :doc:`nm/cut/coul/cut ` - N-M potential with cutoff Coulomb * :doc:`nm/cut/coul/long ` - N-M potential with long-range Coulomb +* :doc:`nm/cut/split ` - Split 12-6 Lennard-Jones and N-M potential * :doc:`oxdna/coaxstk ` - * :doc:`oxdna/excv ` - * :doc:`oxdna/hbond ` - @@ -327,6 +328,7 @@ accelerated styles exist. * :doc:`spin/neel ` - * :doc:`srp ` - * :doc:`sw ` - Stillinger-Weber 3-body potential +* :doc:`sw/mod ` - modified Stillinger-Weber 3-body potential * :doc:`table ` - tabulated pair potential * :doc:`table/rx ` - * :doc:`tdpd ` - tDPD particle interactions diff --git a/doc/src/pair_sw.rst b/doc/src/pair_sw.rst index 1b2a4a4b1d..d71999b2d4 100644 --- a/doc/src/pair_sw.rst +++ b/doc/src/pair_sw.rst @@ -3,18 +3,34 @@ .. index:: pair_style sw/intel .. index:: pair_style sw/kk .. index:: pair_style sw/omp +.. index:: pair_style sw/mod +.. index:: pair_style sw/mod/omp pair_style sw command ===================== Accelerator Variants: *sw/gpu*, *sw/intel*, *sw/kk*, *sw/omp* +pair_style sw/mod command +========================= + +Accelerator Variants: *sw/mod/omp* + Syntax """""" .. code-block:: LAMMPS - pair_style sw + pair_style style keyword values + +* style = *sw* or *sw/mod* +* keyword = *maxdelcs* + + .. parsed-literal:: + + *maxdelcs* value = delta1 delta2 (optional) + delta1 = The minimum thershold for cosine of three-body angle + delta2 = The maximum threshold for cosine of three-body angle Examples """""""" @@ -25,6 +41,9 @@ Examples pair_coeff * * si.sw Si pair_coeff * * GaN.sw Ga N Ga + pair_style sw/mod maxdelcs 0.25 0.35 + pair_coeff * * tmd.sw.mod Mo S S + Description """"""""""" @@ -48,8 +67,52 @@ where :math:`\phi_2` is a two-body term and :math:`\phi_3` is a three-body term. The summations in the formula are over all neighbors J and K of atom I within a cutoff distance :math:`a `\sigma`. -Only a single pair_coeff command is used with the *sw* style which -specifies a Stillinger-Weber potential file with parameters for all +The *sw/mod* style is designed for simulations of materials when +distinguishing three-body angles are necessary, such as borophene +and transition metal dichalcogenide, which cannot be described +by the original code for the Stillinger-Weber potential. +For instance, there are several types of angles around each Mo atom in `MoS_2`, +and some unnecessary angle types should be excluded in the three-body interaction. +Such exclusion may be realized by selecting proper angle types directly. +The exclusion of unnecessary angles is achieved here by the cut-off function (`f_C(\delta)`), +which induces only minimum modifications for LAMMPS. + +Validation, benchmark tests, and applications of the *sw/mod* style +can be found in :ref:`(Jiang2) ` and :ref:`(Jiang3) `. + +The *sw/mod* style computes the energy E of a system of atoms, whose potential +function is mostly the same as the Stillinger-Weber potential. The only modification +is in the three-body term, where the value of :math:`\delta = \cos \theta_{ijk} - \cos \theta_{0ijk}` +used in the original energy and force expression is scaled by a switching factor :math:`f_C(\delta)`: + +.. math:: + + f_C(\delta) & = \left\{ \begin{array} {r@{\quad:\quad}l} + 1 & \left| \delta \right| < \delta_1 \\ + \frac{1}{2} + \frac{1}{2} \cos \left( \pi \frac{\left| \delta \right| - \delta_1}{\delta_2 - \delta_1} \right) & + \delta_1 < \left| \delta \right| < \delta_2 \\ + 0 & \left| \delta \right| > \delta_2 + \end{array} \right. \\ + +This cut-off function decreases smoothly from 1 to 0 over the range :math:`[\delta_1, \delta_2]`. +This smoothly turns off the energy and force contributions for :math:`\left| \delta \right| > \delta_2`. +It is suggested that :math:`\delta 1` and :math:`\delta_2` to be the value around +:math:`0.5 \left| \cos \theta_1 - \cos \theta_2 \right|`, with +:math:`\theta_1` and :math:`\theta_2` as the different types of angles around an atom. +For borophene and transition metal dichalcogenide, :math:`\delta_1 = 0.25` and :math:`\delta_2 = 0.35`. +This value enables the cut-off function to exclude unnecessary angles in the three-body SW terms. + +.. note:: + + The cut-off function is just to be used as a technique to exclude some unnecessary angles, + and it has no physical meaning. It should be noted that the force and potential are inconsistent + with each other in the decaying range of the cut-off function, as the angle dependence for the + cut-off function is not implemented in the force (first derivation of potential). + However, the angle variation is much smaller than the given threshold value for actual simulations, + so the inconsistency between potential and force can be neglected in actual simulations. + +Only a single pair_coeff command is used with the *sw* and *sw/mod* styles +which specifies a Stillinger-Weber potential file with parameters for all needed elements. These are mapped to LAMMPS atom types by specifying N additional arguments after the filename in the pair_coeff command, where N is the number of LAMMPS atom types: @@ -213,10 +276,19 @@ Related commands Default """"""" -none +The default values for the *maxdelcs* setting of the *sw/mod* pair +style are *delta1* = 0.25 and *delta2* = 0.35`. ---------- .. _Stillinger2: **(Stillinger)** Stillinger and Weber, Phys Rev B, 31, 5262 (1985). + +.. _Jiang2: + +**(Jiang2)** J.-W. Jiang, Nanotechnology 26, 315706 (2015). + +.. _Jiang3: + +**(Jiang3)** J.-W. Jiang, Acta Mech. Solida. Sin 32, 17 (2019). diff --git a/doc/src/pair_tersoff.rst b/doc/src/pair_tersoff.rst index ab88806ca6..38a0262f5d 100644 --- a/doc/src/pair_tersoff.rst +++ b/doc/src/pair_tersoff.rst @@ -23,7 +23,7 @@ Syntax pair_style style keywords values -* style = *tersoff* or *tersoff/table* or *tersoff/gpu* or *tersoff/omp* or *tersoff/table/omp* +* style = *tersoff* or *tersoff/table* * keyword = *shift* .. parsed-literal:: diff --git a/doc/src/pair_thole.rst b/doc/src/pair_thole.rst index 5a1e72f569..a4e8bbb96e 100644 --- a/doc/src/pair_thole.rst +++ b/doc/src/pair_thole.rst @@ -17,7 +17,7 @@ Syntax pair_style style args -* style = *thole* or *lj/cut/thole/long* or *lj/cut/thole/long/omp* +* style = *thole* or *lj/cut/thole/long* * args = list of arguments for a particular style .. parsed-literal:: @@ -25,7 +25,7 @@ Syntax *thole* args = damp cutoff damp = global damping parameter cutoff = global cutoff (distance units) - *lj/cut/thole/long* or *lj/cut/thole/long/omp* args = damp cutoff (cutoff2) + *lj/cut/thole/long* args = damp cutoff (cutoff2) damp = global damping parameter cutoff = global cutoff for LJ (and Thole if only 1 arg) (distance units) cutoff2 = global cutoff for Thole (optional) (distance units) diff --git a/doc/src/pair_vashishta.rst b/doc/src/pair_vashishta.rst index d38ac02a96..8310eb7607 100644 --- a/doc/src/pair_vashishta.rst +++ b/doc/src/pair_vashishta.rst @@ -22,13 +22,13 @@ Syntax pair_style style args -* style = *vashishta* or *vashishta/table* or *vashishta/omp* or *vashishta/table/omp* +* style = *vashishta* or *vashishta/table* * args = list of arguments for a particular style .. parsed-literal:: - *vashishta* or *vashishta/omp* args = none - *vashishta/table* or *vashishta/table/omp* args = Ntable cutinner + *vashishta* args = none + *vashishta/table* args = Ntable cutinner Ntable = # of tabulation points cutinner = tablulate from cutinner to cutoff diff --git a/doc/src/read_dump.rst b/doc/src/read_dump.rst index c873156a38..3a771b9c2d 100644 --- a/doc/src/read_dump.rst +++ b/doc/src/read_dump.rst @@ -98,8 +98,7 @@ command, after the dump snapshot is read. ---------- If the dump filename specified as *file* ends with ".gz", the dump -file is read in gzipped format. You cannot (yet) read a dump file -that was written in binary format with a ".bin" suffix. +file is read in gzipped format. You can read dump files that were written (in parallel) to multiple files via the "%" wild-card character in the dump file name. If any @@ -115,8 +114,8 @@ to tell LAMMPS how many parallel files exist, via its specified The format of the dump file is selected through the *format* keyword. If specified, it must be the last keyword used, since all remaining arguments are passed on to the dump reader. The *native* format is -for native LAMMPS dump files, written with a :doc:`dump atom ` or -:doc:`dump custom ` command. The *xyz* format is for generic XYZ +for native LAMMPS dump files, written with a :doc:`dump atom ` +or :doc:`dump custom ` command. The *xyz* format is for generic XYZ formatted dump files. These formats take no additional values. The *molfile* format supports reading data through using the `VMD `_ @@ -370,8 +369,6 @@ needed to generate absolute, unscaled coordinates. Restrictions """""""""""" -The *native* dump file reader does not support binary .bin dump files. - To read gzipped dump files, you must compile LAMMPS with the -DLAMMPS_GZIP option. See the :doc:`Build settings ` doc page for details. diff --git a/doc/utils/requirements.txt b/doc/utils/requirements.txt index 9b8e106875..9797d4d119 100644 --- a/doc/utils/requirements.txt +++ b/doc/utils/requirements.txt @@ -1,6 +1,6 @@ Sphinx sphinxcontrib-spelling -git+git://github.com/akohlmey/sphinx-fortran@parallel-read +git+https://github.com/akohlmey/sphinx-fortran@parallel-read sphinx_tabs breathe Pygments diff --git a/doc/utils/sphinx-config/false_positives.txt b/doc/utils/sphinx-config/false_positives.txt index d295767519..fe1e40e8ba 100644 --- a/doc/utils/sphinx-config/false_positives.txt +++ b/doc/utils/sphinx-config/false_positives.txt @@ -308,6 +308,7 @@ boolean boostostat boostostatting Boresch +borophene Botero Botu Bouguet @@ -688,8 +689,10 @@ diagonalizers diagonalizing Diallo diblock +dichalcogenide Dickel diel +Dietz differentiable diffusively diffusivity @@ -1125,6 +1128,7 @@ gaussian gaussians Gaussians Gavhane +Gawlik gayberne gcc gcmc @@ -1308,6 +1312,7 @@ hotpink Houlle howto Howto +Hoy Hoyt Hs hstyle @@ -1483,6 +1488,7 @@ Izz Jacobsen Jadhao Jadhav +Jagielski jagreat Jahn Jalalvand @@ -1609,6 +1615,7 @@ Koslowski Kosovan Koster Kosztin +Koziol Kp kradius Kraker @@ -1995,6 +2002,7 @@ minimizer minimizers minneigh minorder +MinSizeRel minSteps mintcream Mintmire @@ -2386,6 +2394,7 @@ ohenrich ok Okabe Okamoto +O'Hearn O'Keefe OKeefe oldlace @@ -2784,6 +2793,7 @@ relink relres relTol relu +RelWithDebInfo remappings remd Ren @@ -3073,6 +3083,7 @@ snav Snodin Sodani Soderlind +Solida solvated solvation someuser diff --git a/examples/ELASTIC_T/potential.mod b/examples/ELASTIC_T/potential.mod index b9ed80d865..d4b7cc7158 100644 --- a/examples/ELASTIC_T/potential.mod +++ b/examples/ELASTIC_T/potential.mod @@ -1,6 +1,8 @@ # NOTE: This script can be modified for different pair styles # See in.elastic for more info. +# we must undefine any fix ave/* fix before using reset_timestep +if "$(is_defined(fix,avp)" then "unfix avp" reset_timestep 0 # Choose potential diff --git a/examples/PACKAGES/local_density/benzene_water/benzene_water.localdensity.table b/examples/PACKAGES/local_density/benzene_water/benzene_water.localdensity.table index b0d63dbbbf..96630f0ccc 100644 --- a/examples/PACKAGES/local_density/benzene_water/benzene_water.localdensity.table +++ b/examples/PACKAGES/local_density/benzene_water/benzene_water.localdensity.table @@ -1,4 +1,4 @@ -# local density potentials: (B,B), (W,W), (B,W), (W,B) +# local density potentials: (B,B), (W,W), (B,W), (W,B) UNITS: real 4 500 diff --git a/examples/PACKAGES/local_density/benzene_water/benzene_water.pair.table b/examples/PACKAGES/local_density/benzene_water/benzene_water.pair.table index 348bccfa0e..d76ac0dfd8 100644 --- a/examples/PACKAGES/local_density/benzene_water/benzene_water.pair.table +++ b/examples/PACKAGES/local_density/benzene_water/benzene_water.pair.table @@ -1,4 +1,4 @@ - +# UNITS: real PairBB N 500 R 2.00000e-02 1.32500e+01 diff --git a/examples/PACKAGES/local_density/benzene_water/benzene_water.in b/examples/PACKAGES/local_density/benzene_water/in.benzene_water similarity index 87% rename from examples/PACKAGES/local_density/benzene_water/benzene_water.in rename to examples/PACKAGES/local_density/benzene_water/in.benzene_water index 01fb3f27e5..69d39be357 100644 --- a/examples/PACKAGES/local_density/benzene_water/benzene_water.in +++ b/examples/PACKAGES/local_density/benzene_water/in.benzene_water @@ -11,7 +11,7 @@ # Initialize simulation box dimension 3 -boundary p p p +boundary p p p units real atom_style molecular @@ -32,7 +32,7 @@ pair_coeff * * local/density benzene_water.localdensity.table fix recentering all recenter 0.0 0.0 0.0 units box # Thermostat & time integration -timestep 2.0 +timestep 2.0 thermo 100 thermo_style custom temp ke pe etotal ebond eangle edihed evdwl @@ -49,14 +49,14 @@ run 5000 # Turn off recentering during production phase unfix recentering +reset_timestep 0 # Setup trajectory output -dump myDump all custom 100 benzene_water.lammpstrj.gz id type x y z element -dump_modify myDump element B W -dump_modify myDump sort id +#dump myDump all custom 100 benzene_water.lammpstrj.gz id type x y z element +#dump_modify myDump element B W +#dump_modify myDump sort id # Production (for realistic results, run for 10000000 steps) -reset_timestep 0 -run 1000 +run 1000 diff --git a/examples/PACKAGES/local_density/benzene_water/log.04Sep19.g++.1 b/examples/PACKAGES/local_density/benzene_water/log.04Sep19.g++.1 deleted file mode 100644 index 928906edbd..0000000000 --- a/examples/PACKAGES/local_density/benzene_water/log.04Sep19.g++.1 +++ /dev/null @@ -1,267 +0,0 @@ -LAMMPS (7 Aug 2019) -# LAMMPS input file for 26.5% benzene mole fraction solution -# with 380 benzene and 1000 water molecules, -# using all possible local density potentials -# between benzene and water -# -# Author: Tanmoy Sanyal, Shell Group, UC Santa Barbara -# -# Refer: Sanyal and Shell, JPC-B, 2018, 122 (21), 5678-5693 - - - -# Initialize simulation box -dimension 3 -boundary p p p -units real -atom_style molecular - -# Set potential styles -pair_style hybrid/overlay table spline 500 local/density - -# Read molecule data and set initial velocities -read_data benzene_water.data - orthogonal box = (-12.865 -12.865 -64.829) to (12.865 12.865 64.829) - 1 by 1 by 8 MPI processor grid - reading atoms ... - 1380 atoms - 0 = max # of 1-2 neighbors - 0 = max # of 1-3 neighbors - 0 = max # of 1-4 neighbors - 1 = max # of special neighbors - special bonds CPU = 0.000566959 secs - read_data CPU = 0.00661397 secs -velocity all create 3.0000e+02 16611 rot yes dist gaussian - -# Assign potentials -pair_coeff 1 1 table benzene_water.pair.table PairBB -WARNING: 33 of 500 force values in table are inconsistent with -dE/dr. - Should only be flagged at inflection points (../pair_table.cpp:483) -WARNING: 150 of 500 distance values in table with relative error - over 1e-06 to re-computed values (../pair_table.cpp:492) -pair_coeff 1 2 table benzene_water.pair.table PairWW -WARNING: 61 of 500 force values in table are inconsistent with -dE/dr. - Should only be flagged at inflection points (../pair_table.cpp:483) -WARNING: 90 of 500 distance values in table with relative error - over 1e-06 to re-computed values (../pair_table.cpp:492) -pair_coeff 2 2 table benzene_water.pair.table PairBW -WARNING: 108 of 500 force values in table are inconsistent with -dE/dr. - Should only be flagged at inflection points (../pair_table.cpp:483) -WARNING: 135 of 500 distance values in table with relative error - over 1e-06 to re-computed values (../pair_table.cpp:492) -pair_coeff * * local/density benzene_water.localdensity.table - -# Recentering during minimization and equilibration -fix recentering all recenter 0.0 0.0 0.0 units box - -# Thermostat & time integration -timestep 2.0 -thermo 100 -thermo_style custom temp ke pe etotal ebond eangle edihed evdwl - -# Minimization -minimize 1.e-4 0.0 10000 10000 -WARNING: Using 'neigh_modify every 1 delay 0 check yes' setting during minimization (../min.cpp:168) -Neighbor list info ... - update every 1 steps, delay 0 steps, check yes - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 15.25 - ghost atom cutoff = 15.25 - binsize = 7.625, bins = 4 4 18 - 2 neighbor lists, perpetual/occasional/extra = 2 0 0 - (1) pair table, perpetual - attributes: half, newton on - pair build: half/bin/newton - stencil: half/bin/3d/newton - bin: standard - (2) pair local/density, perpetual, copy from (1) - attributes: half, newton on - pair build: copy - stencil: none - bin: none -Per MPI rank memory allocation (min/avg/max) = 8.061 | 8.32 | 8.674 Mbytes -Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl - 300 1233.1611 4162.3053 5395.4665 0 0 0 4162.3053 - 300 1233.1611 2275.526 3508.6871 0 0 0 2275.526 -Loop time of 0.352822 on 8 procs for 40 steps with 1380 atoms - -71.3% CPU use with 8 MPI tasks x no OpenMP threads - -Minimization stats: - Stopping criterion = linesearch alpha is zero - Energy initial, next-to-last, final = - 4162.30533361 2208.86525108 2275.52597861 - Force two-norm initial, final = 259.364 69.3915 - Force max component initial, final = 22.2077 8.31436 - Final line search alpha, max atom move = 2.90022e-12 2.41135e-11 - Iterations, force evaluations = 40 110 - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 0.053192 | 0.23903 | 0.32779 | 17.2 | 67.75 -Bond | 9.0599e-06 | 1.6302e-05 | 2.5272e-05 | 0.0 | 0.00 -Neigh | 0.00044513 | 0.0023614 | 0.0063851 | 5.1 | 0.67 -Comm | 0.015469 | 0.090432 | 0.20295 | 20.0 | 25.63 -Output | 0 | 0 | 0 | 0.0 | 0.00 -Modify | 0 | 0 | 0 | 0.0 | 0.00 -Other | | 0.02098 | | | 5.95 - -Nlocal: 172.5 ave 348 max 72 min -Histogram: 5 0 0 0 0 0 0 0 1 2 -Nghost: 2193.62 ave 4352 max 932 min -Histogram: 3 0 0 2 0 0 2 0 0 1 -Neighs: 9700.5 ave 20535 max 3685 min -Histogram: 5 0 0 0 0 0 0 1 0 2 - -Total # of neighbors = 77604 -Ave neighs/atom = 56.2348 -Ave special neighs/atom = 0 -Neighbor list builds = 2 -Dangerous builds = 0 - -# Set up integration parameters -fix timeintegration all nve -fix thermostat all langevin 3.0000e+02 3.0000e+02 1.0000e+02 81890 - -# Equilibration (for realistic results, run for 5000000 steps) -reset_timestep 0 -run 5000 -WARNING: Fix recenter should come after all other integration fixes (../fix_recenter.cpp:131) -Per MPI rank memory allocation (min/avg/max) = 6.936 | 7.195 | 7.552 Mbytes -Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl - 300 1233.1611 2866.9109 4100.0721 0 0 0 2866.9109 - 273.33541 1123.5553 3983.2007 5106.756 0 0 0 3983.2007 - 293.68078 1207.1857 3319.6601 4526.8458 0 0 0 3319.6601 - 314.21462 1291.5908 3389.2178 4680.8086 0 0 0 3389.2178 - 323.77563 1330.8917 3332.9828 4663.8745 0 0 0 3332.9828 - 302.5902 1243.8082 3461.7692 4705.5774 0 0 0 3461.7692 - 295.39324 1214.2249 3411.5727 4625.7976 0 0 0 3411.5727 - 320.52341 1317.5234 3453.1931 4770.7164 0 0 0 3453.1931 - 312.00777 1282.5195 3403.3443 4685.8638 0 0 0 3403.3443 - 307.96774 1265.9128 3429.7809 4695.6937 0 0 0 3429.7809 - 294.75922 1211.6187 3388.8404 4600.4591 0 0 0 3388.8404 - 311.24567 1279.3869 3514.9603 4794.3472 0 0 0 3514.9603 - 306.6152 1260.3531 3447.2011 4707.5542 0 0 0 3447.2011 - 305.23306 1254.6718 3375.5092 4630.181 0 0 0 3375.5092 - 321.62889 1322.0675 3460.2581 4782.3256 0 0 0 3460.2581 - 316.37725 1300.4804 3437.0312 4737.5116 0 0 0 3437.0312 - 322.90522 1327.3139 3389.1262 4716.44 0 0 0 3389.1262 - 307.57893 1264.3146 3359.8491 4624.1637 0 0 0 3359.8491 - 302.22607 1242.3115 3406.1711 4648.4826 0 0 0 3406.1711 - 302.73997 1244.4239 3220.2582 4464.6821 0 0 0 3220.2582 - 303.66194 1248.2137 3318.4629 4566.6765 0 0 0 3318.4629 - 308.73862 1269.0815 3369.5894 4638.671 0 0 0 3369.5894 - 315.60294 1297.2976 3411.2405 4708.5381 0 0 0 3411.2405 - 310.0113 1274.3129 3360.1054 4634.4183 0 0 0 3360.1054 - 302.36229 1242.8714 3326.9845 4569.8559 0 0 0 3326.9845 - 317.78659 1306.2735 3355.4976 4661.7711 0 0 0 3355.4976 - 302.50479 1243.4571 3317.6846 4561.1417 0 0 0 3317.6846 - 304.29249 1250.8056 3423.5068 4674.3124 0 0 0 3423.5068 - 305.99948 1257.8222 3432.9395 4690.7617 0 0 0 3432.9395 - 309.93363 1273.9937 3393.657 4667.6506 0 0 0 3393.657 - 316.14884 1299.5415 3463.0636 4762.6051 0 0 0 3463.0636 - 300.38817 1234.7567 3309.2495 4544.0062 0 0 0 3309.2495 - 311.05735 1278.6128 3304.4418 4583.0546 0 0 0 3304.4418 - 311.11872 1278.865 3291.1891 4570.0542 0 0 0 3291.1891 - 315.74338 1297.8749 3341.3063 4639.1812 0 0 0 3341.3063 - 297.5658 1223.1552 3316.3862 4539.5414 0 0 0 3316.3862 - 311.79033 1281.6257 3357.4556 4639.0813 0 0 0 3357.4556 - 310.93666 1278.1167 3414.7694 4692.8861 0 0 0 3414.7694 - 307.37298 1263.468 3337.3889 4600.8569 0 0 0 3337.3889 - 298.84185 1228.4005 3329.6173 4558.0178 0 0 0 3329.6173 - 310.54684 1276.5143 3351.0852 4627.5995 0 0 0 3351.0852 - 300.0871 1233.5191 3302.2315 4535.7506 0 0 0 3302.2315 - 304.69078 1252.4427 3324.2508 4576.6935 0 0 0 3324.2508 - 313.50714 1288.6827 3330.4088 4619.0915 0 0 0 3330.4088 - 329.80018 1355.6559 3301.86 4657.5159 0 0 0 3301.86 - 304.57609 1251.9713 3365.2938 4617.2652 0 0 0 3365.2938 - 308.73584 1269.0701 3344.4155 4613.4856 0 0 0 3344.4155 - 306.90951 1261.5629 3304.4698 4566.0327 0 0 0 3304.4698 - 308.85761 1269.5707 3392.1511 4661.7218 0 0 0 3392.1511 - 302.78788 1244.6208 3317.0849 4561.7057 0 0 0 3317.0849 - 321.68092 1322.2813 3321.5755 4643.8568 0 0 0 3321.5755 -Loop time of 16.3061 on 8 procs for 5000 steps with 1380 atoms - -Performance: 52.986 ns/day, 0.453 hours/ns, 306.634 timesteps/s -69.6% CPU use with 8 MPI tasks x no OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 2.1872 | 10.542 | 14.607 | 116.7 | 64.65 -Bond | 0.00044084 | 0.00069669 | 0.00095081 | 0.0 | 0.00 -Neigh | 0.026948 | 0.15225 | 0.44344 | 42.0 | 0.93 -Comm | 0.63452 | 4.2953 | 9.49 | 133.9 | 26.34 -Output | 0.0016391 | 0.012378 | 0.050919 | 13.9 | 0.08 -Modify | 0.45894 | 1.2107 | 4.4629 | 116.4 | 7.42 -Other | | 0.09292 | | | 0.57 - -Nlocal: 172.5 ave 380 max 70 min -Histogram: 5 0 0 0 0 0 0 1 1 1 -Nghost: 2213 ave 4440 max 903 min -Histogram: 3 0 0 2 0 0 2 0 0 1 -Neighs: 10042.5 ave 24051 max 3500 min -Histogram: 5 0 0 0 0 0 0 1 1 1 - -Total # of neighbors = 80340 -Ave neighs/atom = 58.2174 -Ave special neighs/atom = 0 -Neighbor list builds = 123 -Dangerous builds = 1 - -# Turn off recentering during production phase -unfix recentering - -# Setup trajectory output -dump myDump all custom 100 benzene_water.lammpstrj.gz id type x y z element -dump_modify myDump element B W -dump_modify myDump sort id - -# Production (for realistic results, run for 10000000 steps) -reset_timestep 0 -run 1000 -Per MPI rank memory allocation (min/avg/max) = 8.232 | 8.492 | 8.851 Mbytes -Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl - 321.68092 1322.2813 3784.0834 5106.3647 0 0 0 3784.0834 - 310.59763 1276.7231 3318.3283 4595.0513 0 0 0 3318.3283 - 303.39445 1247.1141 3324.1191 4571.2332 0 0 0 3324.1191 - 311.37275 1279.9092 3305.0901 4584.9993 0 0 0 3305.0901 - 311.29071 1279.572 3248.216 4527.788 0 0 0 3248.216 - 314.53456 1292.906 3283.4563 4576.3623 0 0 0 3283.4563 - 316.52595 1301.0916 3258.9171 4560.0087 0 0 0 3258.9171 - 318.92447 1310.9509 3235.6256 4546.5765 0 0 0 3235.6256 - 311.79212 1281.6331 3308.099 4589.7321 0 0 0 3308.099 - 305.52477 1255.8709 3267.6907 4523.5616 0 0 0 3267.6907 - 301.07457 1237.5782 3206.3997 4443.9779 0 0 0 3206.3997 -Loop time of 4.44139 on 8 procs for 1000 steps with 1380 atoms - -Performance: 38.907 ns/day, 0.617 hours/ns, 225.155 timesteps/s -60.8% CPU use with 8 MPI tasks x no OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 0.656 | 2.5078 | 3.5775 | 57.7 | 56.46 -Bond | 0.00013375 | 0.0001854 | 0.0002377 | 0.0 | 0.00 -Neigh | 0.0048757 | 0.029188 | 0.090432 | 18.9 | 0.66 -Comm | 0.51836 | 1.4427 | 2.6285 | 56.9 | 32.48 -Output | 0.083084 | 0.089199 | 0.10333 | 2.3 | 2.01 -Modify | 0.0087376 | 0.019705 | 0.038437 | 8.4 | 0.44 -Other | | 0.3526 | | | 7.94 - -Nlocal: 172.5 ave 388 max 69 min -Histogram: 5 0 0 0 0 0 0 2 0 1 -Nghost: 2207.88 ave 4429 max 896 min -Histogram: 3 0 0 2 0 0 2 0 0 1 -Neighs: 10094.1 ave 24847 max 3403 min -Histogram: 5 0 0 0 0 0 1 1 0 1 - -Total # of neighbors = 80753 -Ave neighs/atom = 58.5167 -Ave special neighs/atom = 0 -Neighbor list builds = 23 -Dangerous builds = 0 - - -Total wall time: 0:00:21 diff --git a/examples/PACKAGES/local_density/benzene_water/log.27Oct21.benzene_water.g++.1 b/examples/PACKAGES/local_density/benzene_water/log.27Oct21.benzene_water.g++.1 new file mode 100644 index 0000000000..034b60ea67 --- /dev/null +++ b/examples/PACKAGES/local_density/benzene_water/log.27Oct21.benzene_water.g++.1 @@ -0,0 +1,300 @@ +LAMMPS (27 Oct 2021) +OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98) + using 1 OpenMP thread(s) per MPI task +# LAMMPS input file for 26.5% benzene mole fraction solution +# with 380 benzene and 1000 water molecules, +# using all possible local density potentials +# between benzene and water +# +# Author: Tanmoy Sanyal, Shell Group, UC Santa Barbara +# +# Refer: Sanyal and Shell, JPC-B, 2018, 122 (21), 5678-5693 + + + +# Initialize simulation box +dimension 3 +boundary p p p +units real +atom_style molecular + +# Set potential styles +pair_style hybrid/overlay table spline 500 local/density + +# Read molecule data and set initial velocities +read_data benzene_water.data +Reading data file ... + orthogonal box = (-12.865000 -12.865000 -64.829000) to (12.865000 12.865000 64.829000) + 1 by 1 by 1 MPI processor grid + reading atoms ... + 1380 atoms +Finding 1-2 1-3 1-4 neighbors ... + special bond factors lj: 0 0 0 + special bond factors coul: 0 0 0 + 0 = max # of 1-2 neighbors + 0 = max # of 1-3 neighbors + 0 = max # of 1-4 neighbors + 1 = max # of special neighbors + special bonds CPU = 0.000 seconds + read_data CPU = 0.006 seconds +velocity all create 3.0000e+02 16611 rot yes dist gaussian + +# Assign potentials +pair_coeff 1 1 table benzene_water.pair.table PairBB +WARNING: 33 of 500 force values in table PairBB are inconsistent with -dE/dr. +WARNING: Should only be flagged at inflection points (src/pair_table.cpp:465) +WARNING: 150 of 500 distance values in table 1e-06 with relative error +WARNING: over PairBB to re-computed values (src/pair_table.cpp:473) +pair_coeff 1 2 table benzene_water.pair.table PairWW +WARNING: 61 of 500 force values in table PairWW are inconsistent with -dE/dr. +WARNING: Should only be flagged at inflection points (src/pair_table.cpp:465) +WARNING: 90 of 500 distance values in table 1e-06 with relative error +WARNING: over PairWW to re-computed values (src/pair_table.cpp:473) +pair_coeff 2 2 table benzene_water.pair.table PairBW +WARNING: 108 of 500 force values in table PairBW are inconsistent with -dE/dr. +WARNING: Should only be flagged at inflection points (src/pair_table.cpp:465) +WARNING: 135 of 500 distance values in table 1e-06 with relative error +WARNING: over PairBW to re-computed values (src/pair_table.cpp:473) +pair_coeff * * local/density benzene_water.localdensity.table + +# Recentering during minimization and equilibration +fix recentering all recenter 0.0 0.0 0.0 units box + +# Thermostat & time integration +timestep 2.0 +thermo 100 +thermo_style custom temp ke pe etotal ebond eangle edihed evdwl + +# Minimization +minimize 1.e-4 0.0 10000 10000 + +CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE + +Your simulation uses code contributions which should be cited: + +- pair_style local/density command: + +@Article{Sanyal16, + author = {T.Sanyal and M.Scott Shell}, + title = {Coarse-grained models using local-density potentials optimized with the relative entropy: Application to implicit solvation}, + journal = {J.~Chem.~Phys.}, + year = 2016, + DOI = doi.org/10.1063/1.4958629} + +@Article{Sanyal18, + author = {T.Sanyal and M.Scott Shell}, + title = {Transferable coarse-grained models of liquid-liquid equilibrium using local density potentials optimized with the relative entropy}, + journal = {J.~Phys.~Chem. B}, + year = 2018, + DOI = doi.org/10.1021/acs.jpcb.7b12446} + +CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE + +WARNING: Using 'neigh_modify every 1 delay 0 check yes' setting during minimization (src/min.cpp:187) + generated 0 of 1 mixed pair_coeff terms from geometric mixing rule +Neighbor list info ... + update every 1 steps, delay 0 steps, check yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 15.25 + ghost atom cutoff = 15.25 + binsize = 7.625, bins = 4 4 18 + 2 neighbor lists, perpetual/occasional/extra = 2 0 0 + (1) pair table, perpetual + attributes: half, newton on + pair build: half/bin/newton + stencil: half/bin/3d + bin: standard + (2) pair local/density, perpetual, copy from (1) + attributes: half, newton on + pair build: copy + stencil: none + bin: none +Per MPI rank memory allocation (min/avg/max) = 8.754 | 8.754 | 8.754 Mbytes +Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl + 300 1233.1611 2374.6749 3607.836 0 0 0 2374.6749 + 300 1233.1611 985.54829 2218.7094 0 0 0 985.54829 + 300 1233.1611 962.66036 2195.8215 0 0 0 962.66036 +Loop time of 0.812343 on 1 procs for 134 steps with 1380 atoms + +99.8% CPU use with 1 MPI tasks x 1 OpenMP threads + +Minimization stats: + Stopping criterion = energy tolerance + Energy initial, next-to-last, final = + 2374.67491482358 962.664796664787 962.660357218268 + Force two-norm initial, final = 263.77519 15.741017 + Force max component initial, final = 22.412654 7.9360139 + Final line search alpha, max atom move = 0.014975513 0.11884588 + Iterations, force evaluations = 134 240 + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0.78539 | 0.78539 | 0.78539 | 0.0 | 96.68 +Bond | 2.0149e-05 | 2.0149e-05 | 2.0149e-05 | 0.0 | 0.00 +Neigh | 0.016759 | 0.016759 | 0.016759 | 0.0 | 2.06 +Comm | 0.0045 | 0.0045 | 0.0045 | 0.0 | 0.55 +Output | 2.9402e-05 | 2.9402e-05 | 2.9402e-05 | 0.0 | 0.00 +Modify | 0 | 0 | 0 | 0.0 | 0.00 +Other | | 0.005647 | | | 0.70 + +Nlocal: 1380.00 ave 1380 max 1380 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 5832.00 ave 5832 max 5832 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 78165.0 ave 78165 max 78165 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 78165 +Ave neighs/atom = 56.641304 +Ave special neighs/atom = 0.0000000 +Neighbor list builds = 5 +Dangerous builds = 0 + +# Set up integration parameters +fix timeintegration all nve +fix thermostat all langevin 3.0000e+02 3.0000e+02 1.0000e+02 81890 + +# Equilibration (for realistic results, run for 5000000 steps) +reset_timestep 0 +run 5000 + generated 0 of 1 mixed pair_coeff terms from geometric mixing rule +WARNING: Fix recenter should come after all other integration fixes (src/fix_recenter.cpp:133) +Per MPI rank memory allocation (min/avg/max) = 7.629 | 7.629 | 7.629 Mbytes +Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl + 300 1233.1611 962.66036 2195.8215 0 0 0 962.66036 + 253.1913 1040.7522 1803.711 2844.4633 0 0 0 1803.711 + 290.31049 1193.332 2059.0637 3252.3958 0 0 0 2059.0637 + 299.30778 1230.3157 2140.226 3370.5417 0 0 0 2140.226 + 309.81524 1273.507 2178.3782 3451.8853 0 0 0 2178.3782 + 299.79526 1232.3195 2229.9248 3462.2444 0 0 0 2229.9248 + 299.24909 1230.0745 2260.7129 3490.7874 0 0 0 2260.7129 + 299.5898 1231.475 2244.2384 3475.7134 0 0 0 2244.2384 + 297.81223 1224.1682 2320.27 3544.4382 0 0 0 2320.27 + 301.53975 1239.4903 2277.0431 3516.5334 0 0 0 2277.0431 + 292.00572 1200.3003 2292.3073 3492.6076 0 0 0 2292.3073 + 309.19709 1270.9661 2303.6055 3574.5716 0 0 0 2303.6055 + 297.54933 1223.0876 2304.127 3527.2146 0 0 0 2304.127 + 303.48106 1247.4702 2303.5673 3551.0375 0 0 0 2303.5673 + 296.46047 1218.6118 2256.1591 3474.7709 0 0 0 2256.1591 + 299.4835 1231.038 2280.0452 3511.0832 0 0 0 2280.0452 + 306.25958 1258.8914 2307.9795 3566.8709 0 0 0 2307.9795 + 304.67335 1252.3711 2284.8252 3537.1963 0 0 0 2284.8252 + 298.33637 1226.3227 2289.8499 3516.1726 0 0 0 2289.8499 + 303.1338 1246.0427 2342.2148 3588.2575 0 0 0 2342.2148 + 305.86051 1257.251 2341.0106 3598.2616 0 0 0 2341.0106 + 297.75418 1223.9296 2303.5613 3527.4909 0 0 0 2303.5613 + 296.79348 1219.9806 2327.5207 3547.5013 0 0 0 2327.5207 + 307.25403 1262.9791 2288.4219 3551.401 0 0 0 2288.4219 + 301.26976 1238.3805 2291.2465 3529.627 0 0 0 2291.2465 + 297.17249 1221.5385 2283.3926 3504.9311 0 0 0 2283.3926 + 313.99072 1290.6705 2293.9661 3584.6366 0 0 0 2293.9661 + 301.70804 1240.1821 2331.1694 3571.3515 0 0 0 2331.1694 + 300.62599 1235.7343 2325.4367 3561.171 0 0 0 2325.4367 + 292.13495 1200.8316 2315.631 3516.4626 0 0 0 2315.631 + 313.9981 1290.7008 2286.0536 3576.7545 0 0 0 2286.0536 + 300.25311 1234.2015 2324.2379 3558.4394 0 0 0 2324.2379 + 309.3746 1271.6958 2322.2298 3593.9256 0 0 0 2322.2298 + 300.23041 1234.1082 2332.7521 3566.8603 0 0 0 2332.7521 + 302.97054 1245.3716 2303.1689 3548.5405 0 0 0 2303.1689 + 294.77155 1211.6694 2334.5087 3546.1781 0 0 0 2334.5087 + 296.81476 1220.0681 2322.5932 3542.6613 0 0 0 2322.5932 + 301.83238 1240.6932 2345.4841 3586.1773 0 0 0 2345.4841 + 295.0399 1212.7724 2312.3889 3525.1614 0 0 0 2312.3889 + 300.73565 1236.185 2338.8384 3575.0235 0 0 0 2338.8384 + 303.02264 1245.5858 2310.0868 3555.6726 0 0 0 2310.0868 + 302.86404 1244.9339 2332.2001 3577.134 0 0 0 2332.2001 + 293.77916 1207.5901 2293.2799 3500.8701 0 0 0 2293.2799 + 299.30072 1230.2867 2317.5065 3547.7933 0 0 0 2317.5065 + 311.05029 1278.5837 2311.0476 3589.6313 0 0 0 2311.0476 + 293.25646 1205.4416 2314.7398 3520.1814 0 0 0 2314.7398 + 310.49018 1276.2814 2337.4909 3613.7723 0 0 0 2337.4909 + 302.37336 1242.9169 2340.3197 3583.2366 0 0 0 2340.3197 + 297.06862 1221.1116 2323.9136 3545.0252 0 0 0 2323.9136 + 300.54817 1235.4144 2315.2405 3550.6549 0 0 0 2315.2405 + 309.10643 1270.5934 2333.1848 3603.7783 0 0 0 2333.1848 +Loop time of 15.2696 on 1 procs for 5000 steps with 1380 atoms + +Performance: 56.583 ns/day, 0.424 hours/ns, 327.447 timesteps/s +99.9% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 14.432 | 14.432 | 14.432 | 0.0 | 94.51 +Bond | 0.00032375 | 0.00032375 | 0.00032375 | 0.0 | 0.00 +Neigh | 0.41541 | 0.41541 | 0.41541 | 0.0 | 2.72 +Comm | 0.0975 | 0.0975 | 0.0975 | 0.0 | 0.64 +Output | 0.0013044 | 0.0013044 | 0.0013044 | 0.0 | 0.01 +Modify | 0.30336 | 0.30336 | 0.30336 | 0.0 | 1.99 +Other | | 0.01973 | | | 0.13 + +Nlocal: 1380.00 ave 1380 max 1380 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 5843.00 ave 5843 max 5843 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 76949.0 ave 76949 max 76949 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 76949 +Ave neighs/atom = 55.760145 +Ave special neighs/atom = 0.0000000 +Neighbor list builds = 121 +Dangerous builds = 1 + +# Turn off recentering during production phase +unfix recentering + +# Setup trajectory output +dump myDump all custom 100 benzene_water.lammpstrj.gz id type x y z element +dump_modify myDump element B W +dump_modify myDump sort id + +# Production (for realistic results, run for 10000000 steps) +reset_timestep 0 +run 1000 + generated 0 of 1 mixed pair_coeff terms from geometric mixing rule +Per MPI rank memory allocation (min/avg/max) = 9.022 | 9.022 | 9.022 Mbytes +Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl + 309.10643 1270.5934 2333.1848 3603.7783 0 0 0 2333.1848 + 300.84572 1236.6375 2331.3493 3567.9868 0 0 0 2331.3493 + 300.90599 1236.8852 2337.6775 3574.5627 0 0 0 2337.6775 + 302.77895 1244.5841 2341.7778 3586.362 0 0 0 2341.7778 + 291.66639 1198.9055 2320.3512 3519.2567 0 0 0 2320.3512 + 298.7003 1227.8187 2292.8195 3520.6382 0 0 0 2292.8195 + 301.11163 1237.7305 2310.017 3547.7475 0 0 0 2310.017 + 305.22515 1254.6393 2315.1355 3569.7748 0 0 0 2315.1355 + 295.15921 1213.2629 2310.184 3523.4468 0 0 0 2310.184 + 299.2024 1229.8826 2332.2118 3562.0943 0 0 0 2332.2118 + 302.80078 1244.6738 2320.3763 3565.0502 0 0 0 2320.3763 +Loop time of 3.07208 on 1 procs for 1000 steps with 1380 atoms + +Performance: 56.249 ns/day, 0.427 hours/ns, 325.512 timesteps/s +99.9% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 2.8993 | 2.8993 | 2.8993 | 0.0 | 94.37 +Bond | 6.5327e-05 | 6.5327e-05 | 6.5327e-05 | 0.0 | 0.00 +Neigh | 0.083502 | 0.083502 | 0.083502 | 0.0 | 2.72 +Comm | 0.019967 | 0.019967 | 0.019967 | 0.0 | 0.65 +Output | 0.012268 | 0.012268 | 0.012268 | 0.0 | 0.40 +Modify | 0.052801 | 0.052801 | 0.052801 | 0.0 | 1.72 +Other | | 0.004203 | | | 0.14 + +Nlocal: 1380.00 ave 1380 max 1380 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 5860.00 ave 5860 max 5860 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 77055.0 ave 77055 max 77055 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 77055 +Ave neighs/atom = 55.836957 +Ave special neighs/atom = 0.0000000 +Neighbor list builds = 24 +Dangerous builds = 0 + + +Total wall time: 0:00:19 diff --git a/examples/PACKAGES/local_density/benzene_water/log.27Oct21.benzene_water.g++.4 b/examples/PACKAGES/local_density/benzene_water/log.27Oct21.benzene_water.g++.4 new file mode 100644 index 0000000000..f841181777 --- /dev/null +++ b/examples/PACKAGES/local_density/benzene_water/log.27Oct21.benzene_water.g++.4 @@ -0,0 +1,299 @@ +LAMMPS (27 Oct 2021) +OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98) + using 1 OpenMP thread(s) per MPI task +# LAMMPS input file for 26.5% benzene mole fraction solution +# with 380 benzene and 1000 water molecules, +# using all possible local density potentials +# between benzene and water +# +# Author: Tanmoy Sanyal, Shell Group, UC Santa Barbara +# +# Refer: Sanyal and Shell, JPC-B, 2018, 122 (21), 5678-5693 + + + +# Initialize simulation box +dimension 3 +boundary p p p +units real +atom_style molecular + +# Set potential styles +pair_style hybrid/overlay table spline 500 local/density + +# Read molecule data and set initial velocities +read_data benzene_water.data +Reading data file ... + orthogonal box = (-12.865000 -12.865000 -64.829000) to (12.865000 12.865000 64.829000) + 1 by 1 by 4 MPI processor grid + reading atoms ... + 1380 atoms +Finding 1-2 1-3 1-4 neighbors ... + special bond factors lj: 0 0 0 + special bond factors coul: 0 0 0 + 0 = max # of 1-2 neighbors + 0 = max # of 1-3 neighbors + 0 = max # of 1-4 neighbors + 1 = max # of special neighbors + special bonds CPU = 0.000 seconds + read_data CPU = 0.007 seconds +velocity all create 3.0000e+02 16611 rot yes dist gaussian + +# Assign potentials +pair_coeff 1 1 table benzene_water.pair.table PairBB +WARNING: 33 of 500 force values in table PairBB are inconsistent with -dE/dr. +WARNING: Should only be flagged at inflection points (src/pair_table.cpp:465) +WARNING: 150 of 500 distance values in table 1e-06 with relative error +WARNING: over PairBB to re-computed values (src/pair_table.cpp:473) +pair_coeff 1 2 table benzene_water.pair.table PairWW +WARNING: 61 of 500 force values in table PairWW are inconsistent with -dE/dr. +WARNING: Should only be flagged at inflection points (src/pair_table.cpp:465) +WARNING: 90 of 500 distance values in table 1e-06 with relative error +WARNING: over PairWW to re-computed values (src/pair_table.cpp:473) +pair_coeff 2 2 table benzene_water.pair.table PairBW +WARNING: 108 of 500 force values in table PairBW are inconsistent with -dE/dr. +WARNING: Should only be flagged at inflection points (src/pair_table.cpp:465) +WARNING: 135 of 500 distance values in table 1e-06 with relative error +WARNING: over PairBW to re-computed values (src/pair_table.cpp:473) +pair_coeff * * local/density benzene_water.localdensity.table + +# Recentering during minimization and equilibration +fix recentering all recenter 0.0 0.0 0.0 units box + +# Thermostat & time integration +timestep 2.0 +thermo 100 +thermo_style custom temp ke pe etotal ebond eangle edihed evdwl + +# Minimization +minimize 1.e-4 0.0 10000 10000 + +CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE + +Your simulation uses code contributions which should be cited: + +- pair_style local/density command: + +@Article{Sanyal16, + author = {T.Sanyal and M.Scott Shell}, + title = {Coarse-grained models using local-density potentials optimized with the relative entropy: Application to implicit solvation}, + journal = {J.~Chem.~Phys.}, + year = 2016, + DOI = doi.org/10.1063/1.4958629} + +@Article{Sanyal18, + author = {T.Sanyal and M.Scott Shell}, + title = {Transferable coarse-grained models of liquid-liquid equilibrium using local density potentials optimized with the relative entropy}, + journal = {J.~Phys.~Chem. B}, + year = 2018, + DOI = doi.org/10.1021/acs.jpcb.7b12446} + +CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE + +WARNING: Using 'neigh_modify every 1 delay 0 check yes' setting during minimization (src/min.cpp:187) + generated 0 of 1 mixed pair_coeff terms from geometric mixing rule +Neighbor list info ... + update every 1 steps, delay 0 steps, check yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 15.25 + ghost atom cutoff = 15.25 + binsize = 7.625, bins = 4 4 18 + 2 neighbor lists, perpetual/occasional/extra = 2 0 0 + (1) pair table, perpetual + attributes: half, newton on + pair build: half/bin/newton + stencil: half/bin/3d + bin: standard + (2) pair local/density, perpetual, copy from (1) + attributes: half, newton on + pair build: copy + stencil: none + bin: none +Per MPI rank memory allocation (min/avg/max) = 8.441 | 8.589 | 8.688 Mbytes +Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl + 300 1233.1611 2374.6749 3607.836 0 0 0 2374.6749 + 300 1233.1611 1024.8113 2257.9724 0 0 0 1024.8113 +Loop time of 0.240559 on 4 procs for 74 steps with 1380 atoms + +98.5% CPU use with 4 MPI tasks x 1 OpenMP threads + +Minimization stats: + Stopping criterion = energy tolerance + Energy initial, next-to-last, final = + 2374.67491482358 1024.89407898645 1024.81130011575 + Force two-norm initial, final = 263.77519 20.459697 + Force max component initial, final = 22.412654 8.6082349 + Final line search alpha, max atom move = 0.027790997 0.23923143 + Iterations, force evaluations = 74 118 + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0.15928 | 0.1873 | 0.22814 | 6.5 | 77.86 +Bond | 3.857e-06 | 4.4012e-06 | 5.496e-06 | 0.0 | 0.00 +Neigh | 0.00064142 | 0.0028761 | 0.0058864 | 4.2 | 1.20 +Comm | 0.0040776 | 0.039595 | 0.074187 | 12.6 | 16.46 +Output | 0 | 0 | 0 | 0.0 | 0.00 +Modify | 0 | 0 | 0 | 0.0 | 0.00 +Other | | 0.01078 | | | 4.48 + +Nlocal: 345.000 ave 664 max 147 min +Histogram: 2 0 0 0 0 1 0 0 0 1 +Nghost: 2850.50 ave 4438 max 1208 min +Histogram: 1 0 0 1 0 0 1 0 0 1 +Neighs: 19377.5 ave 37718 max 7456 min +Histogram: 2 0 0 0 0 1 0 0 0 1 + +Total # of neighbors = 77510 +Ave neighs/atom = 56.166667 +Ave special neighs/atom = 0.0000000 +Neighbor list builds = 3 +Dangerous builds = 0 + +# Set up integration parameters +fix timeintegration all nve +fix thermostat all langevin 3.0000e+02 3.0000e+02 1.0000e+02 81890 + +# Equilibration (for realistic results, run for 5000000 steps) +reset_timestep 0 +run 5000 + generated 0 of 1 mixed pair_coeff terms from geometric mixing rule +WARNING: Fix recenter should come after all other integration fixes (src/fix_recenter.cpp:133) +Per MPI rank memory allocation (min/avg/max) = 7.316 | 7.465 | 7.563 Mbytes +Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl + 300 1233.1611 1024.8113 2257.9724 0 0 0 1024.8113 + 263.61917 1083.6164 1866.745 2950.3614 0 0 0 1866.745 + 296.0253 1216.823 2122.8463 3339.6692 0 0 0 2122.8463 + 301.93846 1241.1292 2172.9802 3414.1095 0 0 0 2172.9802 + 293.9491 1208.2887 2205.4892 3413.7779 0 0 0 2205.4892 + 286.33795 1177.0027 2204.8908 3381.8935 0 0 0 2204.8908 + 295.48217 1214.5904 2230.8849 3445.4753 0 0 0 2230.8849 + 293.88908 1208.0419 2218.7563 3426.7982 0 0 0 2218.7563 + 295.13798 1213.1756 2277.4515 3490.6271 0 0 0 2277.4515 + 290.39538 1193.681 2273.4385 3467.1195 0 0 0 2273.4385 + 297.56782 1223.1635 2268.7182 3491.8817 0 0 0 2268.7182 + 306.45578 1259.6978 2289.1507 3548.8486 0 0 0 2289.1507 + 308.54582 1268.289 2284.8514 3553.1404 0 0 0 2284.8514 + 302.17353 1242.0955 2262.5577 3504.6532 0 0 0 2262.5577 + 295.30087 1213.8452 2315.8853 3529.7305 0 0 0 2315.8853 + 308.59197 1268.4787 2291.8314 3560.3101 0 0 0 2291.8314 + 297.75618 1223.9378 2287.2003 3511.1381 0 0 0 2287.2003 + 303.43395 1247.2765 2297.7158 3544.9923 0 0 0 2297.7158 + 307.16233 1262.6021 2255.9769 3518.5791 0 0 0 2255.9769 + 301.34428 1238.6868 2284.416 3523.1028 0 0 0 2284.416 + 295.43209 1214.3846 2294.1043 3508.4889 0 0 0 2294.1043 + 287.86904 1183.2963 2257.0204 3440.3168 0 0 0 2257.0204 + 297.2661 1221.9233 2251.4194 3473.3428 0 0 0 2251.4194 + 298.90221 1228.6486 2261.834 3490.4826 0 0 0 2261.834 + 288.07202 1184.1307 2284.1918 3468.3225 0 0 0 2284.1918 + 300.41201 1234.8547 2303.9573 3538.812 0 0 0 2303.9573 + 283.91279 1167.034 2329.7936 3496.8277 0 0 0 2329.7936 + 297.27507 1221.9602 2337.0516 3559.0118 0 0 0 2337.0516 + 296.22263 1217.6341 2335.6424 3553.2765 0 0 0 2335.6424 + 296.13784 1217.2856 2364.7034 3581.989 0 0 0 2364.7034 + 308.17642 1266.7706 2320.2753 3587.0459 0 0 0 2320.2753 + 310.26592 1275.3596 2301.9318 3577.2914 0 0 0 2301.9318 + 292.97391 1204.2801 2289.8116 3494.0917 0 0 0 2289.8116 + 294.81231 1211.8369 2315.0388 3526.8757 0 0 0 2315.0388 + 298.66155 1227.6594 2317.2844 3544.9437 0 0 0 2317.2844 + 302.77939 1244.5859 2301.2063 3545.7922 0 0 0 2301.2063 + 291.47597 1198.1228 2285.1757 3483.2985 0 0 0 2285.1757 + 286.19045 1176.3964 2265.2665 3441.6629 0 0 0 2265.2665 + 295.58144 1214.9984 2272.3165 3487.315 0 0 0 2272.3165 + 283.86988 1166.8577 2320.6142 3487.4719 0 0 0 2320.6142 + 300.0576 1233.3979 2330.8962 3564.2941 0 0 0 2330.8962 + 299.86413 1232.6026 2321.2281 3553.8308 0 0 0 2321.2281 + 292.79017 1203.5248 2334.2308 3537.7557 0 0 0 2334.2308 + 291.5027 1198.2327 2335.2119 3533.4446 0 0 0 2335.2119 + 299.55471 1231.3307 2332.5216 3563.8524 0 0 0 2332.5216 + 293.29613 1205.6046 2295.3263 3500.9309 0 0 0 2295.3263 + 303.13151 1246.0333 2310.0548 3556.0881 0 0 0 2310.0548 + 298.83954 1228.391 2297.3117 3525.7027 0 0 0 2297.3117 + 297.44775 1222.67 2307.2483 3529.9183 0 0 0 2307.2483 + 309.59874 1272.6171 2309.2439 3581.861 0 0 0 2309.2439 + 307.47844 1263.9015 2274.998 3538.8995 0 0 0 2274.998 +Loop time of 11.2235 on 4 procs for 5000 steps with 1380 atoms + +Performance: 76.982 ns/day, 0.312 hours/ns, 445.495 timesteps/s +98.5% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 7.1444 | 8.5074 | 10.534 | 44.9 | 75.80 +Bond | 0.00017048 | 0.00020672 | 0.00030488 | 0.0 | 0.00 +Neigh | 0.026174 | 0.12108 | 0.26052 | 28.2 | 1.08 +Comm | 0.21788 | 1.8597 | 3.3375 | 81.2 | 16.57 +Output | 0.0008989 | 0.0069895 | 0.021647 | 10.2 | 0.06 +Modify | 0.19418 | 0.7044 | 2.1378 | 98.6 | 6.28 +Other | | 0.02368 | | | 0.21 + +Nlocal: 345.000 ave 678 max 148 min +Histogram: 2 0 0 0 1 0 0 0 0 1 +Nghost: 2854.25 ave 4464 max 1181 min +Histogram: 1 0 0 1 0 0 1 0 0 1 +Neighs: 19366.8 ave 38533 max 7481 min +Histogram: 2 0 0 0 0 1 0 0 0 1 + +Total # of neighbors = 77467 +Ave neighs/atom = 56.135507 +Ave special neighs/atom = 0.0000000 +Neighbor list builds = 121 +Dangerous builds = 1 + +# Turn off recentering during production phase +unfix recentering + +# Setup trajectory output +dump myDump all custom 100 benzene_water.lammpstrj.gz id type x y z element +dump_modify myDump element B W +dump_modify myDump sort id + +# Production (for realistic results, run for 10000000 steps) +reset_timestep 0 +run 1000 + generated 0 of 1 mixed pair_coeff terms from geometric mixing rule +Per MPI rank memory allocation (min/avg/max) = 8.640 | 8.791 | 8.894 Mbytes +Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl + 307.47844 1263.9015 2274.998 3538.8995 0 0 0 2274.998 + 309.46142 1272.0526 2274.8499 3546.9026 0 0 0 2274.8499 + 300.70977 1236.0787 2301.0588 3537.1374 0 0 0 2301.0588 + 300.53659 1235.3668 2316.1008 3551.4675 0 0 0 2316.1008 + 300.48582 1235.1581 2296.3009 3531.459 0 0 0 2296.3009 + 299.2618 1230.1267 2325.7501 3555.8768 0 0 0 2325.7501 + 303.00905 1245.5299 2321.8238 3567.3537 0 0 0 2321.8238 + 300.07018 1233.4496 2339.2833 3572.7329 0 0 0 2339.2833 + 304.20292 1250.4374 2353.1018 3603.5392 0 0 0 2353.1018 + 304.19487 1250.4043 2334.5087 3584.913 0 0 0 2334.5087 + 294.24283 1209.4961 2335.0535 3544.5496 0 0 0 2335.0535 +Loop time of 2.90512 on 4 procs for 1000 steps with 1380 atoms + +Performance: 59.481 ns/day, 0.403 hours/ns, 344.220 timesteps/s +98.4% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 1.8627 | 2.2082 | 2.7289 | 22.6 | 76.01 +Bond | 4.042e-05 | 5.3677e-05 | 8.4044e-05 | 0.0 | 0.00 +Neigh | 0.0066184 | 0.030172 | 0.064523 | 13.9 | 1.04 +Comm | 0.05914 | 0.51145 | 0.86887 | 40.7 | 17.61 +Output | 0.0057814 | 0.0073478 | 0.011158 | 2.6 | 0.25 +Modify | 0.0085337 | 0.020869 | 0.042248 | 9.4 | 0.72 +Other | | 0.127 | | | 4.37 + +Nlocal: 345.000 ave 682 max 147 min +Histogram: 2 0 0 0 1 0 0 0 0 1 +Nghost: 2836.25 ave 4427 max 1175 min +Histogram: 1 0 0 1 0 0 1 0 0 1 +Neighs: 19249.8 ave 38683 max 7433 min +Histogram: 2 0 0 0 1 0 0 0 0 1 + +Total # of neighbors = 76999 +Ave neighs/atom = 55.796377 +Ave special neighs/atom = 0.0000000 +Neighbor list builds = 23 +Dangerous builds = 0 + + +Total wall time: 0:00:14 diff --git a/examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.in b/examples/PACKAGES/local_density/methanol_implicit_water/in.methanol_implicit_water similarity index 86% rename from examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.in rename to examples/PACKAGES/local_density/methanol_implicit_water/in.methanol_implicit_water index ef92fbe655..76038b2337 100644 --- a/examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.in +++ b/examples/PACKAGES/local_density/methanol_implicit_water/in.methanol_implicit_water @@ -1,6 +1,6 @@ # LAMMPS input file for 50.0% methanol mole fraction solution # with 2500 methanol molecules in implicit water. -# +# # # Author: David Rosenberger, van der Vegt Group, TU Darmstadt # @@ -9,7 +9,7 @@ # Initialize simulation box dimension 3 -boundary p p p +boundary p p p units real atom_style molecular @@ -17,7 +17,7 @@ atom_style molecular pair_style hybrid/overlay table spline 500 local/density # Read molecule data and set initial velocities -read_data methanol_implicit_water.data +read_data methanol_implicit_water.data velocity all create 3.0000e+02 12142 rot yes dist gaussian # Assign potentials @@ -31,7 +31,7 @@ pair_coeff * * local/density methanol_implicit_water.localdensity.t fix recentering all recenter 0.0 0.0 0.0 units box #Thermostat & time integration -timestep 1.0 +timestep 1.0 thermo 100 thermo_style custom etotal ke pe temp evdwl @@ -52,15 +52,14 @@ run 2000 #turn off recentering during production run unfix recentering - +reset_timestep 0 #setup trajectory output -dump myDump all custom 100 methanol_implicit_water.lammpstrj.gz id type x y z element -dump_modify myDump element M -dump_modify myDump sort id +#dump myDump all custom 100 methanol_implicit_water.lammpstrj.gz id type x y z element +#dump_modify myDump element M +#dump_modify myDump sort id #run production (for realistic results, run for 10000000 steps) -reset_timestep 0 thermo 1000 thermo_style custom etotal ke pe temp evdwl run 10000 diff --git a/examples/PACKAGES/local_density/methanol_implicit_water/log.04Sep19.g++.1 b/examples/PACKAGES/local_density/methanol_implicit_water/log.04Sep19.g++.1 deleted file mode 100644 index 618e994946..0000000000 --- a/examples/PACKAGES/local_density/methanol_implicit_water/log.04Sep19.g++.1 +++ /dev/null @@ -1,226 +0,0 @@ -LAMMPS (7 Aug 2019) -# LAMMPS input file for 50.0% methanol mole fraction solution -# with 2500 methanol molecules in implicit water. -# -# -# Author: David Rosenberger, van der Vegt Group, TU Darmstadt -# -# Refer: Rosenberger, Sanyal, Shell, van der Vegt, J. Chem. Theory Comput. 15, 2881-2895 (2019) - - -# Initialize simulation box -dimension 3 -boundary p p p -units real -atom_style molecular - -# Set potential styles -pair_style hybrid/overlay table spline 500 local/density - -# Read molecule data and set initial velocities -read_data methanol_implicit_water.data - orthogonal box = (-31.123 -31.123 -31.123) to (31.123 31.123 31.123) - 2 by 2 by 2 MPI processor grid - reading atoms ... - 2500 atoms - 0 = max # of 1-2 neighbors - 0 = max # of 1-3 neighbors - 0 = max # of 1-4 neighbors - 1 = max # of special neighbors - special bonds CPU = 0.00063014 secs - read_data CPU = 0.00599909 secs -velocity all create 3.0000e+02 12142 rot yes dist gaussian - -# Assign potentials -pair_coeff 1 1 table methanol_implicit_water.pair.table PairMM -WARNING: 93 of 500 force values in table are inconsistent with -dE/dr. - Should only be flagged at inflection points (../pair_table.cpp:483) -WARNING: 254 of 500 distance values in table with relative error - over 1e-06 to re-computed values (../pair_table.cpp:492) -pair_coeff * * local/density methanol_implicit_water.localdensity.table - - - - -#Recentering during minimization and equilibration -fix recentering all recenter 0.0 0.0 0.0 units box - -#Thermostat & time integration -timestep 1.0 -thermo 100 -thermo_style custom etotal ke pe temp evdwl - -#minimization -minimize 1.e-4 0.0 1000 1000 -WARNING: Using 'neigh_modify every 1 delay 0 check yes' setting during minimization (../min.cpp:168) -Neighbor list info ... - update every 1 steps, delay 0 steps, check yes - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 17 - ghost atom cutoff = 17 - binsize = 8.5, bins = 8 8 8 - 2 neighbor lists, perpetual/occasional/extra = 2 0 0 - (1) pair table, perpetual - attributes: half, newton on - pair build: half/bin/newton - stencil: half/bin/3d/newton - bin: standard - (2) pair local/density, perpetual, copy from (1) - attributes: half, newton on - pair build: copy - stencil: none - bin: none -Per MPI rank memory allocation (min/avg/max) = 7.411 | 7.411 | 7.412 Mbytes -TotEng KinEng PotEng Temp E_vdwl - 1470.3564 2234.7133 -764.35689 300 -764.35689 - 46.496766 2234.7133 -2188.2165 300 -2188.2165 - 7.9030246 2234.7133 -2226.8103 300 -2226.8103 -Loop time of 0.463996 on 8 procs for 121 steps with 2500 atoms - -91.4% CPU use with 8 MPI tasks x no OpenMP threads - -Minimization stats: - Stopping criterion = linesearch alpha is zero - Energy initial, next-to-last, final = - -764.356892369 -2227.85589084 -2226.81026984 - Force two-norm initial, final = 134.911 3.83896 - Force max component initial, final = 14.1117 1.07422 - Final line search alpha, max atom move = 5.06747e-10 5.44356e-10 - Iterations, force evaluations = 121 154 - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 0.41442 | 0.41976 | 0.42434 | 0.5 | 90.47 -Bond | 1.1683e-05 | 2.0713e-05 | 3.5048e-05 | 0.0 | 0.00 -Neigh | 0.0084722 | 0.0090862 | 0.010038 | 0.5 | 1.96 -Comm | 0.022712 | 0.028157 | 0.034072 | 1.9 | 6.07 -Output | 3.1948e-05 | 3.6925e-05 | 6.6996e-05 | 0.0 | 0.01 -Modify | 0 | 0 | 0 | 0.0 | 0.00 -Other | | 0.006937 | | | 1.50 - -Nlocal: 312.5 ave 333 max 299 min -Histogram: 2 2 0 0 1 0 2 0 0 1 -Nghost: 2546 ave 2580 max 2517 min -Histogram: 1 1 0 3 0 1 0 0 0 2 -Neighs: 33215.4 ave 37251 max 29183 min -Histogram: 1 0 0 1 2 2 0 1 0 1 - -Total # of neighbors = 265723 -Ave neighs/atom = 106.289 -Ave special neighs/atom = 0 -Neighbor list builds = 6 -Dangerous builds = 0 - -#set up integration parameters -fix timeintegration all nve -fix thermostat all langevin 3.0000e+02 3.0000e+02 1.0000e+02 59915 - -#Equilibration (for realistic results, run for 2000000 steps) -reset_timestep 0 -thermo 200 -thermo_style custom etotal ke pe temp evdwl - -#run equilibration -run 2000 -WARNING: Fix recenter should come after all other integration fixes (../fix_recenter.cpp:131) -Per MPI rank memory allocation (min/avg/max) = 6.286 | 6.286 | 6.287 Mbytes -TotEng KinEng PotEng Temp E_vdwl - 177.26822 2234.7133 -2057.4451 300 -2057.4451 - 736.24287 2151.2608 -1415.0179 288.79688 -1415.0179 - 963.07617 2090.6433 -1127.5671 280.65926 -1127.5671 - 1148.9049 2173.1327 -1024.2279 291.73309 -1024.2279 - 1303.6409 2279.8586 -976.21767 306.06055 -976.21767 - 1355.42 2281.0383 -925.61826 306.21892 -925.61826 - 1394.5206 2276.2093 -881.68863 305.57064 -881.68863 - 1346.9764 2215.2973 -868.32091 297.3935 -868.32091 - 1381.3654 2248.8061 -867.44063 301.89189 -867.44063 - 1315.8059 2189.3193 -873.51332 293.90606 -873.51332 - 1314.4456 2209.7431 -895.29752 296.64787 -895.29752 -Loop time of 6.38989 on 8 procs for 2000 steps with 2500 atoms - -Performance: 27.043 ns/day, 0.887 hours/ns, 312.994 timesteps/s -80.5% CPU use with 8 MPI tasks x no OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 5.2693 | 5.3572 | 5.457 | 2.1 | 83.84 -Bond | 0.00028825 | 0.00033835 | 0.00039148 | 0.0 | 0.01 -Neigh | 0.0296 | 0.032337 | 0.035071 | 0.9 | 0.51 -Comm | 0.64679 | 0.73397 | 0.80847 | 5.2 | 11.49 -Output | 0.00033498 | 0.00051582 | 0.0015228 | 0.0 | 0.01 -Modify | 0.16395 | 0.18919 | 0.21056 | 3.9 | 2.96 -Other | | 0.07636 | | | 1.19 - -Nlocal: 312.5 ave 337 max 295 min -Histogram: 2 2 0 1 0 0 0 1 1 1 -Nghost: 2551.62 ave 2582 max 2525 min -Histogram: 2 1 0 0 1 1 1 0 1 1 -Neighs: 33241.8 ave 37659 max 29705 min -Histogram: 2 0 0 2 2 0 0 0 1 1 - -Total # of neighbors = 265934 -Ave neighs/atom = 106.374 -Ave special neighs/atom = 0 -Neighbor list builds = 21 -Dangerous builds = 0 - -#turn off recentering during production run -unfix recentering - - -#setup trajectory output -dump myDump all custom 100 methanol_implicit_water.lammpstrj.gz id type x y z element -dump_modify myDump element M -dump_modify myDump sort id - -#run production (for realistic results, run for 10000000 steps) -reset_timestep 0 -thermo 1000 -thermo_style custom etotal ke pe temp evdwl -run 10000 -Per MPI rank memory allocation (min/avg/max) = 7.588 | 7.589 | 7.589 Mbytes -TotEng KinEng PotEng Temp E_vdwl - 1442.5428 2209.7431 -767.20027 296.64787 -767.20027 - 1391.8624 2262.6889 -870.82656 303.7556 -870.82656 - 1375.914 2244.6176 -868.7036 301.3296 -868.7036 - 1345.9064 2227.2324 -881.32599 298.99573 -881.32599 - 1379.2334 2278.1156 -898.88222 305.82657 -898.88222 - 1389.7928 2255.8062 -866.01341 302.83163 -866.01341 - 1380.4549 2258.2108 -877.75582 303.15443 -877.75582 - 1380.8489 2256.9432 -876.09428 302.98426 -876.09428 - 1326.5151 2225.7408 -899.22577 298.79549 -899.22577 - 1376.6025 2253.0128 -876.41028 302.45662 -876.41028 - 1331.0008 2218.1033 -887.10258 297.77019 -887.10258 -Loop time of 25.4591 on 8 procs for 10000 steps with 2500 atoms - -Performance: 33.937 ns/day, 0.707 hours/ns, 392.787 timesteps/s -89.3% CPU use with 8 MPI tasks x no OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 21.635 | 21.916 | 22.237 | 3.9 | 86.08 -Bond | 0.0011308 | 0.0013149 | 0.0016932 | 0.5 | 0.01 -Neigh | 0.14593 | 0.15675 | 0.16667 | 1.9 | 0.62 -Comm | 1.3789 | 1.7502 | 1.9558 | 13.7 | 6.87 -Output | 0.34664 | 0.82927 | 1.2013 | 32.8 | 3.26 -Modify | 0.24904 | 0.25842 | 0.26907 | 1.2 | 1.02 -Other | | 0.5475 | | | 2.15 - -Nlocal: 312.5 ave 327 max 298 min -Histogram: 2 0 0 1 1 0 1 1 1 1 -Nghost: 2575 ave 2601 max 2559 min -Histogram: 2 0 3 1 0 0 0 0 1 1 -Neighs: 33223.2 ave 35920 max 30303 min -Histogram: 1 1 1 1 0 1 0 0 0 3 - -Total # of neighbors = 265786 -Ave neighs/atom = 106.314 -Ave special neighs/atom = 0 -Neighbor list builds = 103 -Dangerous builds = 0 - - -Total wall time: 0:00:32 diff --git a/examples/PACKAGES/local_density/methanol_implicit_water/log.27Oct21.methanol_implicit_water.g++.1 b/examples/PACKAGES/local_density/methanol_implicit_water/log.27Oct21.methanol_implicit_water.g++.1 new file mode 100644 index 0000000000..3048264818 --- /dev/null +++ b/examples/PACKAGES/local_density/methanol_implicit_water/log.27Oct21.methanol_implicit_water.g++.1 @@ -0,0 +1,259 @@ +LAMMPS (27 Oct 2021) +OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98) + using 1 OpenMP thread(s) per MPI task +# LAMMPS input file for 50.0% methanol mole fraction solution +# with 2500 methanol molecules in implicit water. +# +# +# Author: David Rosenberger, van der Vegt Group, TU Darmstadt +# +# Refer: Rosenberger, Sanyal, Shell, van der Vegt, J. Chem. Theory Comput. 15, 2881-2895 (2019) + + +# Initialize simulation box +dimension 3 +boundary p p p +units real +atom_style molecular + +# Set potential styles +pair_style hybrid/overlay table spline 500 local/density + +# Read molecule data and set initial velocities +read_data methanol_implicit_water.data +Reading data file ... + orthogonal box = (-31.123000 -31.123000 -31.123000) to (31.123000 31.123000 31.123000) + 1 by 1 by 1 MPI processor grid + reading atoms ... + 2500 atoms +Finding 1-2 1-3 1-4 neighbors ... + special bond factors lj: 0 0 0 + special bond factors coul: 0 0 0 + 0 = max # of 1-2 neighbors + 0 = max # of 1-3 neighbors + 0 = max # of 1-4 neighbors + 1 = max # of special neighbors + special bonds CPU = 0.001 seconds + read_data CPU = 0.016 seconds +velocity all create 3.0000e+02 12142 rot yes dist gaussian + +# Assign potentials +pair_coeff 1 1 table methanol_implicit_water.pair.table PairMM +WARNING: 93 of 500 force values in table PairMM are inconsistent with -dE/dr. +WARNING: Should only be flagged at inflection points (src/pair_table.cpp:465) +WARNING: 254 of 500 distance values in table 1e-06 with relative error +WARNING: over PairMM to re-computed values (src/pair_table.cpp:473) +pair_coeff * * local/density methanol_implicit_water.localdensity.table + + + + +#Recentering during minimization and equilibration +fix recentering all recenter 0.0 0.0 0.0 units box + +#Thermostat & time integration +timestep 1.0 +thermo 100 +thermo_style custom etotal ke pe temp evdwl + +#minimization +minimize 1.e-4 0.0 1000 1000 + +CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE + +Your simulation uses code contributions which should be cited: + +- pair_style local/density command: + +@Article{Sanyal16, + author = {T.Sanyal and M.Scott Shell}, + title = {Coarse-grained models using local-density potentials optimized with the relative entropy: Application to implicit solvation}, + journal = {J.~Chem.~Phys.}, + year = 2016, + DOI = doi.org/10.1063/1.4958629} + +@Article{Sanyal18, + author = {T.Sanyal and M.Scott Shell}, + title = {Transferable coarse-grained models of liquid-liquid equilibrium using local density potentials optimized with the relative entropy}, + journal = {J.~Phys.~Chem. B}, + year = 2018, + DOI = doi.org/10.1021/acs.jpcb.7b12446} + +CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE + +WARNING: Using 'neigh_modify every 1 delay 0 check yes' setting during minimization (src/min.cpp:187) + generated 0 of 0 mixed pair_coeff terms from geometric mixing rule +Neighbor list info ... + update every 1 steps, delay 0 steps, check yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 17 + ghost atom cutoff = 17 + binsize = 8.5, bins = 8 8 8 + 2 neighbor lists, perpetual/occasional/extra = 2 0 0 + (1) pair table, perpetual + attributes: half, newton on + pair build: half/bin/newton + stencil: half/bin/3d + bin: standard + (2) pair local/density, perpetual, copy from (1) + attributes: half, newton on + pair build: copy + stencil: none + bin: none +Per MPI rank memory allocation (min/avg/max) = 9.535 | 9.535 | 9.535 Mbytes +TotEng KinEng PotEng Temp E_vdwl + 1283.8556 2234.7133 -950.85771 300 -950.85771 + -10.187232 2234.7133 -2244.9005 300 -2244.9005 + -124.79406 2234.7133 -2359.5074 300 -2359.5074 + -126.7619 2234.7133 -2361.4752 300 -2361.4752 +Loop time of 3.74581 on 1 procs for 205 steps with 2500 atoms + +99.5% CPU use with 1 MPI tasks x 1 OpenMP threads + +Minimization stats: + Stopping criterion = energy tolerance + Energy initial, next-to-last, final = + -950.857712502514 -2361.24417962983 -2361.47519428972 + Force two-norm initial, final = 135.25170 2.8038329 + Force max component initial, final = 14.083102 1.1154133 + Final line search alpha, max atom move = 0.16981022 0.18940857 + Iterations, force evaluations = 205 223 + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 3.5678 | 3.5678 | 3.5678 | 0.0 | 95.25 +Bond | 7.5831e-05 | 7.5831e-05 | 7.5831e-05 | 0.0 | 0.00 +Neigh | 0.12962 | 0.12962 | 0.12962 | 0.0 | 3.46 +Comm | 0.019204 | 0.019204 | 0.019204 | 0.0 | 0.51 +Output | 0.00023948 | 0.00023948 | 0.00023948 | 0.0 | 0.01 +Modify | 0 | 0 | 0 | 0.0 | 0.00 +Other | | 0.02886 | | | 0.77 + +Nlocal: 2500.00 ave 2500 max 2500 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 6729.00 ave 6729 max 6729 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 265637.0 ave 265637 max 265637 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 265637 +Ave neighs/atom = 106.25480 +Ave special neighs/atom = 0.0000000 +Neighbor list builds = 11 +Dangerous builds = 0 + +#set up integration parameters +fix timeintegration all nve +fix thermostat all langevin 3.0000e+02 3.0000e+02 1.0000e+02 59915 + +#Equilibration (for realistic results, run for 2000000 steps) +reset_timestep 0 +thermo 200 +thermo_style custom etotal ke pe temp evdwl + +#run equilibration +run 2000 + generated 0 of 0 mixed pair_coeff terms from geometric mixing rule +WARNING: Fix recenter should come after all other integration fixes (src/fix_recenter.cpp:133) +Per MPI rank memory allocation (min/avg/max) = 8.410 | 8.410 | 8.410 Mbytes +TotEng KinEng PotEng Temp E_vdwl + -126.7619 2234.7133 -2361.4752 300 -2361.4752 + 517.05047 2015.8636 -1498.8131 270.62043 -1498.8131 + 931.78263 2135.4332 -1203.6506 286.6721 -1203.6506 + 1162.6209 2242.1662 -1079.5453 301.00051 -1079.5453 + 1164.2129 2211.6204 -1047.4075 296.89989 -1047.4075 + 1258.0085 2286.5942 -1028.5857 306.96477 -1028.5857 + 1231.1937 2200.814 -969.62032 295.44917 -969.62032 + 1251.2144 2245.0533 -993.83885 301.3881 -993.83885 + 1237.2495 2239.8802 -1002.6307 300.69363 -1002.6307 + 1232.3342 2224.3415 -992.00722 298.60763 -992.00722 + 1235.3228 2197.191 -961.86817 294.9628 -961.86817 +Loop time of 23.6478 on 1 procs for 2000 steps with 2500 atoms + +Performance: 7.307 ns/day, 3.284 hours/ns, 84.575 timesteps/s +99.5% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 22.797 | 22.797 | 22.797 | 0.0 | 96.40 +Bond | 0.00070412 | 0.00070412 | 0.00070412 | 0.0 | 0.00 +Neigh | 0.2249 | 0.2249 | 0.2249 | 0.0 | 0.95 +Comm | 0.12259 | 0.12259 | 0.12259 | 0.0 | 0.52 +Output | 0.00088925 | 0.00088925 | 0.00088925 | 0.0 | 0.00 +Modify | 0.46447 | 0.46447 | 0.46447 | 0.0 | 1.96 +Other | | 0.03711 | | | 0.16 + +Nlocal: 2500.00 ave 2500 max 2500 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 6752.00 ave 6752 max 6752 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 265940.0 ave 265940 max 265940 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 265940 +Ave neighs/atom = 106.37600 +Ave special neighs/atom = 0.0000000 +Neighbor list builds = 20 +Dangerous builds = 0 + +#turn off recentering during production run +unfix recentering + + +#setup trajectory output +dump myDump all custom 100 methanol_implicit_water.lammpstrj.gz id type x y z element +dump_modify myDump element M +dump_modify myDump sort id + +#run production (for realistic results, run for 10000000 steps) +reset_timestep 0 +thermo 1000 +thermo_style custom etotal ke pe temp evdwl +run 10000 + generated 0 of 0 mixed pair_coeff terms from geometric mixing rule +Per MPI rank memory allocation (min/avg/max) = 9.918 | 9.918 | 9.918 Mbytes +TotEng KinEng PotEng Temp E_vdwl + 1235.3228 2197.191 -961.86817 294.9628 -961.86817 + 1289.8463 2236.1425 -946.29622 300.19186 -946.29622 + 1348.0825 2305.0295 -956.94703 309.43963 -956.94703 + 1279.5478 2241.1582 -961.61041 300.86521 -961.61041 + 1231.8597 2201.9591 -970.09949 295.60291 -970.09949 + 1277.3424 2221.3696 -944.02725 298.20867 -944.02725 + 1296.0116 2222.0998 -926.08818 298.3067 -926.08818 + 1266.2849 2206.3727 -940.08782 296.1954 -940.08782 + 1313.2808 2260.5077 -947.22683 303.46278 -947.22683 + 1309.3076 2234.3895 -925.08198 299.95654 -925.08198 + 1275.9792 2221.3037 -945.32449 298.19982 -945.32449 +Loop time of 67.3224 on 1 procs for 10000 steps with 2500 atoms + +Performance: 12.834 ns/day, 1.870 hours/ns, 148.539 timesteps/s +99.4% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 64.476 | 64.476 | 64.476 | 0.0 | 95.77 +Bond | 0.0014504 | 0.0014504 | 0.0014504 | 0.0 | 0.00 +Neigh | 0.71333 | 0.71333 | 0.71333 | 0.0 | 1.06 +Comm | 0.32846 | 0.32846 | 0.32846 | 0.0 | 0.49 +Output | 0.46997 | 0.46997 | 0.46997 | 0.0 | 0.70 +Modify | 1.2336 | 1.2336 | 1.2336 | 0.0 | 1.83 +Other | | 0.09996 | | | 0.15 + +Nlocal: 2500.00 ave 2500 max 2500 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 6662.00 ave 6662 max 6662 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 265774.0 ave 265774 max 265774 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 265774 +Ave neighs/atom = 106.30960 +Ave special neighs/atom = 0.0000000 +Neighbor list builds = 104 +Dangerous builds = 0 + + +Total wall time: 0:01:34 diff --git a/examples/PACKAGES/local_density/methanol_implicit_water/log.27Oct21.methanol_implicit_water.g++.4 b/examples/PACKAGES/local_density/methanol_implicit_water/log.27Oct21.methanol_implicit_water.g++.4 new file mode 100644 index 0000000000..9467e7f9bf --- /dev/null +++ b/examples/PACKAGES/local_density/methanol_implicit_water/log.27Oct21.methanol_implicit_water.g++.4 @@ -0,0 +1,259 @@ +LAMMPS (27 Oct 2021) +OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98) + using 1 OpenMP thread(s) per MPI task +# LAMMPS input file for 50.0% methanol mole fraction solution +# with 2500 methanol molecules in implicit water. +# +# +# Author: David Rosenberger, van der Vegt Group, TU Darmstadt +# +# Refer: Rosenberger, Sanyal, Shell, van der Vegt, J. Chem. Theory Comput. 15, 2881-2895 (2019) + + +# Initialize simulation box +dimension 3 +boundary p p p +units real +atom_style molecular + +# Set potential styles +pair_style hybrid/overlay table spline 500 local/density + +# Read molecule data and set initial velocities +read_data methanol_implicit_water.data +Reading data file ... + orthogonal box = (-31.123000 -31.123000 -31.123000) to (31.123000 31.123000 31.123000) + 1 by 2 by 2 MPI processor grid + reading atoms ... + 2500 atoms +Finding 1-2 1-3 1-4 neighbors ... + special bond factors lj: 0 0 0 + special bond factors coul: 0 0 0 + 0 = max # of 1-2 neighbors + 0 = max # of 1-3 neighbors + 0 = max # of 1-4 neighbors + 1 = max # of special neighbors + special bonds CPU = 0.000 seconds + read_data CPU = 0.005 seconds +velocity all create 3.0000e+02 12142 rot yes dist gaussian + +# Assign potentials +pair_coeff 1 1 table methanol_implicit_water.pair.table PairMM +WARNING: 93 of 500 force values in table PairMM are inconsistent with -dE/dr. +WARNING: Should only be flagged at inflection points (src/pair_table.cpp:465) +WARNING: 254 of 500 distance values in table 1e-06 with relative error +WARNING: over PairMM to re-computed values (src/pair_table.cpp:473) +pair_coeff * * local/density methanol_implicit_water.localdensity.table + + + + +#Recentering during minimization and equilibration +fix recentering all recenter 0.0 0.0 0.0 units box + +#Thermostat & time integration +timestep 1.0 +thermo 100 +thermo_style custom etotal ke pe temp evdwl + +#minimization +minimize 1.e-4 0.0 1000 1000 + +CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE + +Your simulation uses code contributions which should be cited: + +- pair_style local/density command: + +@Article{Sanyal16, + author = {T.Sanyal and M.Scott Shell}, + title = {Coarse-grained models using local-density potentials optimized with the relative entropy: Application to implicit solvation}, + journal = {J.~Chem.~Phys.}, + year = 2016, + DOI = doi.org/10.1063/1.4958629} + +@Article{Sanyal18, + author = {T.Sanyal and M.Scott Shell}, + title = {Transferable coarse-grained models of liquid-liquid equilibrium using local density potentials optimized with the relative entropy}, + journal = {J.~Phys.~Chem. B}, + year = 2018, + DOI = doi.org/10.1021/acs.jpcb.7b12446} + +CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE + +WARNING: Using 'neigh_modify every 1 delay 0 check yes' setting during minimization (src/min.cpp:187) + generated 0 of 0 mixed pair_coeff terms from geometric mixing rule +Neighbor list info ... + update every 1 steps, delay 0 steps, check yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 17 + ghost atom cutoff = 17 + binsize = 8.5, bins = 8 8 8 + 2 neighbor lists, perpetual/occasional/extra = 2 0 0 + (1) pair table, perpetual + attributes: half, newton on + pair build: half/bin/newton + stencil: half/bin/3d + bin: standard + (2) pair local/density, perpetual, copy from (1) + attributes: half, newton on + pair build: copy + stencil: none + bin: none +Per MPI rank memory allocation (min/avg/max) = 7.855 | 7.855 | 7.855 Mbytes +TotEng KinEng PotEng Temp E_vdwl + 1283.8556 2234.7133 -950.85771 300 -950.85771 + -10.187232 2234.7133 -2244.9005 300 -2244.9005 + -124.3661 2234.7133 -2359.0794 300 -2359.0794 + -146.7158 2234.7133 -2381.4291 300 -2381.4291 +Loop time of 0.528503 on 4 procs for 244 steps with 2500 atoms + +99.7% CPU use with 4 MPI tasks x 1 OpenMP threads + +Minimization stats: + Stopping criterion = energy tolerance + Energy initial, next-to-last, final = + -950.857712502527 -2381.2294195605 -2381.42909821383 + Force two-norm initial, final = 135.25170 2.3117934 + Force max component initial, final = 14.083102 0.60833889 + Final line search alpha, max atom move = 0.18347073 0.11161238 + Iterations, force evaluations = 244 278 + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0.48518 | 0.48843 | 0.49223 | 0.4 | 92.42 +Bond | 1.0084e-05 | 1.0861e-05 | 1.1483e-05 | 0.0 | 0.00 +Neigh | 0.018199 | 0.019153 | 0.020036 | 0.5 | 3.62 +Comm | 0.010229 | 0.014832 | 0.018994 | 2.6 | 2.81 +Output | 3.7985e-05 | 4.2069e-05 | 5.3874e-05 | 0.0 | 0.01 +Modify | 0 | 0 | 0 | 0.0 | 0.00 +Other | | 0.006032 | | | 1.14 + +Nlocal: 625.000 ave 638 max 618 min +Histogram: 2 0 0 0 1 0 0 0 0 1 +Nghost: 3613.75 ave 3640 max 3580 min +Histogram: 1 0 0 0 1 0 0 0 1 1 +Neighs: 66411.2 ave 70713 max 62416 min +Histogram: 1 0 1 0 0 0 1 0 0 1 + +Total # of neighbors = 265645 +Ave neighs/atom = 106.25800 +Ave special neighs/atom = 0.0000000 +Neighbor list builds = 13 +Dangerous builds = 0 + +#set up integration parameters +fix timeintegration all nve +fix thermostat all langevin 3.0000e+02 3.0000e+02 1.0000e+02 59915 + +#Equilibration (for realistic results, run for 2000000 steps) +reset_timestep 0 +thermo 200 +thermo_style custom etotal ke pe temp evdwl + +#run equilibration +run 2000 + generated 0 of 0 mixed pair_coeff terms from geometric mixing rule +WARNING: Fix recenter should come after all other integration fixes (src/fix_recenter.cpp:133) +Per MPI rank memory allocation (min/avg/max) = 6.730 | 6.730 | 6.731 Mbytes +TotEng KinEng PotEng Temp E_vdwl + -146.7158 2234.7133 -2381.4291 300 -2381.4291 + 540.68168 2041.44 -1500.7584 274.05395 -1500.7584 + 945.4949 2163.7509 -1218.256 290.47363 -1218.256 + 1118.7729 2195.7579 -1076.985 294.77042 -1076.985 + 1215.0058 2233.2445 -1018.2387 299.80282 -1018.2387 + 1251.8045 2240.8439 -989.03944 300.823 -989.03944 + 1206.649 2149.5807 -942.93169 288.57134 -942.93169 + 1290.6111 2248.3623 -957.75117 301.83231 -957.75117 + 1312.8944 2219.147 -906.25264 297.9103 -906.25264 + 1260.002 2211.4176 -951.41561 296.87266 -951.41561 + 1335.0956 2270.1367 -935.04108 304.75543 -935.04108 +Loop time of 3.56721 on 4 procs for 2000 steps with 2500 atoms + +Performance: 48.441 ns/day, 0.495 hours/ns, 560.663 timesteps/s +99.8% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 3.3122 | 3.3399 | 3.3633 | 1.0 | 93.63 +Bond | 7.5941e-05 | 8.062e-05 | 8.7627e-05 | 0.0 | 0.00 +Neigh | 0.03524 | 0.036666 | 0.037864 | 0.6 | 1.03 +Comm | 0.080116 | 0.10444 | 0.13373 | 6.1 | 2.93 +Output | 0.00019977 | 0.00022502 | 0.00029007 | 0.0 | 0.01 +Modify | 0.077781 | 0.078206 | 0.078752 | 0.1 | 2.19 +Other | | 0.007641 | | | 0.21 + +Nlocal: 625.000 ave 637 max 616 min +Histogram: 1 0 1 0 1 0 0 0 0 1 +Nghost: 3597.25 ave 3610 max 3586 min +Histogram: 1 0 1 0 0 0 1 0 0 1 +Neighs: 66468.2 ave 69230 max 62721 min +Histogram: 1 0 0 1 0 0 0 0 0 2 + +Total # of neighbors = 265873 +Ave neighs/atom = 106.34920 +Ave special neighs/atom = 0.0000000 +Neighbor list builds = 20 +Dangerous builds = 0 + +#turn off recentering during production run +unfix recentering + + +#setup trajectory output +dump myDump all custom 100 methanol_implicit_water.lammpstrj.gz id type x y z element +dump_modify myDump element M +dump_modify myDump sort id + +#run production (for realistic results, run for 10000000 steps) +reset_timestep 0 +thermo 1000 +thermo_style custom etotal ke pe temp evdwl +run 10000 + generated 0 of 0 mixed pair_coeff terms from geometric mixing rule +Per MPI rank memory allocation (min/avg/max) = 8.071 | 8.071 | 8.071 Mbytes +TotEng KinEng PotEng Temp E_vdwl + 1335.0956 2270.1367 -935.04108 304.75543 -935.04108 + 1266.2305 2227.2123 -960.98186 298.99303 -960.98186 + 1304.2289 2238.1343 -933.90544 300.45925 -933.90544 + 1311.3201 2232.0862 -920.7661 299.64733 -920.7661 + 1289.9028 2241.3533 -951.45049 300.89139 -951.45049 + 1314.2234 2244.8514 -930.62797 301.361 -930.62797 + 1282.2744 2240.6716 -958.39719 300.79987 -958.39719 + 1239.302 2181.5711 -942.2691 292.86591 -942.2691 + 1327.0954 2242.6441 -915.54875 301.06468 -915.54875 + 1334.9799 2239.6841 -904.70423 300.66731 -904.70423 + 1320.6105 2263.4912 -942.88066 303.8633 -942.88066 +Loop time of 23.3399 on 4 procs for 10000 steps with 2500 atoms + +Performance: 37.018 ns/day, 0.648 hours/ns, 428.451 timesteps/s +99.5% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 21.343 | 21.606 | 21.766 | 3.7 | 92.57 +Bond | 0.00045963 | 0.0004817 | 0.0005083 | 0.0 | 0.00 +Neigh | 0.20708 | 0.22081 | 0.22733 | 1.7 | 0.95 +Comm | 0.63014 | 0.80326 | 1.0801 | 19.8 | 3.44 +Output | 0.11791 | 0.14443 | 0.22211 | 11.8 | 0.62 +Modify | 0.37291 | 0.389 | 0.41719 | 2.7 | 1.67 +Other | | 0.1761 | | | 0.75 + +Nlocal: 625.000 ave 636 max 613 min +Histogram: 1 0 0 0 0 2 0 0 0 1 +Nghost: 3597.00 ave 3613 max 3580 min +Histogram: 1 0 0 1 0 0 0 1 0 1 +Neighs: 66408.5 ave 69186 max 61728 min +Histogram: 1 0 0 0 0 0 1 0 1 1 + +Total # of neighbors = 265634 +Ave neighs/atom = 106.25360 +Ave special neighs/atom = 0.0000000 +Neighbor list builds = 102 +Dangerous builds = 0 + + +Total wall time: 0:00:27 diff --git a/examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.localdensity.table b/examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.localdensity.table index b9b4a082bc..af2d4304f7 100644 --- a/examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.localdensity.table +++ b/examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.localdensity.table @@ -1,4 +1,4 @@ -#LOCAL DENSITY POTENTIALS +#LOCAL DENSITY POTENTIALS UNITS: real 1 500 diff --git a/examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.pair.table b/examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.pair.table index b74fe398e8..6ec4a0a762 100644 --- a/examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.pair.table +++ b/examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.pair.table @@ -1,4 +1,4 @@ - +# UNITS: real PairMM N 500 R 2.00000e-02 1.50000e+01 diff --git a/examples/plugins/CMakeLists.txt b/examples/plugins/CMakeLists.txt index 0ca2c025e2..8bef055ad3 100644 --- a/examples/plugins/CMakeLists.txt +++ b/examples/plugins/CMakeLists.txt @@ -14,26 +14,29 @@ endif() project(plugins VERSION 1.0 LANGUAGES CXX) -# ugly hacks for MSVC which by default always reports an old C++ standard in the __cplusplus macro -# and prints lots of pointless warnings about "unsafe" functions -if(MSVC) - add_compile_options(/Zc:__cplusplus) - add_compile_options(/wd4244) - add_compile_options(/wd4267) - add_compile_definitions(_CRT_SECURE_NO_WARNINGS) -endif() - -# NOTE: the next line should be commented out when used outside of the LAMMPS package -get_filename_component(LAMMPS_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../src ABSOLUTE) -set(LAMMPS_HEADER_DIR ${LAMMPS_SOURCE_DIR} CACHE PATH "Location of LAMMPS headers") -if(NOT LAMMPS_HEADER_DIR) - message(FATAL_ERROR "Must set LAMMPS_HEADER_DIR") -endif() - -# by default, install into $HOME/.local (not /usr/local), -# so that no root access (and sudo) is needed -if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) - set(CMAKE_INSTALL_PREFIX "$ENV{HOME}/.local" CACHE PATH "Default install path" FORCE) +# when this file is included as subdirectory in the LAMMPS build, many settings are directly imported +if(LAMMPS_DIR) + set(LAMMPS_HEADER_DIR ${LAMMPS_SOURCE_DIR}) +else() + # NOTE: the next line should be commented out when used outside of the LAMMPS package + get_filename_component(LAMMPS_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../src ABSOLUTE) + set(LAMMPS_HEADER_DIR ${LAMMPS_SOURCE_DIR} CACHE PATH "Location of LAMMPS headers") + if(NOT LAMMPS_HEADER_DIR) + message(FATAL_ERROR "Must set LAMMPS_HEADER_DIR") + endif() + # by default, install into $HOME/.local (not /usr/local), + # so that no root access (and sudo) is needed + if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) + set(CMAKE_INSTALL_PREFIX "$ENV{HOME}/.local" CACHE PATH "Default install path" FORCE) + endif() + # ugly hacks for MSVC which by default always reports an old C++ standard in the __cplusplus macro + # and prints lots of pointless warnings about "unsafe" functions + if(MSVC) + add_compile_options(/Zc:__cplusplus) + add_compile_options(/wd4244) + add_compile_options(/wd4267) + add_compile_definitions(_CRT_SECURE_NO_WARNINGS) + endif() endif() # C++11 is required @@ -45,9 +48,11 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -restrict") endif() -set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}) +set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}) include(CheckIncludeFileCXX) -include(LAMMPSInterfaceCXX) +if(NOT LAMMPS_DIR) + include(LAMMPSInterfaceCXX) +endif() ########################## # building the plugins @@ -66,7 +71,7 @@ add_library(zero2plugin MODULE zero2plugin.cpp pair_zero2.cpp bond_zero2.cpp angle_zero2.cpp dihedral_zero2.cpp improper_zero2.cpp) target_link_libraries(zero2plugin PRIVATE lammps) -set_target_properties(morse2plugin nve2plugin helloplugin zero2plugin PROPERTIES PREFIX "") +set_target_properties(morse2plugin nve2plugin helloplugin zero2plugin PROPERTIES PREFIX "" SUFFIX ".so") # MacOS seems to need this if(CMAKE_SYSTEM_NAME STREQUAL Darwin) @@ -84,3 +89,6 @@ else() set_target_properties(morse2plugin nve2plugin helloplugin zero2plugin PROPERTIES LINK_FLAGS "-rdynamic") endif() + +add_custom_target(plugins ALL ${CMAKE_COMMAND} -E echo "Building Plugins" + DEPENDS morse2plugin nve2plugin helloplugin zero2plugin morse2plugin) diff --git a/examples/threebody/in.mos2.sw.mod b/examples/threebody/in.mos2.sw.mod new file mode 100644 index 0000000000..d5ddd356e0 --- /dev/null +++ b/examples/threebody/in.mos2.sw.mod @@ -0,0 +1,30 @@ +# monolayer MoS2 +units metal +boundary p p f +processors * * 1 + +atom_style atomic +read_data single_layer_MoS2.data + +mass * 32.065 # mass of sulphur atom , uint: a.u.=1.66X10^(-27)kg +mass 1 95.94 # mass of molebdenum atom , uint: a.u.=1.66X10^(-27)kg + +########################## Define potentials ################################ +pair_style sw/mod maxdelcs 0.25 0.35 +pair_coeff * * tmd.sw.mod Mo S S +######################################################################### + +### Simulation settings #### +timestep 0.001 +velocity all create 300.0 12345 + +############################ + +# Output +thermo 500 +thermo_style custom step etotal pe ke temp +thermo_modify lost warn + +###### Run molecular dynamics ###### +fix thermostat all nve +run 5000 diff --git a/examples/threebody/log.27Oct21.mos2_sw_mod.g++.1 b/examples/threebody/log.27Oct21.mos2_sw_mod.g++.1 new file mode 100644 index 0000000000..4dda8e9d1c --- /dev/null +++ b/examples/threebody/log.27Oct21.mos2_sw_mod.g++.1 @@ -0,0 +1,92 @@ +LAMMPS (27 Oct 2021) +# monolayer MoS2 +units metal +boundary p p f +processors * * 1 + +atom_style atomic +read_data single_layer_MoS2.data +Reading data file ... + triclinic box = (0.0000000 0.0000000 -100.00000) to (51.152320 44.299209 100.00000) with tilt (25.576160 0.0000000 0.0000000) + 1 by 1 by 1 MPI processor grid + reading atoms ... + 768 atoms + read_data CPU = 0.043 seconds + +mass * 32.065 # mass of sulphur atom , uint: a.u.=1.66X10^(-27)kg +mass 1 95.94 # mass of molebdenum atom , uint: a.u.=1.66X10^(-27)kg + +########################## Define potentials ################################ +pair_style sw/mod maxdelcs 0.25 0.35 +pair_coeff * * tmd.sw.mod Mo S S +Reading sw potential file tmd.sw.mod with DATE: 2018-03-26 +######################################################################### + +### Simulation settings #### +timestep 0.001 +velocity all create 300.0 12345 + +############################ + +# Output +thermo 500 +thermo_style custom step etotal pe ke temp +thermo_modify lost warn + +###### Run molecular dynamics ###### +fix thermostat all nve +run 5000 +Neighbor list info ... + update every 1 steps, delay 10 steps, check yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 5.158796 + ghost atom cutoff = 5.158796 + binsize = 2.579398, bins = 30 18 78 + 1 neighbor lists, perpetual/occasional/extra = 1 0 0 + (1) pair sw/mod, perpetual + attributes: full, newton on + pair build: full/bin/atomonly + stencil: full/bin/3d + bin: standard +Per MPI rank memory allocation (min/avg/max) = 3.466 | 3.466 | 3.466 Mbytes +Step TotEng PotEng KinEng Temp + 0 -899.28605 -929.02881 29.742759 300 + 500 -899.28626 -922.45519 23.168929 233.69313 + 1000 -899.29247 -925.86547 26.573002 268.02828 + 1500 -899.27957 -916.95478 17.675214 178.28084 + 2000 -899.28171 -918.38728 19.105573 192.70814 + 2500 -899.28732 -922.50423 23.21691 234.17709 + 3000 -899.28195 -918.74112 19.459174 196.27473 + 3500 -899.27944 -918.03105 18.751604 189.13784 + 4000 -899.28397 -920.50737 21.223397 214.06955 + 4500 -899.28386 -919.79154 20.507685 206.85053 + 5000 -899.28077 -918.78947 19.508698 196.77425 +Loop time of 5.84317 on 1 procs for 5000 steps with 768 atoms + +Performance: 73.932 ns/day, 0.325 hours/ns, 855.700 timesteps/s +99.8% CPU use with 1 MPI tasks x no OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 5.6796 | 5.6796 | 5.6796 | 0.0 | 97.20 +Neigh | 0 | 0 | 0 | 0.0 | 0.00 +Comm | 0.026354 | 0.026354 | 0.026354 | 0.0 | 0.45 +Output | 0.0014959 | 0.0014959 | 0.0014959 | 0.0 | 0.03 +Modify | 0.090437 | 0.090437 | 0.090437 | 0.0 | 1.55 +Other | | 0.04524 | | | 0.77 + +Nlocal: 768.000 ave 768 max 768 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 354.000 ave 354 max 354 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +FullNghs: 20480.0 ave 20480 max 20480 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 20480 +Ave neighs/atom = 26.666667 +Neighbor list builds = 0 +Dangerous builds = 0 +Total wall time: 0:00:06 diff --git a/examples/threebody/single_layer_MoS2.data b/examples/threebody/single_layer_MoS2.data new file mode 100644 index 0000000000..e68230987c --- /dev/null +++ b/examples/threebody/single_layer_MoS2.data @@ -0,0 +1,781 @@ + Single layer MoS2 + + 768 atoms + + 3 atom types + + 0.0000000000000000 51.1523200000000177 xlo xhi + 0.0000000000000000 44.2992085825108320 ylo yhi + -100.0000000000000000 100.0000000000000000 zlo zhi + 25.5761600000000088 0.0000000000000000 0.0000000000000000 xy xz yz + + Atoms + + 1 2 0.000000000000000 0.000000000000000 -1.596930000000000 + 2 3 0.000000000000000 0.000000000000000 1.596930000000000 + 3 1 0.000000000000000 1.845800357604618 0.000000000000000 + 4 2 1.598510000000001 2.768700536406927 -1.596930000000000 + 5 3 1.598510000000001 2.768700536406927 1.596930000000000 + 6 1 1.598510000000001 4.614500894011545 0.000000000000000 + 7 2 3.197020000000001 5.537401072813854 -1.596930000000000 + 8 3 3.197020000000001 5.537401072813854 1.596930000000000 + 9 1 3.197020000000001 7.383201430418472 0.000000000000000 + 10 2 4.795530000000002 8.306101609220781 -1.596930000000000 + 11 3 4.795530000000002 8.306101609220781 1.596930000000000 + 12 1 4.795530000000002 10.151901966825399 0.000000000000000 + 13 2 6.394040000000002 11.074802145627708 -1.596930000000000 + 14 3 6.394040000000002 11.074802145627708 1.596930000000000 + 15 1 6.394040000000002 12.920602503232326 0.000000000000000 + 16 2 7.992550000000003 13.843502682034635 -1.596930000000000 + 17 3 7.992550000000003 13.843502682034635 1.596930000000000 + 18 1 7.992550000000003 15.689303039639253 0.000000000000000 + 19 2 9.591060000000003 16.612203218441562 -1.596930000000000 + 20 3 9.591060000000003 16.612203218441562 1.596930000000000 + 21 1 9.591060000000003 18.458003576046180 0.000000000000000 + 22 2 11.189570000000004 19.380903754848489 -1.596930000000000 + 23 3 11.189570000000004 19.380903754848489 1.596930000000000 + 24 1 11.189570000000004 21.226704112453107 0.000000000000000 + 25 2 12.788080000000004 22.149604291255416 -1.596930000000000 + 26 3 12.788080000000004 22.149604291255416 1.596930000000000 + 27 1 12.788080000000004 23.995404648860034 0.000000000000000 + 28 2 14.386590000000005 24.918304827662343 -1.596930000000000 + 29 3 14.386590000000005 24.918304827662343 1.596930000000000 + 30 1 14.386590000000005 26.764105185266961 0.000000000000000 + 31 2 15.985100000000006 27.687005364069270 -1.596930000000000 + 32 3 15.985100000000006 27.687005364069270 1.596930000000000 + 33 1 15.985100000000006 29.532805721673888 0.000000000000000 + 34 2 17.583610000000006 30.455705900476197 -1.596930000000000 + 35 3 17.583610000000006 30.455705900476197 1.596930000000000 + 36 1 17.583610000000006 32.301506258080815 0.000000000000000 + 37 2 19.182120000000007 33.224406436883124 -1.596930000000000 + 38 3 19.182120000000007 33.224406436883124 1.596930000000000 + 39 1 19.182120000000007 35.070206794487742 0.000000000000000 + 40 2 20.780630000000007 35.993106973290051 -1.596930000000000 + 41 3 20.780630000000007 35.993106973290051 1.596930000000000 + 42 1 20.780630000000007 37.838907330894669 0.000000000000000 + 43 2 22.379140000000008 38.761807509696978 -1.596930000000000 + 44 3 22.379140000000008 38.761807509696978 1.596930000000000 + 45 1 22.379140000000008 40.607607867301596 0.000000000000000 + 46 2 23.977650000000008 41.530508046103905 -1.596930000000000 + 47 3 23.977650000000008 41.530508046103905 1.596930000000000 + 48 1 23.977650000000008 43.376308403708523 0.000000000000000 + 49 2 3.197020000000001 0.000000000000000 -1.596930000000000 + 50 3 3.197020000000001 0.000000000000000 1.596930000000000 + 51 1 3.197020000000001 1.845800357604618 0.000000000000000 + 52 2 4.795530000000002 2.768700536406927 -1.596930000000000 + 53 3 4.795530000000002 2.768700536406927 1.596930000000000 + 54 1 4.795530000000002 4.614500894011545 0.000000000000000 + 55 2 6.394040000000002 5.537401072813854 -1.596930000000000 + 56 3 6.394040000000002 5.537401072813854 1.596930000000000 + 57 1 6.394040000000002 7.383201430418472 0.000000000000000 + 58 2 7.992550000000003 8.306101609220781 -1.596930000000000 + 59 3 7.992550000000003 8.306101609220781 1.596930000000000 + 60 1 7.992550000000003 10.151901966825399 0.000000000000000 + 61 2 9.591060000000003 11.074802145627708 -1.596930000000000 + 62 3 9.591060000000003 11.074802145627708 1.596930000000000 + 63 1 9.591060000000003 12.920602503232326 0.000000000000000 + 64 2 11.189570000000004 13.843502682034635 -1.596930000000000 + 65 3 11.189570000000004 13.843502682034635 1.596930000000000 + 66 1 11.189570000000004 15.689303039639253 0.000000000000000 + 67 2 12.788080000000004 16.612203218441562 -1.596930000000000 + 68 3 12.788080000000004 16.612203218441562 1.596930000000000 + 69 1 12.788080000000004 18.458003576046180 0.000000000000000 + 70 2 14.386590000000005 19.380903754848489 -1.596930000000000 + 71 3 14.386590000000005 19.380903754848489 1.596930000000000 + 72 1 14.386590000000005 21.226704112453107 0.000000000000000 + 73 2 15.985100000000006 22.149604291255416 -1.596930000000000 + 74 3 15.985100000000006 22.149604291255416 1.596930000000000 + 75 1 15.985100000000006 23.995404648860034 0.000000000000000 + 76 2 17.583610000000006 24.918304827662343 -1.596930000000000 + 77 3 17.583610000000006 24.918304827662343 1.596930000000000 + 78 1 17.583610000000006 26.764105185266961 0.000000000000000 + 79 2 19.182120000000007 27.687005364069270 -1.596930000000000 + 80 3 19.182120000000007 27.687005364069270 1.596930000000000 + 81 1 19.182120000000007 29.532805721673888 0.000000000000000 + 82 2 20.780630000000007 30.455705900476197 -1.596930000000000 + 83 3 20.780630000000007 30.455705900476197 1.596930000000000 + 84 1 20.780630000000007 32.301506258080815 0.000000000000000 + 85 2 22.379140000000008 33.224406436883124 -1.596930000000000 + 86 3 22.379140000000008 33.224406436883124 1.596930000000000 + 87 1 22.379140000000008 35.070206794487742 0.000000000000000 + 88 2 23.977650000000008 35.993106973290051 -1.596930000000000 + 89 3 23.977650000000008 35.993106973290051 1.596930000000000 + 90 1 23.977650000000008 37.838907330894669 0.000000000000000 + 91 2 25.576160000000009 38.761807509696978 -1.596930000000000 + 92 3 25.576160000000009 38.761807509696978 1.596930000000000 + 93 1 25.576160000000009 40.607607867301596 0.000000000000000 + 94 2 27.174670000000009 41.530508046103905 -1.596930000000000 + 95 3 27.174670000000009 41.530508046103905 1.596930000000000 + 96 1 27.174670000000009 43.376308403708523 0.000000000000000 + 97 2 6.394040000000002 0.000000000000000 -1.596930000000000 + 98 3 6.394040000000002 0.000000000000000 1.596930000000000 + 99 1 6.394040000000002 1.845800357604618 0.000000000000000 + 100 2 7.992550000000003 2.768700536406927 -1.596930000000000 + 101 3 7.992550000000003 2.768700536406927 1.596930000000000 + 102 1 7.992550000000003 4.614500894011545 0.000000000000000 + 103 2 9.591060000000003 5.537401072813854 -1.596930000000000 + 104 3 9.591060000000003 5.537401072813854 1.596930000000000 + 105 1 9.591060000000003 7.383201430418472 0.000000000000000 + 106 2 11.189570000000004 8.306101609220781 -1.596930000000000 + 107 3 11.189570000000004 8.306101609220781 1.596930000000000 + 108 1 11.189570000000004 10.151901966825399 0.000000000000000 + 109 2 12.788080000000004 11.074802145627708 -1.596930000000000 + 110 3 12.788080000000004 11.074802145627708 1.596930000000000 + 111 1 12.788080000000004 12.920602503232326 0.000000000000000 + 112 2 14.386590000000005 13.843502682034635 -1.596930000000000 + 113 3 14.386590000000005 13.843502682034635 1.596930000000000 + 114 1 14.386590000000005 15.689303039639253 0.000000000000000 + 115 2 15.985100000000006 16.612203218441562 -1.596930000000000 + 116 3 15.985100000000006 16.612203218441562 1.596930000000000 + 117 1 15.985100000000006 18.458003576046180 0.000000000000000 + 118 2 17.583610000000006 19.380903754848489 -1.596930000000000 + 119 3 17.583610000000006 19.380903754848489 1.596930000000000 + 120 1 17.583610000000006 21.226704112453107 0.000000000000000 + 121 2 19.182120000000007 22.149604291255416 -1.596930000000000 + 122 3 19.182120000000007 22.149604291255416 1.596930000000000 + 123 1 19.182120000000007 23.995404648860034 0.000000000000000 + 124 2 20.780630000000007 24.918304827662343 -1.596930000000000 + 125 3 20.780630000000007 24.918304827662343 1.596930000000000 + 126 1 20.780630000000007 26.764105185266961 0.000000000000000 + 127 2 22.379140000000008 27.687005364069270 -1.596930000000000 + 128 3 22.379140000000008 27.687005364069270 1.596930000000000 + 129 1 22.379140000000008 29.532805721673888 0.000000000000000 + 130 2 23.977650000000008 30.455705900476197 -1.596930000000000 + 131 3 23.977650000000008 30.455705900476197 1.596930000000000 + 132 1 23.977650000000008 32.301506258080815 0.000000000000000 + 133 2 25.576160000000009 33.224406436883124 -1.596930000000000 + 134 3 25.576160000000009 33.224406436883124 1.596930000000000 + 135 1 25.576160000000009 35.070206794487742 0.000000000000000 + 136 2 27.174670000000009 35.993106973290051 -1.596930000000000 + 137 3 27.174670000000009 35.993106973290051 1.596930000000000 + 138 1 27.174670000000009 37.838907330894669 0.000000000000000 + 139 2 28.773180000000010 38.761807509696978 -1.596930000000000 + 140 3 28.773180000000010 38.761807509696978 1.596930000000000 + 141 1 28.773180000000010 40.607607867301596 0.000000000000000 + 142 2 30.371690000000011 41.530508046103905 -1.596930000000000 + 143 3 30.371690000000011 41.530508046103905 1.596930000000000 + 144 1 30.371690000000011 43.376308403708523 0.000000000000000 + 145 2 9.591060000000003 0.000000000000000 -1.596930000000000 + 146 3 9.591060000000003 0.000000000000000 1.596930000000000 + 147 1 9.591060000000003 1.845800357604618 0.000000000000000 + 148 2 11.189570000000004 2.768700536406927 -1.596930000000000 + 149 3 11.189570000000004 2.768700536406927 1.596930000000000 + 150 1 11.189570000000004 4.614500894011545 0.000000000000000 + 151 2 12.788080000000004 5.537401072813854 -1.596930000000000 + 152 3 12.788080000000004 5.537401072813854 1.596930000000000 + 153 1 12.788080000000004 7.383201430418472 0.000000000000000 + 154 2 14.386590000000005 8.306101609220781 -1.596930000000000 + 155 3 14.386590000000005 8.306101609220781 1.596930000000000 + 156 1 14.386590000000005 10.151901966825399 0.000000000000000 + 157 2 15.985100000000006 11.074802145627708 -1.596930000000000 + 158 3 15.985100000000006 11.074802145627708 1.596930000000000 + 159 1 15.985100000000006 12.920602503232326 0.000000000000000 + 160 2 17.583610000000006 13.843502682034635 -1.596930000000000 + 161 3 17.583610000000006 13.843502682034635 1.596930000000000 + 162 1 17.583610000000006 15.689303039639253 0.000000000000000 + 163 2 19.182120000000007 16.612203218441562 -1.596930000000000 + 164 3 19.182120000000007 16.612203218441562 1.596930000000000 + 165 1 19.182120000000007 18.458003576046180 0.000000000000000 + 166 2 20.780630000000007 19.380903754848489 -1.596930000000000 + 167 3 20.780630000000007 19.380903754848489 1.596930000000000 + 168 1 20.780630000000007 21.226704112453107 0.000000000000000 + 169 2 22.379140000000008 22.149604291255416 -1.596930000000000 + 170 3 22.379140000000008 22.149604291255416 1.596930000000000 + 171 1 22.379140000000008 23.995404648860034 0.000000000000000 + 172 2 23.977650000000008 24.918304827662343 -1.596930000000000 + 173 3 23.977650000000008 24.918304827662343 1.596930000000000 + 174 1 23.977650000000008 26.764105185266961 0.000000000000000 + 175 2 25.576160000000009 27.687005364069270 -1.596930000000000 + 176 3 25.576160000000009 27.687005364069270 1.596930000000000 + 177 1 25.576160000000009 29.532805721673888 0.000000000000000 + 178 2 27.174670000000009 30.455705900476197 -1.596930000000000 + 179 3 27.174670000000009 30.455705900476197 1.596930000000000 + 180 1 27.174670000000009 32.301506258080815 0.000000000000000 + 181 2 28.773180000000010 33.224406436883124 -1.596930000000000 + 182 3 28.773180000000010 33.224406436883124 1.596930000000000 + 183 1 28.773180000000010 35.070206794487742 0.000000000000000 + 184 2 30.371690000000011 35.993106973290051 -1.596930000000000 + 185 3 30.371690000000011 35.993106973290051 1.596930000000000 + 186 1 30.371690000000011 37.838907330894669 0.000000000000000 + 187 2 31.970200000000011 38.761807509696978 -1.596930000000000 + 188 3 31.970200000000011 38.761807509696978 1.596930000000000 + 189 1 31.970200000000011 40.607607867301596 0.000000000000000 + 190 2 33.568710000000012 41.530508046103905 -1.596930000000000 + 191 3 33.568710000000012 41.530508046103905 1.596930000000000 + 192 1 33.568710000000012 43.376308403708523 0.000000000000000 + 193 2 12.788080000000004 0.000000000000000 -1.596930000000000 + 194 3 12.788080000000004 0.000000000000000 1.596930000000000 + 195 1 12.788080000000004 1.845800357604618 0.000000000000000 + 196 2 14.386590000000005 2.768700536406927 -1.596930000000000 + 197 3 14.386590000000005 2.768700536406927 1.596930000000000 + 198 1 14.386590000000005 4.614500894011545 0.000000000000000 + 199 2 15.985100000000006 5.537401072813854 -1.596930000000000 + 200 3 15.985100000000006 5.537401072813854 1.596930000000000 + 201 1 15.985100000000006 7.383201430418472 0.000000000000000 + 202 2 17.583610000000006 8.306101609220781 -1.596930000000000 + 203 3 17.583610000000006 8.306101609220781 1.596930000000000 + 204 1 17.583610000000006 10.151901966825399 0.000000000000000 + 205 2 19.182120000000007 11.074802145627708 -1.596930000000000 + 206 3 19.182120000000007 11.074802145627708 1.596930000000000 + 207 1 19.182120000000007 12.920602503232326 0.000000000000000 + 208 2 20.780630000000007 13.843502682034635 -1.596930000000000 + 209 3 20.780630000000007 13.843502682034635 1.596930000000000 + 210 1 20.780630000000007 15.689303039639253 0.000000000000000 + 211 2 22.379140000000008 16.612203218441562 -1.596930000000000 + 212 3 22.379140000000008 16.612203218441562 1.596930000000000 + 213 1 22.379140000000008 18.458003576046180 0.000000000000000 + 214 2 23.977650000000008 19.380903754848489 -1.596930000000000 + 215 3 23.977650000000008 19.380903754848489 1.596930000000000 + 216 1 23.977650000000008 21.226704112453107 0.000000000000000 + 217 2 25.576160000000009 22.149604291255416 -1.596930000000000 + 218 3 25.576160000000009 22.149604291255416 1.596930000000000 + 219 1 25.576160000000009 23.995404648860034 0.000000000000000 + 220 2 27.174670000000009 24.918304827662343 -1.596930000000000 + 221 3 27.174670000000009 24.918304827662343 1.596930000000000 + 222 1 27.174670000000009 26.764105185266961 0.000000000000000 + 223 2 28.773180000000010 27.687005364069270 -1.596930000000000 + 224 3 28.773180000000010 27.687005364069270 1.596930000000000 + 225 1 28.773180000000010 29.532805721673888 0.000000000000000 + 226 2 30.371690000000011 30.455705900476197 -1.596930000000000 + 227 3 30.371690000000011 30.455705900476197 1.596930000000000 + 228 1 30.371690000000011 32.301506258080815 0.000000000000000 + 229 2 31.970200000000011 33.224406436883124 -1.596930000000000 + 230 3 31.970200000000011 33.224406436883124 1.596930000000000 + 231 1 31.970200000000011 35.070206794487742 0.000000000000000 + 232 2 33.568710000000012 35.993106973290051 -1.596930000000000 + 233 3 33.568710000000012 35.993106973290051 1.596930000000000 + 234 1 33.568710000000012 37.838907330894669 0.000000000000000 + 235 2 35.167220000000012 38.761807509696978 -1.596930000000000 + 236 3 35.167220000000012 38.761807509696978 1.596930000000000 + 237 1 35.167220000000012 40.607607867301596 0.000000000000000 + 238 2 36.765730000000013 41.530508046103905 -1.596930000000000 + 239 3 36.765730000000013 41.530508046103905 1.596930000000000 + 240 1 36.765730000000013 43.376308403708523 0.000000000000000 + 241 2 15.985100000000006 0.000000000000000 -1.596930000000000 + 242 3 15.985100000000006 0.000000000000000 1.596930000000000 + 243 1 15.985100000000006 1.845800357604618 0.000000000000000 + 244 2 17.583610000000006 2.768700536406927 -1.596930000000000 + 245 3 17.583610000000006 2.768700536406927 1.596930000000000 + 246 1 17.583610000000006 4.614500894011545 0.000000000000000 + 247 2 19.182120000000007 5.537401072813854 -1.596930000000000 + 248 3 19.182120000000007 5.537401072813854 1.596930000000000 + 249 1 19.182120000000007 7.383201430418472 0.000000000000000 + 250 2 20.780630000000007 8.306101609220781 -1.596930000000000 + 251 3 20.780630000000007 8.306101609220781 1.596930000000000 + 252 1 20.780630000000007 10.151901966825399 0.000000000000000 + 253 2 22.379140000000008 11.074802145627708 -1.596930000000000 + 254 3 22.379140000000008 11.074802145627708 1.596930000000000 + 255 1 22.379140000000008 12.920602503232326 0.000000000000000 + 256 2 23.977650000000008 13.843502682034635 -1.596930000000000 + 257 3 23.977650000000008 13.843502682034635 1.596930000000000 + 258 1 23.977650000000008 15.689303039639253 0.000000000000000 + 259 2 25.576160000000009 16.612203218441562 -1.596930000000000 + 260 3 25.576160000000009 16.612203218441562 1.596930000000000 + 261 1 25.576160000000009 18.458003576046180 0.000000000000000 + 262 2 27.174670000000009 19.380903754848489 -1.596930000000000 + 263 3 27.174670000000009 19.380903754848489 1.596930000000000 + 264 1 27.174670000000009 21.226704112453107 0.000000000000000 + 265 2 28.773180000000010 22.149604291255416 -1.596930000000000 + 266 3 28.773180000000010 22.149604291255416 1.596930000000000 + 267 1 28.773180000000010 23.995404648860034 0.000000000000000 + 268 2 30.371690000000011 24.918304827662343 -1.596930000000000 + 269 3 30.371690000000011 24.918304827662343 1.596930000000000 + 270 1 30.371690000000011 26.764105185266961 0.000000000000000 + 271 2 31.970200000000011 27.687005364069270 -1.596930000000000 + 272 3 31.970200000000011 27.687005364069270 1.596930000000000 + 273 1 31.970200000000011 29.532805721673888 0.000000000000000 + 274 2 33.568710000000012 30.455705900476197 -1.596930000000000 + 275 3 33.568710000000012 30.455705900476197 1.596930000000000 + 276 1 33.568710000000012 32.301506258080815 0.000000000000000 + 277 2 35.167220000000012 33.224406436883124 -1.596930000000000 + 278 3 35.167220000000012 33.224406436883124 1.596930000000000 + 279 1 35.167220000000012 35.070206794487742 0.000000000000000 + 280 2 36.765730000000013 35.993106973290051 -1.596930000000000 + 281 3 36.765730000000013 35.993106973290051 1.596930000000000 + 282 1 36.765730000000013 37.838907330894669 0.000000000000000 + 283 2 38.364240000000013 38.761807509696978 -1.596930000000000 + 284 3 38.364240000000013 38.761807509696978 1.596930000000000 + 285 1 38.364240000000013 40.607607867301596 0.000000000000000 + 286 2 39.962750000000014 41.530508046103905 -1.596930000000000 + 287 3 39.962750000000014 41.530508046103905 1.596930000000000 + 288 1 39.962750000000014 43.376308403708523 0.000000000000000 + 289 2 19.182120000000007 0.000000000000000 -1.596930000000000 + 290 3 19.182120000000007 0.000000000000000 1.596930000000000 + 291 1 19.182120000000007 1.845800357604618 0.000000000000000 + 292 2 20.780630000000007 2.768700536406927 -1.596930000000000 + 293 3 20.780630000000007 2.768700536406927 1.596930000000000 + 294 1 20.780630000000007 4.614500894011545 0.000000000000000 + 295 2 22.379140000000008 5.537401072813854 -1.596930000000000 + 296 3 22.379140000000008 5.537401072813854 1.596930000000000 + 297 1 22.379140000000008 7.383201430418472 0.000000000000000 + 298 2 23.977650000000008 8.306101609220781 -1.596930000000000 + 299 3 23.977650000000008 8.306101609220781 1.596930000000000 + 300 1 23.977650000000008 10.151901966825399 0.000000000000000 + 301 2 25.576160000000009 11.074802145627708 -1.596930000000000 + 302 3 25.576160000000009 11.074802145627708 1.596930000000000 + 303 1 25.576160000000009 12.920602503232326 0.000000000000000 + 304 2 27.174670000000009 13.843502682034635 -1.596930000000000 + 305 3 27.174670000000009 13.843502682034635 1.596930000000000 + 306 1 27.174670000000009 15.689303039639253 0.000000000000000 + 307 2 28.773180000000010 16.612203218441562 -1.596930000000000 + 308 3 28.773180000000010 16.612203218441562 1.596930000000000 + 309 1 28.773180000000010 18.458003576046180 0.000000000000000 + 310 2 30.371690000000011 19.380903754848489 -1.596930000000000 + 311 3 30.371690000000011 19.380903754848489 1.596930000000000 + 312 1 30.371690000000011 21.226704112453107 0.000000000000000 + 313 2 31.970200000000011 22.149604291255416 -1.596930000000000 + 314 3 31.970200000000011 22.149604291255416 1.596930000000000 + 315 1 31.970200000000011 23.995404648860034 0.000000000000000 + 316 2 33.568710000000012 24.918304827662343 -1.596930000000000 + 317 3 33.568710000000012 24.918304827662343 1.596930000000000 + 318 1 33.568710000000012 26.764105185266961 0.000000000000000 + 319 2 35.167220000000012 27.687005364069270 -1.596930000000000 + 320 3 35.167220000000012 27.687005364069270 1.596930000000000 + 321 1 35.167220000000012 29.532805721673888 0.000000000000000 + 322 2 36.765730000000013 30.455705900476197 -1.596930000000000 + 323 3 36.765730000000013 30.455705900476197 1.596930000000000 + 324 1 36.765730000000013 32.301506258080815 0.000000000000000 + 325 2 38.364240000000013 33.224406436883124 -1.596930000000000 + 326 3 38.364240000000013 33.224406436883124 1.596930000000000 + 327 1 38.364240000000013 35.070206794487742 0.000000000000000 + 328 2 39.962750000000014 35.993106973290051 -1.596930000000000 + 329 3 39.962750000000014 35.993106973290051 1.596930000000000 + 330 1 39.962750000000014 37.838907330894669 0.000000000000000 + 331 2 41.561260000000014 38.761807509696978 -1.596930000000000 + 332 3 41.561260000000014 38.761807509696978 1.596930000000000 + 333 1 41.561260000000014 40.607607867301596 0.000000000000000 + 334 2 43.159770000000015 41.530508046103905 -1.596930000000000 + 335 3 43.159770000000015 41.530508046103905 1.596930000000000 + 336 1 43.159770000000015 43.376308403708523 0.000000000000000 + 337 2 22.379140000000008 0.000000000000000 -1.596930000000000 + 338 3 22.379140000000008 0.000000000000000 1.596930000000000 + 339 1 22.379140000000008 1.845800357604618 0.000000000000000 + 340 2 23.977650000000008 2.768700536406927 -1.596930000000000 + 341 3 23.977650000000008 2.768700536406927 1.596930000000000 + 342 1 23.977650000000008 4.614500894011545 0.000000000000000 + 343 2 25.576160000000009 5.537401072813854 -1.596930000000000 + 344 3 25.576160000000009 5.537401072813854 1.596930000000000 + 345 1 25.576160000000009 7.383201430418472 0.000000000000000 + 346 2 27.174670000000009 8.306101609220781 -1.596930000000000 + 347 3 27.174670000000009 8.306101609220781 1.596930000000000 + 348 1 27.174670000000009 10.151901966825399 0.000000000000000 + 349 2 28.773180000000010 11.074802145627708 -1.596930000000000 + 350 3 28.773180000000010 11.074802145627708 1.596930000000000 + 351 1 28.773180000000010 12.920602503232326 0.000000000000000 + 352 2 30.371690000000011 13.843502682034635 -1.596930000000000 + 353 3 30.371690000000011 13.843502682034635 1.596930000000000 + 354 1 30.371690000000011 15.689303039639253 0.000000000000000 + 355 2 31.970200000000011 16.612203218441562 -1.596930000000000 + 356 3 31.970200000000011 16.612203218441562 1.596930000000000 + 357 1 31.970200000000011 18.458003576046180 0.000000000000000 + 358 2 33.568710000000012 19.380903754848489 -1.596930000000000 + 359 3 33.568710000000012 19.380903754848489 1.596930000000000 + 360 1 33.568710000000012 21.226704112453107 0.000000000000000 + 361 2 35.167220000000012 22.149604291255416 -1.596930000000000 + 362 3 35.167220000000012 22.149604291255416 1.596930000000000 + 363 1 35.167220000000012 23.995404648860034 0.000000000000000 + 364 2 36.765730000000013 24.918304827662343 -1.596930000000000 + 365 3 36.765730000000013 24.918304827662343 1.596930000000000 + 366 1 36.765730000000013 26.764105185266961 0.000000000000000 + 367 2 38.364240000000013 27.687005364069270 -1.596930000000000 + 368 3 38.364240000000013 27.687005364069270 1.596930000000000 + 369 1 38.364240000000013 29.532805721673888 0.000000000000000 + 370 2 39.962750000000014 30.455705900476197 -1.596930000000000 + 371 3 39.962750000000014 30.455705900476197 1.596930000000000 + 372 1 39.962750000000014 32.301506258080815 0.000000000000000 + 373 2 41.561260000000014 33.224406436883124 -1.596930000000000 + 374 3 41.561260000000014 33.224406436883124 1.596930000000000 + 375 1 41.561260000000014 35.070206794487742 0.000000000000000 + 376 2 43.159770000000015 35.993106973290051 -1.596930000000000 + 377 3 43.159770000000015 35.993106973290051 1.596930000000000 + 378 1 43.159770000000015 37.838907330894669 0.000000000000000 + 379 2 44.758280000000015 38.761807509696978 -1.596930000000000 + 380 3 44.758280000000015 38.761807509696978 1.596930000000000 + 381 1 44.758280000000015 40.607607867301596 0.000000000000000 + 382 2 46.356790000000016 41.530508046103905 -1.596930000000000 + 383 3 46.356790000000016 41.530508046103905 1.596930000000000 + 384 1 46.356790000000016 43.376308403708523 0.000000000000000 + 385 2 25.576160000000009 0.000000000000000 -1.596930000000000 + 386 3 25.576160000000009 0.000000000000000 1.596930000000000 + 387 1 25.576160000000009 1.845800357604618 0.000000000000000 + 388 2 27.174670000000009 2.768700536406927 -1.596930000000000 + 389 3 27.174670000000009 2.768700536406927 1.596930000000000 + 390 1 27.174670000000009 4.614500894011545 0.000000000000000 + 391 2 28.773180000000010 5.537401072813854 -1.596930000000000 + 392 3 28.773180000000010 5.537401072813854 1.596930000000000 + 393 1 28.773180000000010 7.383201430418472 0.000000000000000 + 394 2 30.371690000000011 8.306101609220781 -1.596930000000000 + 395 3 30.371690000000011 8.306101609220781 1.596930000000000 + 396 1 30.371690000000011 10.151901966825399 0.000000000000000 + 397 2 31.970200000000011 11.074802145627708 -1.596930000000000 + 398 3 31.970200000000011 11.074802145627708 1.596930000000000 + 399 1 31.970200000000011 12.920602503232326 0.000000000000000 + 400 2 33.568710000000012 13.843502682034635 -1.596930000000000 + 401 3 33.568710000000012 13.843502682034635 1.596930000000000 + 402 1 33.568710000000012 15.689303039639253 0.000000000000000 + 403 2 35.167220000000012 16.612203218441562 -1.596930000000000 + 404 3 35.167220000000012 16.612203218441562 1.596930000000000 + 405 1 35.167220000000012 18.458003576046180 0.000000000000000 + 406 2 36.765730000000013 19.380903754848489 -1.596930000000000 + 407 3 36.765730000000013 19.380903754848489 1.596930000000000 + 408 1 36.765730000000013 21.226704112453107 0.000000000000000 + 409 2 38.364240000000013 22.149604291255416 -1.596930000000000 + 410 3 38.364240000000013 22.149604291255416 1.596930000000000 + 411 1 38.364240000000013 23.995404648860034 0.000000000000000 + 412 2 39.962750000000014 24.918304827662343 -1.596930000000000 + 413 3 39.962750000000014 24.918304827662343 1.596930000000000 + 414 1 39.962750000000014 26.764105185266961 0.000000000000000 + 415 2 41.561260000000014 27.687005364069270 -1.596930000000000 + 416 3 41.561260000000014 27.687005364069270 1.596930000000000 + 417 1 41.561260000000014 29.532805721673888 0.000000000000000 + 418 2 43.159770000000015 30.455705900476197 -1.596930000000000 + 419 3 43.159770000000015 30.455705900476197 1.596930000000000 + 420 1 43.159770000000015 32.301506258080815 0.000000000000000 + 421 2 44.758280000000015 33.224406436883124 -1.596930000000000 + 422 3 44.758280000000015 33.224406436883124 1.596930000000000 + 423 1 44.758280000000015 35.070206794487742 0.000000000000000 + 424 2 46.356790000000016 35.993106973290051 -1.596930000000000 + 425 3 46.356790000000016 35.993106973290051 1.596930000000000 + 426 1 46.356790000000016 37.838907330894669 0.000000000000000 + 427 2 47.955300000000017 38.761807509696978 -1.596930000000000 + 428 3 47.955300000000017 38.761807509696978 1.596930000000000 + 429 1 47.955300000000017 40.607607867301596 0.000000000000000 + 430 2 49.553810000000017 41.530508046103905 -1.596930000000000 + 431 3 49.553810000000017 41.530508046103905 1.596930000000000 + 432 1 49.553810000000017 43.376308403708523 0.000000000000000 + 433 2 28.773180000000010 0.000000000000000 -1.596930000000000 + 434 3 28.773180000000010 0.000000000000000 1.596930000000000 + 435 1 28.773180000000010 1.845800357604618 0.000000000000000 + 436 2 30.371690000000011 2.768700536406927 -1.596930000000000 + 437 3 30.371690000000011 2.768700536406927 1.596930000000000 + 438 1 30.371690000000011 4.614500894011545 0.000000000000000 + 439 2 31.970200000000011 5.537401072813854 -1.596930000000000 + 440 3 31.970200000000011 5.537401072813854 1.596930000000000 + 441 1 31.970200000000011 7.383201430418472 0.000000000000000 + 442 2 33.568710000000012 8.306101609220781 -1.596930000000000 + 443 3 33.568710000000012 8.306101609220781 1.596930000000000 + 444 1 33.568710000000012 10.151901966825399 0.000000000000000 + 445 2 35.167220000000012 11.074802145627708 -1.596930000000000 + 446 3 35.167220000000012 11.074802145627708 1.596930000000000 + 447 1 35.167220000000012 12.920602503232326 0.000000000000000 + 448 2 36.765730000000013 13.843502682034635 -1.596930000000000 + 449 3 36.765730000000013 13.843502682034635 1.596930000000000 + 450 1 36.765730000000013 15.689303039639253 0.000000000000000 + 451 2 38.364240000000013 16.612203218441562 -1.596930000000000 + 452 3 38.364240000000013 16.612203218441562 1.596930000000000 + 453 1 38.364240000000013 18.458003576046180 0.000000000000000 + 454 2 39.962750000000014 19.380903754848489 -1.596930000000000 + 455 3 39.962750000000014 19.380903754848489 1.596930000000000 + 456 1 39.962750000000014 21.226704112453107 0.000000000000000 + 457 2 41.561260000000014 22.149604291255416 -1.596930000000000 + 458 3 41.561260000000014 22.149604291255416 1.596930000000000 + 459 1 41.561260000000014 23.995404648860034 0.000000000000000 + 460 2 43.159770000000015 24.918304827662343 -1.596930000000000 + 461 3 43.159770000000015 24.918304827662343 1.596930000000000 + 462 1 43.159770000000015 26.764105185266961 0.000000000000000 + 463 2 44.758280000000015 27.687005364069270 -1.596930000000000 + 464 3 44.758280000000015 27.687005364069270 1.596930000000000 + 465 1 44.758280000000015 29.532805721673888 0.000000000000000 + 466 2 46.356790000000016 30.455705900476197 -1.596930000000000 + 467 3 46.356790000000016 30.455705900476197 1.596930000000000 + 468 1 46.356790000000016 32.301506258080815 0.000000000000000 + 469 2 47.955300000000017 33.224406436883124 -1.596930000000000 + 470 3 47.955300000000017 33.224406436883124 1.596930000000000 + 471 1 47.955300000000017 35.070206794487742 0.000000000000000 + 472 2 49.553810000000017 35.993106973290051 -1.596930000000000 + 473 3 49.553810000000017 35.993106973290051 1.596930000000000 + 474 1 49.553810000000017 37.838907330894669 0.000000000000000 + 475 2 51.152320000000018 38.761807509696978 -1.596930000000000 + 476 3 51.152320000000018 38.761807509696978 1.596930000000000 + 477 1 51.152320000000018 40.607607867301596 0.000000000000000 + 478 2 52.750830000000018 41.530508046103905 -1.596930000000000 + 479 3 52.750830000000018 41.530508046103905 1.596930000000000 + 480 1 52.750830000000018 43.376308403708523 0.000000000000000 + 481 2 31.970200000000011 0.000000000000000 -1.596930000000000 + 482 3 31.970200000000011 0.000000000000000 1.596930000000000 + 483 1 31.970200000000011 1.845800357604618 0.000000000000000 + 484 2 33.568710000000012 2.768700536406927 -1.596930000000000 + 485 3 33.568710000000012 2.768700536406927 1.596930000000000 + 486 1 33.568710000000012 4.614500894011545 0.000000000000000 + 487 2 35.167220000000012 5.537401072813854 -1.596930000000000 + 488 3 35.167220000000012 5.537401072813854 1.596930000000000 + 489 1 35.167220000000012 7.383201430418472 0.000000000000000 + 490 2 36.765730000000013 8.306101609220781 -1.596930000000000 + 491 3 36.765730000000013 8.306101609220781 1.596930000000000 + 492 1 36.765730000000013 10.151901966825399 0.000000000000000 + 493 2 38.364240000000013 11.074802145627708 -1.596930000000000 + 494 3 38.364240000000013 11.074802145627708 1.596930000000000 + 495 1 38.364240000000013 12.920602503232326 0.000000000000000 + 496 2 39.962750000000014 13.843502682034635 -1.596930000000000 + 497 3 39.962750000000014 13.843502682034635 1.596930000000000 + 498 1 39.962750000000014 15.689303039639253 0.000000000000000 + 499 2 41.561260000000014 16.612203218441562 -1.596930000000000 + 500 3 41.561260000000014 16.612203218441562 1.596930000000000 + 501 1 41.561260000000014 18.458003576046180 0.000000000000000 + 502 2 43.159770000000015 19.380903754848489 -1.596930000000000 + 503 3 43.159770000000015 19.380903754848489 1.596930000000000 + 504 1 43.159770000000015 21.226704112453107 0.000000000000000 + 505 2 44.758280000000015 22.149604291255416 -1.596930000000000 + 506 3 44.758280000000015 22.149604291255416 1.596930000000000 + 507 1 44.758280000000015 23.995404648860034 0.000000000000000 + 508 2 46.356790000000016 24.918304827662343 -1.596930000000000 + 509 3 46.356790000000016 24.918304827662343 1.596930000000000 + 510 1 46.356790000000016 26.764105185266961 0.000000000000000 + 511 2 47.955300000000017 27.687005364069270 -1.596930000000000 + 512 3 47.955300000000017 27.687005364069270 1.596930000000000 + 513 1 47.955300000000017 29.532805721673888 0.000000000000000 + 514 2 49.553810000000017 30.455705900476197 -1.596930000000000 + 515 3 49.553810000000017 30.455705900476197 1.596930000000000 + 516 1 49.553810000000017 32.301506258080815 0.000000000000000 + 517 2 51.152320000000018 33.224406436883124 -1.596930000000000 + 518 3 51.152320000000018 33.224406436883124 1.596930000000000 + 519 1 51.152320000000018 35.070206794487742 0.000000000000000 + 520 2 52.750830000000018 35.993106973290051 -1.596930000000000 + 521 3 52.750830000000018 35.993106973290051 1.596930000000000 + 522 1 52.750830000000018 37.838907330894669 0.000000000000000 + 523 2 54.349340000000019 38.761807509696978 -1.596930000000000 + 524 3 54.349340000000019 38.761807509696978 1.596930000000000 + 525 1 54.349340000000019 40.607607867301596 0.000000000000000 + 526 2 55.947850000000019 41.530508046103905 -1.596930000000000 + 527 3 55.947850000000019 41.530508046103905 1.596930000000000 + 528 1 55.947850000000019 43.376308403708523 0.000000000000000 + 529 2 35.167220000000012 0.000000000000000 -1.596930000000000 + 530 3 35.167220000000012 0.000000000000000 1.596930000000000 + 531 1 35.167220000000012 1.845800357604618 0.000000000000000 + 532 2 36.765730000000013 2.768700536406927 -1.596930000000000 + 533 3 36.765730000000013 2.768700536406927 1.596930000000000 + 534 1 36.765730000000013 4.614500894011545 0.000000000000000 + 535 2 38.364240000000013 5.537401072813854 -1.596930000000000 + 536 3 38.364240000000013 5.537401072813854 1.596930000000000 + 537 1 38.364240000000013 7.383201430418472 0.000000000000000 + 538 2 39.962750000000014 8.306101609220781 -1.596930000000000 + 539 3 39.962750000000014 8.306101609220781 1.596930000000000 + 540 1 39.962750000000014 10.151901966825399 0.000000000000000 + 541 2 41.561260000000014 11.074802145627708 -1.596930000000000 + 542 3 41.561260000000014 11.074802145627708 1.596930000000000 + 543 1 41.561260000000014 12.920602503232326 0.000000000000000 + 544 2 43.159770000000015 13.843502682034635 -1.596930000000000 + 545 3 43.159770000000015 13.843502682034635 1.596930000000000 + 546 1 43.159770000000015 15.689303039639253 0.000000000000000 + 547 2 44.758280000000015 16.612203218441562 -1.596930000000000 + 548 3 44.758280000000015 16.612203218441562 1.596930000000000 + 549 1 44.758280000000015 18.458003576046180 0.000000000000000 + 550 2 46.356790000000016 19.380903754848489 -1.596930000000000 + 551 3 46.356790000000016 19.380903754848489 1.596930000000000 + 552 1 46.356790000000016 21.226704112453107 0.000000000000000 + 553 2 47.955300000000017 22.149604291255416 -1.596930000000000 + 554 3 47.955300000000017 22.149604291255416 1.596930000000000 + 555 1 47.955300000000017 23.995404648860034 0.000000000000000 + 556 2 49.553810000000017 24.918304827662343 -1.596930000000000 + 557 3 49.553810000000017 24.918304827662343 1.596930000000000 + 558 1 49.553810000000017 26.764105185266961 0.000000000000000 + 559 2 51.152320000000018 27.687005364069270 -1.596930000000000 + 560 3 51.152320000000018 27.687005364069270 1.596930000000000 + 561 1 51.152320000000018 29.532805721673888 0.000000000000000 + 562 2 52.750830000000018 30.455705900476197 -1.596930000000000 + 563 3 52.750830000000018 30.455705900476197 1.596930000000000 + 564 1 52.750830000000018 32.301506258080815 0.000000000000000 + 565 2 54.349340000000019 33.224406436883124 -1.596930000000000 + 566 3 54.349340000000019 33.224406436883124 1.596930000000000 + 567 1 54.349340000000019 35.070206794487742 0.000000000000000 + 568 2 55.947850000000019 35.993106973290051 -1.596930000000000 + 569 3 55.947850000000019 35.993106973290051 1.596930000000000 + 570 1 55.947850000000019 37.838907330894669 0.000000000000000 + 571 2 57.546360000000020 38.761807509696978 -1.596930000000000 + 572 3 57.546360000000020 38.761807509696978 1.596930000000000 + 573 1 57.546360000000020 40.607607867301596 0.000000000000000 + 574 2 59.144870000000020 41.530508046103905 -1.596930000000000 + 575 3 59.144870000000020 41.530508046103905 1.596930000000000 + 576 1 59.144870000000020 43.376308403708523 0.000000000000000 + 577 2 38.364240000000013 0.000000000000000 -1.596930000000000 + 578 3 38.364240000000013 0.000000000000000 1.596930000000000 + 579 1 38.364240000000013 1.845800357604618 0.000000000000000 + 580 2 39.962750000000014 2.768700536406927 -1.596930000000000 + 581 3 39.962750000000014 2.768700536406927 1.596930000000000 + 582 1 39.962750000000014 4.614500894011545 0.000000000000000 + 583 2 41.561260000000014 5.537401072813854 -1.596930000000000 + 584 3 41.561260000000014 5.537401072813854 1.596930000000000 + 585 1 41.561260000000014 7.383201430418472 0.000000000000000 + 586 2 43.159770000000015 8.306101609220781 -1.596930000000000 + 587 3 43.159770000000015 8.306101609220781 1.596930000000000 + 588 1 43.159770000000015 10.151901966825399 0.000000000000000 + 589 2 44.758280000000015 11.074802145627708 -1.596930000000000 + 590 3 44.758280000000015 11.074802145627708 1.596930000000000 + 591 1 44.758280000000015 12.920602503232326 0.000000000000000 + 592 2 46.356790000000016 13.843502682034635 -1.596930000000000 + 593 3 46.356790000000016 13.843502682034635 1.596930000000000 + 594 1 46.356790000000016 15.689303039639253 0.000000000000000 + 595 2 47.955300000000017 16.612203218441562 -1.596930000000000 + 596 3 47.955300000000017 16.612203218441562 1.596930000000000 + 597 1 47.955300000000017 18.458003576046180 0.000000000000000 + 598 2 49.553810000000017 19.380903754848489 -1.596930000000000 + 599 3 49.553810000000017 19.380903754848489 1.596930000000000 + 600 1 49.553810000000017 21.226704112453107 0.000000000000000 + 601 2 51.152320000000018 22.149604291255416 -1.596930000000000 + 602 3 51.152320000000018 22.149604291255416 1.596930000000000 + 603 1 51.152320000000018 23.995404648860034 0.000000000000000 + 604 2 52.750830000000018 24.918304827662343 -1.596930000000000 + 605 3 52.750830000000018 24.918304827662343 1.596930000000000 + 606 1 52.750830000000018 26.764105185266961 0.000000000000000 + 607 2 54.349340000000019 27.687005364069270 -1.596930000000000 + 608 3 54.349340000000019 27.687005364069270 1.596930000000000 + 609 1 54.349340000000019 29.532805721673888 0.000000000000000 + 610 2 55.947850000000019 30.455705900476197 -1.596930000000000 + 611 3 55.947850000000019 30.455705900476197 1.596930000000000 + 612 1 55.947850000000019 32.301506258080815 0.000000000000000 + 613 2 57.546360000000020 33.224406436883124 -1.596930000000000 + 614 3 57.546360000000020 33.224406436883124 1.596930000000000 + 615 1 57.546360000000020 35.070206794487742 0.000000000000000 + 616 2 59.144870000000020 35.993106973290051 -1.596930000000000 + 617 3 59.144870000000020 35.993106973290051 1.596930000000000 + 618 1 59.144870000000020 37.838907330894669 0.000000000000000 + 619 2 60.743380000000021 38.761807509696978 -1.596930000000000 + 620 3 60.743380000000021 38.761807509696978 1.596930000000000 + 621 1 60.743380000000021 40.607607867301596 0.000000000000000 + 622 2 62.341890000000022 41.530508046103905 -1.596930000000000 + 623 3 62.341890000000022 41.530508046103905 1.596930000000000 + 624 1 62.341890000000022 43.376308403708523 0.000000000000000 + 625 2 41.561260000000014 0.000000000000000 -1.596930000000000 + 626 3 41.561260000000014 0.000000000000000 1.596930000000000 + 627 1 41.561260000000014 1.845800357604618 0.000000000000000 + 628 2 43.159770000000015 2.768700536406927 -1.596930000000000 + 629 3 43.159770000000015 2.768700536406927 1.596930000000000 + 630 1 43.159770000000015 4.614500894011545 0.000000000000000 + 631 2 44.758280000000015 5.537401072813854 -1.596930000000000 + 632 3 44.758280000000015 5.537401072813854 1.596930000000000 + 633 1 44.758280000000015 7.383201430418472 0.000000000000000 + 634 2 46.356790000000016 8.306101609220781 -1.596930000000000 + 635 3 46.356790000000016 8.306101609220781 1.596930000000000 + 636 1 46.356790000000016 10.151901966825399 0.000000000000000 + 637 2 47.955300000000017 11.074802145627708 -1.596930000000000 + 638 3 47.955300000000017 11.074802145627708 1.596930000000000 + 639 1 47.955300000000017 12.920602503232326 0.000000000000000 + 640 2 49.553810000000017 13.843502682034635 -1.596930000000000 + 641 3 49.553810000000017 13.843502682034635 1.596930000000000 + 642 1 49.553810000000017 15.689303039639253 0.000000000000000 + 643 2 51.152320000000018 16.612203218441562 -1.596930000000000 + 644 3 51.152320000000018 16.612203218441562 1.596930000000000 + 645 1 51.152320000000018 18.458003576046180 0.000000000000000 + 646 2 52.750830000000018 19.380903754848489 -1.596930000000000 + 647 3 52.750830000000018 19.380903754848489 1.596930000000000 + 648 1 52.750830000000018 21.226704112453107 0.000000000000000 + 649 2 54.349340000000019 22.149604291255416 -1.596930000000000 + 650 3 54.349340000000019 22.149604291255416 1.596930000000000 + 651 1 54.349340000000019 23.995404648860034 0.000000000000000 + 652 2 55.947850000000019 24.918304827662343 -1.596930000000000 + 653 3 55.947850000000019 24.918304827662343 1.596930000000000 + 654 1 55.947850000000019 26.764105185266961 0.000000000000000 + 655 2 57.546360000000020 27.687005364069270 -1.596930000000000 + 656 3 57.546360000000020 27.687005364069270 1.596930000000000 + 657 1 57.546360000000020 29.532805721673888 0.000000000000000 + 658 2 59.144870000000020 30.455705900476197 -1.596930000000000 + 659 3 59.144870000000020 30.455705900476197 1.596930000000000 + 660 1 59.144870000000020 32.301506258080815 0.000000000000000 + 661 2 60.743380000000021 33.224406436883124 -1.596930000000000 + 662 3 60.743380000000021 33.224406436883124 1.596930000000000 + 663 1 60.743380000000021 35.070206794487742 0.000000000000000 + 664 2 62.341890000000022 35.993106973290051 -1.596930000000000 + 665 3 62.341890000000022 35.993106973290051 1.596930000000000 + 666 1 62.341890000000022 37.838907330894669 0.000000000000000 + 667 2 63.940400000000022 38.761807509696978 -1.596930000000000 + 668 3 63.940400000000022 38.761807509696978 1.596930000000000 + 669 1 63.940400000000022 40.607607867301596 0.000000000000000 + 670 2 65.538910000000023 41.530508046103905 -1.596930000000000 + 671 3 65.538910000000023 41.530508046103905 1.596930000000000 + 672 1 65.538910000000023 43.376308403708523 0.000000000000000 + 673 2 44.758280000000015 0.000000000000000 -1.596930000000000 + 674 3 44.758280000000015 0.000000000000000 1.596930000000000 + 675 1 44.758280000000015 1.845800357604618 0.000000000000000 + 676 2 46.356790000000016 2.768700536406927 -1.596930000000000 + 677 3 46.356790000000016 2.768700536406927 1.596930000000000 + 678 1 46.356790000000016 4.614500894011545 0.000000000000000 + 679 2 47.955300000000017 5.537401072813854 -1.596930000000000 + 680 3 47.955300000000017 5.537401072813854 1.596930000000000 + 681 1 47.955300000000017 7.383201430418472 0.000000000000000 + 682 2 49.553810000000017 8.306101609220781 -1.596930000000000 + 683 3 49.553810000000017 8.306101609220781 1.596930000000000 + 684 1 49.553810000000017 10.151901966825399 0.000000000000000 + 685 2 51.152320000000018 11.074802145627708 -1.596930000000000 + 686 3 51.152320000000018 11.074802145627708 1.596930000000000 + 687 1 51.152320000000018 12.920602503232326 0.000000000000000 + 688 2 52.750830000000018 13.843502682034635 -1.596930000000000 + 689 3 52.750830000000018 13.843502682034635 1.596930000000000 + 690 1 52.750830000000018 15.689303039639253 0.000000000000000 + 691 2 54.349340000000019 16.612203218441562 -1.596930000000000 + 692 3 54.349340000000019 16.612203218441562 1.596930000000000 + 693 1 54.349340000000019 18.458003576046180 0.000000000000000 + 694 2 55.947850000000019 19.380903754848489 -1.596930000000000 + 695 3 55.947850000000019 19.380903754848489 1.596930000000000 + 696 1 55.947850000000019 21.226704112453107 0.000000000000000 + 697 2 57.546360000000020 22.149604291255416 -1.596930000000000 + 698 3 57.546360000000020 22.149604291255416 1.596930000000000 + 699 1 57.546360000000020 23.995404648860034 0.000000000000000 + 700 2 59.144870000000020 24.918304827662343 -1.596930000000000 + 701 3 59.144870000000020 24.918304827662343 1.596930000000000 + 702 1 59.144870000000020 26.764105185266961 0.000000000000000 + 703 2 60.743380000000021 27.687005364069270 -1.596930000000000 + 704 3 60.743380000000021 27.687005364069270 1.596930000000000 + 705 1 60.743380000000021 29.532805721673888 0.000000000000000 + 706 2 62.341890000000022 30.455705900476197 -1.596930000000000 + 707 3 62.341890000000022 30.455705900476197 1.596930000000000 + 708 1 62.341890000000022 32.301506258080815 0.000000000000000 + 709 2 63.940400000000022 33.224406436883124 -1.596930000000000 + 710 3 63.940400000000022 33.224406436883124 1.596930000000000 + 711 1 63.940400000000022 35.070206794487742 0.000000000000000 + 712 2 65.538910000000023 35.993106973290051 -1.596930000000000 + 713 3 65.538910000000023 35.993106973290051 1.596930000000000 + 714 1 65.538910000000023 37.838907330894669 0.000000000000000 + 715 2 67.137420000000023 38.761807509696978 -1.596930000000000 + 716 3 67.137420000000023 38.761807509696978 1.596930000000000 + 717 1 67.137420000000023 40.607607867301596 0.000000000000000 + 718 2 68.735930000000024 41.530508046103905 -1.596930000000000 + 719 3 68.735930000000024 41.530508046103905 1.596930000000000 + 720 1 68.735930000000024 43.376308403708523 0.000000000000000 + 721 2 47.955300000000017 0.000000000000000 -1.596930000000000 + 722 3 47.955300000000017 0.000000000000000 1.596930000000000 + 723 1 47.955300000000017 1.845800357604618 0.000000000000000 + 724 2 49.553810000000017 2.768700536406927 -1.596930000000000 + 725 3 49.553810000000017 2.768700536406927 1.596930000000000 + 726 1 49.553810000000017 4.614500894011545 0.000000000000000 + 727 2 51.152320000000018 5.537401072813854 -1.596930000000000 + 728 3 51.152320000000018 5.537401072813854 1.596930000000000 + 729 1 51.152320000000018 7.383201430418472 0.000000000000000 + 730 2 52.750830000000018 8.306101609220781 -1.596930000000000 + 731 3 52.750830000000018 8.306101609220781 1.596930000000000 + 732 1 52.750830000000018 10.151901966825399 0.000000000000000 + 733 2 54.349340000000019 11.074802145627708 -1.596930000000000 + 734 3 54.349340000000019 11.074802145627708 1.596930000000000 + 735 1 54.349340000000019 12.920602503232326 0.000000000000000 + 736 2 55.947850000000019 13.843502682034635 -1.596930000000000 + 737 3 55.947850000000019 13.843502682034635 1.596930000000000 + 738 1 55.947850000000019 15.689303039639253 0.000000000000000 + 739 2 57.546360000000020 16.612203218441562 -1.596930000000000 + 740 3 57.546360000000020 16.612203218441562 1.596930000000000 + 741 1 57.546360000000020 18.458003576046180 0.000000000000000 + 742 2 59.144870000000020 19.380903754848489 -1.596930000000000 + 743 3 59.144870000000020 19.380903754848489 1.596930000000000 + 744 1 59.144870000000020 21.226704112453107 0.000000000000000 + 745 2 60.743380000000021 22.149604291255416 -1.596930000000000 + 746 3 60.743380000000021 22.149604291255416 1.596930000000000 + 747 1 60.743380000000021 23.995404648860034 0.000000000000000 + 748 2 62.341890000000022 24.918304827662343 -1.596930000000000 + 749 3 62.341890000000022 24.918304827662343 1.596930000000000 + 750 1 62.341890000000022 26.764105185266961 0.000000000000000 + 751 2 63.940400000000022 27.687005364069270 -1.596930000000000 + 752 3 63.940400000000022 27.687005364069270 1.596930000000000 + 753 1 63.940400000000022 29.532805721673888 0.000000000000000 + 754 2 65.538910000000023 30.455705900476197 -1.596930000000000 + 755 3 65.538910000000023 30.455705900476197 1.596930000000000 + 756 1 65.538910000000023 32.301506258080815 0.000000000000000 + 757 2 67.137420000000023 33.224406436883124 -1.596930000000000 + 758 3 67.137420000000023 33.224406436883124 1.596930000000000 + 759 1 67.137420000000023 35.070206794487742 0.000000000000000 + 760 2 68.735930000000024 35.993106973290051 -1.596930000000000 + 761 3 68.735930000000024 35.993106973290051 1.596930000000000 + 762 1 68.735930000000024 37.838907330894669 0.000000000000000 + 763 2 70.334440000000024 38.761807509696978 -1.596930000000000 + 764 3 70.334440000000024 38.761807509696978 1.596930000000000 + 765 1 70.334440000000024 40.607607867301596 0.000000000000000 + 766 2 71.932950000000025 41.530508046103905 -1.596930000000000 + 767 3 71.932950000000025 41.530508046103905 1.596930000000000 + 768 1 71.932950000000025 43.376308403708523 0.000000000000000 diff --git a/examples/threebody/tmd.sw.mod b/examples/threebody/tmd.sw.mod new file mode 120000 index 0000000000..0affacdd40 --- /dev/null +++ b/examples/threebody/tmd.sw.mod @@ -0,0 +1 @@ +../../potentials/tmd.sw.mod \ No newline at end of file diff --git a/lib/gpu/Makefile.cuda_mps b/lib/gpu/Makefile.cuda_mps index f52bd07fcf..d7820e4c34 100644 --- a/lib/gpu/Makefile.cuda_mps +++ b/lib/gpu/Makefile.cuda_mps @@ -1,5 +1,5 @@ # /* ---------------------------------------------------------------------- -# Generic Linux Makefile for CUDA +# Generic Linux Makefile for CUDA with the Multi-Process Service (MPS) # - change CUDA_ARCH for your GPU # ------------------------------------------------------------------------- */ diff --git a/lib/gpu/Makefile.hip b/lib/gpu/Makefile.hip index a736988596..d5391f7d6b 100644 --- a/lib/gpu/Makefile.hip +++ b/lib/gpu/Makefile.hip @@ -39,11 +39,9 @@ HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform) HIP_COMPILER=$(shell $(HIP_PATH)/bin/hipconfig --compiler) ifeq (hcc,$(HIP_PLATFORM)) - HIP_OPTS += -ffast-math # possible values: gfx803,gfx900,gfx906 HIP_ARCH = gfx906 else ifeq (amd,$(HIP_PLATFORM)) - HIP_OPTS += -ffast-math # possible values: gfx803,gfx900,gfx906 HIP_ARCH = gfx906 else ifeq (nvcc,$(HIP_PLATFORM)) diff --git a/lib/gpu/Makefile.linux b/lib/gpu/Makefile.linux index 0b3084cbe9..bed6848980 100644 --- a/lib/gpu/Makefile.linux +++ b/lib/gpu/Makefile.linux @@ -1,5 +1,5 @@ # /* ---------------------------------------------------------------------- -# Generic Linux Makefile for CUDA +# Generic Linux Makefile for CUDA # - Change CUDA_ARCH for your GPU # ------------------------------------------------------------------------- */ @@ -13,7 +13,7 @@ endif NVCC = nvcc -# obsolete hardware. not supported by current drivers anymore. +# obsolete hardware. not supported by current drivers and toolkits anymore. #CUDA_ARCH = -arch=sm_13 #CUDA_ARCH = -arch=sm_10 -DCUDA_PRE_THREE @@ -28,11 +28,11 @@ NVCC = nvcc #CUDA_ARCH = -arch=sm_37 # Maxwell hardware -CUDA_ARCH = -arch=sm_50 +#CUDA_ARCH = -arch=sm_50 #CUDA_ARCH = -arch=sm_52 # Pascal hardware -#CUDA_ARCH = -arch=sm_60 +CUDA_ARCH = -arch=sm_60 #CUDA_ARCH = -arch=sm_61 # Volta hardware @@ -70,7 +70,7 @@ LIB_DIR = ./ AR = ar BSH = /bin/sh -# GPU binning not recommended with modern GPUs +# GPU binning not recommended for most modern GPUs CUDPP_OPT = #-DUSE_CUDPP -Icudpp_mini include Nvidia.makefile diff --git a/lib/gpu/Makefile.linux_multi b/lib/gpu/Makefile.linux_multi index 05b869879e..f3d89fd9f0 100644 --- a/lib/gpu/Makefile.linux_multi +++ b/lib/gpu/Makefile.linux_multi @@ -1,6 +1,6 @@ # /* ---------------------------------------------------------------------- -# Generic Linux Makefile for CUDA -# - Change CUDA_ARCH for your GPU +# Generic Linux Makefile for CUDA complied for multiple compute capabilities +# - Add your GPU to CUDA_CODE # ------------------------------------------------------------------------- */ # which file will be copied to Makefile.lammps diff --git a/lib/gpu/Makefile.mpi b/lib/gpu/Makefile.mpi new file mode 120000 index 0000000000..8bad27d081 --- /dev/null +++ b/lib/gpu/Makefile.mpi @@ -0,0 +1 @@ +Makefile.linux \ No newline at end of file diff --git a/lib/gpu/Makefile.serial b/lib/gpu/Makefile.serial index d24b03f8d6..6c94911f32 100644 --- a/lib/gpu/Makefile.serial +++ b/lib/gpu/Makefile.serial @@ -1,5 +1,5 @@ # /* ---------------------------------------------------------------------- -# Generic Linux Makefile for CUDA +# Generic Linux Makefile for CUDA without MPI libraries # - Change CUDA_ARCH for your GPU # ------------------------------------------------------------------------- */ @@ -28,11 +28,11 @@ NVCC = nvcc #CUDA_ARCH = -arch=sm_37 # Maxwell hardware -CUDA_ARCH = -arch=sm_50 +#CUDA_ARCH = -arch=sm_50 #CUDA_ARCH = -arch=sm_52 # Pascal hardware -#CUDA_ARCH = -arch=sm_60 +CUDA_ARCH = -arch=sm_60 #CUDA_ARCH = -arch=sm_61 # Volta hardware @@ -41,6 +41,10 @@ CUDA_ARCH = -arch=sm_50 # Turing hardware #CUDA_ARCH = -arch=sm_75 +# Ampere hardware +#CUDA_ARCH = -arch=sm_80 +#CUDA_ARCH = -arch=sm_86 + # this setting should match LAMMPS Makefile # one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL diff --git a/lib/gpu/Makefile.turing b/lib/gpu/Makefile.turing deleted file mode 100644 index 390de9c558..0000000000 --- a/lib/gpu/Makefile.turing +++ /dev/null @@ -1,23 +0,0 @@ -NVCC = $(CUDA_HOME)/bin/nvcc -EXTRAMAKE = Makefile.lammps.standard - -CUDA_ARCH = -arch=sm_75 -CUDA_PRECISION = -D_SINGLE_DOUBLE -CUDA_INCLUDE = -I$(CUDA_HOME)/include -CUDA_LIB = -L$(CUDA_HOME)/lib64 -Xlinker -rpath -Xlinker $(CUDA_HOME)/lib64 -lcudart -CUDA_OPTS = -DUNIX -O3 --use_fast_math --ftz=true - -CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include -CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON -DLAMMPS_SMALLBIG - -BIN_DIR = . -OBJ_DIR = obj -LIB_DIR = . -AR = ar -BSH = /bin/sh - -# GPU binning not recommended with most modern GPUs -CUDPP_OPT = #-DUSE_CUDPP -Icudpp_mini - -include Nvidia.makefile - diff --git a/lib/kim/Install.py b/lib/kim/Install.py index ae4b356ba9..da0a4296d6 100644 --- a/lib/kim/Install.py +++ b/lib/kim/Install.py @@ -17,6 +17,8 @@ parser = ArgumentParser(prog='Install.py', # settings +CMAKE = os.environ.get('CMAKE') or 'cmake' + thisdir = fullpath('.') version = "2.2.1" @@ -141,7 +143,7 @@ if buildflag: # configure kim-api print("Configuring kim-api ...") - cmd = 'cd "%s/kim-api-%s" && mkdir build && cd build && cmake .. -DCMAKE_INSTALL_PREFIX="%s" -DCMAKE_BUILD_TYPE=Release' % (thisdir,version,kimdir) + cmd = 'cd "%s/kim-api-%s" && mkdir build && cd build && %s .. -DCMAKE_INSTALL_PREFIX="%s" -DCMAKE_BUILD_TYPE=Release' % (thisdir,version,CMAKE,kimdir) txt = subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) if verboseflag: print(txt.decode("UTF-8")) diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index 7bb6de4cd9..2e779791dd 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,5 +1,165 @@ # Change Log +## [3.5.00](https://github.com/kokkos/kokkos/tree/3.5.00) (2021-10-19) +[Full Changelog](https://github.com/kokkos/kokkos/compare/3.4.01...3.5.00) + +### Features: + +- Add support for quad-precision math functions/traits [\#4098](https://github.com/kokkos/kokkos/pull/4098) +- Adding ExecutionSpace partitioning function [\#4096](https://github.com/kokkos/kokkos/pull/4096) +- Improve Python Interop Capabilities [\#4065](https://github.com/kokkos/kokkos/pull/4065) +- Add half_t Kokkos::rand specialization [\#3922](https://github.com/kokkos/kokkos/pull/3922) +- Add math special functions: erf, erfcx, expint1, Bessel functions, Hankel functions [\#3920](https://github.com/kokkos/kokkos/pull/3920) +- Add missing common mathematical functions [\#4043](https://github.com/kokkos/kokkos/pull/4043) [\#4036](https://github.com/kokkos/kokkos/pull/4036) [\#4034](https://github.com/kokkos/kokkos/pull/4034) +- Let the numeric traits be SFINAE-friendly [\#4038](https://github.com/kokkos/kokkos/pull/4038) +- Add Desul atomics - enabling memory-order and memory-scope parameters [\#3247](https://github.com/kokkos/kokkos/pull/3247) +- Add detection idiom from the C++ standard library extension version 2 [\#3980](https://github.com/kokkos/kokkos/pull/3980) +- Fence Profiling Support in all backends [\#3966](https://github.com/kokkos/kokkos/pull/3966) [\#4304](https://github.com/kokkos/kokkos/pull/4304) [\#4258](https://github.com/kokkos/kokkos/pull/4258) [\#4232](https://github.com/kokkos/kokkos/pull/4232) +- Significant SYCL enhancements (see below) + +### Deprecations: + +- Deprecate CUDA_SAFE_CALL and HIP_SAFE_CALL [\#4249](https://github.com/kokkos/kokkos/pull/4249) +- Deprecate Kokkos::Impl::Timer (Kokkos::Timer has been available for a long time) [\#4201](https://github.com/kokkos/kokkos/pull/4201) +- Deprecate Experimental::MasterLock [\#4094](https://github.com/kokkos/kokkos/pull/4094) +- Deprecate Kokkos_TaskPolicy.hpp (headers got reorganized, doesn't remove functionality) [\#4011](https://github.com/kokkos/kokkos/pull/4011) +- Deprecate backward compatibility features [\#3978](https://github.com/kokkos/kokkos/pull/3978) +- Update and deprecate is_space::host_memory/execution/mirror_space [\#3973](https://github.com/kokkos/kokkos/pull/3973) + + +### Backends and Archs Enhancements: + +- Enabling constbitset constructors in kernels [\#4296](https://github.com/kokkos/kokkos/pull/4296) +- Use ZeroMemset in View constructor to improve performance [\#4226](https://github.com/kokkos/kokkos/pull/4226) +- Use memset in deep_copy [\#3944](https://github.com/kokkos/kokkos/pull/3944) +- Add missing fence() calls in resize(View) that effectively do deep_copy(resized, orig) [\#4212](https://github.com/kokkos/kokkos/pull/4212) +- Avoid allocations in resize and realloc [\#4207](https://github.com/kokkos/kokkos/pull/4207) +- StaticCsrGraph: use device type instead of execution space to construct views [\#3991](https://github.com/kokkos/kokkos/pull/3991) +- Consider std::sort when view is accessible from host [\#3929](https://github.com/kokkos/kokkos/pull/3929) +- Fix CPP20 warnings except for volatile [\#4312](https://github.com/kokkos/kokkos/pull/4312) + +#### SYCL: +- Introduce SYCLHostUSMSpace [\#4268](https://github.com/kokkos/kokkos/pull/4268) +- Implement SYCL TeamPolicy for vector_size > 1 [\#4183](https://github.com/kokkos/kokkos/pull/4183) +- Enable 64bit ranges for SYCL [\#4211](https://github.com/kokkos/kokkos/pull/4211) +- Don't print SYCL device info in execution space intialization [\#4168](https://github.com/kokkos/kokkos/pull/4168) +- Improve SYCL MDRangePolicy performance [\#4161](https://github.com/kokkos/kokkos/pull/4161) +- Use sub_groups in SYCL parallel_scan [\#4147](https://github.com/kokkos/kokkos/pull/4147) +- Implement subgroup reduction for SYCL RangePolicy parallel_reduce [\#3940](https://github.com/kokkos/kokkos/pull/3940) +- Use DPC++ broadcast extension in SYCL team_broadcast [\#4103](https://github.com/kokkos/kokkos/pull/4103) +- Only fence in SYCL parallel_reduce for non-device-accessible result_ptr [\#4089](https://github.com/kokkos/kokkos/pull/4089) +- Improve fencing behavior in SYCL backend [\#4088](https://github.com/kokkos/kokkos/pull/4088) +- Fence all registered SYCL queues before deallocating memory [\#4086](https://github.com/kokkos/kokkos/pull/4086) +- Implement SYCL::print_configuration [\#3992](https://github.com/kokkos/kokkos/pull/3992) +- Reuse scratch memory in parallel_scan and TeamPolicy (decreases memory footprint) [\#3899](https://github.com/kokkos/kokkos/pull/3899) [\#3889](https://github.com/kokkos/kokkos/pull/3889) + +#### CUDA: +- Cuda improve heuristic for blocksize [\#4271](https://github.com/kokkos/kokkos/pull/4271) +- Don't use [[deprecated]] for nvcc [\#4229](https://github.com/kokkos/kokkos/pull/4229) +- Improve error message for NVHPC as host compiler [\#4227](https://github.com/kokkos/kokkos/pull/4227) +- Update support for cuda reductions to work with types < 4bytes [\#4156](https://github.com/kokkos/kokkos/pull/4156) +- Fix incompatible team size deduction in rare cases parallel_reduce [\#4142](https://github.com/kokkos/kokkos/pull/4142) +- Remove UVM usage in DynamicView [\#4129](https://github.com/kokkos/kokkos/pull/4129) +- Remove dependency between core and containers [\#4114](https://github.com/kokkos/kokkos/pull/4114) +- Adding opt-in CudaMallocSync support when using CUDA version >= 11.2 [\#4026](https://github.com/kokkos/kokkos/pull/4026) [\#4233](https://github.com/kokkos/kokkos/pull/4233) +- Fix a potential race condition in the CUDA backend [\#3999](https://github.com/kokkos/kokkos/pull/3999) + +#### HIP: +- Implement new blocksize deduction method for HIP Backend [\#3953](https://github.com/kokkos/kokkos/pull/3953) +- Add multiple LaunchMechanism [\#3820](https://github.com/kokkos/kokkos/pull/3820) +- Make HIP backend thread-safe [\#4170](https://github.com/kokkos/kokkos/pull/4170) + +#### Serial: +- Refactor Serial backend and fix thread-safety issue [\#4053](https://github.com/kokkos/kokkos/pull/4053) + +#### OpenMPTarget: +- OpenMPTarget: support array reductions in RangePolicy [\#4040](https://github.com/kokkos/kokkos/pull/4040) +- OpenMPTarget: add MDRange parallel_reduce [\#4032](https://github.com/kokkos/kokkos/pull/4032) +- OpenMPTarget: Fix bug in for the case of a reducer. [\#4044](https://github.com/kokkos/kokkos/pull/4044) +- OpenMPTarget: verify process fix [\#4041](https://github.com/kokkos/kokkos/pull/4041) + +### Implemented enhancements BuildSystem + +#### Important BuildSystem Updates: +- Use hipcc architecture autodetection when Kokkos_ARCH is not set [\#3941](https://github.com/kokkos/kokkos/pull/3941) +- Introduce Kokkos_ENABLE_DEPRECATION_WARNINGS and remove deprecated code with Kokkos_ENABLE_DEPRECATED_CODE_3 [\#4106](https://github.com/kokkos/kokkos/pull/4106) [\#3855](https://github.com/kokkos/kokkos/pull/3855) + +#### Other Improvements: +- Add allow-unsupported-compiler flag to nvcc-wrapper [\#4298](https://github.com/kokkos/kokkos/pull/4298) +- nvcc_wrapper: fix errors in argument handling [\#3993](https://github.com/kokkos/kokkos/pull/3993) +- Adds support for -time= and -time in nvcc_wrapper [\#4015](https://github.com/kokkos/kokkos/pull/4015) +- nvcc_wrapper: suppress duplicates of GPU architecture and RDC flags [\#3968](https://github.com/kokkos/kokkos/pull/3968) +- Fix TMPDIR support in nvcc_wrapper [\#3792](https://github.com/kokkos/kokkos/pull/3792) +- NVHPC: update PGI compiler arch flags [\#4133](https://github.com/kokkos/kokkos/pull/4133) +- Replace PGI with NVHPC (works for both) [\#4196](https://github.com/kokkos/kokkos/pull/4196) +- Make sure that KOKKOS_CXX_HOST_COMPILER_ID is defined [\#4235](https://github.com/kokkos/kokkos/pull/4235) +- Add options to Makefile builds for deprecated code and warnings [\#4215](https://github.com/kokkos/kokkos/pull/4215) +- Use KOKKOS_CXX_HOST_COMPILER_ID for identifying CPU arch flags [\#4199](https://github.com/kokkos/kokkos/pull/4199) +- Added support for Cray Clang to Makefile.kokkos [\#4176](https://github.com/kokkos/kokkos/pull/4176) +- Add XLClang as compiler [\#4120](https://github.com/kokkos/kokkos/pull/4120) +- Keep quoted compiler flags when passing to Trilinos [\#3987](https://github.com/kokkos/kokkos/pull/3987) +- Add support for AMD Zen3 CPU architecture [\#3972](https://github.com/kokkos/kokkos/pull/3972) +- Rename IntelClang to IntelLLVM [\#3945](https://github.com/kokkos/kokkos/pull/3945) +- Add cppcoreguidelines-pro-type-cstyle-cast to clang-tidy [\#3522](https://github.com/kokkos/kokkos/pull/3522) +- Add sve bit size definition for A64FX [\#3947](https://github.com/kokkos/kokkos/pull/3947) [\#3946](https://github.com/kokkos/kokkos/pull/3946) +- Remove KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES [\#4150](https://github.com/kokkos/kokkos/pull/4150) + +### Other Changes: + +#### Tool Enhancements: + +- Retrieve original value from a point in a MultidimensionalSparseTuningProblem [\#3977](https://github.com/kokkos/kokkos/pull/3977) +- Allow extension of built-in tuners with additional tuning axes [\#3961](https://github.com/kokkos/kokkos/pull/3961) +- Added a categorical tuner [\#3955](https://github.com/kokkos/kokkos/pull/3955) + + +#### Miscellaneous: + +- hpcbind: Use double quotes around $@ when invoking user command [\#4284](https://github.com/kokkos/kokkos/pull/4284) +- Add file and line to error message [\#3985](https://github.com/kokkos/kokkos/pull/3985) +- Fix compiler warnings when compiling with nvc++ [\#4198](https://github.com/kokkos/kokkos/pull/4198) +- Add OpenMPTarget CI build on AMD GPUs [\#4055](https://github.com/kokkos/kokkos/pull/4055) +- CI: icpx is now part of intel container [\#4002](https://github.com/kokkos/kokkos/pull/4002) + +### Incompatibilities: + +- Remove pre CUDA 9 KOKKOS_IMPL_CUDA_* macros [\#4138](https://github.com/kokkos/kokkos/pull/4138) + +### Bug Fixes: +- UnorderedMap::clear() should zero the size() [\#4130](https://github.com/kokkos/kokkos/pull/4130) +- Add memory fence for HostSharedPtr::cleanup() [\#4144](https://github.com/kokkos/kokkos/pull/4144) +- SYCL: Fix race conditions in TeamPolicy::parallel_reduce [\#4418](https://github.com/kokkos/kokkos/pull/4418) +- Adding missing memory fence to serial exec space fence. [\#4292](https://github.com/kokkos/kokkos/pull/4292) +- Fix using external SYCL queues in tests [\#4291](https://github.com/kokkos/kokkos/pull/4291) +- Fix digits10 bug [\#4281](https://github.com/kokkos/kokkos/pull/4281) +- Fixes constexpr errors with frounding-math on gcc < 10. [\#4278](https://github.com/kokkos/kokkos/pull/4278) +- Fix compiler flags for PGI/NVHPC [\#4264](https://github.com/kokkos/kokkos/pull/4264) +- Fix Zen2/3 also implying Zen Arch with Makefiles [\#4260](https://github.com/kokkos/kokkos/pull/4260) +- Kokkos_Cuda.hpp: Fix shadow warning with cuda/11.0 [\#4252](https://github.com/kokkos/kokkos/pull/4252) +- Fix issue w/ static initialization of function attributes [\#4242](https://github.com/kokkos/kokkos/pull/4242) +- Disable long double hypot test on Power systems [\#4221](https://github.com/kokkos/kokkos/pull/4221) +- Fix false sharing in random pool [\#4218](https://github.com/kokkos/kokkos/pull/4218) +- Fix a missing memory_fence for debug shared alloc code [\#4216](https://github.com/kokkos/kokkos/pull/4216) +- Fix two xl issues [\#4179](https://github.com/kokkos/kokkos/pull/4179) +- Makefile.kokkos: fix (standard_in) 1: syntax error [\#4173](https://github.com/kokkos/kokkos/pull/4173) +- Fixes for query_device example [\#4172](https://github.com/kokkos/kokkos/pull/4172) +- Fix a bug when using HIP atomic with Kokkos::Complex [\#4159](https://github.com/kokkos/kokkos/pull/4159) +- Fix mistaken logic in pthread creation [\#4157](https://github.com/kokkos/kokkos/pull/4157) +- Define KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION when requesting Kokkos_ENABLE_AGGRESSIVE_VECTORIZATION=ON [\#4107](https://github.com/kokkos/kokkos/pull/4107) +- Fix compilation with latest MSVC version [\#4102](https://github.com/kokkos/kokkos/pull/4102) +- Fix incorrect macro definitions when compiling with Intel compiler on Windows [\#4087](https://github.com/kokkos/kokkos/pull/4087) +- Fixup global buffer overflow in hand rolled string manipulation [\#4070](https://github.com/kokkos/kokkos/pull/4070) +- Fixup heap buffer overflow in cmd line args parsing unit tests [\#4069](https://github.com/kokkos/kokkos/pull/4069) +- Only add quotes in compiler flags for Trilinos if necessary [\#4067](https://github.com/kokkos/kokkos/pull/4067) +- Fixed invocation of tools init callbacks [\#4061](https://github.com/kokkos/kokkos/pull/4061) +- Work around SYCL JIT compiler issues with static variables [\#4013](https://github.com/kokkos/kokkos/pull/4013) +- Fix TestDetectionIdiom.cpp test inclusion for Trilinos/TriBITS [\#4010](https://github.com/kokkos/kokkos/pull/4010) +- Fixup allocation headers with OpenMPTarget backend [\#4003](https://github.com/kokkos/kokkos/pull/4003) +- Add missing specialization for OMPT to Kokkos Random [\#3967](https://github.com/kokkos/kokkos/pull/3967) +- Disable hypot long double test on power arches [\#3962](https://github.com/kokkos/kokkos/pull/3962) +- Use different EBO workaround for MSVC (rebased) [\#3924](https://github.com/kokkos/kokkos/pull/3924) +- Fix SYCL Kokkos::Profiling::(de)allocateData calls [\#3928](https://github.com/kokkos/kokkos/pull/3928) + ## [3.4.01](https://github.com/kokkos/kokkos/tree/3.4.01) (2021-05-19) [Full Changelog](https://github.com/kokkos/kokkos/compare/3.4.00...3.4.01) diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt index 9452027d8e..1b6753f983 100644 --- a/lib/kokkos/CMakeLists.txt +++ b/lib/kokkos/CMakeLists.txt @@ -111,8 +111,8 @@ ENDIF() set(Kokkos_VERSION_MAJOR 3) -set(Kokkos_VERSION_MINOR 4) -set(Kokkos_VERSION_PATCH 01) +set(Kokkos_VERSION_MINOR 5) +set(Kokkos_VERSION_PATCH 00) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") @@ -210,7 +210,12 @@ IF (KOKKOS_HAS_TRILINOS) # which needs another workaround. SET(KOKKOS_COMPILE_OPTIONS_TMP) FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS}) - LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP \"${OPTION}\") + STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE) + IF(OPTION_HAS_WHITESPACE EQUAL -1) + LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "${OPTION}") + ELSE() + LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "\"${OPTION}\"") + ENDIF() ENDFOREACH() STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS_TMP}") LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_COMPILE_OPTIONS}) diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index 2a984eefb6..7ffea5a62c 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -11,20 +11,21 @@ CXXFLAGS += $(SHFLAGS) endif KOKKOS_VERSION_MAJOR = 3 -KOKKOS_VERSION_MINOR = 4 -KOKKOS_VERSION_PATCH = 01 +KOKKOS_VERSION_MINOR = 5 +KOKKOS_VERSION_PATCH = 00 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) -# Options: Cuda,HIP,OpenMP,Pthread,Serial +# Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Pthread,Serial KOKKOS_DEVICES ?= "OpenMP" #KOKKOS_DEVICES ?= "Pthread" -# Options: +# Options: # Intel: KNC,KNL,SNB,HSW,BDW,SKX # NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86 # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX # IBM: BGQ,Power7,Power8,Power9 -# AMD-GPUS: Vega900,Vega906,Vega908 +# AMD-GPUS: Vega900,Vega906,Vega908,Vega90A # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3 +# Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP KOKKOS_ARCH ?= "" # Options: yes,no KOKKOS_DEBUG ?= "no" @@ -32,8 +33,8 @@ KOKKOS_DEBUG ?= "no" KOKKOS_USE_TPLS ?= "" # Options: c++14,c++1y,c++17,c++1z,c++2a KOKKOS_CXX_STANDARD ?= "c++14" -# Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align -KOKKOS_OPTIONS ?= "" +# Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align,disable_deprecated_code,enable_deprecation_warnings,enable_desul_atomics +KOKKOS_OPTIONS ?= "enable_desul_atomics" KOKKOS_CMAKE ?= "no" KOKKOS_TRIBITS ?= "no" KOKKOS_STANDALONE_CMAKE ?= "no" @@ -80,7 +81,7 @@ KOKKOS_INTERNAL_USE_MEMKIND := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),exper # Check for advanced settings. KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),compiler_warnings) -KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OPTIONS),aggressive_vectorization) +KOKKOS_INTERNAL_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OPTIONS),aggressive_vectorization) KOKKOS_INTERNAL_ENABLE_TUNING := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_tuning) KOKKOS_INTERNAL_DISABLE_COMPLEX_ALIGN := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_complex_align) KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_dualview_modify_check) @@ -92,6 +93,9 @@ KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda) KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_constexpr) KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch) +KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_desul_atomics) +KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_deprecated_code) +KOKKOS_INTERNAL_ENABLE_DEPRECATION_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_deprecation_warnings) KOKKOS_INTERNAL_HIP_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),rdc) @@ -112,6 +116,7 @@ endif # Check for other Execution Spaces. KOKKOS_INTERNAL_USE_CUDA := $(call kokkos_has_string,$(KOKKOS_DEVICES),Cuda) KOKKOS_INTERNAL_USE_HIP := $(call kokkos_has_string,$(KOKKOS_DEVICES),HIP) +KOKKOS_INTERNAL_USE_SYCL := $(call kokkos_has_string,$(KOKKOS_DEVICES),SYCL) KOKKOS_INTERNAL_USE_OPENMPTARGET := $(call kokkos_has_string,$(KOKKOS_DEVICES),OpenMPTarget) KOKKOS_DEVICELIST = @@ -133,11 +138,18 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) KOKKOS_DEVICELIST += HIP endif +KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER := $(shell expr $(KOKKOS_INTERNAL_ENABLE_CXX17) \ + + $(KOKKOS_INTERNAL_ENABLE_CXX20) \ + + $(KOKKOS_INTERNAL_ENABLE_CXX2A)) +ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1) + KOKKOS_DEVICELIST += SYCL + ifneq ($(KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER), 1) + $(error SYCL backend requires C++17 or newer) + endif + +endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) KOKKOS_DEVICELIST += OPENMPTARGET - KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER := $(shell expr $(KOKKOS_INTERNAL_ENABLE_CXX17) \ - + $(KOKKOS_INTERNAL_ENABLE_CXX20) \ - + $(KOKKOS_INTERNAL_ENABLE_CXX2A)) ifneq ($(KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER), 1) $(error OpenMPTarget backend requires C++17 or newer) endif @@ -168,6 +180,8 @@ KOKKOS_INTERNAL_COMPILER_XL := $(strip $(shell $(CXX) -qversion 2 KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "CC-")) KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep -c nvcc)>0" | bc)) KOKKOS_INTERNAL_COMPILER_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang) +KOKKOS_INTERNAL_COMPILER_CRAY_CLANG := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "clang++")) +KOKKOS_INTERNAL_COMPILER_INTEL_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),oneAPI) KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple clang) KOKKOS_INTERNAL_COMPILER_HCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC) KOKKOS_INTERNAL_COMPILER_GCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),GCC) @@ -247,7 +261,11 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) KOKKOS_INTERNAL_OPENMP_FLAG := -mp else ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY_CLANG), 1) + KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp + else KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp + endif else ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1) KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp @@ -259,7 +277,11 @@ else # OpenMP is turned on by default in Cray compiler environment. KOKKOS_INTERNAL_OPENMP_FLAG := else - KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp + ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL_CLANG), 1) + KOKKOS_INTERNAL_OPENMP_FLAG := -fiopenmp + else + KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp + endif endif endif endif @@ -317,6 +339,13 @@ KOKKOS_INTERNAL_USE_ARCH_BDW := $(call kokkos_has_string,$(KOKKOS_ARCH),BDW) KOKKOS_INTERNAL_USE_ARCH_SKX := $(call kokkos_has_string,$(KOKKOS_ARCH),SKX) KOKKOS_INTERNAL_USE_ARCH_KNL := $(call kokkos_has_string,$(KOKKOS_ARCH),KNL) +KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen) +KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen9) +KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen11) +KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen12LP) +KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelDG1) +KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelXeHP) + # NVIDIA based. NVCC_WRAPPER := $(KOKKOS_PATH)/bin/nvcc_wrapper KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler30) @@ -384,20 +413,25 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX) KOKKOS_INTERNAL_USE_ARCH_ZEN3 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen3) KOKKOS_INTERNAL_USE_ARCH_ZEN2 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen2) -KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 0) + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN2), 0) + KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen) + endif +endif KOKKOS_INTERNAL_USE_ARCH_VEGA900 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega900) KOKKOS_INTERNAL_USE_ARCH_VEGA906 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega906) KOKKOS_INTERNAL_USE_ARCH_VEGA908 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega908) +KOKKOS_INTERNAL_USE_ARCH_VEGA90A := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega90A) # Any AVX? KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM)) KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)) -KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3)) +KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3)) KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL)) KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SKX)) # Decide what ISA level we are able to support. -KOKKOS_INTERNAL_USE_ISA_X86_64 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3)) +KOKKOS_INTERNAL_USE_ISA_X86_64 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3)) KOKKOS_INTERNAL_USE_ISA_KNC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNC)) KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER8) + $(KOKKOS_INTERNAL_USE_ARCH_POWER9)) KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER7)) @@ -406,7 +440,7 @@ KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POW KOKKOS_INTERNAL_USE_TM := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_SKX)) # Incompatible flags? -KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc ) +KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc) KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1") | bc) ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1) @@ -442,6 +476,10 @@ KOKKOS_LINK_FLAGS = KOKKOS_SRC = KOKKOS_HEADERS = +#ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1) + KOKKOS_LIBS += -latomic +#endif + # Generating the KokkosCore_config.h file. KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp @@ -478,6 +516,10 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_HIP') endif +ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1) + tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_SYCL') +endif + ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMPTARGET') ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1) @@ -533,6 +575,12 @@ endif #only add the c++ standard flags if this is not CMake tmp := $(call kokkos_append_header,"/* General Settings */") +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEPRECATED_CODE_3") +endif +ifeq ($(KOKKOS_INTERNAL_ENABLE_DEPRECATION_WARNINGS), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEPRECATION_WARNINGS") +endif ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX14), 1) ifneq ($(KOKKOS_STANDALONE_CMAKE), yes) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX14_FLAG) @@ -635,8 +683,10 @@ endif tmp := $(call kokkos_append_header,"/* Optimization Settings */") -ifeq ($(KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION), 1) +ifeq ($(KOKKOS_INTERNAL_AGGRESSIVE_VECTORIZATION), 1) + # deprecated tmp := $(call kokkos_append_header,"$H""define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION") endif tmp := $(call kokkos_append_header,"/* Cuda Settings */") @@ -1166,6 +1216,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA908") KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx908 endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA90A), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HIP 90A") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA90A") + KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx90a + endif KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.cpp) @@ -1184,6 +1239,52 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) endif endif +# Figure out the architecture flag for SYCL. +ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1) + # Lets start with adding architecture defines + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN") + KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9-" + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN9") + KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9" + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN11") + KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen11" + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN12LP") + KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen12lp" + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_DG1") + KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device dg1" + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_XEHP") + KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device xehp" + endif + + KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/SYCL/*.cpp) + KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/SYCL/*.hpp) + + KOKKOS_CXXFLAGS+=-fsycl -fno-sycl-id-queries-fit-in-int -fsycl-unnamed-lambda + KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_SYCL_ARCH_FLAG) + KOKKOS_LDFLAGS+=-fsycl + KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_SYCL_ARCH_FLAG) +endif + +ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_DESUL_ATOMICS") +endif KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1) @@ -1196,56 +1297,62 @@ endif ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) tmp := $(shell cp KokkosCore_config.tmp KokkosCore_config.h) -# Functions for generating config header file -kokkos_start_config_header = $(shell sed 's~@INCLUDE_NEXT_FILE@~~g' $(KOKKOS_PATH)/cmake/KokkosCore_Config_HeaderSet.in > $1) -kokkos_update_config_header = $(shell sed 's~@HEADER_GUARD_TAG@~$1~g' $2 > $3) -kokkos_append_config_header = $(shell echo $1 >> $2)) -tmp := $(call kokkos_start_config_header, "KokkosCore_Config_FwdBackend.tmp") -tmp := $(call kokkos_start_config_header, "KokkosCore_Config_SetupBackend.tmp") -tmp := $(call kokkos_start_config_header, "KokkosCore_Config_DeclareBackend.tmp") -tmp := $(call kokkos_start_config_header, "KokkosCore_Config_PostInclude.tmp") -tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp") -tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp") -tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp") -tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp") -ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_SetupBackend.hpp") - ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1) - else - endif -endif -ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") -endif -ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_SetupBackend.hpp") -endif -ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") -endif -ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") -endif -ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") -endif -ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") -endif -ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") -endif + # Functions for generating config header file + kokkos_start_config_header = $(shell sed 's~@INCLUDE_NEXT_FILE@~~g' $(KOKKOS_PATH)/cmake/KokkosCore_Config_HeaderSet.in > $1) + kokkos_update_config_header = $(shell sed 's~@HEADER_GUARD_TAG@~$1~g' $2 > $3) + kokkos_append_config_header = $(shell echo $1 >> $2)) + tmp := $(call kokkos_start_config_header, "KokkosCore_Config_FwdBackend.tmp") + tmp := $(call kokkos_start_config_header, "KokkosCore_Config_SetupBackend.tmp") + tmp := $(call kokkos_start_config_header, "KokkosCore_Config_DeclareBackend.tmp") + tmp := $(call kokkos_start_config_header, "KokkosCore_Config_PostInclude.tmp") + tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp") + tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp") + ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_SetupBackend.hpp") + ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1) + else + endif + endif + ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + endif + ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1) + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_SetupBackend.hpp") + endif + ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_SetupBackend.hpp") + endif + ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + endif + ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + endif + ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + endif + ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + endif + ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + endif endif + KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp) @@ -1257,6 +1364,9 @@ KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp) ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp) + ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1) + KOKKOS_SRC += $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp + endif KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp) ifneq ($(CUDA_PATH),) KOKKOS_CPPLAGS += -I$(CUDA_PATH)/include diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets index cf9fc24242..93854d0cf1 100644 --- a/lib/kokkos/Makefile.targets +++ b/lib/kokkos/Makefile.targets @@ -48,6 +48,17 @@ Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp +Lock_Array_CUDA.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp +endif + +ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1) +Kokkos_SYCL.o : $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL.cpp +Kokkos_SYCL_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Space.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Space.cpp +Kokkos_SYCL_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Instance.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Instance.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) diff --git a/lib/kokkos/README.md b/lib/kokkos/README.md index d55ef2caac..673f462712 100644 --- a/lib/kokkos/README.md +++ b/lib/kokkos/README.md @@ -7,7 +7,7 @@ applications targeting all major HPC platforms. For that purpose it provides abstractions for both parallel execution of code and data management. Kokkos is designed to target complex node architectures with N-level memory hierarchies and multiple types of execution resources. It currently can use -CUDA, HPX, OpenMP and Pthreads as backend programming models with several other +CUDA, HIP, SYCL, HPX, OpenMP and C++ threads as backend programming models with several other backends in development. Kokkos Core is part of the Kokkos C++ Performance Portability Programming EcoSystem, @@ -16,29 +16,19 @@ profiling and debugging tools (https://github.com/kokkos/kokkos-tools). # Learning about Kokkos -A programming guide can be found on the Wiki, the API reference is under development. +The best way to start learning about Kokkos is going through the Kokkos Lectures. +They are online available at https://kokkos.link/the-lectures and contain a mix +of lecture videos and hands-on exercises covering all the important Kokkos Ecosystem +capabilities. + +A programming guide and API reference can be found on the Wiki +(https://github.com/kokkos/kokkos/wiki). For questions find us on Slack: https://kokkosteam.slack.com or open a github issue. For non-public questions send an email to crtrott(at)sandia.gov -A separate repository with extensive tutorial material can be found under -https://github.com/kokkos/kokkos-tutorials. - -Furthermore, the 'example/tutorial' directory provides step by step tutorial -examples which explain many of the features of Kokkos. They work with -simple Makefiles. To build with g++ and OpenMP simply type 'make' -in the 'example/tutorial' directory. This will build all examples in the -subfolders. To change the build options refer to the Programming Guide -in the compilation section. - -To learn more about Kokkos consider watching one of our presentations: -* GTC 2015: - - http://on-demand.gputechconf.com/gtc/2015/video/S5166.html - - http://on-demand.gputechconf.com/gtc/2015/presentation/S5166-H-Carter-Edwards.pdf - - # Contributing to Kokkos We are open and try to encourage contributions from external developers. @@ -53,57 +43,40 @@ For specifics see the LICENSE file contained in the repository or distribution. # Requirements -### Primary tested compilers on X86 are: -* GCC 5.3.0 -* GCC 5.4.0 -* GCC 5.5.0 -* GCC 6.1.0 -* GCC 7.2.0 -* GCC 7.3.0 -* GCC 8.1.0 -* Intel 17.0.1 -* Intel 17.4.196 -* Intel 18.2.128 -* Clang 4.0.0 -* Clang 6.0.0 for CUDA (CUDA Toolkit 9.0) -* Clang 7.0.0 for CUDA (CUDA Toolkit 9.1) -* Clang 8.0.0 for CUDA (CUDA Toolkit 9.2) -* PGI 18.7 -* NVCC 9.1 for CUDA (with gcc 6.1.0) -* NVCC 9.2 for CUDA (with gcc 7.2.0) -* NVCC 10.0 for CUDA (with gcc 7.4.0) -* NVCC 10.1 for CUDA (with gcc 7.4.0) -* NVCC 11.0 for CUDA (with gcc 8.4.0) +### Minimum Compiler Versions -### Primary tested compilers on Power 8 are: -* GCC 6.4.0 (OpenMP,Serial) -* GCC 7.2.0 (OpenMP,Serial) -* IBM XL 16.1.0 (OpenMP, Serial) -* NVCC 9.2.88 for CUDA (with gcc 7.2.0 and XL 16.1.0) +Generally Kokkos should work with all compiler versions newer than the minimum. +However as in all sufficiently complex enough code, we have to work around compiler +bugs with almost all compilers. So compiler versions we don't test may have issues +we are unaware off. -### Primary tested compilers on Intel KNL are: -* Intel 17.2.174 (with gcc 6.2.0 and 6.4.0) -* Intel 18.2.199 (with gcc 6.2.0 and 6.4.0) +* GCC: 5.3.0 +* Clang: 4.0.0 +* Intel: 17.0.1 +* NVCC: 9.2.88 +* NVC++: 21.5 +* ROCM: 4.3 +* MSVC: 19.29 +* IBM XL: 16.1.1 +* Fujitsu: 4.5.0 +* ARM/Clang 20.1 -### Primary tested compilers on ARM (Cavium ThunderX2) -* GCC 7.2.0 -* ARM/Clang 18.4.0 +### Primary Tested Compilers -### Other compilers working: -* X86: - * Cygwin 2.1.0 64bit with gcc 4.9.3 - * GCC 8.1.0 (not warning free) - -### Known non-working combinations: -* Power8: - * Pthreads backend -* ARM - * Pthreads backend +* GCC: 5.3.0, 6.1.0, 7.3.0, 8.3, 9.2, 10.0 +* NVCC: 9.2.88, 10.1, 11.0 +* Clang: 8.0.0, 9.0.0, 10.0.0, 12.0.0 +* Intel 17.4, 18.1, 19.5 +* MSVC: 19.29 +* ARM/Clang: 20.1 +* IBM XL: 16.1.1 +* ROCM: 4.3.0 ### Build system: -* CMake >= 3.10: required -* CMake >= 3.13: recommended + +* CMake >= 3.16: required * CMake >= 3.18: Fortran linkage. This does not affect most mixed Fortran/Kokkos builds. See [build issues](BUILD.md#KnownIssues). +* CMake >= 3.21.1 for NVC++ Primary tested compiler are passing in release mode with warnings as errors. They also are tested with a comprehensive set of @@ -153,7 +126,6 @@ cmake $srcdir \ -DCMAKE_INSTALL_PREFIX=$path_to_install \ -DKokkos_ENABLE_OPENMP=On \ -DKokkos_ARCH_HSW=On \ - -DKokkos_ENABLE_HWLOC=On \ -DKokkos_HWLOC_DIR=$path_to_hwloc ```` then simply type `make install`. The Kokkos CMake package will then be installed in `$path_to_install` to be used by downstream packages. @@ -212,23 +184,8 @@ where `...` is the unique spec identifying the particular Kokkos configuration a Some more details can found in the Kokkos spack [documentation](Spack.md) or the Spack [website](https://spack.readthedocs.io/en/latest). ## Raw Makefile -A bash script is provided to generate raw makefiles. -To install Kokkos as a library create a build directory and run the following -````bash -> $KOKKOS_PATH/generate_makefile.bash --prefix=$path_to_install -```` -Once the Makefile is generated, run: -````bash -> make kokkoslib -> make install -```` -To additionally run the unit tests: -````bash -> make build-test -> make test -```` -Run `generate_makefile.bash --help` for more detailed options such as -changing the device type for which to build. + +Raw Makefiles are only supported via inline builds. See below. ## Inline Builds vs. Installed Package For individual projects, it may be preferable to build Kokkos inline rather than link to an installed package. @@ -268,6 +225,35 @@ more than a single GPU is used by a single process. If you publish work which mentions Kokkos, please cite the following paper: +````BibTex +@ARTICLE{9485033, + author={Trott, Christian R. and Lebrun-Grandié, Damien and Arndt, Daniel and Ciesko, Jan and Dang, Vinh and Ellingwood, Nathan and Gayatri, Rahulkumar and Harvey, Evan and Hollman, Daisy S. and Ibanez, Dan and Liber, Nevin and Madsen, Jonathan and Miles, Jeff and Poliakoff, David and Powell, Amy and Rajamanickam, Sivasankaran and Simberg, Mikael and Sunderland, Dan and Turcksin, Bruno and Wilke, Jeremiah}, + journal={IEEE Transactions on Parallel and Distributed Systems}, + title={Kokkos 3: Programming Model Extensions for the Exascale Era}, + year={2022}, + volume={33}, + number={4}, + pages={805-817}, + doi={10.1109/TPDS.2021.3097283}} +```` + +If you use more than one Kokkos EcoSystem package, please also cite: + +````BibTex +@ARTICLE{9502936, + author={Trott, Christian and Berger-Vergiat, Luc and Poliakoff, David and Rajamanickam, Sivasankaran and Lebrun-Grandie, Damien and Madsen, Jonathan and Al Awar, Nader and Gligoric, Milos and Shipman, Galen and Womeldorff, Geoff}, + journal={Computing in Science Engineering}, + title={The Kokkos EcoSystem: Comprehensive Performance Portability for High Performance Computing}, + year={2021}, + volume={23}, + number={5}, + pages={10-18}, + doi={10.1109/MCSE.2021.3098509}} +```` + + +And if you feel generous: feel free to cite the original Kokkos paper which describes most of the basic Kokkos concepts: + ````BibTeX @article{CarterEdwards20143202, title = "Kokkos: Enabling manycore performance portability through polymorphic memory access patterns ", diff --git a/lib/kokkos/algorithms/CMakeLists.txt b/lib/kokkos/algorithms/CMakeLists.txt index 4df76a1dbb..eb54db8a55 100644 --- a/lib/kokkos/algorithms/CMakeLists.txt +++ b/lib/kokkos/algorithms/CMakeLists.txt @@ -5,9 +5,7 @@ KOKKOS_SUBPACKAGE(Algorithms) IF (NOT Kokkos_INSTALL_TESTING) ADD_SUBDIRECTORY(src) ENDIF() -IF(NOT (KOKKOS_ENABLE_OPENMPTARGET - AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR - KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))) +IF(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)) KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) ENDIF() diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp index 55ce19971f..46b8ab87fa 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp @@ -447,6 +447,25 @@ struct rand { } }; +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT +template +struct rand { + using half = Kokkos::Experimental::half_t; + KOKKOS_INLINE_FUNCTION + static half max() { return half(1.0); } + KOKKOS_INLINE_FUNCTION + static half draw(Generator& gen) { return half(gen.frand()); } + KOKKOS_INLINE_FUNCTION + static half draw(Generator& gen, const half& range) { + return half(gen.frand(float(range))); + } + KOKKOS_INLINE_FUNCTION + static half draw(Generator& gen, const half& start, const half& end) { + return half(gen.frand(float(start), float(end))); + } +}; +#endif // defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT + template struct rand { KOKKOS_INLINE_FUNCTION @@ -600,7 +619,7 @@ struct Random_XorShift1024_UseCArrayState template struct Random_UniqueIndex { - using locks_view_type = View; + using locks_view_type = View; KOKKOS_FUNCTION static int get_state_idx(const locks_view_type) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST @@ -615,7 +634,7 @@ struct Random_UniqueIndex { #ifdef KOKKOS_ENABLE_CUDA template <> struct Random_UniqueIndex { - using locks_view_type = View; + using locks_view_type = View; KOKKOS_FUNCTION static int get_state_idx(const locks_view_type& locks_) { #ifdef __CUDA_ARCH__ @@ -625,7 +644,7 @@ struct Random_UniqueIndex { blockDim.x * blockDim.y * blockDim.z + i_offset) % locks_.extent(0); - while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) { + while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) { i += blockDim.x * blockDim.y * blockDim.z; if (i >= static_cast(locks_.extent(0))) { i = i_offset; @@ -643,7 +662,7 @@ struct Random_UniqueIndex { #ifdef KOKKOS_ENABLE_HIP template <> struct Random_UniqueIndex { - using locks_view_type = View; + using locks_view_type = View; KOKKOS_FUNCTION static int get_state_idx(const locks_view_type& locks_) { #ifdef __HIP_DEVICE_COMPILE__ @@ -653,7 +672,7 @@ struct Random_UniqueIndex { blockDim.x * blockDim.y * blockDim.z + i_offset) % locks_.extent(0); - while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) { + while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) { i += blockDim.x * blockDim.y * blockDim.z; if (i >= static_cast(locks_.extent(0))) { i = i_offset; @@ -671,15 +690,15 @@ struct Random_UniqueIndex { #ifdef KOKKOS_ENABLE_SYCL template <> struct Random_UniqueIndex { - using locks_view_type = View; + using locks_view_type = View; KOKKOS_FUNCTION static int get_state_idx(const locks_view_type& locks_) { -#ifdef KOKKOS_ARCH_INTEL_GEN +#ifdef KOKKOS_ARCH_INTEL_GPU int i = Kokkos::Impl::clock_tic() % locks_.extent(0); #else int i = 0; #endif - while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) { + while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) { i = (i + 1) % static_cast(locks_.extent(0)); } return i; @@ -690,14 +709,14 @@ struct Random_UniqueIndex { #ifdef KOKKOS_ENABLE_OPENMPTARGET template <> struct Random_UniqueIndex { - using locks_view_type = View; + using locks_view_type = View; KOKKOS_FUNCTION static int get_state_idx(const locks_view_type& locks) { const int team_size = omp_get_num_threads(); int i = omp_get_team_num() * team_size + omp_get_thread_num(); const int lock_size = locks.extent_int(0); - while (Kokkos::atomic_compare_exchange(&locks(i), 0, 1)) { + while (Kokkos::atomic_compare_exchange(&locks(i, 0), 0, 1)) { i = (i + 1) % lock_size; } return i; @@ -856,18 +875,22 @@ template class Random_XorShift64_Pool { private: using execution_space = typename DeviceType::execution_space; - using locks_type = View; - using state_data_type = View; + using locks_type = View; + using state_data_type = View; locks_type locks_; state_data_type state_; int num_states_; + int padding_; public: using generator_type = Random_XorShift64; using device_type = DeviceType; KOKKOS_INLINE_FUNCTION - Random_XorShift64_Pool() { num_states_ = 0; } + Random_XorShift64_Pool() { + num_states_ = 0; + padding_ = 0; + } Random_XorShift64_Pool(uint64_t seed) { num_states_ = 0; @@ -883,16 +906,22 @@ class Random_XorShift64_Pool { locks_ = src.locks_; state_ = src.state_; num_states_ = src.num_states_; + padding_ = src.padding_; return *this; } void init(uint64_t seed, int num_states) { if (seed == 0) seed = uint64_t(1318319); - + // I only want to pad on CPU like archs (less than 1000 threads). 64 is a + // magic number, or random number I just wanted something not too large and + // not too small. 64 sounded fine. + padding_ = num_states < 1000 ? 64 : 1; num_states_ = num_states; - locks_ = locks_type("Kokkos::Random_XorShift64::locks", num_states_); - state_ = state_data_type("Kokkos::Random_XorShift64::state", num_states_); + locks_ = + locks_type("Kokkos::Random_XorShift64::locks", num_states, padding_); + state_ = state_data_type("Kokkos::Random_XorShift64::state", num_states_, + padding_); typename state_data_type::HostMirror h_state = create_mirror_view(state_); typename locks_type::HostMirror h_lock = create_mirror_view(locks_); @@ -902,15 +931,15 @@ class Random_XorShift64_Pool { gen(seed, 0); for (int i = 0; i < 17; i++) gen.rand(); for (int i = 0; i < num_states_; i++) { - int n1 = gen.rand(); - int n2 = gen.rand(); - int n3 = gen.rand(); - int n4 = gen.rand(); - h_state(i) = (((static_cast(n1)) & 0xffff) << 00) | - (((static_cast(n2)) & 0xffff) << 16) | - (((static_cast(n3)) & 0xffff) << 32) | - (((static_cast(n4)) & 0xffff) << 48); - h_lock(i) = 0; + int n1 = gen.rand(); + int n2 = gen.rand(); + int n3 = gen.rand(); + int n4 = gen.rand(); + h_state(i, 0) = (((static_cast(n1)) & 0xffff) << 00) | + (((static_cast(n2)) & 0xffff) << 16) | + (((static_cast(n3)) & 0xffff) << 32) | + (((static_cast(n4)) & 0xffff) << 48); + h_lock(i, 0) = 0; } deep_copy(state_, h_state); deep_copy(locks_, h_lock); @@ -920,19 +949,19 @@ class Random_XorShift64_Pool { Random_XorShift64 get_state() const { const int i = Impl::Random_UniqueIndex::get_state_idx(locks_); - return Random_XorShift64(state_(i), i); + return Random_XorShift64(state_(i, 0), i); } // NOTE: state_idx MUST be unique and less than num_states KOKKOS_INLINE_FUNCTION Random_XorShift64 get_state(const int state_idx) const { - return Random_XorShift64(state_(state_idx), state_idx); + return Random_XorShift64(state_(state_idx, 0), state_idx); } KOKKOS_INLINE_FUNCTION void free_state(const Random_XorShift64& state) const { - state_(state.state_idx_) = state.state_; - locks_(state.state_idx_) = 0; + state_(state.state_idx_, 0) = state.state_; + locks_(state.state_idx_, 0) = 0; } }; @@ -1092,14 +1121,15 @@ template class Random_XorShift1024_Pool { private: using execution_space = typename DeviceType::execution_space; - using locks_type = View; - using int_view_type = View; + using locks_type = View; + using int_view_type = View; using state_data_type = View; locks_type locks_; state_data_type state_; int_view_type p_; int num_states_; + int padding_; friend class Random_XorShift1024; public: @@ -1129,15 +1159,21 @@ class Random_XorShift1024_Pool { state_ = src.state_; p_ = src.p_; num_states_ = src.num_states_; + padding_ = src.padding_; return *this; } inline void init(uint64_t seed, int num_states) { if (seed == 0) seed = uint64_t(1318319); + // I only want to pad on CPU like archs (less than 1000 threads). 64 is a + // magic number, or random number I just wanted something not too large and + // not too small. 64 sounded fine. + padding_ = num_states < 1000 ? 64 : 1; num_states_ = num_states; - locks_ = locks_type("Kokkos::Random_XorShift1024::locks", num_states_); + locks_ = + locks_type("Kokkos::Random_XorShift1024::locks", num_states_, padding_); state_ = state_data_type("Kokkos::Random_XorShift1024::state", num_states_); - p_ = int_view_type("Kokkos::Random_XorShift1024::p", num_states_); + p_ = int_view_type("Kokkos::Random_XorShift1024::p", num_states_, padding_); typename state_data_type::HostMirror h_state = create_mirror_view(state_); typename locks_type::HostMirror h_lock = create_mirror_view(locks_); @@ -1158,8 +1194,8 @@ class Random_XorShift1024_Pool { (((static_cast(n3)) & 0xffff) << 32) | (((static_cast(n4)) & 0xffff) << 48); } - h_p(i) = 0; - h_lock(i) = 0; + h_p(i, 0) = 0; + h_lock(i, 0) = 0; } deep_copy(state_, h_state); deep_copy(locks_, h_lock); @@ -1169,20 +1205,20 @@ class Random_XorShift1024_Pool { Random_XorShift1024 get_state() const { const int i = Impl::Random_UniqueIndex::get_state_idx(locks_); - return Random_XorShift1024(state_, p_(i), i); + return Random_XorShift1024(state_, p_(i, 0), i); }; // NOTE: state_idx MUST be unique and less than num_states KOKKOS_INLINE_FUNCTION Random_XorShift1024 get_state(const int state_idx) const { - return Random_XorShift1024(state_, p_(state_idx), state_idx); + return Random_XorShift1024(state_, p_(state_idx, 0), state_idx); } KOKKOS_INLINE_FUNCTION void free_state(const Random_XorShift1024& state) const { for (int i = 0; i < 16; i++) state_(state.state_idx_, i) = state.state_[i]; - p_(state.state_idx_) = state.p_; - locks_(state.state_idx_) = 0; + p_(state.state_idx_, 0) = state.p_; + locks_(state.state_idx_, 0) = 0; } }; diff --git a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp index d17c02776f..9c2e8b978b 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp @@ -206,8 +206,10 @@ class BinSort { //---------------------------------------- // Constructor: takes the keys, the binning_operator and optionally whether to // sort within bins (default false) - BinSort(const_key_view_type keys_, int range_begin_, int range_end_, - BinSortOp bin_op_, bool sort_within_bins_ = false) + template + BinSort(const ExecutionSpace& exec, const_key_view_type keys_, + int range_begin_, int range_end_, BinSortOp bin_op_, + bool sort_within_bins_ = false) : keys(keys_), keys_rnd(keys_), bin_op(bin_op_), @@ -222,50 +224,63 @@ class BinSort { "Kokkos::SortImpl::BinSortFunctor::bin_count", bin_op.max_bins()); bin_count_const = bin_count_atomic; bin_offsets = - offset_type(view_alloc(WithoutInitializing, + offset_type(view_alloc(exec, WithoutInitializing, "Kokkos::SortImpl::BinSortFunctor::bin_offsets"), bin_op.max_bins()); sort_order = - offset_type(view_alloc(WithoutInitializing, + offset_type(view_alloc(exec, WithoutInitializing, "Kokkos::SortImpl::BinSortFunctor::sort_order"), range_end - range_begin); } + BinSort(const_key_view_type keys_, int range_begin_, int range_end_, + BinSortOp bin_op_, bool sort_within_bins_ = false) + : BinSort(execution_space{}, keys_, range_begin_, range_end_, bin_op_, + sort_within_bins_) {} + + template + BinSort(const ExecutionSpace& exec, const_key_view_type keys_, + BinSortOp bin_op_, bool sort_within_bins_ = false) + : BinSort(exec, keys_, 0, keys_.extent(0), bin_op_, sort_within_bins_) {} + BinSort(const_key_view_type keys_, BinSortOp bin_op_, bool sort_within_bins_ = false) - : BinSort(keys_, 0, keys_.extent(0), bin_op_, sort_within_bins_) {} + : BinSort(execution_space{}, keys_, bin_op_, sort_within_bins_) {} //---------------------------------------- // Create the permutation vector, the bin_offset array and the bin_count // array. Can be called again if keys changed - void create_permute_vector() { + template + void create_permute_vector(const ExecutionSpace& exec = execution_space{}) { const size_t len = range_end - range_begin; Kokkos::parallel_for( "Kokkos::Sort::BinCount", - Kokkos::RangePolicy(0, len), *this); + Kokkos::RangePolicy(exec, 0, len), + *this); Kokkos::parallel_scan("Kokkos::Sort::BinOffset", - Kokkos::RangePolicy( - 0, bin_op.max_bins()), + Kokkos::RangePolicy( + exec, 0, bin_op.max_bins()), *this); - Kokkos::deep_copy(bin_count_atomic, 0); + Kokkos::deep_copy(exec, bin_count_atomic, 0); Kokkos::parallel_for( "Kokkos::Sort::BinBinning", - Kokkos::RangePolicy(0, len), *this); + Kokkos::RangePolicy(exec, 0, len), + *this); if (sort_within_bins) Kokkos::parallel_for( "Kokkos::Sort::BinSort", - Kokkos::RangePolicy( - 0, bin_op.max_bins()), + Kokkos::RangePolicy( + exec, 0, bin_op.max_bins()), *this); } // Sort a subset of a view with respect to the first dimension using the // permutation array - template - void sort(ValuesViewType const& values, int values_range_begin, - int values_range_end) const { + template + void sort(const ExecutionSpace& exec, ValuesViewType const& values, + int values_range_begin, int values_range_end) const { using scratch_view_type = Kokkos::View 0 ? len : KOKKOS_IMPL_CTOR_DEFAULT_ARG, values.rank_dynamic > 1 ? values.extent(1) @@ -308,7 +323,7 @@ class BinSort { values_range_begin - range_begin); parallel_for("Kokkos::Sort::CopyPermute", - Kokkos::RangePolicy(0, len), functor); + Kokkos::RangePolicy(exec, 0, len), functor); } { @@ -316,10 +331,23 @@ class BinSort { values, range_begin, sorted_values); parallel_for("Kokkos::Sort::Copy", - Kokkos::RangePolicy(0, len), functor); + Kokkos::RangePolicy(exec, 0, len), functor); } + } - execution_space().fence(); + // Sort a subset of a view with respect to the first dimension using the + // permutation array + template + void sort(ValuesViewType const& values, int values_range_begin, + int values_range_end) const { + execution_space exec; + sort(exec, values, values_range_begin, values_range_end); + exec.fence("Kokkos::Sort: fence after sorting"); + } + + template + void sort(ExecutionSpace const& exec, ValuesViewType const& values) const { + this->sort(exec, values, 0, /*values.extent(0)*/ range_end - range_begin); } template @@ -485,17 +513,19 @@ struct BinOp3D { namespace Impl { -template -bool try_std_sort(ViewType view) { +template +bool try_std_sort(ViewType view, const ExecutionSpace& exec) { bool possible = true; size_t stride[8] = {view.stride_0(), view.stride_1(), view.stride_2(), view.stride_3(), view.stride_4(), view.stride_5(), view.stride_6(), view.stride_7()}; possible = possible && - std::is_same::value; + SpaceAccessibility::accessible; possible = possible && (ViewType::Rank == 1); possible = possible && (stride[0] == 1); if (possible) { + exec.fence("Kokkos::sort: Fence before sorting on the host"); std::sort(view.data(), view.data() + view.extent(0)); } return possible; @@ -518,10 +548,12 @@ struct min_max_functor { } // namespace Impl -template -void sort(ViewType const& view, bool const always_use_kokkos_sort = false) { +template +std::enable_if_t::value> sort( + const ExecutionSpace& exec, ViewType const& view, + bool const always_use_kokkos_sort = false) { if (!always_use_kokkos_sort) { - if (Impl::try_std_sort(view)) return; + if (Impl::try_std_sort(view, exec)) return; } using CompType = BinOp1D; @@ -529,34 +561,50 @@ void sort(ViewType const& view, bool const always_use_kokkos_sort = false) { Kokkos::MinMax reducer(result); parallel_reduce("Kokkos::Sort::FindExtent", Kokkos::RangePolicy( - 0, view.extent(0)), + exec, 0, view.extent(0)), Impl::min_max_functor(view), reducer); if (result.min_val == result.max_val) return; BinSort bin_sort( view, CompType(view.extent(0) / 2, result.min_val, result.max_val), true); - bin_sort.create_permute_vector(); - bin_sort.sort(view); + bin_sort.create_permute_vector(exec); + bin_sort.sort(exec, view); } template -void sort(ViewType view, size_t const begin, size_t const end) { +void sort(ViewType const& view, bool const always_use_kokkos_sort = false) { + typename ViewType::execution_space exec; + sort(exec, view, always_use_kokkos_sort); + exec.fence("Kokkos::Sort: fence after sorting"); +} + +template +std::enable_if_t::value> sort( + const ExecutionSpace& exec, ViewType view, size_t const begin, + size_t const end) { using range_policy = Kokkos::RangePolicy; using CompType = BinOp1D; Kokkos::MinMaxScalar result; Kokkos::MinMax reducer(result); - parallel_reduce("Kokkos::Sort::FindExtent", range_policy(begin, end), + parallel_reduce("Kokkos::Sort::FindExtent", range_policy(exec, begin, end), Impl::min_max_functor(view), reducer); if (result.min_val == result.max_val) return; BinSort bin_sort( - view, begin, end, + exec, view, begin, end, CompType((end - begin) / 2, result.min_val, result.max_val), true); - bin_sort.create_permute_vector(); - bin_sort.sort(view, begin, end); + bin_sort.create_permute_vector(exec); + bin_sort.sort(exec, view, begin, end); +} + +template +void sort(ViewType view, size_t const begin, size_t const end) { + typename ViewType::execution_space exec; + sort(exec, view, begin, end); + exec.fence("Kokkos::Sort: fence after sorting"); } } // namespace Kokkos diff --git a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp index c37e779c99..3dffce7df4 100644 --- a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp @@ -47,7 +47,7 @@ #include #include #include -#include +#include #include #include #include @@ -198,11 +198,50 @@ struct test_random_functor { static_cast(1.0 * HIST_DIM3D * tmp2 / theMax); const uint64_t ind3_3d = static_cast(1.0 * HIST_DIM3D * tmp3 / theMax); - +// Workaround Intel 17 compiler bug which sometimes add random +// instruction alignment which makes the lock instruction +// illegal. Seems to be mostly just for unsigned int atomics. +// Looking at the assembly the compiler +// appears to insert cache line alignment for the instruction. +// Isn't restricted to specific archs. Seen it on SNB and SKX, but for +// different code. Another occurrence was with Desul atomics in +// a different unit test. This one here happens without desul atomics. +// Inserting an assembly nop instruction changes the alignment and +// works round this. +// +// 17.0.4 for 64bit Random works with 1/1/1/2/1 +// 17.0.4 for 1024bit Random works with 1/1/1/1/1 +#ifdef KOKKOS_COMPILER_INTEL +#if (KOKKOS_COMPILER_INTEL < 1800) + asm volatile("nop\n"); +#endif +#endif atomic_fetch_add(&density_1d(ind1_1d), 1); +#ifdef KOKKOS_COMPILER_INTEL +#if (KOKKOS_COMPILER_INTEL < 1800) + asm volatile("nop\n"); +#endif +#endif atomic_fetch_add(&density_1d(ind2_1d), 1); +#ifdef KOKKOS_COMPILER_INTEL +#if (KOKKOS_COMPILER_INTEL < 1800) + asm volatile("nop\n"); +#endif +#endif atomic_fetch_add(&density_1d(ind3_1d), 1); +#ifdef KOKKOS_COMPILER_INTEL +#if (KOKKOS_COMPILER_INTEL < 1800) + if (std::is_same>::value) + asm volatile("nop\n"); + asm volatile("nop\n"); +#endif +#endif atomic_fetch_add(&density_3d(ind1_3d, ind2_3d, ind3_3d), 1); +#ifdef KOKKOS_COMPILER_INTEL +#if (KOKKOS_COMPILER_INTEL < 1800) + asm volatile("nop\n"); +#endif +#endif } rand_pool.free_state(rand_gen); } @@ -338,9 +377,11 @@ struct test_random_scalar { using functor_type = test_histogram1d_functor; parallel_reduce(HIST_DIM1D, functor_type(density_1d, num_draws), result); - - double tolerance = 6 * std::sqrt(1.0 / HIST_DIM1D); - double mean_expect = 1.0 * num_draws * 3 / HIST_DIM1D; + double mean_eps_expect = 0.0001; + double variance_eps_expect = 0.07; + double covariance_eps_expect = 0.06; + double tolerance = 6 * std::sqrt(1.0 / HIST_DIM1D); + double mean_expect = 1.0 * num_draws * 3 / HIST_DIM1D; double variance_expect = 1.0 * num_draws * 3 / HIST_DIM1D * (1.0 - 1.0 / HIST_DIM1D); double covariance_expect = -1.0 * num_draws * 3 / HIST_DIM1D / HIST_DIM1D; @@ -349,11 +390,26 @@ struct test_random_scalar { variance_expect / (result.variance / HIST_DIM1D) - 1.0; double covariance_eps = (result.covariance / HIST_DIM1D - covariance_expect) / mean_expect; - pass_hist1d_mean = ((-0.0001 < mean_eps) && (0.0001 > mean_eps)) ? 1 : 0; - pass_hist1d_var = - ((-0.07 < variance_eps) && (0.07 > variance_eps)) ? 1 : 0; - pass_hist1d_covar = - ((-0.06 < covariance_eps) && (0.06 > covariance_eps)) ? 1 : 0; + +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT + if (std::is_same::value) { + mean_eps_expect = 0.0003; + variance_eps_expect = 1.0; + covariance_eps_expect = 5.0e4; + } +#endif + + pass_hist1d_mean = + ((-mean_eps_expect < mean_eps) && (mean_eps_expect > mean_eps)) ? 1 + : 0; + pass_hist1d_var = ((-variance_eps_expect < variance_eps) && + (variance_eps_expect > variance_eps)) + ? 1 + : 0; + pass_hist1d_covar = ((-covariance_eps_expect < covariance_eps) && + (covariance_eps_expect > covariance_eps)) + ? 1 + : 0; cout << "Density 1D: " << mean_eps << " " << variance_eps << " " << (result.covariance / HIST_DIM1D / HIST_DIM1D) << " || " @@ -371,8 +427,9 @@ struct test_random_scalar { test_histogram3d_functor; parallel_reduce(HIST_DIM1D, functor_type(density_3d, num_draws), result); - double tolerance = 6 * std::sqrt(1.0 / HIST_DIM1D); - double mean_expect = 1.0 * num_draws / HIST_DIM1D; + double variance_factor = 1.2; + double tolerance = 6 * std::sqrt(1.0 / HIST_DIM1D); + double mean_expect = 1.0 * num_draws / HIST_DIM1D; double variance_expect = 1.0 * num_draws / HIST_DIM1D * (1.0 - 1.0 / HIST_DIM1D); double covariance_expect = -1.0 * num_draws / HIST_DIM1D / HIST_DIM1D; @@ -381,15 +438,23 @@ struct test_random_scalar { variance_expect / (result.variance / HIST_DIM1D) - 1.0; double covariance_eps = (result.covariance / HIST_DIM1D - covariance_expect) / mean_expect; + +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT + if (std::is_same::value) { + variance_factor = 7; + } +#endif + pass_hist3d_mean = ((-tolerance < mean_eps) && (tolerance > mean_eps)) ? 1 : 0; - pass_hist3d_var = ((-1.2 * tolerance < variance_eps) && - (1.2 * tolerance > variance_eps)) + pass_hist3d_var = ((-variance_factor * tolerance < variance_eps) && + (variance_factor * tolerance > variance_eps)) ? 1 : 0; - pass_hist3d_covar = - ((-tolerance < covariance_eps) && (tolerance > covariance_eps)) ? 1 - : 0; + pass_hist3d_covar = ((-variance_factor * tolerance < covariance_eps) && + (variance_factor * tolerance > covariance_eps)) + ? 1 + : 0; cout << "Density 3D: " << mean_eps << " " << variance_eps << " " << result.covariance / HIST_DIM1D / HIST_DIM1D << " || " << tolerance @@ -471,6 +536,21 @@ void test_random(unsigned int num_draws) { deep_copy(density_1d, 0); deep_copy(density_3d, 0); + cout << "Test Scalar=half" << endl; + test_random_scalar test_half( + density_1d, density_3d, pool, num_draws); + ASSERT_EQ(test_half.pass_mean, 1); + ASSERT_EQ(test_half.pass_var, 1); + ASSERT_EQ(test_half.pass_covar, 1); + ASSERT_EQ(test_half.pass_hist1d_mean, 1); + ASSERT_EQ(test_half.pass_hist1d_var, 1); + ASSERT_EQ(test_half.pass_hist1d_covar, 1); + ASSERT_EQ(test_half.pass_hist3d_mean, 1); + ASSERT_EQ(test_half.pass_hist3d_var, 1); + ASSERT_EQ(test_half.pass_hist3d_covar, 1); + deep_copy(density_1d, 0); + deep_copy(density_3d, 0); + cout << "Test Scalar=float" << endl; test_random_scalar test_float(density_1d, density_3d, pool, num_draws); diff --git a/lib/kokkos/algorithms/unit_tests/TestSort.hpp b/lib/kokkos/algorithms/unit_tests/TestSort.hpp index 9c6308c843..de1e6b3c31 100644 --- a/lib/kokkos/algorithms/unit_tests/TestSort.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestSort.hpp @@ -135,8 +135,9 @@ void test_1D_sort_impl(unsigned int n, bool force_kokkos) { KeyViewType keys("Keys", n); // Test sorting array with all numbers equal - Kokkos::deep_copy(keys, KeyType(1)); - Kokkos::sort(keys, force_kokkos); + ExecutionSpace exec; + Kokkos::deep_copy(exec, keys, KeyType(1)); + Kokkos::sort(exec, keys, force_kokkos); Kokkos::Random_XorShift64_Pool g(1931); Kokkos::fill_random(keys, g, @@ -147,13 +148,16 @@ void test_1D_sort_impl(unsigned int n, bool force_kokkos) { double sum_after = 0.0; unsigned int sort_fails = 0; - Kokkos::parallel_reduce(n, sum(keys), sum_before); + Kokkos::parallel_reduce(Kokkos::RangePolicy(exec, 0, n), + sum(keys), sum_before); - Kokkos::sort(keys, force_kokkos); + Kokkos::sort(exec, keys, force_kokkos); - Kokkos::parallel_reduce(n, sum(keys), sum_after); - Kokkos::parallel_reduce( - n - 1, is_sorted_struct(keys), sort_fails); + Kokkos::parallel_reduce(Kokkos::RangePolicy(exec, 0, n), + sum(keys), sum_after); + Kokkos::parallel_reduce(Kokkos::RangePolicy(exec, 0, n - 1), + is_sorted_struct(keys), + sort_fails); double ratio = sum_before / sum_after; double epsilon = 1e-10; @@ -177,8 +181,10 @@ void test_3D_sort_impl(unsigned int n) { double sum_after = 0.0; unsigned int sort_fails = 0; - Kokkos::parallel_reduce(keys.extent(0), sum3D(keys), - sum_before); + ExecutionSpace exec; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(exec, 0, keys.extent(0)), + sum3D(keys), sum_before); int bin_1d = 1; while (bin_1d * bin_1d * bin_1d * 4 < (int)keys.extent(0)) bin_1d *= 2; @@ -189,15 +195,17 @@ void test_3D_sort_impl(unsigned int n) { using BinOp = Kokkos::BinOp3D; BinOp bin_op(bin_max, min, max); Kokkos::BinSort Sorter(keys, bin_op, false); - Sorter.create_permute_vector(); - Sorter.template sort(keys); + Sorter.create_permute_vector(exec); + Sorter.sort(exec, keys); - Kokkos::parallel_reduce(keys.extent(0), sum3D(keys), - sum_after); - Kokkos::parallel_reduce(keys.extent(0) - 1, - bin3d_is_sorted_struct( - keys, bin_1d, min[0], max[0]), - sort_fails); + Kokkos::parallel_reduce( + Kokkos::RangePolicy(exec, 0, keys.extent(0)), + sum3D(keys), sum_after); + Kokkos::parallel_reduce( + Kokkos::RangePolicy(exec, 0, keys.extent(0) - 1), + bin3d_is_sorted_struct(keys, bin_1d, min[0], + max[0]), + sort_fails); double ratio = sum_before / sum_after; double epsilon = 1e-10; @@ -229,36 +237,36 @@ void test_dynamic_view_sort_impl(unsigned int n) { KeyViewType keys_view("KeysTmp", n); // Test sorting array with all numbers equal - Kokkos::deep_copy(keys_view, KeyType(1)); + ExecutionSpace exec; + Kokkos::deep_copy(exec, keys_view, KeyType(1)); Kokkos::deep_copy(keys, keys_view); - Kokkos::sort(keys, 0 /* begin */, n /* end */); + Kokkos::sort(exec, keys, 0 /* begin */, n /* end */); Kokkos::Random_XorShift64_Pool g(1931); Kokkos::fill_random(keys_view, g, Kokkos::Random_XorShift64_Pool< ExecutionSpace>::generator_type::MAX_URAND); - ExecutionSpace().fence(); + exec.fence(); Kokkos::deep_copy(keys, keys_view); - // ExecutionSpace().fence(); double sum_before = 0.0; double sum_after = 0.0; unsigned int sort_fails = 0; - Kokkos::parallel_reduce(n, sum(keys_view), - sum_before); + Kokkos::parallel_reduce(Kokkos::RangePolicy(exec, 0, n), + sum(keys_view), sum_before); - Kokkos::sort(keys, 0 /* begin */, n /* end */); + Kokkos::sort(exec, keys, 0 /* begin */, n /* end */); - ExecutionSpace().fence(); // Need this fence to prevent BusError with Cuda + exec.fence(); // Need this fence to prevent BusError with Cuda Kokkos::deep_copy(keys_view, keys); - // ExecutionSpace().fence(); - Kokkos::parallel_reduce(n, sum(keys_view), - sum_after); - Kokkos::parallel_reduce( - n - 1, is_sorted_struct(keys_view), sort_fails); + Kokkos::parallel_reduce(Kokkos::RangePolicy(exec, 0, n), + sum(keys_view), sum_after); + Kokkos::parallel_reduce(Kokkos::RangePolicy(exec, 0, n - 1), + is_sorted_struct(keys_view), + sort_fails); double ratio = sum_before / sum_after; double epsilon = 1e-10; @@ -301,9 +309,10 @@ void test_issue_1160_impl() { for (int i = 0; i < 10; ++i) { h_v.access(i, 0) = h_x.access(i, 0) = double(h_element(i)); } - Kokkos::deep_copy(element_, h_element); - Kokkos::deep_copy(x_, h_x); - Kokkos::deep_copy(v_, h_v); + ExecutionSpace exec; + Kokkos::deep_copy(exec, element_, h_element); + Kokkos::deep_copy(exec, x_, h_x); + Kokkos::deep_copy(exec, v_, h_v); using KeyViewType = decltype(element_); using BinOp = Kokkos::BinOp1D; @@ -316,15 +325,16 @@ void test_issue_1160_impl() { Kokkos::BinSort Sorter(element_, begin, end, binner, false); - Sorter.create_permute_vector(); - Sorter.sort(element_, begin, end); + Sorter.create_permute_vector(exec); + Sorter.sort(exec, element_, begin, end); - Sorter.sort(x_, begin, end); - Sorter.sort(v_, begin, end); + Sorter.sort(exec, x_, begin, end); + Sorter.sort(exec, v_, begin, end); - Kokkos::deep_copy(h_element, element_); - Kokkos::deep_copy(h_x, x_); - Kokkos::deep_copy(h_v, v_); + Kokkos::deep_copy(exec, h_element, element_); + Kokkos::deep_copy(exec, h_x, x_); + Kokkos::deep_copy(exec, h_v, v_); + exec.fence(); ASSERT_EQ(h_element(0), 9); ASSERT_EQ(h_element(1), 8); diff --git a/lib/kokkos/appveyor.yml b/lib/kokkos/appveyor.yml index e8763c0b66..73a0d31875 100644 --- a/lib/kokkos/appveyor.yml +++ b/lib/kokkos/appveyor.yml @@ -3,4 +3,8 @@ image: clone_folder: c:\projects\source build_script: - cmd: >- - cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc /d1reportClassLayoutChanges" -DCTEST_ARGS="-C Debug -V --output-on-failure" -DBUILD_NAME=MSVC-2019 -DBUILD_TYPE=Debug -DSITE=AppVeyor -DTARGET=install -P cmake/KokkosCI.cmake + mkdir build && + cd build && + cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc" -DKokkos_ENABLE_DEPRECATED_CODE_3=ON -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF && + cmake --build . --target install && + ctest -C Debug --output-on-failure diff --git a/lib/kokkos/benchmarks/atomic/main.cpp b/lib/kokkos/benchmarks/atomic/main.cpp index 7b5caa1aee..cc0d3e41e8 100644 --- a/lib/kokkos/benchmarks/atomic/main.cpp +++ b/lib/kokkos/benchmarks/atomic/main.cpp @@ -1,12 +1,12 @@ #include -#include +#include #include template double test_atomic(int L, int N, int M, int K, int R, Kokkos::View offsets) { Kokkos::View output("Output", N); - Kokkos::Impl::Timer timer; + Kokkos::Timer timer; for (int r = 0; r < R; r++) Kokkos::parallel_for( @@ -28,7 +28,7 @@ template double test_no_atomic(int L, int N, int M, int K, int R, Kokkos::View offsets) { Kokkos::View output("Output", N); - Kokkos::Impl::Timer timer; + Kokkos::Timer timer; for (int r = 0; r < R; r++) Kokkos::parallel_for( L, KOKKOS_LAMBDA(const int& i) { diff --git a/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp b/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp index 62d7ef4a4c..4fc6ca2c68 100644 --- a/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp +++ b/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp @@ -43,7 +43,7 @@ */ #include -#include +#include template struct Run { diff --git a/lib/kokkos/benchmarks/bytes_and_flops/main.cpp b/lib/kokkos/benchmarks/bytes_and_flops/main.cpp index 6da2407a08..75f30a3409 100644 --- a/lib/kokkos/benchmarks/bytes_and_flops/main.cpp +++ b/lib/kokkos/benchmarks/bytes_and_flops/main.cpp @@ -43,7 +43,7 @@ */ #include -#include +#include #include #include diff --git a/lib/kokkos/benchmarks/gather/main.cpp b/lib/kokkos/benchmarks/gather/main.cpp index 5f10e4dcc1..dd502faaa4 100644 --- a/lib/kokkos/benchmarks/gather/main.cpp +++ b/lib/kokkos/benchmarks/gather/main.cpp @@ -43,7 +43,7 @@ */ #include -#include +#include #include #include diff --git a/lib/kokkos/benchmarks/stream/stream-kokkos.cpp b/lib/kokkos/benchmarks/stream/stream-kokkos.cpp index e7ef67e080..311947c197 100644 --- a/lib/kokkos/benchmarks/stream/stream-kokkos.cpp +++ b/lib/kokkos/benchmarks/stream/stream-kokkos.cpp @@ -52,35 +52,33 @@ #define HLINE "-------------------------------------------------------------\n" -#if defined(KOKKOS_ENABLE_CUDA) -using StreamHostArray = Kokkos::View::HostMirror; -using StreamDeviceArray = Kokkos::View; -#else -using StreamHostArray = Kokkos::View::HostMirror; -using StreamDeviceArray = Kokkos::View; -#endif +using StreamDeviceArray = + Kokkos::View>; +using StreamHostArray = typename StreamDeviceArray::HostMirror; using StreamIndex = int; +using Policy = Kokkos::RangePolicy>; -double now() { - struct timeval now; - gettimeofday(&now, nullptr); - - return (double)now.tv_sec + ((double)now.tv_usec * 1.0e-6); -} - -void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b, - StreamDeviceArray& c) { +void perform_set(StreamDeviceArray& a, const double scalar) { Kokkos::parallel_for( - "copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) { c[i] = a[i]; }); + "set", Policy(0, a.extent(0)), + KOKKOS_LAMBDA(const StreamIndex i) { a[i] = scalar; }); Kokkos::fence(); } -void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b, - StreamDeviceArray& c, const double scalar) { +void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b) { Kokkos::parallel_for( - "copy", a.extent(0), + "copy", Policy(0, a.extent(0)), + KOKKOS_LAMBDA(const StreamIndex i) { b[i] = a[i]; }); + + Kokkos::fence(); +} + +void perform_scale(StreamDeviceArray& b, StreamDeviceArray& c, + const double scalar) { + Kokkos::parallel_for( + "scale", Policy(0, b.extent(0)), KOKKOS_LAMBDA(const StreamIndex i) { b[i] = scalar * c[i]; }); Kokkos::fence(); @@ -89,7 +87,7 @@ void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b, void perform_add(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c) { Kokkos::parallel_for( - "add", a.extent(0), + "add", Policy(0, a.extent(0)), KOKKOS_LAMBDA(const StreamIndex i) { c[i] = a[i] + b[i]; }); Kokkos::fence(); @@ -98,7 +96,7 @@ void perform_add(StreamDeviceArray& a, StreamDeviceArray& b, void perform_triad(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c, const double scalar) { Kokkos::parallel_for( - "triad", a.extent(0), + "triad", Policy(0, a.extent(0)), KOKKOS_LAMBDA(const StreamIndex i) { a[i] = b[i] + scalar * c[i]; }); Kokkos::fence(); @@ -184,6 +182,7 @@ int run_benchmark() { const double scalar = 3.0; + double setTime = std::numeric_limits::max(); double copyTime = std::numeric_limits::max(); double scaleTime = std::numeric_limits::max(); double addTime = std::numeric_limits::max(); @@ -191,13 +190,10 @@ int run_benchmark() { printf("Initializing Views...\n"); -#if defined(KOKKOS_HAVE_OPENMP) Kokkos::parallel_for( - "init", Kokkos::RangePolicy(0, STREAM_ARRAY_SIZE), -#else - Kokkos::parallel_for( - "init", Kokkos::RangePolicy(0, STREAM_ARRAY_SIZE), -#endif + "init", + Kokkos::RangePolicy(0, + STREAM_ARRAY_SIZE), KOKKOS_LAMBDA(const int i) { a[i] = 1.0; b[i] = 2.0; @@ -209,26 +205,30 @@ int run_benchmark() { Kokkos::deep_copy(dev_b, b); Kokkos::deep_copy(dev_c, c); - double start; - printf("Starting benchmarking...\n"); + Kokkos::Timer timer; + for (StreamIndex k = 0; k < STREAM_NTIMES; ++k) { - start = now(); - perform_copy(dev_a, dev_b, dev_c); - copyTime = std::min(copyTime, (now() - start)); + timer.reset(); + perform_set(dev_c, 1.5); + setTime = std::min(setTime, timer.seconds()); - start = now(); - perform_scale(dev_a, dev_b, dev_c, scalar); - scaleTime = std::min(scaleTime, (now() - start)); + timer.reset(); + perform_copy(dev_a, dev_c); + copyTime = std::min(copyTime, timer.seconds()); - start = now(); + timer.reset(); + perform_scale(dev_b, dev_c, scalar); + scaleTime = std::min(scaleTime, timer.seconds()); + + timer.reset(); perform_add(dev_a, dev_b, dev_c); - addTime = std::min(addTime, (now() - start)); + addTime = std::min(addTime, timer.seconds()); - start = now(); + timer.reset(); perform_triad(dev_a, dev_b, dev_c, scalar); - triadTime = std::min(triadTime, (now() - start)); + triadTime = std::min(triadTime, timer.seconds()); } Kokkos::deep_copy(a, dev_a); @@ -240,6 +240,9 @@ int run_benchmark() { printf(HLINE); + printf("Set %11.2f MB/s\n", + (1.0e-06 * 1.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) / + setTime); printf("Copy %11.2f MB/s\n", (1.0e-06 * 2.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) / copyTime); diff --git a/lib/kokkos/bin/hpcbind b/lib/kokkos/bin/hpcbind index 6af091a7d8..43f8a745da 100755 --- a/lib/kokkos/bin/hpcbind +++ b/lib/kokkos/bin/hpcbind @@ -634,15 +634,15 @@ elif [[ ${HPCBIND_HAS_COMMAND} -eq 1 ]]; then > ${HPCBIND_OUT} if [[ ${HPCBIND_TEE} -eq 0 ]]; then if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then - hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR} + hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- "$@" > ${HPCBIND_OUT} 2> ${HPCBIND_ERR} else - eval $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR} + eval "$@" > ${HPCBIND_OUT} 2> ${HPCBIND_ERR} fi else if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then - hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2) + hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- "$@" > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2) else - eval $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2) + eval "$@" > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2) fi fi fi diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper index 4e52e4d09f..27e7d15b9d 100755 --- a/lib/kokkos/bin/nvcc_wrapper +++ b/lib/kokkos/bin/nvcc_wrapper @@ -96,10 +96,10 @@ replace_pragma_ident=0 first_xcompiler_arg=1 # Allow for setting temp dir without setting TMPDIR in parent (see https://docs.olcf.ornl.gov/systems/summit_user_guide.html#setting-tmpdir-causes-jsm-jsrun-errors-job-state-flip-flop) -if [[ ! -z ${NVCC_WRAPPER_TMPDIR+x} ]]; then +if [[ -z ${NVCC_WRAPPER_TMPDIR+x} ]]; then temp_dir=${TMPDIR:-/tmp} else - temp_dir=${NVCC_WRAPPER_TMPDIR+x} + temp_dir=${NVCC_WRAPPER_TMPDIR} fi # optimization flag added as a command-line argument @@ -226,14 +226,14 @@ do cuda_args="$cuda_args $1" ;; #Handle more known nvcc args - --expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets) + --expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets|-allow-unsupported-compiler|--allow-unsupported-compiler) cuda_args="$cuda_args $1" ;; #Handle known nvcc args that have an argument - -maxrregcount=*|--maxrregcount=*) + -maxrregcount=*|--maxrregcount=*|-time=*) cuda_args="$cuda_args $1" ;; - -maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart|-include) + -maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart|-include|-time) cuda_args="$cuda_args $1 $2" shift ;; @@ -552,14 +552,14 @@ if [ $host_only -eq 1 ]; then $host_command elif [ -n "$nvcc_depfile_command" ]; then if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then - echo "$nvcc_command && $nvcc_depfile_command" + echo "TMPDIR=${temp_dir} $nvcc_command && TMPDIR=${temp_dir} $nvcc_depfile_command" fi - $nvcc_command && $nvcc_depfile_command + TMPDIR=${temp_dir} $nvcc_command && TMPDIR=${temp_dir} $nvcc_depfile_command else if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then - echo "$nvcc_command" + echo "TMPDIR=${temp_dir} $nvcc_command" fi - $nvcc_command + TMPDIR=${temp_dir} $nvcc_command fi error_code=$? diff --git a/lib/kokkos/cmake/CTestConfig.cmake.in b/lib/kokkos/cmake/CTestConfig.cmake.in deleted file mode 100644 index 1f82c0d64d..0000000000 --- a/lib/kokkos/cmake/CTestConfig.cmake.in +++ /dev/null @@ -1,91 +0,0 @@ -#----------------------------------------------------------------------------------------# -# -# CTestConfig.cmake template for Kokkos -# -#----------------------------------------------------------------------------------------# - -# -# dash-board related -# -set(CTEST_PROJECT_NAME "Kokkos") -set(CTEST_NIGHTLY_START_TIME "01:00:00 UTC") -set(CTEST_DROP_METHOD "https") -set(CTEST_DROP_SITE "cdash.nersc.gov") -set(CTEST_DROP_LOCATION "/submit.php?project=${CTEST_PROJECT_NAME}") -set(CTEST_CDASH_VERSION "1.6") -set(CTEST_CDASH_QUERY_VERSION TRUE) -set(CTEST_SUBMIT_RETRY_COUNT "1") -set(CTEST_SUBMIT_RETRY_DELAY "30") - -# -# configure/build related -# -set(CTEST_BUILD_NAME "@BUILD_NAME@") -set(CTEST_MODEL "@MODEL@") -set(CTEST_SITE "@SITE@") -set(CTEST_CONFIGURATION_TYPE "@BUILD_TYPE@") -set(CTEST_SOURCE_DIRECTORY "@SOURCE_REALDIR@") -set(CTEST_BINARY_DIRECTORY "@BINARY_REALDIR@") - -# -# configure/build related -# -set(CTEST_UPDATE_TYPE "git") -set(CTEST_UPDATE_VERSION_ONLY ON) -# set(CTEST_GENERATOR "") -# set(CTEST_GENERATOR_PLATFORM "") - -# -# testing related -# -set(CTEST_TIMEOUT "7200") -set(CTEST_TEST_TIMEOUT "7200") -set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS "100") -set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "100") -set(CTEST_CUSTOM_MAXIMUM_PASSED_TEST_OUTPUT_SIZE "1048576") - -# -# coverage related -# -set(CTEST_CUSTOM_COVERAGE_EXCLUDE ".*tpls/.*;/usr/.*;.*unit_test/.*;.*unit_tests/.*;.*perf_test/.*") - -# -# commands -# -if(NOT "@CHECKOUT_COMMAND@" STREQUAL "") - set(CTEST_CHECKOUT_COMMAND "@CHECKOUT_COMMAND@") -endif() -set(CTEST_UPDATE_COMMAND "@GIT_EXECUTABLE@") -set(CTEST_CONFIGURE_COMMAND "@CMAKE_COMMAND@ -DCMAKE_BUILD_TYPE=@BUILD_TYPE@ -DKokkos_ENABLE_TESTS=ON @CONFIG_ARGS@ @SOURCE_REALDIR@") -set(CTEST_BUILD_COMMAND "@CMAKE_COMMAND@ --build @BINARY_REALDIR@ --target @TARGET@") -if(NOT WIN32) - set(CTEST_BUILD_COMMAND "${CTEST_BUILD_COMMAND} -- -j@BUILD_JOBS@") -endif() -set(CTEST_COVERAGE_COMMAND "gcov") -set(CTEST_MEMORYCHECK_COMMAND "valgrind") -set(CTEST_GIT_COMMAND "@GIT_EXECUTABLE@") - -# -# various configs -# -set(APPEND_VALUE @APPEND@) -if(APPEND_VALUE) - set(APPEND_CTEST APPEND) -endif() - -macro(SET_TEST_PROP VAR) - if(NOT "${ARGS}" STREQUAL "") - set(${VAR}_CTEST ${VAR} ${ARGN}) - endif() -endmacro() - -set_test_prop(START @START@) -set_test_prop(END @END@) -set_test_prop(STRIDE @STRIDE@) -set_test_prop(INCLUDE @INCLUDE@) -set_test_prop(EXCLUDE @EXCLUDE@) -set_test_prop(INCLUDE_LABEL @INCLUDE_LABEL@) -set_test_prop(EXCLUDE_LABEL @EXCLUDE_LABEL@) -set_test_prop(PARALLEL_LEVEL @PARALLEL_LEVEL@) -set_test_prop(STOP_TIME @STOP_TIME@) -set_test_prop(COVERAGE_LABELS @LABELS@) diff --git a/lib/kokkos/cmake/KokkosCI.cmake b/lib/kokkos/cmake/KokkosCI.cmake deleted file mode 100644 index e8c9af37ad..0000000000 --- a/lib/kokkos/cmake/KokkosCI.cmake +++ /dev/null @@ -1,350 +0,0 @@ -cmake_minimum_required(VERSION 3.16 FATAL_ERROR) - -message(STATUS "") - -get_cmake_property(_cached_vars CACHE_VARIABLES) -set(KOKKOS_CMAKE_ARGS) -set(EXCLUDED_VARIABLES "CMAKE_COMMAND" "CMAKE_CPACK_COMMAND" "CMAKE_CTEST_COMMAND" "CMAKE_ROOT" - "CTEST_ARGS" "BUILD_NAME" "CMAKE_CXX_FLAGS" "CMAKE_BUILD_TYPE") -list(SORT _cached_vars) -foreach(_var ${_cached_vars}) - if(NOT "${_var}" IN_LIST EXCLUDED_VARIABLES) - list(APPEND KOKKOS_CMAKE_ARGS ${_var}) - if("${_var}" STREQUAL "CMAKE_BUILD_TYPE") - set(BUILD_TYPE "${CMAKE_BUILD_TYPE}") - endif() - endif() -endforeach() - - -#----------------------------------------------------------------------------------------# -# -# Macros and variables -# -#----------------------------------------------------------------------------------------# - -macro(CHECK_REQUIRED VAR) - if(NOT DEFINED ${VAR}) - message(FATAL_ERROR "Error! Variable '${VAR}' must be defined") - endif() -endmacro() - -# require the build name variable -CHECK_REQUIRED(BUILD_NAME) - -# uses all args -macro(SET_DEFAULT VAR) - if(NOT DEFINED ${VAR}) - set(${VAR} ${ARGN}) - endif() - # remove these ctest configuration variables from the defines - # passed to the Kokkos configuration - if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS) - list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}") - endif() -endmacro() - -# uses first arg -- useful for selecting via priority from multiple -# potentially defined variables, e.g.: -# -# set_default_arg1(BUILD_NAME ${TRAVIS_BUILD_NAME} ${BUILD_NAME}) -# -macro(SET_DEFAULT_ARG1 VAR) - if(NOT DEFINED ${VAR}) - foreach(_ARG ${ARGN}) - if(NOT "${_ARG}" STREQUAL "") - set(${VAR} ${_ARG}) - break() - endif() - endforeach() - endif() - # remove these ctest configuration variables from the defines - # passed to the Kokkos configuration - if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS) - list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}") - endif() -endmacro() - -# determine the default working directory -if(NOT "$ENV{WORKSPACE}" STREQUAL "") - set(WORKING_DIR "$ENV{WORKSPACE}") -else() - get_filename_component(WORKING_DIR ${CMAKE_CURRENT_LIST_DIR} DIRECTORY) -endif() - -# determine the hostname -execute_process(COMMAND hostname - OUTPUT_VARIABLE HOSTNAME - OUTPUT_STRIP_TRAILING_WHITESPACE) - -SET_DEFAULT(HOSTNAME "$ENV{HOSTNAME}") - -# get the number of processors -include(ProcessorCount) -ProcessorCount(NUM_PROCESSORS) - -# find git -find_package(Git QUIET) -if(NOT GIT_EXECUTABLE) - unset(GIT_EXECUTABLE CACHE) - unset(GIT_EXECUTABLE) -endif() - -function(EXECUTE_GIT_COMMAND VAR) - set(${VAR} "" PARENT_SCOPE) - execute_process(COMMAND ${GIT_EXECUTABLE} ${ARGN} - OUTPUT_VARIABLE VAL - RESULT_VARIABLE RET - OUTPUT_STRIP_TRAILING_WHITESPACE - WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} - ERROR_QUIET) - string(REPLACE ";" " " _CMD "${GIT_EXECUTABLE} ${ARGN}") - set(LAST_GIT_COMMAND "${_CMD}" PARENT_SCOPE) - if(RET EQUAL 0) - set(${VAR} "${VAL}" PARENT_SCOPE) - endif() -endfunction() - -# just gets the git branch name if available -function(GET_GIT_BRANCH_NAME VAR) - execute_git_command(GIT_BRANCH branch --show-current) - set(_INVALID "%D" "HEAD") - if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID) - execute_git_command(GIT_BRANCH show -s --format=%D) - if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID) - execute_git_command(GIT_BRANCH --describe all) - endif() - endif() - # - if(GIT_BRANCH) - string(REPLACE " " ";" _DESC "${GIT_BRANCH}") - # just set it to last one via loop instead of wonky cmake index manip - foreach(_ITR ${_DESC}) - set(GIT_BRANCH "${_ITR}") - endforeach() - set(${VAR} "${GIT_BRANCH}" PARENT_SCOPE) - message(STATUS "GIT BRANCH via '${LAST_GIT_COMMAND}': ${GIT_BRANCH}") - endif() -endfunction() - -# just gets the git branch name if available -function(GET_GIT_AUTHOR_NAME VAR) - execute_git_command(GIT_AUTHOR show -s --format=%an) - if(GIT_AUTHOR) - string(LENGTH "${GIT_AUTHOR}" STRLEN) - # if the build name gets too long, this can cause submission errors - if(STRLEN GREATER 24) - # remove middle initial - string(REGEX REPLACE " [A-Z]\. " " " GIT_AUTHOR "${GIT_AUTHOR}") - # get first and sur name - string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\1" F_NAME "${GIT_AUTHOR}") - string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\2" S_NAME "${GIT_AUTHOR}") - if(S_NAME) - set(GIT_AUTHOR "${S_NAME}") - elseif(F_NAME) - set(GIT_AUTHOR "${F_NAME}") - endif() - endif() - # remove any spaces, quotes, periods, etc. - string(REGEX REPLACE "[ ',;_\.\"]+" "" GIT_AUTHOR "${GIT_AUTHOR}") - set(${VAR} "${GIT_AUTHOR}" PARENT_SCOPE) - message(STATUS "GIT AUTHOR via '${LAST_GIT_COMMAND}': ${GIT_AUTHOR}") - endif() -endfunction() - -# get the name of the branch -GET_GIT_BRANCH_NAME(GIT_BRANCH) -# get the name of the author -GET_GIT_AUTHOR_NAME(GIT_AUTHOR) -# author, prefer git method for consistency -SET_DEFAULT_ARG1(AUTHOR ${GIT_AUTHOR} $ENV{GIT_AUTHOR} $ENV{AUTHOR}) -# SLUG == owner_name/repo_name -SET_DEFAULT_ARG1(SLUG $ENV{TRAVIS_PULL_REQUEST_SLUG} $ENV{TRAVIS_REPO_SLUG} $ENV{APPVEYOR_REPO_NAME} $ENV{PULL_REQUEST_SLUG} $ENV{REPO_SLUG}) -# branch name -SET_DEFAULT_ARG1(BRANCH $ENV{TRAVIS_PULL_REQUEST_BRANCH} $ENV{TRAVIS_BRANCH} $ENV{APPVEYOR_PULL_REQUEST_HEAD_REPO_BRANCH} $ENV{APPVEYOR_REPO_BRANCH} $ENV{GIT_BRANCH} $ENV{BRANCH_NAME} $ENV{BRANCH} ${GIT_BRANCH}) -# pull request number -SET_DEFAULT_ARG1(PULL_REQUEST_NUM $ENV{TRAVIS_PULL_REQUEST} $ENV{CHANGE_ID} $ENV{APPVEYOR_PULL_REQUEST_NUMBER} $ENV{PULL_REQUEST_NUM}) -# get the event type, e.g. push, pull_request, api, cron, etc. -SET_DEFAULT_ARG1(EVENT_TYPE $ENV{TRAVIS_EVENT_TYPE} ${EVENT_TYPE}) - -if("${BRANCH}" STREQUAL "") - message(STATUS "Checked: environment variables for Travis, Appveyor, Jenkins (git plugin), BRANCH_NAME, BRANCH and 'git branch --show-current'") - message(FATAL_ERROR "Error! Git branch could not be determined. Please provide -DBRANCH=") -endif() - -#----------------------------------------------------------------------------------------# -# -# Set default values if not provided on command-line -# -#----------------------------------------------------------------------------------------# - -SET_DEFAULT(SOURCE_DIR "${WORKING_DIR}") # source directory -SET_DEFAULT(BINARY_DIR "${WORKING_DIR}/build") # build directory -SET_DEFAULT(BUILD_TYPE "${CMAKE_BUILD_TYPE}") # Release, Debug, etc. -SET_DEFAULT(MODEL "Continuous") # Continuous, Nightly, or Experimental -SET_DEFAULT(JOBS 1) # number of parallel ctests -SET_DEFAULT(CTEST_COMMAND "${CMAKE_CTEST_COMMAND}") # just in case -SET_DEFAULT(CTEST_ARGS "-V --output-on-failure") # extra arguments when ctest is called -SET_DEFAULT(GIT_EXECUTABLE "git") # ctest_update -SET_DEFAULT(TARGET "all") # build target -SET_DEFAULT_ARG1(SITE "$ENV{SITE}" - "${HOSTNAME}") # update site -SET_DEFAULT_ARG1(BUILD_JOBS "$ENV{BUILD_JOBS}" - "${NUM_PROCESSORS}") # number of parallel compile jobs -# -# The variable below correspond to ctest arguments, i.e. START,END,STRIDE are -# '-I START,END,STRIDE' -# -SET_DEFAULT(START "") -SET_DEFAULT(END "") -SET_DEFAULT(STRIDE "") -SET_DEFAULT(INCLUDE "") -SET_DEFAULT(EXCLUDE "") -SET_DEFAULT(INCLUDE_LABEL "") -SET_DEFAULT(EXCLUDE_LABEL "") -SET_DEFAULT(PARALLEL_LEVEL "") -SET_DEFAULT(STOP_TIME "") -SET_DEFAULT(LABELS "") -SET_DEFAULT(NOTES "") - -# default static build tag for Nightly -set(BUILD_TAG "${BRANCH}") - -if(NOT BUILD_TYPE) - # default for kokkos if not specified - set(BUILD_TYPE "RelWithDebInfo") -endif() - -# generate dynamic name if continuous or experimental model -if(NOT "${MODEL}" STREQUAL "Nightly") - if(EVENT_TYPE AND PULL_REQUEST_NUM) - # e.g. pull_request/123 - if(AUTHOR) - set(BUILD_TAG "${AUTHOR}/${EVENT_TYPE}/${PULL_REQUEST_NUM}") - else() - set(BUILD_TAG "${EVENT_TYPE}/${PULL_REQUEST_NUM}") - endif() - elseif(SLUG) - # e.g. owner_name/repo_name - set(BUILD_TAG "${SLUG}") - elseif(AUTHOR) - set(BUILD_TAG "${AUTHOR}/${BRANCH}") - endif() - if(EVENT_TYPE AND NOT PULL_REQUEST_NUM) - set(BUILD_TAG "${BUILD_TAG}-${EVENT_TYPE}") - endif() -endif() - -# unnecessary -string(REPLACE "/remotes/" "/" BUILD_TAG "${BUILD_TAG}") -string(REPLACE "/origin/" "/" BUILD_TAG "${BUILD_TAG}") - -message(STATUS "BUILD_TAG: ${BUILD_TAG}") - -set(BUILD_NAME "[${BUILD_TAG}] [${BUILD_NAME}-${BUILD_TYPE}]") - -# colons in build name create extra (empty) entries in CDash -string(REPLACE ":" "-" BUILD_NAME "${BUILD_NAME}") -# unnecessary info -string(REPLACE "/merge]" "]" BUILD_NAME "${BUILD_NAME}") -# consistency -string(REPLACE "/pr/" "/pull/" BUILD_NAME "${BUILD_NAME}") -string(REPLACE "pull_request/" "pull/" BUILD_NAME "${BUILD_NAME}") -# miscellaneous from missing fields -string(REPLACE "--" "-" BUILD_NAME "${BUILD_NAME}") -string(REPLACE "-]" "]" BUILD_NAME "${BUILD_NAME}") - -# check binary directory -if(EXISTS ${BINARY_DIR}) - if(NOT IS_DIRECTORY "${BINARY_DIR}") - message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not a directory!") - endif() - file(GLOB BINARY_DIR_FILES "${BINARY_DIR}/*") - if(NOT "${BINARY_DIR_FILES}" STREQUAL "") - message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not empty!") - endif() -endif() - -get_filename_component(SOURCE_REALDIR ${SOURCE_DIR} REALPATH) -get_filename_component(BINARY_REALDIR ${BINARY_DIR} REALPATH) - -#----------------------------------------------------------------------------------------# -# -# Generate the CTestConfig.cmake -# -#----------------------------------------------------------------------------------------# - -set(CONFIG_ARGS) -foreach(_ARG ${KOKKOS_CMAKE_ARGS}) - if(NOT "${${_ARG}}" STREQUAL "") - get_property(_ARG_TYPE CACHE ${_ARG} PROPERTY TYPE) - if("${_ARG_TYPE}" STREQUAL "UNINITIALIZED") - if("${${_ARG}}" STREQUAL "ON" OR "${${_ARG}}" STREQUAL "OFF") - set(_ARG_TYPE "BOOL") - elseif(EXISTS "${${_ARG}}" AND NOT IS_DIRECTORY "${${_ARG}}") - set(_ARG_TYPE "FILEPATH") - elseif(EXISTS "${${_ARG}}" AND IS_DIRECTORY "${${_ARG}}") - set(_ARG_TYPE "PATH") - elseif(NOT "${${_ARG}}" STREQUAL "") - set(_ARG_TYPE "STRING") - endif() - endif() - set(CONFIG_ARGS "${CONFIG_ARGS}set(${_ARG} \"${${_ARG}}\" CACHE ${_ARG_TYPE} \"\")\n") - endif() -endforeach() - -file(WRITE ${BINARY_REALDIR}/initial-cache.cmake -" -set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS}\" CACHE STRING \"\") -${CONFIG_ARGS} -") - -file(READ ${BINARY_REALDIR}/initial-cache.cmake _CACHE_INFO) -message(STATUS "Initial cache:\n${_CACHE_INFO}") - -# initialize the cache -set(CONFIG_ARGS "-C ${BINARY_REALDIR}/initial-cache.cmake") - - -# generate the CTestConfig.cmake -configure_file( - ${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake.in - ${BINARY_REALDIR}/CTestConfig.cmake - @ONLY) - -# copy/generate the dashboard script -configure_file( - ${CMAKE_CURRENT_LIST_DIR}/KokkosCTest.cmake.in - ${BINARY_REALDIR}/KokkosCTest.cmake - @ONLY) - -# custom CTest settings go in ${BINARY_DIR}/CTestCustom.cmake -execute_process( - COMMAND ${CMAKE_COMMAND} -E touch CTestCustom.cmake - WORKING_DIRECTORY ${BINARY_REALDIR} - ) - -#----------------------------------------------------------------------------------------# -# -# Execute CTest -# -#----------------------------------------------------------------------------------------# - -message(STATUS "") -message(STATUS "BUILD_NAME: ${BUILD_NAME}") -message(STATUS "Executing '${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS}'...") -message(STATUS "") - -# e.g. -DCTEST_ARGS="--output-on-failure -VV" should really be -DCTEST_ARGS="--output-on-failure;-VV" -string(REPLACE " " ";" CTEST_ARGS "${CTEST_ARGS}") - -execute_process( - COMMAND ${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS} - RESULT_VARIABLE RET - WORKING_DIRECTORY ${BINARY_REALDIR} - ) - -# ensure that any non-zero result variable gets propagated -if(NOT RET EQUAL 0) - message(FATAL_ERROR "CTest return non-zero exit code: ${RET}") -endif() diff --git a/lib/kokkos/cmake/KokkosCTest.cmake.in b/lib/kokkos/cmake/KokkosCTest.cmake.in deleted file mode 100644 index b6917f3cc1..0000000000 --- a/lib/kokkos/cmake/KokkosCTest.cmake.in +++ /dev/null @@ -1,261 +0,0 @@ -cmake_minimum_required(VERSION 3.16 FATAL_ERROR) - -if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake") - include("${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake") -endif() - -include(ProcessorCount) -ProcessorCount(CTEST_PROCESSOR_COUNT) - -cmake_policy(SET CMP0009 NEW) -cmake_policy(SET CMP0011 NEW) - -# ---------------------------------------------------------------------------- # -# -- Commands -# ---------------------------------------------------------------------------- # -find_program(CTEST_CMAKE_COMMAND NAMES cmake) -find_program(CTEST_UNAME_COMMAND NAMES uname) - -find_program(CTEST_BZR_COMMAND NAMES bzr) -find_program(CTEST_CVS_COMMAND NAMES cvs) -find_program(CTEST_GIT_COMMAND NAMES git) -find_program(CTEST_HG_COMMAND NAMES hg) -find_program(CTEST_P4_COMMAND NAMES p4) -find_program(CTEST_SVN_COMMAND NAMES svn) - -find_program(VALGRIND_COMMAND NAMES valgrind) -find_program(GCOV_COMMAND NAMES gcov) -find_program(LCOV_COMMAND NAMES llvm-cov) -find_program(MEMORYCHECK_COMMAND NAMES valgrind ) - -set(MEMORYCHECK_TYPE Valgrind) -# set(MEMORYCHECK_TYPE Purify) -# set(MEMORYCHECK_TYPE BoundsChecker) -# set(MEMORYCHECK_TYPE ThreadSanitizer) -# set(MEMORYCHECK_TYPE AddressSanitizer) -# set(MEMORYCHECK_TYPE LeakSanitizer) -# set(MEMORYCHECK_TYPE MemorySanitizer) -# set(MEMORYCHECK_TYPE UndefinedBehaviorSanitizer) -set(MEMORYCHECK_COMMAND_OPTIONS "--trace-children=yes --leak-check=full") - -# ---------------------------------------------------------------------------- # -# -- Settings -# ---------------------------------------------------------------------------- # -## -- Process timeout in seconds -set(CTEST_TIMEOUT "7200") -## -- Set output to English -set(ENV{LC_MESSAGES} "en_EN" ) - - -# ---------------------------------------------------------------------------- # -# -- Copy ctest configuration file -# ---------------------------------------------------------------------------- # -macro(COPY_CTEST_CONFIG_FILES) - - foreach(_FILE CTestConfig.cmake CTestCustom.cmake) - - # if current directory is not binary or source directory - if(NOT "${CMAKE_CURRENT_LIST_DIR}" STREQUAL "${CTEST_BINARY_DIRECTORY}" AND - NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}") - - # if file exists in current directory - if(EXISTS ${CMAKE_CURRENT_LIST_DIR}/${_FILE}) - configure_file(${CMAKE_CURRENT_LIST_DIR}/${_FILE} - ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY) - endif() - - # if source and binary differ - elseif(NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}") - - # if file exists in source directory but not in binary directory - if(EXISTS ${CTEST_SOURCE_DIRECTORY}/${_FILE} AND - NOT EXISTS ${CTEST_BINARY_DIRECTORY}/${_FILE}) - configure_file(${CTEST_SOURCE_DIRECTORY}/${_FILE} - ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY) - endif() - - endif() - endforeach() - -endmacro() - -ctest_read_custom_files("${CMAKE_CURRENT_LIST_DIR}") - -message(STATUS "CTEST_MODEL: ${CTEST_MODEL}") - -#-------------------------------------------------------------------------# -# Start -# -message(STATUS "") -message(STATUS "[${CTEST_BUILD_NAME}] Running START_CTEST stage...") -message(STATUS "") - -ctest_start(${CTEST_MODEL} TRACK ${CTEST_MODEL} ${APPEND_CTEST} - ${CTEST_SOURCE_DIRECTORY} ${CTEST_BINARY_DIRECTORY}) - - -#-------------------------------------------------------------------------# -# Config -# -copy_ctest_config_files() -ctest_read_custom_files("${CTEST_BINARY_DIRECTORY}") - - -#-------------------------------------------------------------------------# -# Update -# -message(STATUS "") -message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_UPDATE stage...") -message(STATUS "") - -ctest_update(SOURCE "${CTEST_SOURCE_DIRECTORY}" - RETURN_VALUE up_ret) - - -#-------------------------------------------------------------------------# -# Configure -# -message(STATUS "") -message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_CONFIGURE stage...") -message(STATUS "") - -ctest_configure(BUILD "${CTEST_BINARY_DIRECTORY}" - SOURCE ${CTEST_SOURCE_DIRECTORY} - ${APPEND_CTEST} - OPTIONS "${CTEST_CONFIGURE_OPTIONS}" - RETURN_VALUE config_ret) - - -#-------------------------------------------------------------------------# -# Echo configure log bc Damien wants to delay merging this PR for eternity -# -file(GLOB _configure_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastConfigure*.log") -# should only have one but loop just for safety -foreach(_LOG ${_configure_log}) - file(READ ${_LOG} _LOG_MESSAGE) - message(STATUS "Configure Log: ${_LOG}") - message(STATUS "\n${_LOG_MESSAGE}\n") -endforeach() - - -#-------------------------------------------------------------------------# -# Build -# -message(STATUS "") -message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_BUILD stage...") -message(STATUS "") - -ctest_build(BUILD "${CTEST_BINARY_DIRECTORY}" - ${APPEND_CTEST} - RETURN_VALUE build_ret) - - -#-------------------------------------------------------------------------# -# Echo build log bc Damien wants to delay merging this PR for eternity -# -file(GLOB _build_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastBuild*.log") -# should only have one but loop just for safety -foreach(_LOG ${_build_log}) - file(READ ${_LOG} _LOG_MESSAGE) - message(STATUS "Build Log: ${_LOG}") - message(STATUS "\n${_LOG_MESSAGE}\n") -endforeach() - - -#-------------------------------------------------------------------------# -# Test -# -message(STATUS "") -message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_TEST stage...") -message(STATUS "") - -ctest_test(RETURN_VALUE test_ret - ${APPEND_CTEST} - ${START_CTEST} - ${END_CTEST} - ${STRIDE_CTEST} - ${INCLUDE_CTEST} - ${EXCLUDE_CTEST} - ${INCLUDE_LABEL_CTEST} - ${EXCLUDE_LABEL_CTEST} - ${PARALLEL_LEVEL_CTEST} - ${STOP_TIME_CTEST} - SCHEDULE_RANDOM OFF) - - -#-------------------------------------------------------------------------# -# Coverage -# -message(STATUS "") -message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_COVERAGE stage...") -message(STATUS "") - -execute_process(COMMAND ${CTEST_COVERAGE_COMMAND} ${CTEST_COVERAGE_EXTRA_FLAGS} - WORKING_DIRECTORY ${CTEST_BINARY_DIRECTORY} - ERROR_QUIET) - -ctest_coverage(${APPEND_CTEST} - ${CTEST_COVERAGE_LABELS} - RETURN_VALUE cov_ret) - - -#-------------------------------------------------------------------------# -# MemCheck -# -message(STATUS "") -message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_MEMCHECK stage...") -message(STATUS "") - -ctest_memcheck(RETURN_VALUE mem_ret - ${APPEND_CTEST} - ${START_CTEST} - ${END_CTEST} - ${STRIDE_CTEST} - ${INCLUDE_CTEST} - ${EXCLUDE_CTEST} - ${INCLUDE_LABEL_CTEST} - ${EXCLUDE_LABEL_CTEST} - ${PARALLEL_LEVEL_CTEST}) - - -#-------------------------------------------------------------------------# -# Submit -# -message(STATUS "") -message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_SUBMIT stage...") -message(STATUS "") - -file(GLOB_RECURSE NOTE_FILES "${CTEST_BINARY_DIRECTORY}/*CTestNotes.cmake") -foreach(_FILE ${NOTE_FILES}) - message(STATUS "Including CTest notes files: \"${_FILE}\"...") - include("${_FILE}") -endforeach() - -# capture submit error so it doesn't fail because of a submission error -ctest_submit(RETURN_VALUE submit_ret - RETRY_COUNT 2 - RETRY_DELAY 10 - CAPTURE_CMAKE_ERROR submit_err) - -#-------------------------------------------------------------------------# -# Submit -# -message(STATUS "") -message(STATUS "[${CTEST_BUILD_NAME}] Finished ${CTEST_MODEL} Stages (${STAGES})") -message(STATUS "") - - -#-------------------------------------------------------------------------# -# Non-zero exit codes for important errors -# -if(NOT config_ret EQUAL 0) - message(FATAL_ERROR "Error during configuration! Exit code: ${config_ret}") -endif() - -if(NOT build_ret EQUAL 0) - message(FATAL_ERROR "Error during build! Exit code: ${build_ret}") -endif() - -if(NOT test_ret EQUAL 0) - message(FATAL_ERROR "Error during testing! Exit code: ${test_ret}") -endif() diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in index 3455b0cb42..07baa0a5f0 100644 --- a/lib/kokkos/cmake/KokkosCore_config.h.in +++ b/lib/kokkos/cmake/KokkosCore_config.h.in @@ -41,6 +41,7 @@ #cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA #cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR #cmakedefine KOKKOS_ENABLE_CUDA_LDG_INTRINSIC +#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE #cmakedefine KOKKOS_ENABLE_HPX_ASYNC_DISPATCH #cmakedefine KOKKOS_ENABLE_DEBUG @@ -49,17 +50,21 @@ #cmakedefine KOKKOS_ENABLE_COMPILER_WARNINGS #cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT #cmakedefine KOKKOS_ENABLE_TUNING -#cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE +#cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE_3 +#cmakedefine KOKKOS_ENABLE_DEPRECATION_WARNINGS #cmakedefine KOKKOS_ENABLE_LARGE_MEM_TESTS #cmakedefine KOKKOS_ENABLE_DUALVIEW_MODIFY_CHECK #cmakedefine KOKKOS_ENABLE_COMPLEX_ALIGN -#cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION +#cmakedefine KOKKOS_ENABLE_IMPL_DESUL_ATOMICS +#cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION // deprecated +#cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION /* TPL Settings */ #cmakedefine KOKKOS_ENABLE_HWLOC #cmakedefine KOKKOS_USE_LIBRT #cmakedefine KOKKOS_ENABLE_HBWSPACE #cmakedefine KOKKOS_ENABLE_LIBDL +#cmakedefine KOKKOS_ENABLE_LIBQUADMATH #cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND #cmakedefine KOKKOS_COMPILER_CUDA_VERSION @KOKKOS_COMPILER_CUDA_VERSION@ @@ -79,6 +84,12 @@ #cmakedefine KOKKOS_ARCH_POWER8 #cmakedefine KOKKOS_ARCH_POWER9 #cmakedefine KOKKOS_ARCH_INTEL_GEN +#cmakedefine KOKKOS_ARCH_INTEL_DG1 +#cmakedefine KOKKOS_ARCH_INTEL_GEN9 +#cmakedefine KOKKOS_ARCH_INTEL_GEN11 +#cmakedefine KOKKOS_ARCH_INTEL_GEN12LP +#cmakedefine KOKKOS_ARCH_INTEL_XEHP +#cmakedefine KOKKOS_ARCH_INTEL_GPU #cmakedefine KOKKOS_ARCH_KEPLER #cmakedefine KOKKOS_ARCH_KEPLER30 #cmakedefine KOKKOS_ARCH_KEPLER32 @@ -95,6 +106,7 @@ #cmakedefine KOKKOS_ARCH_VOLTA70 #cmakedefine KOKKOS_ARCH_VOLTA72 #cmakedefine KOKKOS_ARCH_TURING75 +#cmakedefine KOKKOS_ARCH_AMPERE #cmakedefine KOKKOS_ARCH_AMPERE80 #cmakedefine KOKKOS_ARCH_AMPERE86 #cmakedefine KOKKOS_ARCH_AMD_ZEN diff --git a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake index 8d58d96415..0c825c59e0 100644 --- a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake @@ -29,7 +29,12 @@ ELSE() ENDIF() include(FindPackageHandleStandardArgs) -FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA DEFAULT_MSG FOUND_CUDART FOUND_CUDA_DRIVER) +IF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI) + SET(KOKKOS_CUDA_ERROR "Using NVHPC as host compiler requires at least CMake 3.20.1") +ELSE() + SET(KOKKOS_CUDA_ERROR DEFAULT_MSG) +ENDIF() +FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA ${KOKKOS_CUDA_ERROR} FOUND_CUDART FOUND_CUDA_DRIVER) IF (FOUND_CUDA_DRIVER AND FOUND_CUDART) KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart diff --git a/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake b/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake new file mode 100644 index 0000000000..be70b711e0 --- /dev/null +++ b/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake @@ -0,0 +1 @@ +KOKKOS_FIND_IMPORTED(LIBQUADMATH HEADER quadmath.h LIBRARY quadmath) diff --git a/lib/kokkos/cmake/deps/quadmath.cmake b/lib/kokkos/cmake/deps/quadmath.cmake new file mode 100644 index 0000000000..826f5021d3 --- /dev/null +++ b/lib/kokkos/cmake/deps/quadmath.cmake @@ -0,0 +1,46 @@ +# @HEADER +# ************************************************************************ +# +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). +# +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the Corporation nor the names of the +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) +# +# ************************************************************************ +# @HEADER + +KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath + REQUIRED_HEADERS quadmath.h + REQUIRED_LIBS_NAMES quadmath +) diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake index e8b85542c6..c4637339f3 100644 --- a/lib/kokkos/cmake/kokkos_arch.cmake +++ b/lib/kokkos/cmake/kokkos_arch.cmake @@ -67,8 +67,13 @@ KOKKOS_ARCH_OPTION(ZEN3 HOST "AMD Zen3 architecture") KOKKOS_ARCH_OPTION(VEGA900 GPU "AMD GPU MI25 GFX900") KOKKOS_ARCH_OPTION(VEGA906 GPU "AMD GPU MI50/MI60 GFX906") KOKKOS_ARCH_OPTION(VEGA908 GPU "AMD GPU MI100 GFX908") +KOKKOS_ARCH_OPTION(VEGA90A GPU "" ) KOKKOS_ARCH_OPTION(INTEL_GEN GPU "Intel GPUs Gen9+") - +KOKKOS_ARCH_OPTION(INTEL_DG1 GPU "Intel Iris XeMAX GPU") +KOKKOS_ARCH_OPTION(INTEL_GEN9 GPU "Intel GPU Gen9") +KOKKOS_ARCH_OPTION(INTEL_GEN11 GPU "Intel GPU Gen11") +KOKKOS_ARCH_OPTION(INTEL_GEN12LP GPU "Intel GPU Gen12LP") +KOKKOS_ARCH_OPTION(INTEL_XEHP GPU "Intel GPU Xe-HP") IF(KOKKOS_ENABLE_COMPILER_WARNINGS) @@ -76,6 +81,12 @@ IF(KOKKOS_ENABLE_COMPILER_WARNINGS) "-Wall" "-Wunused-parameter" "-Wshadow" "-pedantic" "-Wsign-compare" "-Wtype-limits" "-Wuninitialized") + # NOTE KOKKOS_ prefixed variable (all uppercase) is not set yet because TPLs are processed after ARCH + IF(Kokkos_ENABLE_LIBQUADMATH) + # warning: non-standard suffix on floating constant [-Wpedantic] + LIST(REMOVE_ITEM COMMON_WARNINGS "-pedantic") + ENDIF() + # OpenMPTarget compilers give erroneous warnings about sign comparison in loops IF(KOKKOS_ENABLE_OPENMPTARGET) LIST(REMOVE_ITEM COMMON_WARNINGS "-Wsign-compare") @@ -86,7 +97,7 @@ IF(KOKKOS_ENABLE_COMPILER_WARNINGS) COMPILER_SPECIFIC_FLAGS( COMPILER_ID CMAKE_CXX_COMPILER_ID - PGI NO-VALUE-SPECIFIED + NVHPC NO-VALUE-SPECIFIED GNU ${GNU_WARNINGS} DEFAULT ${COMMON_WARNINGS} ) @@ -158,16 +169,18 @@ ENDIF() IF (KOKKOS_ARCH_ARMV80) COMPILER_SPECIFIC_FLAGS( - Cray NO-VALUE-SPECIFIED - PGI NO-VALUE-SPECIFIED + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID + Cray NO-VALUE-SPECIFIED + NVHPC NO-VALUE-SPECIFIED DEFAULT -march=armv8-a ) ENDIF() IF (KOKKOS_ARCH_ARMV81) COMPILER_SPECIFIC_FLAGS( - Cray NO-VALUE-SPECIFIED - PGI NO-VALUE-SPECIFIED + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID + Cray NO-VALUE-SPECIFIED + NVHPC NO-VALUE-SPECIFIED DEFAULT -march=armv8.1-a ) ENDIF() @@ -175,8 +188,9 @@ ENDIF() IF (KOKKOS_ARCH_ARMV8_THUNDERX) SET(KOKKOS_ARCH_ARMV80 ON) #Not a cache variable COMPILER_SPECIFIC_FLAGS( - Cray NO-VALUE-SPECIFIED - PGI NO-VALUE-SPECIFIED + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID + Cray NO-VALUE-SPECIFIED + NVHPC NO-VALUE-SPECIFIED DEFAULT -march=armv8-a -mtune=thunderx ) ENDIF() @@ -184,23 +198,28 @@ ENDIF() IF (KOKKOS_ARCH_ARMV8_THUNDERX2) SET(KOKKOS_ARCH_ARMV81 ON) #Not a cache variable COMPILER_SPECIFIC_FLAGS( - Cray NO-VALUE-SPECIFIED - PGI NO-VALUE-SPECIFIED + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID + Cray NO-VALUE-SPECIFIED + NVHPC NO-VALUE-SPECIFIED DEFAULT -mcpu=thunderx2t99 -mtune=thunderx2t99 ) ENDIF() IF (KOKKOS_ARCH_A64FX) COMPILER_SPECIFIC_FLAGS( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID + NVHPC NO-VALUE-SPECIFIED DEFAULT -march=armv8.2-a+sve - Clang -march=armv8.2-a+sve -msve-vector-bits=512 - GCC -march=armv8.2-a+sve -msve-vector-bits=512 + Clang -march=armv8.2-a+sve -msve-vector-bits=512 + GCC -march=armv8.2-a+sve -msve-vector-bits=512 ) ENDIF() IF (KOKKOS_ARCH_ZEN) COMPILER_SPECIFIC_FLAGS( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID Intel -mavx2 + NVHPC -tp=zen DEFAULT -march=znver1 -mtune=znver1 ) SET(KOKKOS_ARCH_AMD_ZEN ON) @@ -209,7 +228,9 @@ ENDIF() IF (KOKKOS_ARCH_ZEN2) COMPILER_SPECIFIC_FLAGS( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID Intel -mavx2 + NVHPC -tp=zen2 DEFAULT -march=znver2 -mtune=znver2 ) SET(KOKKOS_ARCH_AMD_ZEN2 ON) @@ -218,7 +239,9 @@ ENDIF() IF (KOKKOS_ARCH_ZEN3) COMPILER_SPECIFIC_FLAGS( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID Intel -mavx2 + NVHPC -tp=zen2 DEFAULT -march=znver3 -mtune=znver3 ) SET(KOKKOS_ARCH_AMD_ZEN3 ON) @@ -227,8 +250,9 @@ ENDIF() IF (KOKKOS_ARCH_WSM) COMPILER_SPECIFIC_FLAGS( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID Intel -xSSE4.2 - PGI -tp=nehalem + NVHPC -tp=px Cray NO-VALUE-SPECIFIED DEFAULT -msse4.2 ) @@ -238,8 +262,9 @@ ENDIF() IF (KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX) SET(KOKKOS_ARCH_AVX ON) COMPILER_SPECIFIC_FLAGS( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID Intel -mavx - PGI -tp=sandybridge + NVHPC -tp=sandybridge Cray NO-VALUE-SPECIFIED DEFAULT -mavx ) @@ -248,8 +273,9 @@ ENDIF() IF (KOKKOS_ARCH_HSW) SET(KOKKOS_ARCH_AVX2 ON) COMPILER_SPECIFIC_FLAGS( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID Intel -xCORE-AVX2 - PGI -tp=haswell + NVHPC -tp=haswell Cray NO-VALUE-SPECIFIED DEFAULT -march=core-avx2 -mtune=core-avx2 ) @@ -258,8 +284,9 @@ ENDIF() IF (KOKKOS_ARCH_BDW) SET(KOKKOS_ARCH_AVX2 ON) COMPILER_SPECIFIC_FLAGS( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID Intel -xCORE-AVX2 - PGI -tp=haswell + NVHPC -tp=haswell Cray NO-VALUE-SPECIFIED DEFAULT -march=core-avx2 -mtune=core-avx2 -mrtm ) @@ -269,8 +296,9 @@ IF (KOKKOS_ARCH_KNL) #avx512-mic SET(KOKKOS_ARCH_AVX512MIC ON) #not a cache variable COMPILER_SPECIFIC_FLAGS( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID Intel -xMIC-AVX512 - PGI NO-VALUE-SPECIFIED + NVHPC -tp=knl Cray NO-VALUE-SPECIFIED DEFAULT -march=knl -mtune=knl ) @@ -279,6 +307,7 @@ ENDIF() IF (KOKKOS_ARCH_KNC) SET(KOKKOS_USE_ISA_KNC ON) COMPILER_SPECIFIC_FLAGS( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID DEFAULT -mmic ) ENDIF() @@ -287,8 +316,9 @@ IF (KOKKOS_ARCH_SKX) #avx512-xeon SET(KOKKOS_ARCH_AVX512XEON ON) COMPILER_SPECIFIC_FLAGS( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID Intel -xCORE-AVX512 - PGI NO-VALUE-SPECIFIED + NVHPC -tp=skylake Cray NO-VALUE-SPECIFIED DEFAULT -march=skylake-avx512 -mtune=skylake-avx512 -mrtm ) @@ -304,7 +334,8 @@ ENDIF() IF (KOKKOS_ARCH_POWER7) COMPILER_SPECIFIC_FLAGS( - PGI NO-VALUE-SPECIFIED + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID + NVHPC NO-VALUE-SPECIFIED DEFAULT -mcpu=power7 -mtune=power7 ) SET(KOKKOS_USE_ISA_POWERPCBE ON) @@ -312,16 +343,16 @@ ENDIF() IF (KOKKOS_ARCH_POWER8) COMPILER_SPECIFIC_FLAGS( - PGI NO-VALUE-SPECIFIED - NVIDIA NO-VALUE-SPECIFIED + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID + NVHPC -tp=pwr8 DEFAULT -mcpu=power8 -mtune=power8 ) ENDIF() IF (KOKKOS_ARCH_POWER9) COMPILER_SPECIFIC_FLAGS( - PGI NO-VALUE-SPECIFIED - NVIDIA NO-VALUE-SPECIFIED + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID + NVHPC -tp=pwr9 DEFAULT -mcpu=power9 -mtune=power9 ) ENDIF() @@ -368,7 +399,7 @@ ENDIF() IF (KOKKOS_ENABLE_SYCL) COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl + DEFAULT -fsycl -fno-sycl-id-queries-fit-in-int ) COMPILER_SPECIFIC_OPTIONS( DEFAULT -fsycl-unnamed-lambda @@ -443,20 +474,58 @@ ENDFUNCTION() CHECK_AMDGPU_ARCH(VEGA900 gfx900) # Radeon Instinct MI25 CHECK_AMDGPU_ARCH(VEGA906 gfx906) # Radeon Instinct MI50 and MI60 CHECK_AMDGPU_ARCH(VEGA908 gfx908) +CHECK_AMDGPU_ARCH(VEGA90A gfx90a) IF(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED) - MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. " - "Please enable one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.") + IF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + FIND_PROGRAM(ROCM_ENUMERATOR rocm_agent_enumerator) + EXECUTE_PROCESS(COMMAND ${ROCM_ENUMERATOR} OUTPUT_VARIABLE GPU_ARCHS) + STRING(LENGTH "${GPU_ARCHS}" len_str) + # enumerator always output gfx000 as the first line + IF(${len_str} LESS 8) + MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. " + "Please enable one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.") + ENDIF() + ELSE() + MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. " + "Please enable one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.") + ENDIF() +ENDIF() + +MACRO(CHECK_MULTIPLE_INTEL_ARCH) + IF(KOKKOS_ARCH_INTEL_GPU) + MESSAGE(FATAL_ERROR "Specifying multiple Intel GPU architectures is not allowed!") + ENDIF() + SET(KOKKOS_ARCH_INTEL_GPU ON) +ENDMACRO() + +IF(KOKKOS_ARCH_INTEL_GEN) + CHECK_MULTIPLE_INTEL_ARCH() +ENDIF() +IF(KOKKOS_ARCH_INTEL_DG1) + CHECK_MULTIPLE_INTEL_ARCH() +ENDIF() +IF(KOKKOS_ARCH_INTEL_GEN9) + CHECK_MULTIPLE_INTEL_ARCH() +ENDIF() +IF(KOKKOS_ARCH_INTEL_GEN11) + CHECK_MULTIPLE_INTEL_ARCH() +ENDIF() +IF(KOKKOS_ARCH_INTEL_GEN12LP) + CHECK_MULTIPLE_INTEL_ARCH() +ENDIF() +IF(KOKKOS_ARCH_INTEL_XEHP) + CHECK_MULTIPLE_INTEL_ARCH() ENDIF() IF (KOKKOS_ENABLE_OPENMPTARGET) SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) IF (CLANG_CUDA_ARCH) - STRING(REPLACE "sm_" "cc" PGI_CUDA_ARCH ${CLANG_CUDA_ARCH}) + STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH}) COMPILER_SPECIFIC_FLAGS( Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64-nvidia-cuda - XL -qtgtarch=${KOKKOS_CUDA_ARCH_FLAG} - PGI -gpu=${PGI_CUDA_ARCH} + XL -qtgtarch=${KOKKOS_CUDA_ARCH_FLAG} + NVHPC -gpu=${NVHPC_CUDA_ARCH} ) ENDIF() SET(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG}) @@ -465,7 +534,7 @@ IF (KOKKOS_ENABLE_OPENMPTARGET) Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${CLANG_AMDGPU_ARCH} -fopenmp-targets=amdgcn-amd-amdhsa ) ENDIF() - IF (KOKKOS_ARCH_INTEL_GEN) + IF (KOKKOS_ARCH_INTEL_GPU) COMPILER_SPECIFIC_FLAGS( IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__ ) @@ -485,7 +554,27 @@ IF (KOKKOS_ENABLE_SYCL) ENDIF() ELSEIF(KOKKOS_ARCH_INTEL_GEN) COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device skl" + DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9-" + ) + ELSEIF(KOKKOS_ARCH_INTEL_GEN9) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9" + ) + ELSEIF(KOKKOS_ARCH_INTEL_GEN11) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen11" + ) + ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen12lp" + ) + ELSEIF(KOKKOS_ARCH_INTEL_DG1) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device dg1" + ) + ELSEIF(KOKKOS_ARCH_INTEL_XEHP) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device xehp" ) ENDIF() ENDIF() diff --git a/lib/kokkos/cmake/kokkos_compiler_id.cmake b/lib/kokkos/cmake/kokkos_compiler_id.cmake index 23847263a9..5afed4fb0e 100644 --- a/lib/kokkos/cmake/kokkos_compiler_id.cmake +++ b/lib/kokkos/cmake/kokkos_compiler_id.cmake @@ -137,7 +137,7 @@ SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang 4.0.0 or higher" SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC 5.3.0 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel 17.0.0 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC 9.2.88 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC 3.8.0 or higher") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC 4.2.0 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n PGI 17.4 or higher\n") IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) @@ -158,13 +158,23 @@ ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) ENDIF() SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE) ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 3.8.0) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.2.0) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI) IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.4) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() + # Treat PGI internally as NVHPC to simplify handling both compilers. + # Before CMake 3.20 NVHPC was identified as PGI, nvc++ is + # backward-compatible to pgc++. + SET(KOKKOS_CXX_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) +ENDIF() + +IF(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID) + SET(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) +ELSEIF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI) + SET(KOKKOS_CXX_HOST_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) ENDIF() STRING(REPLACE "." ";" VERSION_LIST ${KOKKOS_CXX_COMPILER_VERSION}) diff --git a/lib/kokkos/cmake/kokkos_enable_devices.cmake b/lib/kokkos/cmake/kokkos_enable_devices.cmake index d7f83ddbdf..7fd0794036 100644 --- a/lib/kokkos/cmake/kokkos_enable_devices.cmake +++ b/lib/kokkos/cmake/kokkos_enable_devices.cmake @@ -62,7 +62,7 @@ IF(KOKKOS_ENABLE_OPENMP) COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID Clang -Xcompiler ${ClangOpenMPFlag} IntelLLVM -Xcompiler -fiopenmp - PGI -Xcompiler -mp + NVHPC -Xcompiler -mp Cray NO-VALUE-SPECIFIED XL -Xcompiler -qsmp=omp DEFAULT -Xcompiler -fopenmp @@ -72,7 +72,7 @@ IF(KOKKOS_ENABLE_OPENMP) Clang ${ClangOpenMPFlag} IntelLLVM -fiopenmp AppleClang -Xpreprocessor -fopenmp - PGI -mp + NVHPC -mp Cray NO-VALUE-SPECIFIED XL -qsmp=omp DEFAULT -fopenmp @@ -94,7 +94,7 @@ IF (KOKKOS_ENABLE_OPENMPTARGET) Clang ${ClangOpenMPFlag} -Wno-openmp-mapping IntelLLVM -fiopenmp -Wno-openmp-mapping XL -qsmp=omp -qoffload -qnoeh - PGI -mp=gpu + NVHPC -mp=gpu DEFAULT -fopenmp ) COMPILER_SPECIFIC_DEFS( diff --git a/lib/kokkos/cmake/kokkos_enable_options.cmake b/lib/kokkos/cmake/kokkos_enable_options.cmake index 95bce66c7b..4cb8bd20f5 100644 --- a/lib/kokkos/cmake/kokkos_enable_options.cmake +++ b/lib/kokkos/cmake/kokkos_enable_options.cmake @@ -26,9 +26,16 @@ KOKKOS_CFG_DEPENDS(OPTIONS COMPILER_ID) # Put a check in just in case people are using this option KOKKOS_DEPRECATED_LIST(OPTIONS ENABLE) +# Set the Default for Desul Atomics usage. +set(_DESUL_ATOMICS_DEFAULT ON) + KOKKOS_ENABLE_OPTION(CUDA_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for CUDA") KOKKOS_ENABLE_OPTION(CUDA_UVM OFF "Whether to use unified memory (UM) for CUDA by default") KOKKOS_ENABLE_OPTION(CUDA_LDG_INTRINSIC OFF "Whether to use CUDA LDG intrinsics") +# As of 08/12/2021 CudaMallocAsync causes issues if UCX is used as MPI communication layer. +KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC OFF "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)") +KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_3 ON "Whether code deprecated in major release 3 is available" ) +KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" ) KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") KOKKOS_ENABLE_OPTION(HPX_ASYNC_DISPATCH OFF "Whether HPX supports asynchronous dispatch") KOKKOS_ENABLE_OPTION(TESTS OFF "Whether to build the unit tests") @@ -50,6 +57,9 @@ KOKKOS_ENABLE_OPTION(TUNING OFF "Whether to create bindings for tu KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops") KOKKOS_ENABLE_OPTION(LAUNCH_COMPILER ON "Whether to potentially use the launch compiler") +# This option will go away eventually, but allows fallback to old implementation when needed. +KOKKOS_ENABLE_OPTION(IMPL_DESUL_ATOMICS ON "Whether to use desul based atomics - option only during beta") + IF (KOKKOS_ENABLE_CUDA) SET(KOKKOS_COMPILER_CUDA_VERSION "${KOKKOS_COMPILER_VERSION_MAJOR}${KOKKOS_COMPILER_VERSION_MINOR}") ENDIF() diff --git a/lib/kokkos/cmake/kokkos_functions.cmake b/lib/kokkos/cmake/kokkos_functions.cmake index e1a3e5f8bd..02c9a911b1 100644 --- a/lib/kokkos/cmake/kokkos_functions.cmake +++ b/lib/kokkos/cmake/kokkos_functions.cmake @@ -773,7 +773,7 @@ FUNCTION(kokkos_link_tpl TARGET) ENDFUNCTION() FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) - SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelLLVM GNU HIPCC Fujitsu) + SET(COMPILERS NVIDIA NVHPC XL XLClang DEFAULT Cray Intel Clang AppleClang IntelLLVM GNU HIPCC Fujitsu) CMAKE_PARSE_ARGUMENTS( PARSE "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES" diff --git a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake index 707fb000af..1eb0592c7f 100644 --- a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake +++ b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake @@ -140,7 +140,7 @@ IF (NOT KOKKOS_CXX_STANDARD_FEATURE) IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) INCLUDE(${KOKKOS_SRC_PATH}/cmake/cray.cmake) kokkos_set_cray_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI) + ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) INCLUDE(${KOKKOS_SRC_PATH}/cmake/pgi.cmake) kokkos_set_pgi_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) diff --git a/lib/kokkos/cmake/kokkos_tpls.cmake b/lib/kokkos/cmake/kokkos_tpls.cmake index d8d044c9d7..51bad521c4 100644 --- a/lib/kokkos/cmake/kokkos_tpls.cmake +++ b/lib/kokkos/cmake/kokkos_tpls.cmake @@ -67,6 +67,12 @@ SET(PTHREAD_DEFAULT OFF) ENDIF() KOKKOS_TPL_OPTION(PTHREAD ${PTHREAD_DEFAULT} TRIBITS Pthread) +IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_quadmath) + SET(LIBQUADMATH_DEFAULT ON) +ELSE() + SET(LIBQUADMATH_DEFAULT OFF) +ENDIF() +KOKKOS_TPL_OPTION(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath) #Make sure we use our local FindKokkosCuda.cmake KOKKOS_IMPORT_TPL(HPX INTERFACE) @@ -78,6 +84,7 @@ KOKKOS_IMPORT_TPL(LIBDL) KOKKOS_IMPORT_TPL(MEMKIND) KOKKOS_IMPORT_TPL(PTHREAD INTERFACE) KOKKOS_IMPORT_TPL(ROCM INTERFACE) +KOKKOS_IMPORT_TPL(LIBQUADMATH) #Convert list to newlines (which CMake doesn't always like in cache variables) STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") diff --git a/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake b/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake new file mode 100644 index 0000000000..1f7587da80 --- /dev/null +++ b/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake @@ -0,0 +1,46 @@ +# @HEADER +# ************************************************************************ +# +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). +# +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the Corporation nor the names of the +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) +# +# ************************************************************************ +# @HEADER + +TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath + REQUIRED_HEADERS quadmath.h + REQUIRED_LIBS_NAMES quadmath +) diff --git a/lib/kokkos/containers/performance_tests/TestDynRankView.hpp b/lib/kokkos/containers/performance_tests/TestDynRankView.hpp index 8c507c7662..7ed9a0271a 100644 --- a/lib/kokkos/containers/performance_tests/TestDynRankView.hpp +++ b/lib/kokkos/containers/performance_tests/TestDynRankView.hpp @@ -48,7 +48,7 @@ #include #include -#include +#include // Compare performance of DynRankView to View, specific focus on the parenthesis // operators diff --git a/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp b/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp index 65de551b27..16b74a4997 100644 --- a/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp +++ b/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp @@ -48,7 +48,7 @@ #include #include -#include +#include // This test will simulate global ids diff --git a/lib/kokkos/containers/performance_tests/TestScatterView.hpp b/lib/kokkos/containers/performance_tests/TestScatterView.hpp index 0f3ba103ef..8a23f59d32 100644 --- a/lib/kokkos/containers/performance_tests/TestScatterView.hpp +++ b/lib/kokkos/containers/performance_tests/TestScatterView.hpp @@ -46,7 +46,7 @@ #define KOKKOS_TEST_SCATTER_VIEW_HPP #include -#include +#include namespace Perf { diff --git a/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp b/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp index c31412552a..4547d5c357 100644 --- a/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp +++ b/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp @@ -43,7 +43,7 @@ #ifndef KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP #define KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP -#include +#include #include #include diff --git a/lib/kokkos/containers/src/Kokkos_Bitset.hpp b/lib/kokkos/containers/src/Kokkos_Bitset.hpp index ea1d6dde5d..c5b66f05a3 100644 --- a/lib/kokkos/containers/src/Kokkos_Bitset.hpp +++ b/lib/kokkos/containers/src/Kokkos_Bitset.hpp @@ -76,20 +76,25 @@ class Bitset { using execution_space = Device; using size_type = unsigned int; - enum { BIT_SCAN_REVERSE = 1u }; - enum { MOVE_HINT_BACKWARD = 2u }; + static constexpr unsigned BIT_SCAN_REVERSE = 1u; + static constexpr unsigned MOVE_HINT_BACKWARD = 2u; - enum { - BIT_SCAN_FORWARD_MOVE_HINT_FORWARD = 0u, - BIT_SCAN_REVERSE_MOVE_HINT_FORWARD = BIT_SCAN_REVERSE, - BIT_SCAN_FORWARD_MOVE_HINT_BACKWARD = MOVE_HINT_BACKWARD, - BIT_SCAN_REVERSE_MOVE_HINT_BACKWARD = BIT_SCAN_REVERSE | MOVE_HINT_BACKWARD - }; + static constexpr unsigned BIT_SCAN_FORWARD_MOVE_HINT_FORWARD = 0u; + static constexpr unsigned BIT_SCAN_REVERSE_MOVE_HINT_FORWARD = + BIT_SCAN_REVERSE; + static constexpr unsigned BIT_SCAN_FORWARD_MOVE_HINT_BACKWARD = + MOVE_HINT_BACKWARD; + static constexpr unsigned BIT_SCAN_REVERSE_MOVE_HINT_BACKWARD = + BIT_SCAN_REVERSE | MOVE_HINT_BACKWARD; private: - enum { block_size = static_cast(sizeof(unsigned) * CHAR_BIT) }; - enum { block_mask = block_size - 1u }; - enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) }; + enum : unsigned { + block_size = static_cast(sizeof(unsigned) * CHAR_BIT) + }; + enum : unsigned { block_mask = block_size - 1u }; + enum : unsigned { + block_shift = Kokkos::Impl::integral_power_of_two(block_size) + }; public: /// constructor @@ -317,14 +322,18 @@ class ConstBitset { enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) }; public: + KOKKOS_FUNCTION ConstBitset() : m_size(0) {} + KOKKOS_FUNCTION ConstBitset(Bitset const& rhs) : m_size(rhs.m_size), m_blocks(rhs.m_blocks) {} + KOKKOS_FUNCTION ConstBitset(ConstBitset const& rhs) : m_size(rhs.m_size), m_blocks(rhs.m_blocks) {} + KOKKOS_FUNCTION ConstBitset& operator=(Bitset const& rhs) { this->m_size = rhs.m_size; this->m_blocks = rhs.m_blocks; @@ -332,6 +341,7 @@ class ConstBitset { return *this; } + KOKKOS_FUNCTION ConstBitset& operator=(ConstBitset const& rhs) { this->m_size = rhs.m_size; this->m_blocks = rhs.m_blocks; diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp index 45710d1f73..f55d0f2b7f 100644 --- a/lib/kokkos/containers/src/Kokkos_DualView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp @@ -597,8 +597,10 @@ class DualView : public ViewTraits { } if (std::is_same::value) { - typename t_dev::execution_space().fence(); - typename t_host::execution_space().fence(); + typename t_dev::execution_space().fence( + "Kokkos::DualView<>::sync: fence after syncing DualView"); + typename t_host::execution_space().fence( + "Kokkos::DualView<>::sync: fence after syncing DualView"); } } @@ -776,10 +778,11 @@ class DualView : public ViewTraits { /// If \c Device is the same as this DualView's device type, then /// mark the device's data as modified. Otherwise, mark the host's /// data as modified. - template + template * = + nullptr> void modify() { if (modified_flags.data() == nullptr) return; - if (impl_dualview_is_single_device::value) return; int dev = get_device_side(); if (dev == 1) { // if Device is the same as DualView's device type @@ -811,8 +814,17 @@ class DualView : public ViewTraits { #endif } + template < + class Device, class Dummy = DualView, + std::enable_if_t* = nullptr> + void modify() { + return; + } + + template * = + nullptr> inline void modify_host() { - if (impl_dualview_is_single_device::value) return; if (modified_flags.data() != nullptr) { modified_flags(0) = (modified_flags(1) > modified_flags(0) ? modified_flags(1) @@ -832,8 +844,17 @@ class DualView : public ViewTraits { } } + template < + class Dummy = DualView, + std::enable_if_t* = nullptr> + inline void modify_host() { + return; + } + + template * = + nullptr> inline void modify_device() { - if (impl_dualview_is_single_device::value) return; if (modified_flags.data() != nullptr) { modified_flags(1) = (modified_flags(1) > modified_flags(0) ? modified_flags(1) @@ -853,6 +874,13 @@ class DualView : public ViewTraits { } } + template < + class Dummy = DualView, + std::enable_if_t* = nullptr> + inline void modify_device() { + return; + } + inline void clear_sync_state() { if (modified_flags.data() != nullptr) modified_flags(1) = modified_flags(0) = 0; @@ -875,8 +903,15 @@ class DualView : public ViewTraits { const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) { - ::Kokkos::realloc(d_view, n0, n1, n2, n3, n4, n5, n6, n7); - h_view = create_mirror_view(d_view); + const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7}; + const bool sizeMismatch = + Impl::size_mismatch(h_view, h_view.rank_dynamic, new_extents); + + if (sizeMismatch) { + ::Kokkos::realloc(d_view, n0, n1, n2, n3, n4, n5, n6, n7); + h_view = create_mirror_view(d_view); + } else + ::Kokkos::deep_copy(d_view, typename t_dev::value_type{}); /* Reset dirty flags */ if (modified_flags.data() == nullptr) { @@ -897,41 +932,31 @@ class DualView : public ViewTraits { const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) { + const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7}; + const bool sizeMismatch = + Impl::size_mismatch(h_view, h_view.rank_dynamic, new_extents); + if (modified_flags.data() == nullptr) { modified_flags = t_modified_flags("DualView::modified_flags"); } if (modified_flags(1) >= modified_flags(0)) { /* Resize on Device */ - ::Kokkos::resize(d_view, n0, n1, n2, n3, n4, n5, n6, n7); - h_view = create_mirror_view(d_view); - - /* Mark Device copy as modified */ - modified_flags(1) = modified_flags(1) + 1; + if (sizeMismatch) { + ::Kokkos::resize(d_view, n0, n1, n2, n3, n4, n5, n6, n7); + h_view = create_mirror_view(d_view); + /* Mark Device copy as modified */ + modified_flags(1) = modified_flags(1) + 1; + } } else { /* Realloc on Device */ - - ::Kokkos::realloc(d_view, n0, n1, n2, n3, n4, n5, n6, n7); - - const bool sizeMismatch = - (h_view.extent(0) != n0) || (h_view.extent(1) != n1) || - (h_view.extent(2) != n2) || (h_view.extent(3) != n3) || - (h_view.extent(4) != n4) || (h_view.extent(5) != n5) || - (h_view.extent(6) != n6) || (h_view.extent(7) != n7); - if (sizeMismatch) + if (sizeMismatch) { ::Kokkos::resize(h_view, n0, n1, n2, n3, n4, n5, n6, n7); + d_view = create_mirror_view(typename t_dev::execution_space(), h_view); - t_host temp_view = create_mirror_view(d_view); - - /* Remap on Host */ - Kokkos::deep_copy(temp_view, h_view); - - h_view = temp_view; - - d_view = create_mirror_view(typename t_dev::execution_space(), h_view); - - /* Mark Host copy as modified */ - modified_flags(0) = modified_flags(0) + 1; + /* Mark Host copy as modified */ + modified_flags(0) = modified_flags(0) + 1; + } } } diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp index c6323fef93..b673c53a4e 100644 --- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -1140,7 +1140,8 @@ class DynRankView : public ViewTraits { // to avoid incomplete type errors from usng Kokkos::Cuda directly. if (std::is_same::value) { - typename traits::device_type::memory_space::execution_space().fence(); + typename traits::device_type::memory_space::execution_space().fence( + "Kokkos::DynRankView<>::DynRankView: fence before UVM allocation"); } #endif //------------------------------------------------------------ @@ -1154,7 +1155,8 @@ class DynRankView : public ViewTraits { #if defined(KOKKOS_ENABLE_CUDA) if (std::is_same::value) { - typename traits::device_type::memory_space::execution_space().fence(); + typename traits::device_type::memory_space::execution_space().fence( + "Kokkos::DynRankView<>::DynRankView: fence after UVM allocation"); } #endif //------------------------------------------------------------ @@ -1404,7 +1406,7 @@ class ViewMapping< template struct apply { - static_assert(Kokkos::Impl::is_memory_traits::value, ""); + static_assert(Kokkos::is_memory_traits::value, ""); using traits_type = Kokkos::ViewTraits& lhs, namespace Kokkos { namespace Impl { -template +template struct DynRankViewFill { using const_value_type = typename OutputView::traits::const_value_type; @@ -1693,9 +1695,11 @@ inline void deep_copy( typename ViewTraits::value_type>::value, "deep_copy requires non-const type"); - Kokkos::fence(); + Kokkos::fence( + "Kokkos::deep_copy(DynRankView, value_type): fence before filling view"); Kokkos::Impl::DynRankViewFill >(dst, value); - Kokkos::fence(); + Kokkos::fence( + "Kokkos::deep_copy(DynRankView, value_type): fence after filling view"); } /** \brief Deep copy into a value in Host memory from a view. */ @@ -1711,10 +1715,13 @@ inline void deep_copy( using src_traits = ViewTraits; using src_memory_space = typename src_traits::memory_space; - Kokkos::fence(); + Kokkos::fence( + "Kokkos::deep_copy(value_type, DynRankView): fence before copying " + "value"); Kokkos::Impl::DeepCopy(&dst, src.data(), sizeof(ST)); - Kokkos::fence(); + Kokkos::fence( + "Kokkos::deep_copy(value_type, DynRankView): fence after copying value"); } //---------------------------------------------------------------------------- @@ -1744,14 +1751,14 @@ inline void deep_copy( enum { DstExecCanAccessSrc = - Kokkos::Impl::SpaceAccessibility::accessible + Kokkos::SpaceAccessibility::accessible }; enum { SrcExecCanAccessDst = - Kokkos::Impl::SpaceAccessibility::accessible + Kokkos::SpaceAccessibility::accessible }; if ((void*)dst.data() != (void*)src.data()) { @@ -1762,10 +1769,14 @@ inline void deep_copy( // memory then can byte-wise copy if (rank(src) == 0 && rank(dst) == 0) { using value_type = typename dst_type::value_type; - Kokkos::fence(); + Kokkos::fence( + "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before " + "copying rank-0 views"); Kokkos::Impl::DeepCopy( dst.data(), src.data(), sizeof(value_type)); - Kokkos::fence(); + Kokkos::fence( + "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after " + "copying rank-0 views"); } else if (std::is_same< typename DstType::traits::value_type, typename SrcType::traits::non_const_value_type>::value && @@ -1787,10 +1798,14 @@ inline void deep_copy( dst.extent(6) == src.extent(6) && dst.extent(7) == src.extent(7)) { const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span(); - Kokkos::fence(); + Kokkos::fence( + "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before " + "copying rank-1 views"); Kokkos::Impl::DeepCopy( dst.data(), src.data(), nbytes); - Kokkos::fence(); + Kokkos::fence( + "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after " + "copying rank-1 views"); } else if (std::is_same< typename DstType::traits::value_type, typename SrcType::traits::non_const_value_type>::value && @@ -1817,29 +1832,43 @@ inline void deep_copy( dst.stride_6() == src.stride_6() && dst.stride_7() == src.stride_7()) { const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span(); - Kokkos::fence(); + Kokkos::fence( + "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before " + "copying rank-1 views"); Kokkos::Impl::DeepCopy( dst.data(), src.data(), nbytes); - Kokkos::fence(); + Kokkos::fence( + "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after " + "copying rank-1 views"); } else if (DstExecCanAccessSrc) { // Copying data between views in accessible memory spaces and either // non-contiguous or incompatible shape. - Kokkos::fence(); + Kokkos::fence( + "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before " + "remapping views of incompatible shape"); Kokkos::Impl::DynRankViewRemap(dst, src); - Kokkos::fence(); + Kokkos::fence( + "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after " + "remapping views of incompatible shape"); } else if (SrcExecCanAccessDst) { // Copying data between views in accessible memory spaces and either // non-contiguous or incompatible shape. - Kokkos::fence(); + Kokkos::fence( + "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before " + "remapping views of incompatible shape"); Kokkos::Impl::DynRankViewRemap( dst, src); - Kokkos::fence(); + Kokkos::fence( + "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after " + "remapping views of incompatible shape"); } else { Kokkos::Impl::throw_runtime_exception( "deep_copy given views that would require a temporary allocation"); } } else { - Kokkos::fence(); + Kokkos::fence( + "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence due to same " + "src and dst"); } } diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp index cc949d4c55..4acae56970 100644 --- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp @@ -53,36 +53,203 @@ namespace Kokkos { namespace Experimental { -// Simple metafunction for choosing memory space -// In the current implementation, if memory_space == CudaSpace, -// use CudaUVMSpace for the chunk 'array' allocation, which -// contains will contain pointers to chunks of memory allocated -// in CudaSpace namespace Impl { -template -struct ChunkArraySpace { - using memory_space = MemSpace; + +/// Utility class to manage memory for chunked arrays on the host and +/// device. Allocates/deallocates memory on both the host and device along with +/// providing utilities for creating mirrors and deep copying between them. +template +struct ChunkedArrayManager { + using value_type = ValueType; + using pointer_type = ValueType*; + using track_type = Kokkos::Impl::SharedAllocationTracker; + + ChunkedArrayManager() = default; + ChunkedArrayManager(ChunkedArrayManager const&) = default; + ChunkedArrayManager(ChunkedArrayManager&&) = default; + ChunkedArrayManager& operator=(ChunkedArrayManager&&) = default; + ChunkedArrayManager& operator=(const ChunkedArrayManager&) = default; + + template + friend struct ChunkedArrayManager; + + template + inline ChunkedArrayManager(const ChunkedArrayManager& rhs) + : m_valid(rhs.m_valid), + m_chunk_max(rhs.m_chunk_max), + m_chunks((ValueType**)(rhs.m_chunks)), + m_track(rhs.m_track), + m_chunk_size(rhs.m_chunk_size) { + static_assert( + Kokkos::Impl::MemorySpaceAccess::assignable, + "Incompatible ChunkedArrayManager copy construction"); + } + + ChunkedArrayManager(const unsigned arg_chunk_max, + const unsigned arg_chunk_size) + : m_chunk_max(arg_chunk_max), m_chunk_size(arg_chunk_size) {} + + private: + struct ACCESSIBLE_TAG {}; + struct INACCESSIBLE_TAG {}; + + ChunkedArrayManager(ACCESSIBLE_TAG, pointer_type* arg_chunks, + const unsigned arg_chunk_max) + : m_valid(true), m_chunk_max(arg_chunk_max), m_chunks(arg_chunks) {} + + ChunkedArrayManager(INACCESSIBLE_TAG, const unsigned arg_chunk_max, + const unsigned arg_chunk_size) + : m_chunk_max(arg_chunk_max), m_chunk_size(arg_chunk_size) {} + + public: + template + struct IsAccessibleFrom; + + template + struct IsAccessibleFrom< + Space, typename std::enable_if_t::accessible>> : std::true_type {}; + + template + struct IsAccessibleFrom< + Space, typename std::enable_if_t::accessible>> : std::false_type {}; + + template + static ChunkedArrayManager create_mirror( + ChunkedArrayManager const& other, + typename std::enable_if::value>::type* = + nullptr) { + return ChunkedArrayManager{ + ACCESSIBLE_TAG{}, other.m_chunks, other.m_chunk_max}; + } + + template + static ChunkedArrayManager create_mirror( + ChunkedArrayManager const& other, + typename std::enable_if::value>::type* = + nullptr) { + using tag_type = + typename ChunkedArrayManager::INACCESSIBLE_TAG; + return ChunkedArrayManager{tag_type{}, other.m_chunk_max, + other.m_chunk_size}; + } + + public: + void allocate_device(const std::string& label) { + if (m_chunks == nullptr) { + m_chunks = reinterpret_cast(MemorySpace().allocate( + label.c_str(), (sizeof(pointer_type) * (m_chunk_max + 2)))); + } + } + + void initialize() { + for (unsigned i = 0; i < m_chunk_max + 2; i++) { + m_chunks[i] = nullptr; + } + m_valid = true; + } + + private: + /// Custom destroy functor for deallocating array chunks along with a linked + /// allocation + template + struct Destroy { + Destroy() = default; + Destroy(Destroy&&) = default; + Destroy(const Destroy&) = default; + Destroy& operator=(Destroy&&) = default; + Destroy& operator=(const Destroy&) = default; + + Destroy(std::string label, value_type** arg_chunk, + const unsigned arg_chunk_max, const unsigned arg_chunk_size, + value_type** arg_linked) + : m_label(label), + m_chunks(arg_chunk), + m_linked(arg_linked), + m_chunk_max(arg_chunk_max), + m_chunk_size(arg_chunk_size) {} + + void execute() { + // Destroy the array of chunk pointers. + // Two entries beyond the max chunks are allocation counters. + uintptr_t const len = + *reinterpret_cast(m_chunks + m_chunk_max); + for (unsigned i = 0; i < len; i++) { + Space().deallocate(m_label.c_str(), m_chunks[i], + sizeof(value_type) * m_chunk_size); + } + // Destroy the linked allocation if we have one. + if (m_linked != nullptr) { + Space().deallocate(m_label.c_str(), m_linked, + (sizeof(value_type*) * (m_chunk_max + 2))); + } + } + + void destroy_shared_allocation() { execute(); } + + std::string m_label; + value_type** m_chunks = nullptr; + value_type** m_linked = nullptr; + unsigned m_chunk_max; + unsigned m_chunk_size; + }; + + public: + template + void allocate_with_destroy(const std::string& label, + pointer_type* linked_allocation = nullptr) { + using destroy_type = Destroy; + using record_type = + Kokkos::Impl::SharedAllocationRecord; + + // Allocate + 2 extra slots so that *m_chunk[m_chunk_max] == + // num_chunks_alloc and *m_chunk[m_chunk_max+1] == extent This must match in + // Destroy's execute(...) method + record_type* const record = record_type::allocate( + MemorySpace(), label, (sizeof(pointer_type) * (m_chunk_max + 2))); + m_chunks = static_cast(record->data()); + m_track.assign_allocated_record_to_uninitialized(record); + + record->m_destroy = destroy_type(label, m_chunks, m_chunk_max, m_chunk_size, + linked_allocation); + } + + pointer_type* get_ptr() const { return m_chunks; } + + template + typename std::enable_if::value>::type deep_copy_to( + ChunkedArrayManager const& other) { + Kokkos::Impl::DeepCopy( + other.m_chunks, m_chunks, sizeof(pointer_type) * (m_chunk_max + 2)); + } + + template + typename std::enable_if::value>::type deep_copy_to( + ChunkedArrayManager const&) { + // no-op + } + + KOKKOS_INLINE_FUNCTION + pointer_type* operator+(int i) const { return m_chunks + i; } + + KOKKOS_INLINE_FUNCTION + pointer_type& operator[](int i) const { return m_chunks[i]; } + + track_type const& track() const { return m_track; } + + KOKKOS_INLINE_FUNCTION + bool valid() const { return m_valid; } + + private: + bool m_valid = false; + unsigned m_chunk_max = 0; + pointer_type* m_chunks = nullptr; + track_type m_track; + unsigned m_chunk_size = 0; }; -#ifdef KOKKOS_ENABLE_CUDA -template <> -struct ChunkArraySpace { - using memory_space = typename Kokkos::CudaUVMSpace; -}; -#endif -#ifdef KOKKOS_ENABLE_HIP -template <> -struct ChunkArraySpace { - using memory_space = typename Kokkos::Experimental::HIPHostPinnedSpace; -}; -#endif -#ifdef KOKKOS_ENABLE_SYCL -template <> -struct ChunkArraySpace { - using memory_space = typename Kokkos::Experimental::SYCLSharedUSMSpace; -}; -#endif -} // end namespace Impl +} /* end namespace Impl */ /** \brief Dynamic views are restricted to rank-one and no layout. * Resize only occurs on host outside of parallel_regions. @@ -93,6 +260,13 @@ class DynamicView : public Kokkos::ViewTraits { public: using traits = Kokkos::ViewTraits; + using value_type = typename traits::value_type; + using device_space = typename traits::memory_space; + using host_space = + typename Kokkos::Impl::HostMirror::Space::memory_space; + using device_accessor = Impl::ChunkedArrayManager; + using host_accessor = Impl::ChunkedArrayManager; + private: template friend class DynamicView; @@ -108,7 +282,7 @@ class DynamicView : public Kokkos::ViewTraits { "DynamicView only implemented for non-specialized View type"); template ::accessible> + Space, device_space>::accessible> struct verify_space { KOKKOS_FORCEINLINE_FUNCTION static void check() {} }; @@ -123,9 +297,8 @@ class DynamicView : public Kokkos::ViewTraits { }; private: - track_type m_track; - typename traits::value_type** m_chunks = - nullptr; // array of pointers to 'chunks' of memory + device_accessor m_chunks; + host_accessor m_chunks_host; unsigned m_chunk_shift; // ceil(log2(m_chunk_size)) unsigned m_chunk_mask; // m_chunk_size - 1 unsigned m_chunk_max; // number of entries in the chunk array - each pointing @@ -173,7 +346,8 @@ class DynamicView : public Kokkos::ViewTraits { KOKKOS_INLINE_FUNCTION size_t allocation_extent() const noexcept { - uintptr_t n = *reinterpret_cast(m_chunks + m_chunk_max); + uintptr_t n = + *reinterpret_cast(m_chunks_host + m_chunk_max); return (n << m_chunk_shift); } @@ -183,7 +357,7 @@ class DynamicView : public Kokkos::ViewTraits { KOKKOS_INLINE_FUNCTION size_t size() const noexcept { size_t extent_0 = - *reinterpret_cast(m_chunks + m_chunk_max + 1); + *reinterpret_cast(m_chunks_host + m_chunk_max + 1); return extent_0; } @@ -215,10 +389,10 @@ class DynamicView : public Kokkos::ViewTraits { // Allocation tracking properties KOKKOS_INLINE_FUNCTION - int use_count() const { return m_track.use_count(); } + int use_count() const { return m_chunks_host.track().use_count(); } inline const std::string label() const { - return m_track.template get_label(); + return m_chunks_host.track().template get_label(); } //---------------------------------------------------------------------- @@ -285,13 +459,7 @@ class DynamicView : public Kokkos::ViewTraits { * up to the maximum number of chunks * */ template - inline typename std::enable_if< - std::is_integral::value && - Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - typename Impl::ChunkArraySpace< - typename traits::memory_space>::memory_space>::accessible>::type - resize_serial(IntType const& n) { + inline void resize_serial(IntType const& n) { using local_value_type = typename traits::value_type; using value_pointer_type = local_value_type*; @@ -304,37 +472,40 @@ class DynamicView : public Kokkos::ViewTraits { } // *m_chunks[m_chunk_max] stores the current number of chunks being used - uintptr_t* const pc = reinterpret_cast(m_chunks + m_chunk_max); - std::string _label = - m_track.template get_label(); + uintptr_t* const pc = + reinterpret_cast(m_chunks_host + m_chunk_max); + std::string _label = m_chunks_host.track().template get_label(); + if (*pc < NC) { while (*pc < NC) { - m_chunks[*pc] = reinterpret_cast( - typename traits::memory_space().allocate( + m_chunks_host[*pc] = + reinterpret_cast(device_space().allocate( _label.c_str(), sizeof(local_value_type) << m_chunk_shift)); ++*pc; } } else { while (NC + 1 <= *pc) { --*pc; - typename traits::memory_space().deallocate( - _label.c_str(), m_chunks[*pc], - sizeof(local_value_type) << m_chunk_shift); - m_chunks[*pc] = nullptr; + device_space().deallocate(_label.c_str(), m_chunks_host[*pc], + sizeof(local_value_type) << m_chunk_shift); + m_chunks_host[*pc] = nullptr; } } - // *m_chunks[m_chunk_max+1] stores the 'extent' requested by resize + // *m_chunks_host[m_chunk_max+1] stores the 'extent' requested by resize *(pc + 1) = n; + + m_chunks_host.deep_copy_to(m_chunks); } KOKKOS_INLINE_FUNCTION bool is_allocated() const { - if (m_chunks == nullptr) { - return false; - } else { - // *m_chunks[m_chunk_max] stores the current number of chunks being used + if (m_chunks_host.valid()) { + // *m_chunks_host[m_chunk_max] stores the current number of chunks being + // used uintptr_t* const pc = - reinterpret_cast(m_chunks + m_chunk_max); + reinterpret_cast(m_chunks_host + m_chunk_max); return (*(pc + 1) > 0); + } else { + return false; } } @@ -349,8 +520,8 @@ class DynamicView : public Kokkos::ViewTraits { template DynamicView(const DynamicView& rhs) - : m_track(rhs.m_track), - m_chunks((typename traits::value_type**)rhs.m_chunks), + : m_chunks(rhs.m_chunks), + m_chunks_host(rhs.m_chunks_host), m_chunk_shift(rhs.m_chunk_shift), m_chunk_mask(rhs.m_chunk_mask), m_chunk_max(rhs.m_chunk_max), @@ -361,63 +532,6 @@ class DynamicView : public Kokkos::ViewTraits { "Incompatible DynamicView copy construction"); } - //---------------------------------------------------------------------- - - struct Destroy { - using local_value_type = typename traits::value_type; - std::string m_label; - local_value_type** m_chunks; - unsigned m_chunk_max; - bool m_destroy; - unsigned m_chunk_size; - - // Initialize or destroy array of chunk pointers. - // Two entries beyond the max chunks are allocation counters. - inline void operator()(unsigned i) const { - if (m_destroy && i < m_chunk_max && nullptr != m_chunks[i]) { - typename traits::memory_space().deallocate( - m_label.c_str(), m_chunks[i], - sizeof(local_value_type) * m_chunk_size); - } - m_chunks[i] = nullptr; - } - - void execute(bool arg_destroy) { - using Range = Kokkos::RangePolicy; - - m_destroy = arg_destroy; - - Kokkos::Impl::ParallelFor closure( - *this, - Range(0, m_chunk_max + 2)); // Add 2 to 'destroy' extra slots storing - // num_chunks and extent; previously + 1 - - closure.execute(); - - typename traits::execution_space().fence(); - // Impl::ChunkArraySpace< typename traits::memory_space - // >::memory_space::execution_space().fence(); - } - - void construct_shared_allocation() { execute(false); } - - void destroy_shared_allocation() { execute(true); } - - Destroy() = default; - Destroy(Destroy&&) = default; - Destroy(const Destroy&) = default; - Destroy& operator=(Destroy&&) = default; - Destroy& operator=(const Destroy&) = default; - - Destroy(std::string label, typename traits::value_type** arg_chunk, - const unsigned arg_chunk_max, const unsigned arg_chunk_size) - : m_label(label), - m_chunks(arg_chunk), - m_chunk_max(arg_chunk_max), - m_destroy(false), - m_chunk_size(arg_chunk_size) {} - }; - /**\brief Allocation constructor * * Memory is allocated in chunks @@ -427,10 +541,7 @@ class DynamicView : public Kokkos::ViewTraits { explicit inline DynamicView(const std::string& arg_label, const unsigned min_chunk_size, const unsigned max_extent) - : m_track(), - m_chunks(nullptr) - // The chunk size is guaranteed to be a power of two - , + : // The chunk size is guaranteed to be a power of two m_chunk_shift(Kokkos::Impl::integral_power_of_two_that_contains( min_chunk_size)) // div ceil(log2(min_chunk_size)) , @@ -440,28 +551,22 @@ class DynamicView : public Kokkos::ViewTraits { m_chunk_shift) // max num pointers-to-chunks in array , m_chunk_size(2 << (m_chunk_shift - 1)) { - using chunk_array_memory_space = typename Impl::ChunkArraySpace< - typename traits::memory_space>::memory_space; - // A functor to deallocate all of the chunks upon final destruction - using record_type = - Kokkos::Impl::SharedAllocationRecord; + m_chunks = device_accessor(m_chunk_max, m_chunk_size); - // Allocate chunk pointers and allocation counter - record_type* const record = - record_type::allocate(chunk_array_memory_space(), arg_label, - (sizeof(pointer_type) * (m_chunk_max + 2))); - // Allocate + 2 extra slots so that *m_chunk[m_chunk_max] == - // num_chunks_alloc and *m_chunk[m_chunk_max+1] == extent This must match in - // Destroy's execute(...) method - - m_chunks = reinterpret_cast(record->data()); - - record->m_destroy = Destroy(arg_label, m_chunks, m_chunk_max, m_chunk_size); - - // Initialize to zero - record->m_destroy.construct_shared_allocation(); - - m_track.assign_allocated_record_to_uninitialized(record); + if (device_accessor::template IsAccessibleFrom::value) { + m_chunks.template allocate_with_destroy(arg_label); + m_chunks.initialize(); + m_chunks_host = + device_accessor::template create_mirror(m_chunks); + } else { + m_chunks.allocate_device(arg_label); + m_chunks_host = + device_accessor::template create_mirror(m_chunks); + m_chunks_host.template allocate_with_destroy( + arg_label, m_chunks.get_ptr()); + m_chunks_host.initialize(); + m_chunks_host.deep_copy_to(m_chunks); + } } }; @@ -487,8 +592,8 @@ inline void deep_copy(const View& dst, enum { DstExecCanAccessSrc = - Kokkos::Impl::SpaceAccessibility::accessible + Kokkos::SpaceAccessibility::accessible }; if (DstExecCanAccessSrc) { @@ -512,8 +617,8 @@ inline void deep_copy(const Kokkos::Experimental::DynamicView& dst, enum { DstExecCanAccessSrc = - Kokkos::Impl::SpaceAccessibility::accessible + Kokkos::SpaceAccessibility::accessible }; if (DstExecCanAccessSrc) { diff --git a/lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp b/lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp index fbfaed9b1b..18f026dc6f 100644 --- a/lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp +++ b/lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp @@ -187,7 +187,8 @@ template void ErrorReporter::resize(const size_t new_size) { m_reports.resize(new_size); m_reporters.resize(new_size); - typename DeviceType::execution_space().fence(); + typename DeviceType::execution_space().fence( + "Kokkos::Experimental::ErrorReporter::resize: fence after resizing"); } } // namespace Experimental diff --git a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp index 0f21a08ba3..57bf745d40 100644 --- a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp +++ b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp @@ -116,8 +116,7 @@ KOKKOS_INLINE_FUNCTION void offsetview_verify_operator_bounds( This check should cover the case of Views that don't have the Unmanaged trait but were initialized by pointer. */ if (tracker.has_record()) { - Kokkos::Impl::operator_bounds_error_on_device( - map, Kokkos::Impl::has_printable_label_typedef()); + Kokkos::Impl::operator_bounds_error_on_device(map); } else { Kokkos::abort("OffsetView bounds error"); } @@ -1244,7 +1243,8 @@ class OffsetView : public ViewTraits { // to avoid incomplete type errors from usng Kokkos::Cuda directly. if (std::is_same::value) { - typename traits::device_type::memory_space::execution_space().fence(); + typename traits::device_type::memory_space::execution_space().fence( + "Kokkos::OffsetView::OffsetView(): fence before UVM allocation"); } #endif //------------------------------------------------------------ @@ -1256,7 +1256,8 @@ class OffsetView : public ViewTraits { #if defined(KOKKOS_ENABLE_CUDA) if (std::is_same::value) { - typename traits::device_type::memory_space::execution_space().fence(); + typename traits::device_type::memory_space::execution_space().fence( + "Kokkos::OffsetView::OffsetView(): fence after UVM allocation"); } #endif //------------------------------------------------------------ diff --git a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp index dcd4cf73e5..79bc43b739 100644 --- a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp +++ b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp @@ -834,7 +834,7 @@ class ScatterView::value, "ScatterView contribute destination has different layout"); static_assert( - Kokkos::Impl::SpaceAccessibility< + Kokkos::SpaceAccessibility< execution_space, typename dest_type::memory_space>::accessible, "ScatterView contribute destination memory space not accessible"); if (dest.data() == internal_view.data()) return; @@ -1061,7 +1061,7 @@ class ScatterView::value, "ScatterView deep_copy destination has different layout"); static_assert( - Kokkos::Impl::SpaceAccessibility< + Kokkos::SpaceAccessibility< execution_space, typename dest_type::memory_space>::accessible, "ScatterView deep_copy destination memory space not accessible"); bool is_equal = (dest.data() == internal_view.data()); @@ -1290,7 +1290,7 @@ class ScatterView::value, "ScatterView deep_copy destination has different layout"); static_assert( - Kokkos::Impl::SpaceAccessibility< + Kokkos::SpaceAccessibility< execution_space, typename dest_type::memory_space>::accessible, "ScatterView deep_copy destination memory space not accessible"); auto extent = internal_view.extent(internal_view_type::rank - 1); diff --git a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp index 81be3ee2d3..cd633e4031 100644 --- a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp +++ b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp @@ -405,7 +405,9 @@ class StaticCrsGraph { Kokkos::parallel_for("Kokkos::StaticCrsGraph::create_block_partitioning", Kokkos::RangePolicy(0, numRows()), partitioner); - typename device_type::execution_space().fence(); + typename device_type::execution_space().fence( + "Kokkos::StaticCrsGraph::create_block_partitioning:: fence after " + "partition"); row_block_offsets = block_offsets; } diff --git a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp index edb0e7261d..a1601eee35 100644 --- a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp +++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp @@ -345,7 +345,8 @@ class UnorderedMap { const impl_value_type tmp = impl_value_type(); Kokkos::deep_copy(m_values, tmp); } - { Kokkos::deep_copy(m_scalars, 0); } + Kokkos::deep_copy(m_scalars, 0); + m_size = 0; } KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { @@ -393,9 +394,9 @@ class UnorderedMap { /// /// This method has undefined behavior when erasable() is true. /// - /// Note that this is not a device function; it cannot be called in + /// Note that this is not a device function; it cannot be called in /// a parallel kernel. The value is not stored as a variable; it - /// must be computed. + /// must be computed. m_size is a mutable cache of that value. size_type size() const { if (capacity() == 0u) return 0u; if (modified()) { @@ -419,9 +420,13 @@ class UnorderedMap { bool begin_erase() { bool result = !erasable(); if (is_insertable_map && result) { - execution_space().fence(); + execution_space().fence( + "Kokkos::UnorderedMap::begin_erase: fence before setting erasable " + "flag"); set_flag(erasable_idx); - execution_space().fence(); + execution_space().fence( + "Kokkos::UnorderedMap::begin_erase: fence after setting erasable " + "flag"); } return result; } @@ -429,10 +434,12 @@ class UnorderedMap { bool end_erase() { bool result = erasable(); if (is_insertable_map && result) { - execution_space().fence(); + execution_space().fence( + "Kokkos::UnorderedMap::end_erase: fence before erasing"); Impl::UnorderedMapErase f(*this); f.apply(); - execution_space().fence(); + execution_space().fence( + "Kokkos::UnorderedMap::end_erase: fence after erasing"); reset_flag(erasable_idx); } return result; diff --git a/lib/kokkos/containers/src/Kokkos_Vector.hpp b/lib/kokkos/containers/src/Kokkos_Vector.hpp index a1fbba6b21..88721bd89e 100644 --- a/lib/kokkos/containers/src/Kokkos_Vector.hpp +++ b/lib/kokkos/containers/src/Kokkos_Vector.hpp @@ -119,12 +119,14 @@ class vector : public DualView { if (DV::template need_sync()) { set_functor_host f(DV::h_view, val); parallel_for("Kokkos::vector::assign", n, f); - typename DV::t_host::execution_space().fence(); + typename DV::t_host::execution_space().fence( + "Kokkos::vector::assign: fence after assigning values"); DV::template modify(); } else { set_functor f(DV::d_view, val); parallel_for("Kokkos::vector::assign", n, f); - typename DV::t_dev::execution_space().fence(); + typename DV::t_dev::execution_space().fence( + "Kokkos::vector::assign: fence after assigning values"); DV::template modify(); } } diff --git a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp index 6047e60f3d..9512f2d4a2 100644 --- a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp +++ b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp @@ -57,22 +57,10 @@ namespace Kokkos { namespace Impl { -KOKKOS_FORCEINLINE_FUNCTION -unsigned rotate_left(unsigned i, int r) { - constexpr int size = static_cast(sizeof(unsigned) * CHAR_BIT); - return r ? ((i << r) | (i >> (size - r))) : i; -} - KOKKOS_FORCEINLINE_FUNCTION unsigned rotate_right(unsigned i, int r) { constexpr int size = static_cast(sizeof(unsigned) * CHAR_BIT); - // FIXME_SYCL llvm.fshr.i32 missing - // (https://github.com/intel/llvm/issues/3308) -#ifdef __SYCL_DEVICE_ONLY__ - return rotate_left(i, size - r); -#else return r ? ((i >> r) | (i << (size - r))) : i; -#endif } template diff --git a/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp index 367ab33857..fdd78e4e5f 100644 --- a/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp +++ b/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp @@ -75,7 +75,7 @@ uint32_t fmix32(uint32_t h) { KOKKOS_INLINE_FUNCTION uint32_t MurmurHash3_x86_32(const void* key, int len, uint32_t seed) { - const uint8_t* data = (const uint8_t*)key; + const uint8_t* data = static_cast(key); const int nblocks = len / 4; uint32_t h1 = seed; diff --git a/lib/kokkos/containers/unit_tests/TestDualView.hpp b/lib/kokkos/containers/unit_tests/TestDualView.hpp index 3eee85ed10..e22564aa5c 100644 --- a/lib/kokkos/containers/unit_tests/TestDualView.hpp +++ b/lib/kokkos/containers/unit_tests/TestDualView.hpp @@ -49,7 +49,7 @@ #include #include #include -#include +#include #include namespace Test { diff --git a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp index dd0199ed81..a8d62bd24c 100644 --- a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp +++ b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp @@ -702,6 +702,11 @@ class TestDynViewAPI { using View0 = Kokkos::View; using View1 = Kokkos::View; + using View2 = Kokkos::View; + using View3 = Kokkos::View; + using View4 = Kokkos::View; + using View5 = Kokkos::View; + using View6 = Kokkos::View; using View7 = Kokkos::View; using host_view_space = typename View0::host_mirror_space; @@ -1065,7 +1070,7 @@ class TestDynViewAPI { dView0 d_uninitialized( Kokkos::view_alloc(Kokkos::WithoutInitializing, "uninit"), 10, 20); - ASSERT_TRUE(d_uninitialized.data() != nullptr); + ASSERT_NE(d_uninitialized.data(), nullptr); ASSERT_EQ(d_uninitialized.rank(), 2); ASSERT_EQ(d_uninitialized.extent(0), 10); ASSERT_EQ(d_uninitialized.extent(1), 20); @@ -1075,14 +1080,14 @@ class TestDynViewAPI { hView0 hx, hy, hz; ASSERT_TRUE(Kokkos::is_dyn_rank_view::value); - ASSERT_FALSE(Kokkos::is_dyn_rank_view >::value); + ASSERT_FALSE(Kokkos::is_dyn_rank_view>::value); - ASSERT_TRUE(dx.data() == nullptr); // Okay with UVM - ASSERT_TRUE(dy.data() == nullptr); // Okay with UVM - ASSERT_TRUE(dz.data() == nullptr); // Okay with UVM - ASSERT_TRUE(hx.data() == nullptr); - ASSERT_TRUE(hy.data() == nullptr); - ASSERT_TRUE(hz.data() == nullptr); + ASSERT_EQ(dx.data(), nullptr); // Okay with UVM + ASSERT_EQ(dy.data(), nullptr); // Okay with UVM + ASSERT_EQ(dz.data(), nullptr); // Okay with UVM + ASSERT_EQ(hx.data(), nullptr); + ASSERT_EQ(hy.data(), nullptr); + ASSERT_EQ(hz.data(), nullptr); ASSERT_EQ(dx.extent(0), 0u); // Okay with UVM ASSERT_EQ(dy.extent(0), 0u); // Okay with UVM ASSERT_EQ(dz.extent(0), 0u); // Okay with UVM @@ -1153,11 +1158,11 @@ class TestDynViewAPI { ASSERT_EQ(dx.use_count(), size_t(2)); - ASSERT_FALSE(dx.data() == nullptr); - ASSERT_FALSE(const_dx.data() == nullptr); - ASSERT_FALSE(unmanaged_dx.data() == nullptr); - ASSERT_FALSE(unmanaged_from_ptr_dx.data() == nullptr); - ASSERT_FALSE(dy.data() == nullptr); + ASSERT_NE(dx.data(), nullptr); + ASSERT_NE(const_dx.data(), nullptr); + ASSERT_NE(unmanaged_dx.data(), nullptr); + ASSERT_NE(unmanaged_from_ptr_dx.data(), nullptr); + ASSERT_NE(dy.data(), nullptr); ASSERT_NE(dx, dy); ASSERT_EQ(dx.extent(0), unsigned(N0)); @@ -1317,17 +1322,17 @@ class TestDynViewAPI { ASSERT_NE(dx, dz); dx = dView0(); - ASSERT_TRUE(dx.data() == nullptr); - ASSERT_FALSE(dy.data() == nullptr); - ASSERT_FALSE(dz.data() == nullptr); + ASSERT_EQ(dx.data(), nullptr); + ASSERT_NE(dy.data(), nullptr); + ASSERT_NE(dz.data(), nullptr); dy = dView0(); - ASSERT_TRUE(dx.data() == nullptr); - ASSERT_TRUE(dy.data() == nullptr); - ASSERT_FALSE(dz.data() == nullptr); + ASSERT_EQ(dx.data(), nullptr); + ASSERT_EQ(dy.data(), nullptr); + ASSERT_NE(dz.data(), nullptr); dz = dView0(); - ASSERT_TRUE(dx.data() == nullptr); - ASSERT_TRUE(dy.data() == nullptr); - ASSERT_TRUE(dz.data() == nullptr); + ASSERT_EQ(dx.data(), nullptr); + ASSERT_EQ(dy.data(), nullptr); + ASSERT_EQ(dz.data(), nullptr); // View - DynRankView Interoperability tests // deep_copy from view to dynrankview @@ -1367,7 +1372,7 @@ class TestDynViewAPI { static void check_auto_conversion_to_const( const Kokkos::DynRankView& arg_const, const Kokkos::DynRankView& arg) { - ASSERT_TRUE(arg_const == arg); + ASSERT_EQ(arg_const, arg); } static void run_test_allocated() { @@ -1396,8 +1401,8 @@ class TestDynViewAPI { const_typeX xc = x; const_typeR xr = x; - ASSERT_TRUE(xc == x); - ASSERT_TRUE(x == xc); + ASSERT_EQ(xc, x); + ASSERT_EQ(x, xc); // For CUDA the constant random access View does not return // an lvalue reference due to retrieving through texture cache @@ -1406,7 +1411,7 @@ class TestDynViewAPI { if (!std::is_same::value) #endif { - ASSERT_TRUE(x.data() == xr.data()); + ASSERT_EQ(x.data(), xr.data()); } // typeX xf = xc ; // setting non-const from const must not compile @@ -1659,29 +1664,29 @@ class TestDynViewAPI { const_svector_right_type cvr3 = Kokkos::subdynrankview(mv, Kokkos::ALL(), 2); - ASSERT_TRUE(&v1[0] == &v1(0)); - ASSERT_TRUE(&v1[0] == &mv(0, 0)); - ASSERT_TRUE(&v2[0] == &mv(0, 1)); - ASSERT_TRUE(&v3[0] == &mv(0, 2)); + ASSERT_EQ(&v1[0], &v1(0)); + ASSERT_EQ(&v1[0], &mv(0, 0)); + ASSERT_EQ(&v2[0], &mv(0, 1)); + ASSERT_EQ(&v3[0], &mv(0, 2)); - ASSERT_TRUE(&cv1[0] == &mv(0, 0)); - ASSERT_TRUE(&cv2[0] == &mv(0, 1)); - ASSERT_TRUE(&cv3[0] == &mv(0, 2)); + ASSERT_EQ(&cv1[0], &mv(0, 0)); + ASSERT_EQ(&cv2[0], &mv(0, 1)); + ASSERT_EQ(&cv3[0], &mv(0, 2)); - ASSERT_TRUE(&vr1[0] == &mv(0, 0)); - ASSERT_TRUE(&vr2[0] == &mv(0, 1)); - ASSERT_TRUE(&vr3[0] == &mv(0, 2)); + ASSERT_EQ(&vr1[0], &mv(0, 0)); + ASSERT_EQ(&vr2[0], &mv(0, 1)); + ASSERT_EQ(&vr3[0], &mv(0, 2)); - ASSERT_TRUE(&cvr1[0] == &mv(0, 0)); - ASSERT_TRUE(&cvr2[0] == &mv(0, 1)); - ASSERT_TRUE(&cvr3[0] == &mv(0, 2)); + ASSERT_EQ(&cvr1[0], &mv(0, 0)); + ASSERT_EQ(&cvr2[0], &mv(0, 1)); + ASSERT_EQ(&cvr3[0], &mv(0, 2)); - ASSERT_TRUE(&mv1(0, 0) == &mv(1, 2)); - ASSERT_TRUE(&mv1(1, 1) == &mv(2, 3)); - ASSERT_TRUE(&mv1(3, 2) == &mv(4, 4)); - ASSERT_TRUE(&mvr1(0, 0) == &mv_right(1, 2)); - ASSERT_TRUE(&mvr1(1, 1) == &mv_right(2, 3)); - ASSERT_TRUE(&mvr1(3, 2) == &mv_right(4, 4)); + ASSERT_EQ(&mv1(0, 0), &mv(1, 2)); + ASSERT_EQ(&mv1(1, 1), &mv(2, 3)); + ASSERT_EQ(&mv1(3, 2), &mv(4, 4)); + ASSERT_EQ(&mvr1(0, 0), &mv_right(1, 2)); + ASSERT_EQ(&mvr1(1, 1), &mv_right(2, 3)); + ASSERT_EQ(&mvr1(3, 2), &mv_right(4, 4)); const_svector_type c_cv1(v1); typename svector_type::const_type c_cv2(v2); diff --git a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp index f018793dd6..023bf92f62 100644 --- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp +++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp @@ -52,7 +52,7 @@ #include #include -#include +#include namespace Test { diff --git a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp index 9ddc226e29..24a43e1ebc 100644 --- a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp +++ b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp @@ -50,7 +50,7 @@ #include #include #include -#include +#include #include #include diff --git a/lib/kokkos/containers/unit_tests/TestScatterView.hpp b/lib/kokkos/containers/unit_tests/TestScatterView.hpp index fdbce2d492..342ce2af48 100644 --- a/lib/kokkos/containers/unit_tests/TestScatterView.hpp +++ b/lib/kokkos/containers/unit_tests/TestScatterView.hpp @@ -118,11 +118,51 @@ struct test_scatter_view_impl_cls sizes(LENGTH); - size_t total_length = 0; - for (size_t i = 0; i < LENGTH; ++i) { sizes[i] = rand() % 1000; } @@ -189,10 +187,6 @@ void run_test_graph3(size_t B, size_t N) { sizes[1] = N; sizes[1998] = N; - for (size_t i = 0; i < LENGTH; ++i) { - total_length += sizes[i]; - } - int C = 0; dView dx = Kokkos::create_staticcrsgraph("test", sizes); dx.create_block_partitioning(B, C); diff --git a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp index 4413cfbc80..8009b99656 100644 --- a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp +++ b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp @@ -295,10 +295,8 @@ void test_deep_copy(uint32_t num_nodes) { } // FIXME_SYCL wrong results on Nvidia GPUs but correct on Host and Intel GPUs -// FIXME_HIP // WORKAROUND MSVC -#if !(defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 401)) && \ - !defined(_WIN32) && !defined(KOKKOS_ENABLE_SYCL) +#if !defined(_WIN32) && !defined(KOKKOS_ENABLE_SYCL) TEST(TEST_CATEGORY, UnorderedMap_insert) { for (int i = 0; i < 500; ++i) { test_insert(100000, 90000, 100, true); @@ -329,6 +327,23 @@ TEST(TEST_CATEGORY, UnorderedMap_valid_empty) { ASSERT_TRUE(n.is_allocated()); } +TEST(TEST_CATEGORY, UnorderedMap_clear_zero_size) { + using Map = + Kokkos::UnorderedMap; + + Map m(11); + ASSERT_EQ(0u, m.size()); + + m.insert(2); + m.insert(3); + m.insert(5); + m.insert(7); + ASSERT_EQ(4u, m.size()); + + m.clear(); + ASSERT_EQ(0u, m.size()); +} + } // namespace Test #endif // KOKKOS_TEST_UNORDERED_MAP_HPP diff --git a/lib/kokkos/core/cmake/KokkosCore_config.h.in b/lib/kokkos/core/cmake/KokkosCore_config.h.in deleted file mode 100644 index f0835772b8..0000000000 --- a/lib/kokkos/core/cmake/KokkosCore_config.h.in +++ /dev/null @@ -1,104 +0,0 @@ -/* The trivial 'src/build_common.sh' creates a config - * that must stay in sync with this file. - */ -#cmakedefine KOKKOS_FOR_SIERRA - -#if !defined(KOKKOS_FOR_SIERRA) - -#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) -#error \ - "Don't include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." -#else -#define KOKKOS_CORE_CONFIG_H -#endif - -#cmakedefine KOKKOS_ENABLE_CUDA -#cmakedefine KOKKOS_ENABLE_HIP -#cmakedefine KOKKOS_ENABLE_OPENMP -#cmakedefine KOKKOS_ENABLE_THREADS -#cmakedefine KOKKOS_ENABLE_SERIAL -#cmakedefine KOKKOS_ENABLE_Winthread - -#cmakedefine KOKKOS_ENABLE_HWLOC -#cmakedefine KOKKOS_ENABLE_HBWSPACE -#cmakedefine KOKKOS_ENABLE_LIBRT - -#cmakedefine KOKKOS_ENABLE_DEBUG -#cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK -#cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK -#cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT -#cmakedefine KOKKOS_ENABLE_TUNING - -#cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION - -#ifdef KOKKOS_ENABLE_CUDA - -#cmakedefine KOKKOS_ENABLE_CUDA_LDG_INTRINSIC - -// mfh 16 Sep 2014: If passed in on the command line, that overrides -// any value of KOKKOS_USE_CUDA_UVM here. Doing this should prevent build -// warnings like this one: -// -// packages/kokkos/core/src/KokkosCore_config.h:13:1: warning: -// "KOKKOS_USE_CUDA_UVM" redefined -// -// At some point, we should edit the test-build scripts in -// Trilinos/cmake/ctest/drivers/perseus/, and take -// -DKOKKOS_USE_CUDA_UVM from the command-line arguments there. I -// hesitate to do that now, because I'm not sure if all the files are -// including KokkosCore_config.h (or a header file that includes it) like -// they should. -#ifndef KOKKOS_USE_CUDA_UVM -#cmakedefine KOKKOS_USE_CUDA_UVM -#endif - -#cmakedefine KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE - -#cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA - -#endif - -#cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND - -#ifndef __CUDA_ARCH__ -#cmakedefine KOKKOS_ENABLE_ISA_X86_64 -#cmakedefine KOKKOS_ENABLE_ISA_KNC -#cmakedefine KOKKOS_ENABLE_ISA_POWERPCLE -#endif - -#ifdef KOKKOS_ENABLE_HIP -#cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE -#endif - -#cmakedefine KOKKOS_ARCH_ARMV80 1 -#cmakedefine KOKKOS_ARCH_ARMV81 1 -#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX 1 -#cmakedefine KOKKOS_ARCH_AVX 1 -#cmakedefine KOKKOS_ARCH_AVX2 1 -#cmakedefine KOKKOS_ARCH_AVX512MIC 1 -#cmakedefine KOKKOS_ARCH_AVX512XEON 1 -#cmakedefine KOKKOS_ARCH_KNC 1 -#cmakedefine KOKKOS_ARCH_POWER8 1 -#cmakedefine KOKKOS_ARCH_POWER9 1 -#cmakedefine KOKKOS_ARCH_KEPLER 1 -#cmakedefine KOKKOS_ARCH_KEPLER30 1 -#cmakedefine KOKKOS_ARCH_KEPLER32 1 -#cmakedefine KOKKOS_ARCH_KEPLER35 1 -#cmakedefine KOKKOS_ARCH_KEPLER37 1 -#cmakedefine KOKKOS_ARCH_MAXWELL 1 -#cmakedefine KOKKOS_ARCH_MAXWELL50 1 -#cmakedefine KOKKOS_ARCH_MAXWELL52 1 -#cmakedefine KOKKOS_ARCH_MAXWELL53 1 -#cmakedefine KOKKOS_ARCH_PASCAL 1 -#cmakedefine KOKKOS_ARCH_PASCAL60 1 -#cmakedefine KOKKOS_ARCH_PASCAL61 1 -#cmakedefine KOKKOS_ARCH_VOLTA70 1 - -// TODO: These are currently not used in Kokkos. Should they be removed? -#cmakedefine KOKKOS_ENABLE_MPI -#cmakedefine KOKKOS_ENABLE_CUSPARSE - -// TODO: No longer options in Kokkos. Need to be removed. -#cmakedefine KOKKOS_USING_DEPRECATED_VIEW - -#endif // !defined(KOKKOS_FOR_SIERRA) diff --git a/lib/kokkos/core/perf_test/CMakeLists.txt b/lib/kokkos/core/perf_test/CMakeLists.txt index 9ff4b6006d..a7c57a9434 100644 --- a/lib/kokkos/core/perf_test/CMakeLists.txt +++ b/lib/kokkos/core/perf_test/CMakeLists.txt @@ -10,9 +10,7 @@ #INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src") # FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests. -IF (KOKKOS_ENABLE_OPENMPTARGET - AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI - OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)) +IF (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) RETURN() ENDIF() diff --git a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp index dee21fd7a5..b534c32c52 100644 --- a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp +++ b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp @@ -231,7 +231,7 @@ void run_test_gramschmidt(int exp_beg, int exp_end, int num_trials, std::cout << label_gramschmidt << " , " << parallel_work_length << " , " << min_seconds << " , " << (min_seconds / parallel_work_length) - << std::endl; + << ", " << avg_seconds << std::endl; } } diff --git a/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp b/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp index c431c2b0c8..24c1898e0a 100644 --- a/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp +++ b/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp @@ -280,7 +280,7 @@ void run_test_hexgrad(int exp_beg, int exp_end, int num_trials, std::cout << label_hexgrad << " , " << parallel_work_length << " , " << min_seconds << " , " << (min_seconds / parallel_work_length) - << std::endl; + << avg_seconds << std::endl; } } diff --git a/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp b/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp index 50bbc78a6b..5b7c2a7a03 100644 --- a/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp +++ b/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp @@ -205,7 +205,7 @@ TEST(default_exec, overlap_range_policy) { double time_end = timer.seconds(); if (SpaceInstance::overlap()) { - ASSERT_TRUE((time_end > 1.5 * time_overlap)); + ASSERT_GT(time_end, 1.5 * time_overlap); } printf("Time RangePolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end, time_overlap); @@ -238,7 +238,7 @@ TEST(default_exec, overlap_range_policy) { double time_not_fenced = timer.seconds(); Kokkos::fence(); if (SpaceInstance::overlap()) { - ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced); + ASSERT_GT(time_fenced, 2.0 * time_not_fenced); } timer.reset(); @@ -280,7 +280,7 @@ TEST(default_exec, overlap_range_policy) { ASSERT_EQ(h_result2(), h_result()); if (SpaceInstance::overlap()) { - ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce); + ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce); } printf("Time RangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n", time_no_overlapped_reduce, time_overlapped_reduce); @@ -378,7 +378,7 @@ TEST(default_exec, overlap_mdrange_policy) { double time_end = timer.seconds(); if (SpaceInstance::overlap()) { - ASSERT_TRUE((time_end > 1.5 * time_overlap)); + ASSERT_GT(time_end, 1.5 * time_overlap); } printf("Time MDRangePolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end, time_overlap); @@ -413,7 +413,7 @@ TEST(default_exec, overlap_mdrange_policy) { double time_not_fenced = timer.seconds(); Kokkos::fence(); if (SpaceInstance::overlap()) { - ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced); + ASSERT_GT(time_fenced, 2.0 * time_not_fenced); } timer.reset(); @@ -459,7 +459,7 @@ TEST(default_exec, overlap_mdrange_policy) { ASSERT_EQ(h_result2(), h_result()); if (SpaceInstance::overlap()) { - ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce); + ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce); } printf("Time MDRangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n", time_no_overlapped_reduce, time_overlapped_reduce); @@ -548,7 +548,7 @@ TEST(default_exec, overlap_team_policy) { double time_end = timer.seconds(); if (SpaceInstance::overlap()) { - ASSERT_TRUE((time_end > 1.5 * time_overlap)); + ASSERT_GT(time_end, 1.5 * time_overlap); } printf("Time TeamPolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end, time_overlap); @@ -581,7 +581,7 @@ TEST(default_exec, overlap_team_policy) { double time_not_fenced = timer.seconds(); Kokkos::fence(); if (SpaceInstance::overlap()) { - ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced); + ASSERT_GT(time_fenced, 2.0 * time_not_fenced); } timer.reset(); Kokkos::parallel_reduce( @@ -622,7 +622,7 @@ TEST(default_exec, overlap_team_policy) { ASSERT_EQ(h_result2(), h_result()); if (SpaceInstance::overlap()) { - ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce); + ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce); } printf("Time TeamPolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n", time_no_overlapped_reduce, time_overlapped_reduce); diff --git a/lib/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp b/lib/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp index 550316bec9..555a05ea27 100644 --- a/lib/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp +++ b/lib/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp @@ -120,7 +120,8 @@ void run_allocateview_tests(int N, int R) { { Kokkos::Timer timer; for (int r = 0; r < R; r++) { - double* a_ptr = (double*)Kokkos::kokkos_malloc("A", sizeof(double) * N8); + double* a_ptr = + static_cast(Kokkos::kokkos_malloc("A", sizeof(double) * N8)); Kokkos::parallel_for( N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 0.0; }); Kokkos::fence(); diff --git a/lib/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp b/lib/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp index afeeb64356..b0562f2fd1 100644 --- a/lib/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp +++ b/lib/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp @@ -47,10 +47,18 @@ namespace Test { TEST(default_exec, ViewResize_Rank8) { +// FIXME_SYCL Avoid running out of resources on the CUDA GPU used in the CI +#ifdef KOKKOS_ENABLE_SYCL + printf("Resize View Performance for LayoutLeft:\n"); + run_resizeview_tests8(9, 1); + printf("Resize View Performance for LayoutRight:\n"); + run_resizeview_tests8(9, 1); +#else printf("Resize View Performance for LayoutLeft:\n"); run_resizeview_tests8(10, 1); printf("Resize View Performance for LayoutRight:\n"); run_resizeview_tests8(10, 1); +#endif } } // namespace Test diff --git a/lib/kokkos/core/perf_test/test_atomic.cpp b/lib/kokkos/core/perf_test/test_atomic.cpp index 59820f3bdd..54824e5b39 100644 --- a/lib/kokkos/core/perf_test/test_atomic.cpp +++ b/lib/kokkos/core/perf_test/test_atomic.cpp @@ -47,7 +47,7 @@ #include #include -#include +#include using exec_space = Kokkos::DefaultExecutionSpace; @@ -401,7 +401,7 @@ template void Loop(int loop, int test, const char* type_name) { LoopVariant(loop, test); - Kokkos::Impl::Timer timer; + Kokkos::Timer timer; T res = LoopVariant(loop, test); double time = timer.seconds(); diff --git a/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp b/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp index eec1c8eacc..4086ef5816 100644 --- a/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp +++ b/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp @@ -12,13 +12,13 @@ #include #include -#include +#include using exec_space = Kokkos::DefaultExecutionSpace; template void test(const int length) { - Kokkos::Impl::Timer timer; + Kokkos::Timer timer; using vector = Kokkos::View; diff --git a/lib/kokkos/core/perf_test/test_mempool.cpp b/lib/kokkos/core/perf_test/test_mempool.cpp index 9aab119774..7887d4ba55 100644 --- a/lib/kokkos/core/perf_test/test_mempool.cpp +++ b/lib/kokkos/core/perf_test/test_mempool.cpp @@ -48,7 +48,7 @@ #include #include -#include +#include using ExecSpace = Kokkos::DefaultExecutionSpace; using MemorySpace = Kokkos::DefaultExecutionSpace::memory_space; @@ -100,7 +100,7 @@ struct TestFunctor { const unsigned size_alloc = chunk * (1 + (j % chunk_span)); - ptrs(j) = (uintptr_t)pool.allocate(size_alloc); + ptrs(j) = reinterpret_cast(pool.allocate(size_alloc)); if (ptrs(j)) ++update; } @@ -129,7 +129,7 @@ struct TestFunctor { const unsigned size_alloc = chunk * (1 + (j % chunk_span)); - pool.deallocate((void*)ptrs(j), size_alloc); + pool.deallocate(reinterpret_cast(ptrs(j)), size_alloc); } } @@ -153,9 +153,9 @@ struct TestFunctor { for (unsigned k = 0; k < repeat_inner; ++k) { const unsigned size_alloc = chunk * (1 + (j % chunk_span)); - pool.deallocate((void*)ptrs(j), size_alloc); + pool.deallocate(reinterpret_cast(ptrs(j)), size_alloc); - ptrs(j) = (uintptr_t)pool.allocate(size_alloc); + ptrs(j) = reinterpret_cast(pool.allocate(size_alloc)); if (0 == ptrs(j)) update++; } @@ -266,7 +266,7 @@ int main(int argc, char* argv[]) { TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc, fill_stride, chunk_span, repeat_inner); - Kokkos::Impl::Timer timer; + Kokkos::Timer timer; if (!functor.test_fill()) { Kokkos::abort("fill "); diff --git a/lib/kokkos/core/perf_test/test_taskdag.cpp b/lib/kokkos/core/perf_test/test_taskdag.cpp index b2f936a955..49957ae932 100644 --- a/lib/kokkos/core/perf_test/test_taskdag.cpp +++ b/lib/kokkos/core/perf_test/test_taskdag.cpp @@ -56,7 +56,7 @@ int main() { return 0; } #include #include -#include +#include using ExecSpace = Kokkos::DefaultExecutionSpace; @@ -220,7 +220,7 @@ int main(int argc, char* argv[]) { double time_sum = 0; for (int i = 0; i < test_repeat_outer; ++i) { - Kokkos::Impl::Timer timer; + Kokkos::Timer timer; Functor::FutureType ftmp = Kokkos::host_spawn(Kokkos::TaskSingle(sched), Functor(fib_input)); diff --git a/lib/kokkos/core/src/CMakeLists.txt b/lib/kokkos/core/src/CMakeLists.txt index 2ab0989805..499736c60d 100644 --- a/lib/kokkos/core/src/CMakeLists.txt +++ b/lib/kokkos/core/src/CMakeLists.txt @@ -9,6 +9,8 @@ INSTALL (DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" DESTINATION ${KOKKOS_HEADER_DIR} FILES_MATCHING + PATTERN "*.inc" + PATTERN "*.inc_*" PATTERN "*.hpp" PATTERN "*.h" ) @@ -65,6 +67,15 @@ IF (KOKKOS_ENABLE_SYCL) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp) ENDIF() +IF (KOKKOS_ENABLE_IMPL_DESUL_ATOMICS) + APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/desul/src/*.cpp) + APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*.hpp) + APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*/*.hpp) + APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*/*/*.hpp) + APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*/*/*.inc) +ENDIF() + + KOKKOS_ADD_LIBRARY( kokkoscore SOURCES ${KOKKOS_CORE_SRCS} @@ -86,3 +97,15 @@ KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL) KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBRT) KOKKOS_LINK_TPL(kokkoscore PUBLIC PTHREAD) KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM) + +# FIXME: We need a proper solution to figure out whether to enable +# libatomic +# XL requires libatomic even for 64 bit CAS, most others only for 128 +# I (CT) had removed 128bit CAS from desul to not need libatomic. +IF (KOKKOS_ENABLE_IMPL_DESUL_ATOMICS AND + (KOKKOS_ENABLE_OPENMPTARGET OR (CMAKE_CXX_COMPILER_ID STREQUAL XLClang))) + target_link_libraries(kokkoscore PUBLIC atomic) +ENDIF() + + +KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBQUADMATH) diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index 916f109758..f6b2762403 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -90,43 +90,25 @@ static std::atomic num_uvm_allocations(0); } // namespace -DeepCopy::DeepCopy(void *dst, const void *src, - size_t n) { - CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault)); +void DeepCopyCuda(void *dst, const void *src, size_t n) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault)); } -DeepCopy::DeepCopy(void *dst, const void *src, - size_t n) { - CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault)); -} - -DeepCopy::DeepCopy(void *dst, const void *src, - size_t n) { - CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault)); -} - -DeepCopy::DeepCopy(const Cuda &instance, void *dst, - const void *src, size_t n) { - CUDA_SAFE_CALL( - cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream())); -} - -DeepCopy::DeepCopy(const Cuda &instance, void *dst, - const void *src, size_t n) { - CUDA_SAFE_CALL( - cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream())); -} - -DeepCopy::DeepCopy(const Cuda &instance, void *dst, - const void *src, size_t n) { - CUDA_SAFE_CALL( +void DeepCopyAsyncCuda(const Cuda &instance, void *dst, const void *src, + size_t n) { + KOKKOS_IMPL_CUDA_SAFE_CALL( cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream())); } void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) { cudaStream_t s = cuda_get_deep_copy_stream(); - CUDA_SAFE_CALL(cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s)); - cudaStreamSynchronize(s); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s)); + Impl::cuda_stream_synchronize( + s, + Kokkos::Tools::Experimental::SpecialSynchronizationCases:: + DeepCopyResourceSynchronization, + "Kokkos::Impl::DeepCopyAsyncCuda: Deep Copy Stream Sync"); } } // namespace Impl @@ -137,6 +119,7 @@ void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) { namespace Kokkos { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 KOKKOS_DEPRECATED void CudaSpace::access_error() { const std::string msg( "Kokkos::CudaSpace::access_error attempt to execute Cuda function from " @@ -150,6 +133,7 @@ KOKKOS_DEPRECATED void CudaSpace::access_error(const void *const) { "non-Cuda space"); Kokkos::Impl::throw_runtime_exception(msg); } +#endif /*--------------------------------------------------------------------------*/ @@ -164,9 +148,11 @@ bool CudaUVMSpace::available() { /*--------------------------------------------------------------------------*/ +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 int CudaUVMSpace::number_of_allocations() { return Kokkos::Impl::num_uvm_allocations.load(); } +#endif #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST // The purpose of the following variable is to allow a state-based choice // for pinning UVM allocations to the CPU. For now this is considered @@ -204,6 +190,8 @@ CudaUVMSpace::CudaUVMSpace() : m_device(Kokkos::Cuda().cuda_device()) {} CudaHostPinnedSpace::CudaHostPinnedSpace() {} +int memory_threshold_g = 40000; // 40 kB + //============================================================================== // {{{1 @@ -221,7 +209,19 @@ void *CudaSpace::impl_allocate( const Kokkos::Tools::SpaceHandle arg_handle) const { void *ptr = nullptr; +#ifndef CUDART_VERSION +#error CUDART_VERSION undefined! +#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) + cudaError_t error_code; + if (arg_alloc_size >= memory_threshold_g) { + error_code = cudaMallocAsync(&ptr, arg_alloc_size, 0); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); + } else { + error_code = cudaMalloc(&ptr, arg_alloc_size); + } +#else auto error_code = cudaMalloc(&ptr, arg_alloc_size); +#endif if (error_code != cudaSuccess) { // TODO tag as unlikely branch cudaGetLastError(); // This is the only way to clear the last error, which // we should do here since we're turning it into an @@ -253,7 +253,8 @@ void *CudaUVMSpace::impl_allocate( const Kokkos::Tools::SpaceHandle arg_handle) const { void *ptr = nullptr; - Cuda::impl_static_fence(); + Cuda::impl_static_fence( + "Kokkos::CudaUVMSpace::impl_allocate: Pre UVM Allocation"); if (arg_alloc_size > 0) { Kokkos::Impl::num_uvm_allocations++; @@ -276,7 +277,8 @@ void *CudaUVMSpace::impl_allocate( CudaMallocManaged); } } - Cuda::impl_static_fence(); + Cuda::impl_static_fence( + "Kokkos::CudaUVMSpace::impl_allocate: Post UVM Allocation"); if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; @@ -337,9 +339,20 @@ void CudaSpace::impl_deallocate( Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, reported_size); } - try { - CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); +#ifndef CUDART_VERSION +#error CUDART_VERSION undefined! +#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) + if (arg_alloc_size >= memory_threshold_g) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeAsync(arg_alloc_ptr, 0)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); + } else { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); + } +#else + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); +#endif } catch (...) { } } @@ -362,7 +375,8 @@ void CudaUVMSpace::impl_deallocate( , const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle) const { - Cuda::impl_static_fence(); + Cuda::impl_static_fence( + "Kokkos::CudaUVMSpace::impl_deallocate: Pre UVM Deallocation"); if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; @@ -372,11 +386,12 @@ void CudaUVMSpace::impl_deallocate( try { if (arg_alloc_ptr != nullptr) { Kokkos::Impl::num_uvm_allocations--; - CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); } } catch (...) { } - Cuda::impl_static_fence(); + Cuda::impl_static_fence( + "Kokkos::CudaUVMSpace::impl_deallocate: Post UVM Deallocation"); } void CudaHostPinnedSpace::deallocate(void *const arg_alloc_ptr, @@ -401,7 +416,7 @@ void CudaHostPinnedSpace::impl_deallocate( reported_size); } try { - CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr)); } catch (...) { } } @@ -462,7 +477,7 @@ SharedAllocationRecord::attach_texture_object( resDesc.res.linear.sizeInBytes = alloc_size; resDesc.res.linear.devPtr = alloc_ptr; - CUDA_SAFE_CALL( + KOKKOS_IMPL_CUDA_SAFE_CALL( cudaCreateTextureObject(&tex_obj, &resDesc, &texDesc, nullptr)); return tex_obj; @@ -581,7 +596,7 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, bool to_device) { if ((ptr == nullptr) || (bytes == 0)) return; cudaPointerAttributes attr; - CUDA_SAFE_CALL(cudaPointerGetAttributes(&attr, ptr)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaPointerGetAttributes(&attr, ptr)); // I measured this and it turns out prefetching towards the host slows // DualView syncs down. Probably because the latency is not too bad in the // first place for the pull down. If we want to change that provde @@ -593,8 +608,8 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, #endif if (to_device && is_managed && space.cuda_device_prop().concurrentManagedAccess) { - CUDA_SAFE_CALL(cudaMemPrefetchAsync(ptr, bytes, space.cuda_device(), - space.cuda_stream())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemPrefetchAsync( + ptr, bytes, space.cuda_device(), space.cuda_stream())); } } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp index 0f4259072d..993c8d1bba 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp @@ -134,7 +134,12 @@ inline int cuda_deduce_block_size(bool early_termination, } if (blocks_per_sm >= min_blocks_per_sm) { - if (threads_per_sm >= opt_threads_per_sm) { + // The logic prefers smaller block sizes over larger ones to + // give more flexibility to the scheduler. + // But don't go below 128 where performance suffers significantly + // for simple copy/set kernels. + if ((threads_per_sm > opt_threads_per_sm) || + ((block_size >= 128) && (threads_per_sm == opt_threads_per_sm))) { opt_block_size = block_size; opt_threads_per_sm = threads_per_sm; } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp index 4759001d81..36df0d2564 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp @@ -49,13 +49,19 @@ #ifdef KOKKOS_ENABLE_CUDA #include - +#include #include namespace Kokkos { namespace Impl { -void cuda_device_synchronize(); +void cuda_stream_synchronize( + const cudaStream_t stream, + Kokkos::Tools::Experimental::SpecialSynchronizationCases reason, + const std::string& name); +void cuda_device_synchronize(const std::string& name); +void cuda_stream_synchronize(const cudaStream_t stream, + const std::string& name); void cuda_internal_error_throw(cudaError e, const char* name, const char* file = nullptr, const int line = 0); @@ -68,9 +74,24 @@ inline void cuda_internal_safe_call(cudaError e, const char* name, } } -#define CUDA_SAFE_CALL(call) \ +#define KOKKOS_IMPL_CUDA_SAFE_CALL(call) \ Kokkos::Impl::cuda_internal_safe_call(call, #call, __FILE__, __LINE__) +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 + +KOKKOS_DEPRECATED +inline void cuda_internal_safe_call_deprecated(cudaError e, const char* name, + const char* file = nullptr, + const int line = 0) { + cuda_internal_safe_call(e, name, file, line); +} + +#define CUDA_SAFE_CALL(call) \ + Kokkos::Impl::cuda_internal_safe_call_deprecated(call, #call, __FILE__, \ + __LINE__) + +#endif + } // namespace Impl namespace Experimental { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp index 3de7a69916..bd514f5e88 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp @@ -60,6 +60,7 @@ #include #include +#include namespace Kokkos { namespace Impl { @@ -82,8 +83,8 @@ struct GraphImpl { constexpr size_t error_log_size = 256; cudaGraphNode_t error_node = nullptr; char error_log[error_log_size]; - CUDA_SAFE_CALL(cudaGraphInstantiate(&m_graph_exec, m_graph, &error_node, - error_log, error_log_size)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphInstantiate( + &m_graph_exec, m_graph, &error_node, error_log, error_log_size)); // TODO @graphs print out errors } @@ -107,26 +108,27 @@ struct GraphImpl { // TODO @graphs we need to somehow indicate the need for a fence in the // destructor of the GraphImpl object (so that we don't have to // just always do it) - m_execution_space.fence(); + m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction"); KOKKOS_EXPECTS(bool(m_graph)) if (bool(m_graph_exec)) { - CUDA_SAFE_CALL(cudaGraphExecDestroy(m_graph_exec)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphExecDestroy(m_graph_exec)); } - CUDA_SAFE_CALL(cudaGraphDestroy(m_graph)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphDestroy(m_graph)); }; explicit GraphImpl(Kokkos::Cuda arg_instance) : m_execution_space(std::move(arg_instance)) { - CUDA_SAFE_CALL(cudaGraphCreate(&m_graph, cuda_graph_flags_t{0})); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaGraphCreate(&m_graph, cuda_graph_flags_t{0})); } void add_node(std::shared_ptr const& arg_node_ptr) { // All of the predecessors are just added as normal, so all we need to // do here is add an empty node - CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&(arg_node_ptr->node_details_t::node), - m_graph, - /* dependencies = */ nullptr, - /* numDependencies = */ 0)); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaGraphAddEmptyNode(&(arg_node_ptr->node_details_t::node), m_graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); } template @@ -171,7 +173,7 @@ struct GraphImpl { auto /*const*/& cuda_node = arg_node_ptr->node_details_t::node; KOKKOS_EXPECTS(bool(cuda_node)) - CUDA_SAFE_CALL( + KOKKOS_IMPL_CUDA_SAFE_CALL( cudaGraphAddDependencies(m_graph, &pred_cuda_node, &cuda_node, 1)); } @@ -179,7 +181,7 @@ struct GraphImpl { if (!bool(m_graph_exec)) { _instantiate_graph(); } - CUDA_SAFE_CALL( + KOKKOS_IMPL_CUDA_SAFE_CALL( cudaGraphLaunch(m_graph_exec, m_execution_space.cuda_stream())); } @@ -192,9 +194,10 @@ struct GraphImpl { KOKKOS_EXPECTS(!bool(m_graph_exec)) auto rv = std::make_shared( get_execution_space(), _graph_node_is_root_ctor_tag{}); - CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&(rv->node_details_t::node), m_graph, - /* dependencies = */ nullptr, - /* numDependencies = */ 0)); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaGraphAddEmptyNode(&(rv->node_details_t::node), m_graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); KOKKOS_ENSURES(bool(rv->node_details_t::node)) return rv; } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp index ec9c434fe6..c81286eb10 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp @@ -51,6 +51,9 @@ !(defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL50) || \ defined(KOKKOS_ARCH_MAXWELL52)) #include +#include // istream & ostream for extraction and insertion ops +#include +#include // reduction_identity #ifndef KOKKOS_IMPL_HALF_TYPE_DEFINED // Make sure no one else tries to define half_t @@ -127,7 +130,7 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> cast_from_half(half_t); -class half_t { +class alignas(2) half_t { public: using impl_type = Kokkos::Impl::half_impl_t::type; @@ -138,6 +141,22 @@ class half_t { KOKKOS_FUNCTION half_t() : val(0.0F) {} + // Copy constructors + KOKKOS_DEFAULTED_FUNCTION + half_t(const half_t&) noexcept = default; + + KOKKOS_INLINE_FUNCTION + half_t(const volatile half_t& rhs) { +#ifdef __CUDA_ARCH__ + val = rhs.val; +#else + const volatile uint16_t* rv_ptr = + reinterpret_cast(&rhs.val); + const uint16_t rv_val = *rv_ptr; + val = reinterpret_cast(rv_val); +#endif // __CUDA_ARCH__ + } + // Don't support implicit conversion back to impl_type. // impl_type is a storage only type on host. KOKKOS_FUNCTION @@ -219,7 +238,7 @@ class half_t { #ifdef __CUDA_ARCH__ tmp.val = +tmp.val; #else - tmp.val = __float2half(+__half2float(tmp.val)); + tmp.val = __float2half(+__half2float(tmp.val)); #endif return tmp; } @@ -230,7 +249,7 @@ class half_t { #ifdef __CUDA_ARCH__ tmp.val = -tmp.val; #else - tmp.val = __float2half(-__half2float(tmp.val)); + tmp.val = __float2half(-__half2float(tmp.val)); #endif return tmp; } @@ -241,7 +260,7 @@ class half_t { #ifdef __CUDA_ARCH__ ++val; #else - float tmp = __half2float(val); + float tmp = __half2float(val); ++tmp; val = __float2half(tmp); #endif @@ -255,7 +274,7 @@ class half_t { #else float tmp = __half2float(val); --tmp; - val = __float2half(tmp); + val = __float2half(tmp); #endif return *this; } @@ -290,7 +309,10 @@ class half_t { template KOKKOS_FUNCTION void operator=(T rhs) volatile { - val = cast_to_half(rhs).val; + impl_type new_val = cast_to_half(rhs).val; + volatile uint16_t* val_ptr = + reinterpret_cast(const_cast(&val)); + *val_ptr = reinterpret_cast(new_val); } // Compound operators @@ -299,30 +321,21 @@ class half_t { #ifdef __CUDA_ARCH__ val += rhs.val; #else - val = __float2half(__half2float(val) + __half2float(rhs.val)); + val = __float2half(__half2float(val) + __half2float(rhs.val)); #endif return *this; } KOKKOS_FUNCTION - volatile half_t& operator+=(half_t rhs) volatile { -#ifdef __CUDA_ARCH__ - // Cuda 10 supports __half volatile stores but not volatile arithmetic - // operands. Cast away volatile-ness of val for arithmetic but not for store - // location. - val = const_cast(val) + rhs.val; -#else - // Use non-volatile val_ref to suppress: - // "warning: implicit dereference will not access object of type ‘volatile - // __half’ in statement" - auto val_ref = const_cast(val); - val_ref = __float2half(__half2float(const_cast(val)) + - __half2float(rhs.val)); -#endif - return *this; + void operator+=(const volatile half_t& rhs) volatile { + half_t tmp_rhs = rhs; + half_t tmp_lhs = *this; + + tmp_lhs += tmp_rhs; + *this = tmp_lhs; } - // Compund operators: upcast overloads for += + // Compound operators: upcast overloads for += template KOKKOS_FUNCTION std::enable_if_t< std::is_same::value || std::is_same::value, T> friend @@ -350,27 +363,18 @@ class half_t { #ifdef __CUDA_ARCH__ val -= rhs.val; #else - val = __float2half(__half2float(val) - __half2float(rhs.val)); + val = __float2half(__half2float(val) - __half2float(rhs.val)); #endif return *this; } KOKKOS_FUNCTION - volatile half_t& operator-=(half_t rhs) volatile { -#ifdef __CUDA_ARCH__ - // Cuda 10 supports __half volatile stores but not volatile arithmetic - // operands. Cast away volatile-ness of val for arithmetic but not for store - // location. - val = const_cast(val) - rhs.val; -#else - // Use non-volatile val_ref to suppress: - // "warning: implicit dereference will not access object of type ‘volatile - // __half’ in statement" - auto val_ref = const_cast(val); - val_ref = __float2half(__half2float(const_cast(val)) - - __half2float(rhs.val)); -#endif - return *this; + void operator-=(const volatile half_t& rhs) volatile { + half_t tmp_rhs = rhs; + half_t tmp_lhs = *this; + + tmp_lhs -= tmp_rhs; + *this = tmp_lhs; } // Compund operators: upcast overloads for -= @@ -401,27 +405,18 @@ class half_t { #ifdef __CUDA_ARCH__ val *= rhs.val; #else - val = __float2half(__half2float(val) * __half2float(rhs.val)); + val = __float2half(__half2float(val) * __half2float(rhs.val)); #endif return *this; } KOKKOS_FUNCTION - volatile half_t& operator*=(half_t rhs) volatile { -#ifdef __CUDA_ARCH__ - // Cuda 10 supports __half volatile stores but not volatile arithmetic - // operands. Cast away volatile-ness of val for arithmetic but not for store - // location. - val = const_cast(val) * rhs.val; -#else - // Use non-volatile val_ref to suppress: - // "warning: implicit dereference will not access object of type ‘volatile - // __half’ in statement" - auto val_ref = const_cast(val); - val_ref = __float2half(__half2float(const_cast(val)) * - __half2float(rhs.val)); -#endif - return *this; + void operator*=(const volatile half_t& rhs) volatile { + half_t tmp_rhs = rhs; + half_t tmp_lhs = *this; + + tmp_lhs *= tmp_rhs; + *this = tmp_lhs; } // Compund operators: upcast overloads for *= @@ -452,27 +447,18 @@ class half_t { #ifdef __CUDA_ARCH__ val /= rhs.val; #else - val = __float2half(__half2float(val) / __half2float(rhs.val)); + val = __float2half(__half2float(val) / __half2float(rhs.val)); #endif return *this; } KOKKOS_FUNCTION - volatile half_t& operator/=(half_t rhs) volatile { -#ifdef __CUDA_ARCH__ - // Cuda 10 supports __half volatile stores but not volatile arithmetic - // operands. Cast away volatile-ness of val for arithmetic but not for store - // location. - val = const_cast(val) / rhs.val; -#else - // Use non-volatile val_ref to suppress: - // "warning: implicit dereference will not access object of type ‘volatile - // __half’ in statement" - auto val_ref = const_cast(val); - val_ref = __float2half(__half2float(const_cast(val)) / - __half2float(rhs.val)); -#endif - return *this; + void operator/=(const volatile half_t& rhs) volatile { + half_t tmp_rhs = rhs; + half_t tmp_lhs = *this; + + tmp_lhs /= tmp_rhs; + *this = tmp_lhs; } // Compund operators: upcast overloads for /= @@ -504,7 +490,7 @@ class half_t { #ifdef __CUDA_ARCH__ lhs.val += rhs.val; #else - lhs.val = __float2half(__half2float(lhs.val) + __half2float(rhs.val)); + lhs.val = __float2half(__half2float(lhs.val) + __half2float(rhs.val)); #endif return lhs; } @@ -529,7 +515,7 @@ class half_t { #ifdef __CUDA_ARCH__ lhs.val -= rhs.val; #else - lhs.val = __float2half(__half2float(lhs.val) - __half2float(rhs.val)); + lhs.val = __float2half(__half2float(lhs.val) - __half2float(rhs.val)); #endif return lhs; } @@ -554,7 +540,7 @@ class half_t { #ifdef __CUDA_ARCH__ lhs.val *= rhs.val; #else - lhs.val = __float2half(__half2float(lhs.val) * __half2float(rhs.val)); + lhs.val = __float2half(__half2float(lhs.val) * __half2float(rhs.val)); #endif return lhs; } @@ -579,7 +565,7 @@ class half_t { #ifdef __CUDA_ARCH__ lhs.val /= rhs.val; #else - lhs.val = __float2half(__half2float(lhs.val) / __half2float(rhs.val)); + lhs.val = __float2half(__half2float(lhs.val) / __half2float(rhs.val)); #endif return lhs; } @@ -683,6 +669,62 @@ class half_t { return __half2float(val) >= __half2float(rhs.val); #endif } + + KOKKOS_FUNCTION + friend bool operator==(const volatile half_t& lhs, + const volatile half_t& rhs) { + half_t tmp_lhs = lhs, tmp_rhs = rhs; + return tmp_lhs == tmp_rhs; + } + + KOKKOS_FUNCTION + friend bool operator!=(const volatile half_t& lhs, + const volatile half_t& rhs) { + half_t tmp_lhs = lhs, tmp_rhs = rhs; + return tmp_lhs != tmp_rhs; + } + + KOKKOS_FUNCTION + friend bool operator<(const volatile half_t& lhs, + const volatile half_t& rhs) { + half_t tmp_lhs = lhs, tmp_rhs = rhs; + return tmp_lhs < tmp_rhs; + } + + KOKKOS_FUNCTION + friend bool operator>(const volatile half_t& lhs, + const volatile half_t& rhs) { + half_t tmp_lhs = lhs, tmp_rhs = rhs; + return tmp_lhs > tmp_rhs; + } + + KOKKOS_FUNCTION + friend bool operator<=(const volatile half_t& lhs, + const volatile half_t& rhs) { + half_t tmp_lhs = lhs, tmp_rhs = rhs; + return tmp_lhs <= tmp_rhs; + } + + KOKKOS_FUNCTION + friend bool operator>=(const volatile half_t& lhs, + const volatile half_t& rhs) { + half_t tmp_lhs = lhs, tmp_rhs = rhs; + return tmp_lhs >= tmp_rhs; + } + + // Insertion and extraction operators + friend std::ostream& operator<<(std::ostream& os, const half_t& x) { + const std::string out = std::to_string(static_cast(x)); + os << out; + return os; + } + + friend std::istream& operator>>(std::istream& is, half_t& x) { + std::string in; + is >> in; + x = std::stod(in); + return is; + } }; // CUDA before 11.1 only has the half <-> float conversions marked host device @@ -943,6 +985,25 @@ KOKKOS_INLINE_FUNCTION } #endif } // namespace Experimental + +// use float as the return type for sum and prod since cuda_fp16.h +// has no constexpr functions for casting to __half +template <> +struct reduction_identity { + KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum() noexcept { + return 0.0F; + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() noexcept { + return 1.0F; + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() noexcept { + return -65504.0F; + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static float min() noexcept { + return 65504.0F; + } +}; + } // namespace Kokkos #endif // KOKKOS_IMPL_HALF_TYPE_DEFINED #endif // KOKKOS_ENABLE_CUDA diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 016cb6cdcb..6964d5b41b 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -119,7 +119,7 @@ int cuda_kernel_arch() { int arch = 0; int *d_arch = nullptr; - cudaMalloc((void **)&d_arch, sizeof(int)); + cudaMalloc(reinterpret_cast(&d_arch), sizeof(int)); cudaMemcpy(d_arch, &arch, sizeof(int), cudaMemcpyDefault); query_cuda_kernel_arch<<<1, 1>>>(d_arch); @@ -141,7 +141,36 @@ bool cuda_launch_blocking() { } // namespace -void cuda_device_synchronize() { CUDA_SAFE_CALL(cudaDeviceSynchronize()); } +void cuda_device_synchronize(const std::string &name) { + Kokkos::Tools::Experimental::Impl::profile_fence_event( + name, + Kokkos::Tools::Experimental::SpecialSynchronizationCases:: + GlobalDeviceSynchronization, + []() { // TODO: correct device ID + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); + }); +} + +void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr, + const std::string &name) { + Kokkos::Tools::Experimental::Impl::profile_fence_event( + name, + Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{ + ptr->impl_get_instance_id()}, + [&]() { // TODO: correct device ID + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream)); + }); +} + +void cuda_stream_synchronize( + const cudaStream_t stream, + Kokkos::Tools::Experimental::SpecialSynchronizationCases reason, + const std::string &name) { + Kokkos::Tools::Experimental::Impl::profile_fence_event( + name, reason, [&]() { // TODO: correct device ID + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream)); + }); +} void cuda_internal_error_throw(cudaError e, const char *name, const char *file, const int line) { @@ -221,7 +250,7 @@ CudaInternalDevices::CudaInternalDevices() { // See 'cudaSetDeviceFlags' for host-device thread interaction // Section 4.4.2.6 of the CUDA Toolkit Reference Manual - CUDA_SAFE_CALL(cudaGetDeviceCount(&m_cudaDevCount)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&m_cudaDevCount)); if (m_cudaDevCount > MAXIMUM_DEVICE_COUNT) { Kokkos::abort( @@ -229,7 +258,7 @@ CudaInternalDevices::CudaInternalDevices() { "have. Please report this to github.com/kokkos/kokkos."); } for (int i = 0; i < m_cudaDevCount; ++i) { - CUDA_SAFE_CALL(cudaGetDeviceProperties(m_cudaProp + i, i)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(m_cudaProp + i, i)); } } @@ -277,25 +306,27 @@ CudaInternal::~CudaInternal() { << std::endl; } - m_cudaDev = -1; - m_cudaArch = -1; - m_multiProcCount = 0; - m_maxWarpCount = 0; - m_maxBlock = 0; - m_maxSharedWords = 0; - m_maxConcurrency = 0; - m_scratchSpaceCount = 0; - m_scratchFlagsCount = 0; - m_scratchUnifiedCount = 0; - m_scratchUnifiedSupported = 0; - m_streamCount = 0; - m_scratchSpace = nullptr; - m_scratchFlags = nullptr; - m_scratchUnified = nullptr; - m_scratchConcurrentBitset = nullptr; - m_stream = nullptr; - m_team_scratch_current_size = 0; - m_team_scratch_ptr = nullptr; + m_cudaDev = -1; + m_cudaArch = -1; + m_multiProcCount = 0; + m_maxWarpCount = 0; + m_maxBlock = 0; + m_maxSharedWords = 0; + m_maxConcurrency = 0; + m_scratchSpaceCount = 0; + m_scratchFlagsCount = 0; + m_scratchUnifiedCount = 0; + m_scratchUnifiedSupported = 0; + m_streamCount = 0; + m_scratchSpace = nullptr; + m_scratchFlags = nullptr; + m_scratchUnified = nullptr; + m_scratchConcurrentBitset = nullptr; + m_stream = nullptr; + for (int i = 0; i < m_n_team_scratch; ++i) { + m_team_scratch_current_size[i] = 0; + m_team_scratch_ptr[i] = nullptr; + } } int CudaInternal::verify_is_initialized(const char *const label) const { @@ -305,16 +336,20 @@ int CudaInternal::verify_is_initialized(const char *const label) const { } return 0 <= m_cudaDev; } - +uint32_t CudaInternal::impl_get_instance_id() const { return m_instance_id; } CudaInternal &CudaInternal::singleton() { static CudaInternal self; return self; } +void CudaInternal::fence(const std::string &name) const { + Impl::cuda_stream_synchronize(m_stream, this, name); +} void CudaInternal::fence() const { - CUDA_SAFE_CALL(cudaStreamSynchronize(m_stream)); + fence("Kokkos::CudaInternal::fence(): Unnamed Instance Fence"); } -void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) { +void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream, + bool manage_stream) { if (was_finalized) Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n"); was_initialized = true; @@ -350,8 +385,9 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) { m_cudaDev = cuda_device_id; m_deviceProp = cudaProp; - CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev)); - Kokkos::Impl::cuda_device_synchronize(); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev)); + Kokkos::Impl::cuda_device_synchronize( + "Kokkos::CudaInternal::initialize: Fence on space initialization"); // Query what compute capability architecture a kernel executes: m_cudaArch = cuda_kernel_arch(); @@ -464,8 +500,8 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) { m_scratchConcurrentBitset = reinterpret_cast(r->data()); - CUDA_SAFE_CALL(cudaMemset(m_scratchConcurrentBitset, 0, - sizeof(uint32_t) * buffer_bound)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemset(m_scratchConcurrentBitset, 0, + sizeof(uint32_t) * buffer_bound)); } //---------------------------------- @@ -535,15 +571,19 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default // Allocate a staging buffer for constant mem in pinned host memory // and an event to avoid overwriting driver for previous kernel launches if (stream == nullptr) { - CUDA_SAFE_CALL(cudaMallocHost((void **)&constantMemHostStaging, - CudaTraits::ConstantMemoryUsage)); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMallocHost(reinterpret_cast(&constantMemHostStaging), + CudaTraits::ConstantMemoryUsage)); - CUDA_SAFE_CALL(cudaEventCreate(&constantMemReusable)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventCreate(&constantMemReusable)); } - m_stream = stream; - m_team_scratch_current_size = 0; - m_team_scratch_ptr = nullptr; + m_stream = stream; + m_manage_stream = manage_stream; + for (int i = 0; i < m_n_team_scratch; ++i) { + m_team_scratch_current_size[i] = 0; + m_team_scratch_ptr[i] = nullptr; + } } //---------------------------------------------------------------------------- @@ -569,7 +609,7 @@ Cuda::size_type *CudaInternal::scratch_flags(const Cuda::size_type size) const { m_scratchFlags = reinterpret_cast(r->data()); - CUDA_SAFE_CALL( + KOKKOS_IMPL_CUDA_SAFE_CALL( cudaMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain)); } @@ -645,20 +685,37 @@ Cuda::size_type *CudaInternal::scratch_functor( return m_scratchFunctor; } -void *CudaInternal::resize_team_scratch_space(std::int64_t bytes, - bool force_shrink) { - if (m_team_scratch_current_size == 0) { - m_team_scratch_current_size = bytes; - m_team_scratch_ptr = Kokkos::kokkos_malloc( - "Kokkos::CudaSpace::TeamScratchMemory", m_team_scratch_current_size); +std::pair CudaInternal::resize_team_scratch_space( + std::int64_t bytes, bool force_shrink) { + // Multiple ParallelFor/Reduce Teams can call this function at the same time + // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race + // condition. + + int current_team_scratch = 0; + int zero = 0; + int one = 1; + while (m_team_scratch_pool[current_team_scratch].compare_exchange_weak( + zero, one, std::memory_order_release, std::memory_order_relaxed)) { + current_team_scratch = (current_team_scratch + 1) % m_n_team_scratch; } - if ((bytes > m_team_scratch_current_size) || - ((bytes < m_team_scratch_current_size) && (force_shrink))) { - m_team_scratch_current_size = bytes; - m_team_scratch_ptr = Kokkos::kokkos_realloc( - m_team_scratch_ptr, m_team_scratch_current_size); + if (m_team_scratch_current_size[current_team_scratch] == 0) { + m_team_scratch_current_size[current_team_scratch] = bytes; + m_team_scratch_ptr[current_team_scratch] = + Kokkos::kokkos_malloc( + "Kokkos::CudaSpace::TeamScratchMemory", + m_team_scratch_current_size[current_team_scratch]); } - return m_team_scratch_ptr; + if ((bytes > m_team_scratch_current_size[current_team_scratch]) || + ((bytes < m_team_scratch_current_size[current_team_scratch]) && + (force_shrink))) { + m_team_scratch_current_size[current_team_scratch] = bytes; + m_team_scratch_ptr[current_team_scratch] = + Kokkos::kokkos_realloc( + m_team_scratch_ptr[current_team_scratch], + m_team_scratch_current_size[current_team_scratch]); + } + return std::make_pair(m_team_scratch_ptr[current_team_scratch], + current_team_scratch); } //---------------------------------------------------------------------------- @@ -685,36 +742,43 @@ void CudaInternal::finalize() { if (m_scratchFunctorSize > 0) RecordCuda::decrement(RecordCuda::get_record(m_scratchFunctor)); - if (m_team_scratch_current_size > 0) - Kokkos::kokkos_free(m_team_scratch_ptr); + for (int i = 0; i < m_n_team_scratch; ++i) { + if (m_team_scratch_current_size[i] > 0) + Kokkos::kokkos_free(m_team_scratch_ptr[i]); + } - m_cudaDev = -1; - m_multiProcCount = 0; - m_maxWarpCount = 0; - m_maxBlock = 0; - m_maxSharedWords = 0; - m_scratchSpaceCount = 0; - m_scratchFlagsCount = 0; - m_scratchUnifiedCount = 0; - m_streamCount = 0; - m_scratchSpace = nullptr; - m_scratchFlags = nullptr; - m_scratchUnified = nullptr; - m_scratchConcurrentBitset = nullptr; - m_stream = nullptr; - m_team_scratch_current_size = 0; - m_team_scratch_ptr = nullptr; + if (m_manage_stream && m_stream != nullptr) + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(m_stream)); + + m_cudaDev = -1; + m_multiProcCount = 0; + m_maxWarpCount = 0; + m_maxBlock = 0; + m_maxSharedWords = 0; + m_scratchSpaceCount = 0; + m_scratchFlagsCount = 0; + m_scratchUnifiedCount = 0; + m_streamCount = 0; + m_scratchSpace = nullptr; + m_scratchFlags = nullptr; + m_scratchUnified = nullptr; + m_scratchConcurrentBitset = nullptr; + m_stream = nullptr; + for (int i = 0; i < m_n_team_scratch; ++i) { + m_team_scratch_current_size[i] = 0; + m_team_scratch_ptr[i] = nullptr; + } } // only destroy these if we're finalizing the singleton if (this == &singleton()) { - cudaFreeHost(constantMemHostStaging); - cudaEventDestroy(constantMemReusable); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(constantMemHostStaging)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventDestroy(constantMemReusable)); auto &deep_copy_space = Kokkos::Impl::cuda_get_deep_copy_space(/*initialize*/ false); if (deep_copy_space) deep_copy_space->impl_internal_space_instance()->finalize(); - cudaStreamDestroy(cuda_get_deep_copy_stream()); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(cuda_get_deep_copy_stream())); } } @@ -823,7 +887,7 @@ Cuda::Cuda() "Cuda instance constructor"); } -Cuda::Cuda(cudaStream_t stream) +Cuda::Cuda(cudaStream_t stream, bool manage_stream) : m_space_instance(new Impl::CudaInternal, [](Impl::CudaInternal *ptr) { ptr->finalize(); delete ptr; @@ -831,18 +895,31 @@ Cuda::Cuda(cudaStream_t stream) Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor"); m_space_instance->initialize(Impl::CudaInternal::singleton().m_cudaDev, - stream); + stream, manage_stream); } void Cuda::print_configuration(std::ostream &s, const bool) { Impl::CudaInternal::singleton().print_configuration(s); } -void Cuda::impl_static_fence() { Kokkos::Impl::cuda_device_synchronize(); } +void Cuda::impl_static_fence(const std::string &name) { + Kokkos::Impl::cuda_device_synchronize(name); +} +void Cuda::impl_static_fence() { + impl_static_fence("Kokkos::Cuda::impl_static_fence(): Unnamed Static Fence"); +} -void Cuda::fence() const { m_space_instance->fence(); } +void Cuda::fence() const { + fence("Kokkos::Cuda::fence(): Unnamed Instance Fence"); +} +void Cuda::fence(const std::string &name) const { + m_space_instance->fence(name); +} const char *Cuda::name() { return "Cuda"; } +uint32_t Cuda::impl_instance_id() const noexcept { + return m_space_instance->impl_get_instance_id(); +} cudaStream_t Cuda::cuda_stream() const { return m_space_instance->m_stream; } int Cuda::cuda_device() const { return m_space_instance->m_cudaDev; } @@ -877,7 +954,15 @@ void CudaSpaceInitializer::finalize(bool all_spaces) { } } -void CudaSpaceInitializer::fence() { Kokkos::Cuda::impl_static_fence(); } +void CudaSpaceInitializer::fence() { + Kokkos::Cuda::impl_static_fence( + "Kokkos::CudaSpaceInitializer::fence: Initializer Fence"); +} +void CudaSpaceInitializer::fence(const std::string &name) { + // Kokkos::Cuda::impl_static_fence("Kokkos::CudaSpaceInitializer::fence: + // "+name); //TODO: or this + Kokkos::Cuda::impl_static_fence(name); +} void CudaSpaceInitializer::print_configuration(std::ostream &msg, const bool detail) { @@ -916,12 +1001,6 @@ void CudaSpaceInitializer::print_configuration(std::ostream &msg, msg << "yes\n"; #else msg << "no\n"; -#endif - msg << " KOKKOS_ENABLE_CUSPARSE: "; -#ifdef KOKKOS_ENABLE_CUSPARSE - msg << "yes\n"; -#else - msg << "no\n"; #endif msg << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: "; #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp index aaec2c2926..7eb169838c 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -3,6 +3,9 @@ #include #include +#include +#include + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- // These functions fulfill the purpose of allowing to work around @@ -114,10 +117,14 @@ class CudaInternal { mutable size_type* m_scratchFunctor; uint32_t* m_scratchConcurrentBitset; cudaStream_t m_stream; + uint32_t m_instance_id; + bool m_manage_stream; // Team Scratch Level 1 Space - mutable int64_t m_team_scratch_current_size; - mutable void* m_team_scratch_ptr; + int m_n_team_scratch = 10; + mutable int64_t m_team_scratch_current_size[10]; + mutable void* m_team_scratch_ptr[10]; + mutable std::atomic_int m_team_scratch_pool[10]; bool was_initialized = false; bool was_finalized = false; @@ -135,7 +142,8 @@ class CudaInternal { return nullptr != m_scratchSpace && nullptr != m_scratchFlags; } - void initialize(int cuda_device_id, cudaStream_t stream = nullptr); + void initialize(int cuda_device_id, cudaStream_t stream = nullptr, + bool manage_stream = false); void finalize(); void print_configuration(std::ostream&) const; @@ -145,6 +153,7 @@ class CudaInternal { static void cuda_set_serial_execution(bool); #endif + void fence(const std::string&) const; void fence() const; ~CudaInternal(); @@ -175,20 +184,68 @@ class CudaInternal { m_scratchFunctor(nullptr), m_scratchConcurrentBitset(nullptr), m_stream(nullptr), - m_team_scratch_current_size(0), - m_team_scratch_ptr(nullptr) {} + m_instance_id( + Kokkos::Tools::Experimental::Impl::idForInstance( + reinterpret_cast(this))) { + for (int i = 0; i < m_n_team_scratch; ++i) { + m_team_scratch_current_size[i] = 0; + m_team_scratch_ptr[i] = nullptr; + m_team_scratch_pool[i] = 0; + } + } // Resizing of reduction related scratch spaces size_type* scratch_space(const size_type size) const; size_type* scratch_flags(const size_type size) const; size_type* scratch_unified(const size_type size) const; size_type* scratch_functor(const size_type size) const; - + uint32_t impl_get_instance_id() const; // Resizing of team level 1 scratch - void* resize_team_scratch_space(std::int64_t bytes, - bool force_shrink = false); + std::pair resize_team_scratch_space(std::int64_t bytes, + bool force_shrink = false); }; } // Namespace Impl + +namespace Experimental { +// Partitioning an Execution Space: expects space and integer arguments for +// relative weight +// Customization point for backends +// Default behavior is to return the passed in instance + +namespace Impl { +inline void create_Cuda_instances(std::vector& instances) { + for (int s = 0; s < int(instances.size()); s++) { + cudaStream_t stream; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&stream)); + instances[s] = Cuda(stream, true); + } +} +} // namespace Impl + +template +std::vector partition_space(const Cuda&, Args...) { +#ifdef __cpp_fold_expressions + static_assert( + (... && std::is_arithmetic_v), + "Kokkos Error: partitioning arguments must be integers or floats"); +#endif + std::vector instances(sizeof...(Args)); + Impl::create_Cuda_instances(instances); + return instances; +} + +template +std::vector partition_space(const Cuda&, std::vector& weights) { + static_assert( + std::is_arithmetic::value, + "Kokkos Error: partitioning arguments must be integers or floats"); + + std::vector instances(weights.size()); + Impl::create_Cuda_instances(instances); + return instances; +} +} // namespace Experimental + } // Namespace Kokkos #endif diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index d892a893b3..4b01798f5e 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -167,7 +167,7 @@ inline void configure_shmem_preference(KernelFuncPtr const& func, #ifndef KOKKOS_ARCH_KEPLER // On Kepler the L1 has no benefit since it doesn't cache reads auto set_cache_config = [&] { - CUDA_SAFE_CALL(cudaFuncSetCacheConfig( + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig( func, (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1))); return prefer_shmem; @@ -372,14 +372,15 @@ struct CudaParallelLaunchKernelInvoker< params.kernelParams = (void**)args; params.extra = nullptr; - CUDA_SAFE_CALL(cudaGraphAddKernelNode( + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphAddKernelNode( &graph_node, graph, /* dependencies = */ nullptr, /* numDependencies = */ 0, ¶ms)); } else { // We still need an empty node for the dependency structure - CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph, - /* dependencies = */ nullptr, - /* numDependencies = */ 0)); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaGraphAddEmptyNode(&graph_node, graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); } KOKKOS_ENSURES(bool(graph_node)) } @@ -475,14 +476,15 @@ struct CudaParallelLaunchKernelInvoker< params.kernelParams = (void**)args; params.extra = nullptr; - CUDA_SAFE_CALL(cudaGraphAddKernelNode( + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphAddKernelNode( &graph_node, graph, /* dependencies = */ nullptr, /* numDependencies = */ 0, ¶ms)); } else { // We still need an empty node for the dependency structure - CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph, - /* dependencies = */ nullptr, - /* numDependencies = */ 0)); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaGraphAddEmptyNode(&graph_node, graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); } KOKKOS_ENSURES(bool(graph_node)) } @@ -538,7 +540,8 @@ struct CudaParallelLaunchKernelInvoker< dim3 const& block, int shmem, CudaInternal const* cuda_instance) { // Wait until the previous kernel that uses the constant buffer is done - CUDA_SAFE_CALL(cudaEventSynchronize(cuda_instance->constantMemReusable)); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaEventSynchronize(cuda_instance->constantMemReusable)); // Copy functor (synchronously) to staging buffer in pinned host memory unsigned long* staging = cuda_instance->constantMemHostStaging; @@ -554,8 +557,9 @@ struct CudaParallelLaunchKernelInvoker< get_kernel_func())<<m_stream>>>(); // Record an event that says when the constant buffer can be reused - CUDA_SAFE_CALL(cudaEventRecord(cuda_instance->constantMemReusable, - cudaStream_t(cuda_instance->m_stream))); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaEventRecord(cuda_instance->constantMemReusable, + cudaStream_t(cuda_instance->m_stream))); } #ifdef KOKKOS_CUDA_ENABLE_GRAPHS @@ -637,8 +641,9 @@ struct CudaParallelLaunchImpl< base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance); #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - CUDA_SAFE_CALL(cudaGetLastError()); - cuda_instance->fence(); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); + cuda_instance->fence( + "Kokkos::Impl::launch_kernel: Debug Only Check for Execution Error"); #endif } } @@ -650,7 +655,7 @@ struct CudaParallelLaunchImpl< // the code and the result is visible. auto wrap_get_attributes = []() -> cudaFuncAttributes { cudaFuncAttributes attr_tmp; - CUDA_SAFE_CALL( + KOKKOS_IMPL_CUDA_SAFE_CALL( cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func())); return attr_tmp; }; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp index ff31649544..1f3024f318 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp @@ -81,22 +81,34 @@ namespace Impl { CudaLockArrays g_host_cuda_lock_arrays = {nullptr, nullptr, 0}; void initialize_host_cuda_lock_arrays() { +#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS + desul::Impl::init_lock_arrays(); + + DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); +#endif if (g_host_cuda_lock_arrays.atomic != nullptr) return; - CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.atomic, - sizeof(int) * (CUDA_SPACE_ATOMIC_MASK + 1))); - CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.scratch, - sizeof(int) * (Cuda::concurrency()))); - CUDA_SAFE_CALL(cudaDeviceSynchronize()); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc(&g_host_cuda_lock_arrays.atomic, + sizeof(int) * (CUDA_SPACE_ATOMIC_MASK + 1))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.scratch, + sizeof(int) * (Cuda::concurrency()))); + Impl::cuda_device_synchronize( + "Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays"); g_host_cuda_lock_arrays.n = Cuda::concurrency(); KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE(); init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>(); init_lock_array_kernel_threadid<<<(Kokkos::Cuda::concurrency() + 255) / 256, 256>>>(Kokkos::Cuda::concurrency()); - CUDA_SAFE_CALL(cudaDeviceSynchronize()); + Impl::cuda_device_synchronize( + "Kokkos::Impl::initialize_host_cuda_lock_arrays: Post Init Lock Arrays"); } void finalize_host_cuda_lock_arrays() { +#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS + desul::Impl::finalize_lock_arrays(); +#endif + if (g_host_cuda_lock_arrays.atomic == nullptr) return; cudaFree(g_host_cuda_lock_arrays.atomic); g_host_cuda_lock_arrays.atomic = nullptr; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp index 7640b8084d..04fb7cb345 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp @@ -53,6 +53,10 @@ #include +#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS +#include +#endif + namespace Kokkos { namespace Impl { @@ -150,13 +154,14 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; } } // namespace } // namespace Impl } // namespace Kokkos + /* Dan Ibanez: it is critical that this code be a macro, so that it will capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays! putting this in an inline function will NOT do the right thing! */ #define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \ { \ if (::Kokkos::Impl::lock_array_copied == 0) { \ - CUDA_SAFE_CALL( \ + KOKKOS_IMPL_CUDA_SAFE_CALL( \ cudaMemcpyToSymbol(Kokkos::Impl::g_device_cuda_lock_arrays, \ &Kokkos::Impl::g_host_cuda_lock_arrays, \ sizeof(Kokkos::Impl::CudaLockArrays))); \ @@ -164,6 +169,8 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; } lock_array_copied = 1; \ } +#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS + #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE #define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() #else @@ -171,6 +178,19 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; } KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() #endif +#else + +#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE +#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() +#else +// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc. +#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \ + KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \ + DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() +#endif + +#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */ + #endif /* defined( KOKKOS_ENABLE_CUDA ) */ #endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp index 2834e6f3de..f83b43e608 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp @@ -62,7 +62,6 @@ #include #include #include -#include #include #include @@ -240,9 +239,11 @@ class TeamPolicyInternal //---------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 KOKKOS_DEPRECATED inline int vector_length() const { return impl_vector_length(); } +#endif inline int impl_vector_length() const { return m_vector_length; } inline int team_size() const { return m_team_size; } inline int league_size() const { return m_league_size; } @@ -687,6 +688,7 @@ class ParallelFor, int m_shmem_size; void* m_scratch_ptr[2]; int m_scratch_size[2]; + int m_scratch_pool_id = -1; template __device__ inline @@ -797,15 +799,19 @@ class ParallelFor, // Functor's reduce memory, team scan memory, and team shared memory depend // upon team size. m_scratch_ptr[0] = nullptr; - m_scratch_ptr[1] = - m_team_size <= 0 - ? nullptr - : m_policy.space() - .impl_internal_space_instance() - ->resize_team_scratch_space( - static_cast(m_scratch_size[1]) * - static_cast(Cuda::concurrency() / - (m_team_size * m_vector_size))); + if (m_team_size <= 0) { + m_scratch_ptr[1] = nullptr; + } else { + auto scratch_ptr_id = + m_policy.space() + .impl_internal_space_instance() + ->resize_team_scratch_space( + static_cast(m_scratch_size[1]) * + (static_cast(Cuda::concurrency() / + (m_team_size * m_vector_size)))); + m_scratch_ptr[1] = scratch_ptr_id.first; + m_scratch_pool_id = scratch_ptr_id.second; + } const int shmem_size_total = m_shmem_begin + m_shmem_size; if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < @@ -829,6 +835,14 @@ class ParallelFor, "Kokkos::Impl::ParallelFor< Cuda > requested too large team size.")); } } + + ~ParallelFor() { + if (m_scratch_pool_id >= 0) { + m_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_pool[m_scratch_pool_id] = 0; + } + } }; } // namespace Impl @@ -870,9 +884,24 @@ class ParallelReduce, ReducerType, using value_type = typename ValueTraits::value_type; using reference_type = typename ValueTraits::reference_type; using functor_type = FunctorType; - using size_type = Kokkos::Cuda::size_type; - using index_type = typename Policy::index_type; - using reducer_type = ReducerType; + // Conditionally set word_size_type to int16_t or int8_t if value_type is + // smaller than int32_t (Kokkos::Cuda::size_type) + // word_size_type is used to determine the word count, shared memory buffer + // size, and global memory buffer size before the reduction is performed. + // Within the reduction, the word count is recomputed based on word_size_type + // and when calculating indexes into the shared/global memory buffers for + // performing the reduction, word_size_type is used again. + // For scalars > 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = typename std::conditional< + sizeof(value_type) < sizeof(Kokkos::Cuda::size_type), + typename std::conditional::type, + Kokkos::Cuda::size_type>::type; + using index_type = typename Policy::index_type; + using reducer_type = ReducerType; // Algorithmic constraints: blockSize is a power of two AND blockDim.y == // blockDim.z == 1 @@ -883,9 +912,11 @@ class ParallelReduce, ReducerType, const pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; const bool m_result_ptr_host_accessible; - size_type* m_scratch_space; - size_type* m_scratch_flags; - size_type* m_unified_space; + word_size_type* m_scratch_space; + // m_scratch_flags must be of type Cuda::size_type due to use of atomics + // for tracking metadata in Kokkos_Cuda_ReduceScan.hpp + Cuda::size_type* m_scratch_flags; + word_size_type* m_unified_space; // Shall we use the shfl based reduction or not (only use it for static sized // types of more than 128bit) @@ -924,16 +955,16 @@ class ParallelReduce, ReducerType, __device__ inline void run(const DummySHMEMReductionType& ) const {*/ - const integral_nonzero_constant + const integral_nonzero_constant< + word_size_type, ValueTraits::StaticValueSize / sizeof(word_size_type)> word_count(ValueTraits::value_size( ReducerConditional::select(m_functor, m_reducer)) / - sizeof(size_type)); + sizeof(word_size_type)); { reference_type value = ValueInit::init(ReducerConditional::select(m_functor, m_reducer), - kokkos_impl_cuda_shared_memory() + + kokkos_impl_cuda_shared_memory() + threadIdx.y * word_count.value); // Number of blocks is bounded so that the reduction can be limited to two @@ -958,11 +989,12 @@ class ParallelReduce, ReducerType, // This is the final block with the final result at the final threads' // location - size_type* const shared = kokkos_impl_cuda_shared_memory() + - (blockDim.y - 1) * word_count.value; - size_type* const global = + word_size_type* const shared = + kokkos_impl_cuda_shared_memory() + + (blockDim.y - 1) * word_count.value; + word_size_type* const global = m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) + ? reinterpret_cast(m_result_ptr) : (m_unified_space ? m_unified_space : m_scratch_space); if (threadIdx.y == 0) { @@ -985,17 +1017,17 @@ class ParallelReduce, ReducerType, if (cuda_single_inter_block_reduce_scan( ReducerConditional::select(m_functor, m_reducer), blockIdx.x, - gridDim.x, kokkos_impl_cuda_shared_memory(), + gridDim.x, kokkos_impl_cuda_shared_memory(), m_scratch_space, m_scratch_flags)) { // This is the final block with the final result at the final threads' // location - size_type* const shared = - kokkos_impl_cuda_shared_memory() + + word_size_type* const shared = + kokkos_impl_cuda_shared_memory() + (blockDim.y - 1) * word_count.value; - size_type* const global = + word_size_type* const global = m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) + ? reinterpret_cast(m_result_ptr) : (m_unified_space ? m_unified_space : m_scratch_space); if (threadIdx.y == 0) { @@ -1100,15 +1132,21 @@ class ParallelReduce, ReducerType, KOKKOS_ASSERT(block_size > 0); - m_scratch_space = cuda_internal_scratch_space( + // TODO: down casting these uses more space than required? + m_scratch_space = (word_size_type*)cuda_internal_scratch_space( m_policy.space(), ValueTraits::value_size(ReducerConditional::select( m_functor, m_reducer)) * block_size /* block_size == max block_count */); - m_scratch_flags = - cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type)); - m_unified_space = cuda_internal_scratch_unified( - m_policy.space(), ValueTraits::value_size(ReducerConditional::select( - m_functor, m_reducer))); + + // Intentionally do not downcast to word_size_type since we use Cuda + // atomics in Kokkos_Cuda_ReduceScan.hpp + m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(), + sizeof(Cuda::size_type)); + m_unified_space = + reinterpret_cast(cuda_internal_scratch_unified( + m_policy.space(), + ValueTraits::value_size( + ReducerConditional::select(m_functor, m_reducer)))); // REQUIRED ( 1 , N , 1 ) dim3 block(1, block_size, 1); @@ -1139,7 +1177,9 @@ class ParallelReduce, ReducerType, false); // copy to device and execute if (!m_result_ptr_device_accessible) { - m_policy.space().fence(); + m_policy.space().fence( + "Kokkos::Impl::ParallelReduce::execute: Result " + "Not Device Accessible"); if (m_result_ptr) { if (m_unified_space) { @@ -1459,7 +1499,9 @@ class ParallelReduce, ReducerType, false); // copy to device and execute if (!m_result_ptr_device_accessible) { - m_policy.space().fence(); + m_policy.space().fence( + "Kokkos::Impl::ParallelReduce::execute: " + "Result Not Device Accessible"); if (m_result_ptr) { if (m_unified_space) { @@ -1580,6 +1622,7 @@ class ParallelReduce, size_type m_shmem_size; void* m_scratch_ptr[2]; int m_scratch_size[2]; + int m_scratch_pool_id = -1; const size_type m_league_size; int m_team_size; const size_type m_vector_size; @@ -1821,7 +1864,9 @@ class ParallelReduce, true); // copy to device and execute if (!m_result_ptr_device_accessible) { - m_policy.space().fence(); + m_policy.space().fence( + "Kokkos::Impl::ParallelReduce::execute: Result " + "Not Device Accessible"); if (m_result_ptr) { if (m_unified_space) { @@ -1895,16 +1940,19 @@ class ParallelReduce, FunctorTeamShmemSize::value(arg_functor, m_team_size); m_scratch_size[0] = m_shmem_size; m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - m_scratch_ptr[1] = - m_team_size <= 0 - ? nullptr - : m_policy.space() - .impl_internal_space_instance() - ->resize_team_scratch_space( - static_cast(m_scratch_size[1]) * - (static_cast( - Cuda::concurrency() / - (m_team_size * m_vector_size)))); + if (m_team_size <= 0) { + m_scratch_ptr[1] = nullptr; + } else { + auto scratch_ptr_id = + m_policy.space() + .impl_internal_space_instance() + ->resize_team_scratch_space( + static_cast(m_scratch_size[1]) * + (static_cast(Cuda::concurrency() / + (m_team_size * m_vector_size)))); + m_scratch_ptr[1] = scratch_ptr_id.first; + m_scratch_pool_id = scratch_ptr_id.second; + } // The global parallel_reduce does not support vector_length other than 1 at // the moment @@ -1973,6 +2021,8 @@ class ParallelReduce, cudaFuncAttributes attr = CudaParallelLaunch::get_cuda_func_attributes(); + + // Valid team size not provided, deduce team size m_team_size = m_team_size >= 0 ? m_team_size @@ -1994,15 +2044,19 @@ class ParallelReduce, FunctorTeamShmemSize::value(arg_functor, m_team_size); m_scratch_size[0] = m_shmem_size; m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - m_scratch_ptr[1] = - m_team_size <= 0 - ? nullptr - : m_policy.space() - .impl_internal_space_instance() - ->resize_team_scratch_space( - static_cast(m_scratch_size[1]) * - static_cast(Cuda::concurrency() / - (m_team_size * m_vector_size))); + if (m_team_size <= 0) { + m_scratch_ptr[1] = nullptr; + } else { + auto scratch_ptr_id = + m_policy.space() + .impl_internal_space_instance() + ->resize_team_scratch_space( + static_cast(m_scratch_size[1]) * + (static_cast(Cuda::concurrency() / + (m_team_size * m_vector_size)))); + m_scratch_ptr[1] = scratch_ptr_id.first; + m_scratch_pool_id = scratch_ptr_id.second; + } // The global parallel_reduce does not support vector_length other than 1 at // the moment @@ -2030,13 +2084,28 @@ class ParallelReduce, Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size")); } - if (int(m_team_size) > - arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) { + + size_type team_size_max = + Kokkos::Impl::cuda_get_max_block_size( + m_policy.space().impl_internal_space_instance(), attr, m_functor, + m_vector_size, m_policy.team_scratch_size(0), + m_policy.thread_scratch_size(0)) / + m_vector_size; + + if ((int)m_team_size > (int)team_size_max) { Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too " "large team size.")); } } + + ~ParallelReduce() { + if (m_scratch_pool_id >= 0) { + m_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_pool[m_scratch_pool_id] = 0; + } + } }; } // namespace Impl @@ -2167,9 +2236,7 @@ class ParallelScan, Kokkos::Cuda> { for (typename Policy::member_type iwork_base = range.begin(); iwork_base < range.end(); iwork_base += blockDim.y) { -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK - unsigned MASK = KOKKOS_IMPL_CUDA_ACTIVEMASK; -#endif + unsigned MASK = __activemask(); const typename Policy::member_type iwork = iwork_base + threadIdx.y; __syncthreads(); // Don't overwrite previous iteration values until they @@ -2182,11 +2249,7 @@ class ParallelScan, Kokkos::Cuda> { for (unsigned i = threadIdx.y; i < word_count.value; ++i) { shared_data[i + word_count.value] = shared_data[i] = shared_accum[i]; } -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK - KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK); -#else - KOKKOS_IMPL_CUDA_SYNCWARP; -#endif + __syncwarp(MASK); if (CudaTraits::WarpSize < word_count.value) { __syncthreads(); } // Protect against large scan values. @@ -2457,9 +2520,7 @@ class ParallelScanWithTotal, for (typename Policy::member_type iwork_base = range.begin(); iwork_base < range.end(); iwork_base += blockDim.y) { -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK - unsigned MASK = KOKKOS_IMPL_CUDA_ACTIVEMASK; -#endif + unsigned MASK = __activemask(); const typename Policy::member_type iwork = iwork_base + threadIdx.y; @@ -2474,11 +2535,7 @@ class ParallelScanWithTotal, shared_data[i + word_count.value] = shared_data[i] = shared_accum[i]; } -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK - KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK); -#else - KOKKOS_IMPL_CUDA_SYNCWARP; -#endif + __syncwarp(MASK); if (CudaTraits::WarpSize < word_count.value) { __syncthreads(); } // Protect against large scan values. diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp index fc9fc3770b..e5b05bcc64 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp @@ -191,48 +191,28 @@ __device__ bool cuda_inter_block_reduction( value_type tmp = Kokkos::shfl_down(value, 1, 32); if (id + 1 < int(gridDim.x)) join(value, tmp); } -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK - unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK; - int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); -#else - int active = KOKKOS_IMPL_CUDA_BALLOT(1); -#endif + unsigned int mask = __activemask(); + int active = __ballot_sync(mask, 1); if (int(blockDim.x * blockDim.y) > 2) { value_type tmp = Kokkos::shfl_down(value, 2, 32); if (id + 2 < int(gridDim.x)) join(value, tmp); } -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK - active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); -#else - active += KOKKOS_IMPL_CUDA_BALLOT(1); -#endif + active += __ballot_sync(mask, 1); if (int(blockDim.x * blockDim.y) > 4) { value_type tmp = Kokkos::shfl_down(value, 4, 32); if (id + 4 < int(gridDim.x)) join(value, tmp); } -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK - active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); -#else - active += KOKKOS_IMPL_CUDA_BALLOT(1); -#endif + active += __ballot_sync(mask, 1); if (int(blockDim.x * blockDim.y) > 8) { value_type tmp = Kokkos::shfl_down(value, 8, 32); if (id + 8 < int(gridDim.x)) join(value, tmp); } -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK - active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); -#else - active += KOKKOS_IMPL_CUDA_BALLOT(1); -#endif + active += __ballot_sync(mask, 1); if (int(blockDim.x * blockDim.y) > 16) { value_type tmp = Kokkos::shfl_down(value, 16, 32); if (id + 16 < int(gridDim.x)) join(value, tmp); } -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK - active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); -#else - active += KOKKOS_IMPL_CUDA_BALLOT(1); -#endif + active += __ballot_sync(mask, 1); } } // The last block has in its thread=0 the global reduction value through @@ -388,48 +368,28 @@ __device__ inline value_type tmp = Kokkos::shfl_down(value, 1, 32); if (id + 1 < int(gridDim.x)) reducer.join(value, tmp); } -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK - unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK; - int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); -#else - int active = KOKKOS_IMPL_CUDA_BALLOT(1); -#endif + unsigned int mask = __activemask(); + int active = __ballot_sync(mask, 1); if (int(blockDim.x * blockDim.y) > 2) { value_type tmp = Kokkos::shfl_down(value, 2, 32); if (id + 2 < int(gridDim.x)) reducer.join(value, tmp); } -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK - active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); -#else - active += KOKKOS_IMPL_CUDA_BALLOT(1); -#endif + active += __ballot_sync(mask, 1); if (int(blockDim.x * blockDim.y) > 4) { value_type tmp = Kokkos::shfl_down(value, 4, 32); if (id + 4 < int(gridDim.x)) reducer.join(value, tmp); } -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK - active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); -#else - active += KOKKOS_IMPL_CUDA_BALLOT(1); -#endif + active += __ballot_sync(mask, 1); if (int(blockDim.x * blockDim.y) > 8) { value_type tmp = Kokkos::shfl_down(value, 8, 32); if (id + 8 < int(gridDim.x)) reducer.join(value, tmp); } -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK - active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); -#else - active += KOKKOS_IMPL_CUDA_BALLOT(1); -#endif + active += __ballot_sync(mask, 1); if (int(blockDim.x * blockDim.y) > 16) { value_type tmp = Kokkos::shfl_down(value, 16, 32); if (id + 16 < int(gridDim.x)) reducer.join(value, tmp); } -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK - active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); -#else - active += KOKKOS_IMPL_CUDA_BALLOT(1); -#endif + active += __ballot_sync(mask, 1); } } @@ -573,23 +533,17 @@ struct CudaReductionsFunctor { // part of the reduction const int width) // How much of the warp participates { -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK unsigned mask = width == 32 ? 0xffffffff : ((1 << width) - 1) << ((threadIdx.y * blockDim.x + threadIdx.x) / width) * width; -#endif const int lane_id = (threadIdx.y * blockDim.x + threadIdx.x) % 32; for (int delta = skip_vector ? blockDim.x : 1; delta < width; delta *= 2) { if (lane_id + delta < 32) { ValueJoin::join(functor, value, value + delta); } -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK - KOKKOS_IMPL_CUDA_SYNCWARP_MASK(mask); -#else - KOKKOS_IMPL_CUDA_SYNCWARP; -#endif + __syncwarp(mask); } *value = *(value - lane_id); } @@ -612,17 +566,18 @@ struct CudaReductionsFunctor { const unsigned int delta = (threadIdx.y * blockDim.x + threadIdx.x) * 32; if (delta < blockDim.x * blockDim.y) *my_shared_team_buffer_element = shared_team_buffer_element[delta]; - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); scalar_intra_warp_reduction(functor, my_shared_team_buffer_element, false, blockDim.x * blockDim.y / 32); if (threadIdx.x + threadIdx.y == 0) *result = *shared_team_buffer_element; } } + template __device__ static inline bool scalar_inter_block_reduction( const FunctorType& functor, const Cuda::size_type /*block_id*/, - const Cuda::size_type block_count, Cuda::size_type* const shared_data, - Cuda::size_type* const global_data, Cuda::size_type* const global_flags) { + const Cuda::size_type block_count, SizeType* const shared_data, + SizeType* const global_data, Cuda::size_type* const global_flags) { Scalar* const global_team_buffer_element = ((Scalar*)global_data); Scalar* const my_global_team_buffer_element = global_team_buffer_element + blockIdx.x; @@ -713,17 +668,17 @@ __device__ void cuda_intra_block_reduce_scan( const pointer_type tdata_intra = base_data + value_count * threadIdx.y; { // Intra-warp reduction: - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 0) - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 1) - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 2) - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 3) - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 4) - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); } __syncthreads(); // Wait for all warps to reduce @@ -732,57 +687,31 @@ __device__ void cuda_intra_block_reduce_scan( const unsigned rtid_inter = (threadIdx.y ^ BlockSizeMask) << CudaTraits::WarpIndexShift; -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK - unsigned inner_mask = - KOKKOS_IMPL_CUDA_BALLOT_MASK(0xffffffff, (rtid_inter < blockDim.y)); -#endif + unsigned inner_mask = __ballot_sync(0xffffffff, (rtid_inter < blockDim.y)); if (rtid_inter < blockDim.y) { const pointer_type tdata_inter = base_data + value_count * (rtid_inter ^ BlockSizeMask); -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK if ((1 << 5) < BlockSizeMask) { - KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); + __syncwarp(inner_mask); BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 5) } if ((1 << 6) < BlockSizeMask) { - KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); + __syncwarp(inner_mask); BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 6) } if ((1 << 7) < BlockSizeMask) { - KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); + __syncwarp(inner_mask); BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 7) } if ((1 << 8) < BlockSizeMask) { - KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); + __syncwarp(inner_mask); BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 8) } if ((1 << 9) < BlockSizeMask) { - KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); + __syncwarp(inner_mask); BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 9) } -#else - if ((1 << 5) < BlockSizeMask) { - KOKKOS_IMPL_CUDA_SYNCWARP; - BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 5) - } - if ((1 << 6) < BlockSizeMask) { - KOKKOS_IMPL_CUDA_SYNCWARP; - BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 6) - } - if ((1 << 7) < BlockSizeMask) { - KOKKOS_IMPL_CUDA_SYNCWARP; - BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 7) - } - if ((1 << 8) < BlockSizeMask) { - KOKKOS_IMPL_CUDA_SYNCWARP; - BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 8) - } - if ((1 << 9) < BlockSizeMask) { - KOKKOS_IMPL_CUDA_SYNCWARP; - BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 9) - } -#endif if (DoScan) { int n = @@ -795,25 +724,14 @@ __device__ void cuda_intra_block_reduce_scan( if (!(rtid_inter + n < blockDim.y)) n = 0; -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK - KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); + __syncwarp(inner_mask); BLOCK_SCAN_STEP(tdata_inter, n, 8) - KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); + __syncwarp(inner_mask); BLOCK_SCAN_STEP(tdata_inter, n, 7) - KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); + __syncwarp(inner_mask); BLOCK_SCAN_STEP(tdata_inter, n, 6) - KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); + __syncwarp(inner_mask); BLOCK_SCAN_STEP(tdata_inter, n, 5) -#else - KOKKOS_IMPL_CUDA_SYNCWARP; - BLOCK_SCAN_STEP(tdata_inter, n, 8) - KOKKOS_IMPL_CUDA_SYNCWARP; - BLOCK_SCAN_STEP(tdata_inter, n, 7) - KOKKOS_IMPL_CUDA_SYNCWARP; - BLOCK_SCAN_STEP(tdata_inter, n, 6) - KOKKOS_IMPL_CUDA_SYNCWARP; - BLOCK_SCAN_STEP(tdata_inter, n, 5) -#endif } } } @@ -832,17 +750,17 @@ __device__ void cuda_intra_block_reduce_scan( : ((rtid_intra & 16) ? 16 : 0)))); if (!(rtid_intra + n < blockDim.y)) n = 0; - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); BLOCK_SCAN_STEP(tdata_intra, n, 4) __threadfence_block(); - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); BLOCK_SCAN_STEP(tdata_intra, n, 3) __threadfence_block(); - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); BLOCK_SCAN_STEP(tdata_intra, n, 2) __threadfence_block(); - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); BLOCK_SCAN_STEP(tdata_intra, n, 1) __threadfence_block(); - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); BLOCK_SCAN_STEP(tdata_intra, n, 0) __threadfence_block(); - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); } #undef BLOCK_SCAN_STEP @@ -858,12 +776,13 @@ __device__ void cuda_intra_block_reduce_scan( * Global reduce result is in the last threads' 'shared_data' location. */ -template +template __device__ bool cuda_single_inter_block_reduce_scan2( const FunctorType& functor, const Cuda::size_type block_id, - const Cuda::size_type block_count, Cuda::size_type* const shared_data, - Cuda::size_type* const global_data, Cuda::size_type* const global_flags) { - using size_type = Cuda::size_type; + const Cuda::size_type block_count, SizeType* const shared_data, + SizeType* const global_data, Cuda::size_type* const global_flags) { + using size_type = SizeType; using ValueTraits = FunctorValueTraits; using ValueJoin = FunctorValueJoin; using ValueInit = FunctorValueInit; @@ -953,11 +872,12 @@ __device__ bool cuda_single_inter_block_reduce_scan2( return is_last_block; } -template +template __device__ bool cuda_single_inter_block_reduce_scan( const FunctorType& functor, const Cuda::size_type block_id, - const Cuda::size_type block_count, Cuda::size_type* const shared_data, - Cuda::size_type* const global_data, Cuda::size_type* const global_flags) { + const Cuda::size_type block_count, SizeType* const shared_data, + SizeType* const global_data, Cuda::size_type* const global_flags) { using ValueTraits = FunctorValueTraits; if (!DoScan && ValueTraits::StaticValueSize > 0) return Kokkos::Impl::CudaReductionsFunctor< diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp index 2004edbeac..88ac0d1878 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp @@ -54,11 +54,27 @@ #include #include -#include // CUDA_SAFE_CALL +#include // KOKKOS_IMPL_CUDA_SAFE_CALL #include //---------------------------------------------------------------------------- +#if defined(__CUDA_ARCH__) +#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG) \ + { \ + __syncwarp(); \ + const unsigned b = __activemask(); \ + if (b != 0xffffffff) { \ + printf(" SYNCWARP AT %s (%d,%d,%d) (%d,%d,%d) failed %x\n", MSG, \ + blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, \ + threadIdx.z, b); \ + return; \ + } \ + } +#else +#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG) +#endif + namespace Kokkos { namespace Impl { namespace { @@ -138,13 +154,13 @@ class TaskQueueSpecialization> { // Broadcast task pointer: // Sync before the broadcast - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); // pretend it's an int* for shuffle purposes ((int*)¤t_task)[0] = - KOKKOS_IMPL_CUDA_SHFL(((int*)¤t_task)[0], 0, 32); + __shfl_sync(0xffffffff, ((int*)¤t_task)[0], 0, 32); ((int*)¤t_task)[1] = - KOKKOS_IMPL_CUDA_SHFL(((int*)¤t_task)[1], 0, 32); + __shfl_sync(0xffffffff, ((int*)¤t_task)[1], 0, 32); if (current_task) { KOKKOS_ASSERT(!current_task->as_runnable_task().get_respawn_flag()); @@ -168,7 +184,7 @@ class TaskQueueSpecialization> { // Synchronize threads of the warp and insure memory // writes are visible to all threads in the warp. - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); if (shared_memory_task_copy->is_team_runnable()) { // Thread Team Task @@ -182,7 +198,7 @@ class TaskQueueSpecialization> { // Synchronize threads of the warp and insure memory // writes are visible to all threads in the warp. - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); // if(warp_lane < b % CudaTraits::WarpSize) b += CudaTraits::WarpSize; // b -= b % CudaTraits::WarpSize; @@ -196,7 +212,7 @@ class TaskQueueSpecialization> { // writes are visible to root thread of the warp for // respawn or completion. - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); if (warp_lane == 0) { // If respawn requested copy respawn data back to main memory @@ -249,12 +265,14 @@ class TaskQueueSpecialization> { auto& queue = scheduler.queue(); - CUDA_SAFE_CALL(cudaDeviceSynchronize()); + Impl::cuda_device_synchronize( + "Kokkos::Impl::TaskQueueSpecialization::execute: Pre Task Execution"); // Query the stack size, in bytes: size_t previous_stack_size = 0; - CUDA_SAFE_CALL( + KOKKOS_IMPL_CUDA_SAFE_CALL( cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize)); // If not large enough then set the stack size, in bytes: @@ -262,18 +280,21 @@ class TaskQueueSpecialization> { const size_t larger_stack_size = 1 << 11; if (previous_stack_size < larger_stack_size) { - CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size)); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size)); } cuda_task_queue_execute<<>>( scheduler, shared_per_warp); - CUDA_SAFE_CALL(cudaGetLastError()); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); - CUDA_SAFE_CALL(cudaDeviceSynchronize()); + Impl::cuda_device_synchronize( + "Kokkos::Impl::TaskQueueSpecialization::execute: Post Task Execution"); if (previous_stack_size < larger_stack_size) { - CUDA_SAFE_CALL( + KOKKOS_IMPL_CUDA_SAFE_CALL( cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size)); } } @@ -295,13 +316,17 @@ class TaskQueueSpecialization> { destroy_type* dtor_ptr = (destroy_type*)((char*)storage + sizeof(function_type)); - CUDA_SAFE_CALL(cudaDeviceSynchronize()); + Impl::cuda_device_synchronize( + "Kokkos::Impl::TaskQueueSpecialization::execute: Pre Get Function Pointer for Tasks"); set_cuda_task_base_apply_function_pointer <<<1, 1>>>(ptr_ptr, dtor_ptr); - CUDA_SAFE_CALL(cudaGetLastError()); - CUDA_SAFE_CALL(cudaDeviceSynchronize()); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); + Impl::cuda_device_synchronize( + "Kokkos::Impl::TaskQueueSpecialization::execute: Post Get Function Pointer for Tasks"); ptr = *ptr_ptr; dtor = *dtor_ptr; @@ -372,23 +397,20 @@ class TaskQueueSpecializationConstrained< // count of 0 also. Otherwise, returns a task from another queue // or `end` if one couldn't be popped task_ptr = team_queue.attempt_to_steal_task(); -#if 0 - if(task != no_more_tasks_sentinel && task != end) { - std::printf("task stolen on rank %d\n", team_exec.league_rank()); - } -#endif } } // Synchronize warp with memory fence before broadcasting task pointer: // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "A" ); - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); // Broadcast task pointer: - ((int*)&task_ptr)[0] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[0], 0, 32); - ((int*)&task_ptr)[1] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[1], 0, 32); + ((int*)&task_ptr)[0] = + __shfl_sync(0xffffffff, ((int*)&task_ptr)[0], 0, 32); + ((int*)&task_ptr)[1] = + __shfl_sync(0xffffffff, ((int*)&task_ptr)[1], 0, 32); #if defined(KOKKOS_ENABLE_DEBUG) KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN("TaskQueue CUDA task_ptr"); @@ -418,7 +440,7 @@ class TaskQueueSpecializationConstrained< // writes are visible to all threads in the warp. // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "B" ); - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); if (task_root_type::TaskTeam == task_shmem->m_task_type) { // Thread Team Task @@ -432,7 +454,7 @@ class TaskQueueSpecializationConstrained< // writes are visible to all threads in the warp. // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "C" ); - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); // copy task closure from shared to global memory: @@ -445,7 +467,7 @@ class TaskQueueSpecializationConstrained< // respawn or completion. // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "D" ); - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); // If respawn requested copy respawn data back to main memory @@ -475,12 +497,14 @@ class TaskQueueSpecializationConstrained< auto& queue = scheduler.queue(); queue.initialize_team_queues(warps_per_block * grid.x); - CUDA_SAFE_CALL(cudaDeviceSynchronize()); + Impl::cuda_device_synchronize( + "Kokkos::Impl::TaskQueueSpecializationConstrained::execute: Pre Execute Task"); // Query the stack size, in bytes: size_t previous_stack_size = 0; - CUDA_SAFE_CALL( + KOKKOS_IMPL_CUDA_SAFE_CALL( cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize)); // If not large enough then set the stack size, in bytes: @@ -488,18 +512,21 @@ class TaskQueueSpecializationConstrained< const size_t larger_stack_size = 2048; if (previous_stack_size < larger_stack_size) { - CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size)); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size)); } cuda_task_queue_execute<<>>( scheduler, shared_per_warp); - CUDA_SAFE_CALL(cudaGetLastError()); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); - CUDA_SAFE_CALL(cudaDeviceSynchronize()); + Impl::cuda_device_synchronize( + "Kokkos::Impl::TaskQueueSpecializationConstrained::execute: Post Execute Task"); if (previous_stack_size < larger_stack_size) { - CUDA_SAFE_CALL( + KOKKOS_IMPL_CUDA_SAFE_CALL( cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size)); } } @@ -516,13 +543,17 @@ class TaskQueueSpecializationConstrained< destroy_type* dtor_ptr = (destroy_type*)((char*)storage + sizeof(function_type)); - CUDA_SAFE_CALL(cudaDeviceSynchronize()); + Impl::cuda_device_synchronize( + "Kokkos::Impl::TaskQueueSpecializationConstrained::get_function_pointer: Pre Get Function Pointer"); set_cuda_task_base_apply_function_pointer <<<1, 1>>>(ptr_ptr, dtor_ptr); - CUDA_SAFE_CALL(cudaGetLastError()); - CUDA_SAFE_CALL(cudaDeviceSynchronize()); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); + Impl::cuda_device_synchronize( + "Kokkos::Impl::TaskQueueSpecializationConstrained::get_function_pointer: Post Get Function Pointer"); ptr = *ptr_ptr; dtor = *dtor_ptr; @@ -609,7 +640,7 @@ class TaskExec { __device__ void team_barrier() const { if (1 < m_team_size) { - KOKKOS_IMPL_CUDA_SYNCWARP; + __syncwarp(0xffffffff); } } @@ -1205,5 +1236,7 @@ KOKKOS_INLINE_FUNCTION void single( //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#undef KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN + #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ #endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp index e780639015..922b980a25 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp @@ -340,191 +340,6 @@ class CudaTeamMember { #endif } - //-------------------------------------------------------------------------- - /**\brief Global reduction across all blocks - * - * Return !0 if reducer contains the final value - */ - template - KOKKOS_INLINE_FUNCTION static - typename std::enable_if::value, int>::type - global_reduce(ReducerType const& reducer, int* const global_scratch_flags, - void* const global_scratch_space, void* const shmem, - int const shmem_size) { -#ifdef __CUDA_ARCH__ - - using value_type = typename ReducerType::value_type; - using pointer_type = value_type volatile*; - - // Number of shared memory entries for the reduction: - const int nsh = shmem_size / sizeof(value_type); - - // Number of CUDA threads in the block, rank within the block - const int nid = blockDim.x * blockDim.y * blockDim.z; - const int tid = - threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z); - - // Reduces within block using all available shared memory - // Contributes if it is the root "vector lane" - - // wn == number of warps in the block - // wx == which lane within the warp - // wy == which warp within the block - - const int wn = - (nid + CudaTraits::WarpIndexMask) >> CudaTraits::WarpIndexShift; - const int wx = tid & CudaTraits::WarpIndexMask; - const int wy = tid >> CudaTraits::WarpIndexShift; - - //------------------------ - { // Intra warp shuffle reduction from contributing CUDA threads - - value_type tmp(reducer.reference()); - - for (int i = CudaTraits::WarpSize; (int)blockDim.x <= (i >>= 1);) { - Impl::in_place_shfl_down(reducer.reference(), tmp, i, - CudaTraits::WarpSize); - - // Root of each vector lane reduces "thread" contribution - if (0 == threadIdx.x && wx < i) { - reducer.join(&tmp, reducer.data()); - } - } - - // Reduce across warps using shared memory. - // Number of warps may not be power of two. - - __syncthreads(); // Wait before shared data write - - // Number of shared memory entries for the reduction - // is at most one per warp - const int nentry = wn < nsh ? wn : nsh; - - if (0 == wx && wy < nentry) { - // Root thread of warp 'wy' has warp's value to contribute - ((value_type*)shmem)[wy] = tmp; - } - - __syncthreads(); // Wait for write to be visible to block - - // When more warps than shared entries - // then warps must take turns joining their contribution - // to the designated shared memory entry. - for (int i = nentry; i < wn; i += nentry) { - const int k = wy - i; - - if (0 == wx && i <= wy && k < nentry) { - // Root thread of warp 'wy' has warp's value to contribute - reducer.join(((value_type*)shmem) + k, &tmp); - } - - __syncthreads(); // Wait for write to be visible to block - } - - // One warp performs the inter-warp reduction: - - if (0 == wy) { - // Start fan-in at power of two covering nentry - - for (int i = (1 << (32 - __clz(nentry - 1))); (i >>= 1);) { - const int k = wx + i; - if (wx < i && k < nentry) { - reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + k); - __threadfence_block(); // Wait for write to be visible to warp - } - } - } - } - //------------------------ - { // Write block's value to global_scratch_memory - - int last_block = 0; - - if (0 == wx) { - reducer.copy(((pointer_type)global_scratch_space) + - blockIdx.x * reducer.length(), - reducer.data()); - - __threadfence(); // Wait until global write is visible. - - last_block = (int)gridDim.x == - 1 + Kokkos::atomic_fetch_add(global_scratch_flags, 1); - - // If last block then reset count - if (last_block) *global_scratch_flags = 0; - } - - last_block = __syncthreads_or(last_block); - - if (!last_block) return 0; - } - //------------------------ - // Last block reads global_scratch_memory into shared memory. - - const int nentry = nid < gridDim.x ? (nid < nsh ? nid : nsh) - : (gridDim.x < nsh ? gridDim.x : nsh); - - // nentry = min( nid , nsh , gridDim.x ) - - // whole block reads global memory into shared memory: - - if (tid < nentry) { - const int offset = tid * reducer.length(); - - reducer.copy(((pointer_type)shmem) + offset, - ((pointer_type)global_scratch_space) + offset); - - for (int i = nentry + tid; i < (int)gridDim.x; i += nentry) { - reducer.join( - ((pointer_type)shmem) + offset, - ((pointer_type)global_scratch_space) + i * reducer.length()); - } - } - - __syncthreads(); // Wait for writes to be visible to block - - if (0 == wy) { - // Iterate to reduce shared memory to single warp fan-in size - - const int nreduce = - CudaTraits::WarpSize < nentry ? CudaTraits::WarpSize : nentry; - - // nreduce = min( CudaTraits::WarpSize , nsh , gridDim.x ) - - if (wx < nreduce && nreduce < nentry) { - for (int i = nreduce + wx; i < nentry; i += nreduce) { - reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + i); - } - __threadfence_block(); // Wait for writes to be visible to warp - } - - // Start fan-in at power of two covering nentry - - for (int i = (1 << (32 - __clz(nreduce - 1))); (i >>= 1);) { - const int k = wx + i; - if (wx < i && k < nreduce) { - reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + k); - __threadfence_block(); // Wait for writes to be visible to warp - } - } - - if (0 == wx) { - reducer.copy(reducer.data(), (pointer_type)shmem); - return 1; - } - } - return 0; - -#else - (void)reducer; - (void)global_scratch_flags; - (void)global_scratch_space; - (void)shmem; - (void)shmem_size; - return 0; -#endif - } - //---------------------------------------- // Private for the driver @@ -533,7 +348,7 @@ class CudaTeamMember { void* scratch_level_1_ptr, const int scratch_level_1_size, const int arg_league_rank, const int arg_league_size) : m_team_reduce(shared), - m_team_shared(((char*)shared) + shared_begin, shared_size, + m_team_shared(static_cast(shared) + shared_begin, shared_size, scratch_level_1_ptr, scratch_level_1_size), m_team_reduce_size(shared_begin), m_league_rank(arg_league_rank), @@ -854,14 +669,10 @@ KOKKOS_INLINE_FUNCTION void parallel_for( i += blockDim.x) { closure(i); } -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK - KOKKOS_IMPL_CUDA_SYNCWARP_MASK( - blockDim.x == 32 ? 0xffffffff - : ((1 << blockDim.x) - 1) - << (threadIdx.y % (32 / blockDim.x)) * blockDim.x); -#else - KOKKOS_IMPL_CUDA_SYNCWARP; -#endif + __syncwarp(blockDim.x == 32 + ? 0xffffffff + : ((1 << blockDim.x) - 1) + << (threadIdx.y % (32 / blockDim.x)) * blockDim.x); #endif } @@ -1100,14 +911,10 @@ KOKKOS_INLINE_FUNCTION void single( (void)lambda; #ifdef __CUDA_ARCH__ if (threadIdx.x == 0) lambda(); -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK - KOKKOS_IMPL_CUDA_SYNCWARP_MASK( - blockDim.x == 32 ? 0xffffffff - : ((1 << blockDim.x) - 1) - << (threadIdx.y % (32 / blockDim.x)) * blockDim.x); -#else - KOKKOS_IMPL_CUDA_SYNCWARP; -#endif + __syncwarp(blockDim.x == 32 + ? 0xffffffff + : ((1 << blockDim.x) - 1) + << (threadIdx.y % (32 / blockDim.x)) * blockDim.x); #endif } @@ -1118,14 +925,10 @@ KOKKOS_INLINE_FUNCTION void single( (void)lambda; #ifdef __CUDA_ARCH__ if (threadIdx.x == 0 && threadIdx.y == 0) lambda(); -#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK - KOKKOS_IMPL_CUDA_SYNCWARP_MASK( - blockDim.x == 32 ? 0xffffffff - : ((1 << blockDim.x) - 1) - << (threadIdx.y % (32 / blockDim.x)) * blockDim.x); -#else - KOKKOS_IMPL_CUDA_SYNCWARP; -#endif + __syncwarp(blockDim.x == 32 + ? 0xffffffff + : ((1 << blockDim.x) - 1) + << (threadIdx.y % (32 / blockDim.x)) * blockDim.x); #endif } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp index 7f7b7b6e78..31d3c47e1c 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp @@ -48,7 +48,12 @@ #ifdef KOKKOS_ENABLE_CUDA #include -#include + +#if !defined(KOKKOS_COMPILER_CLANG) +#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(long long) +#else +#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(int) +#endif namespace Kokkos { @@ -61,7 +66,7 @@ constexpr unsigned shfl_all_mask = 0xffffffffu; // Shuffle operations require input to be a register (stack) variable // Derived implements do_shfl_op(unsigned mask, T& in, int lane, int width), -// which turns in to one of KOKKOS_IMPL_CUDA_SHFL(_UP_|_DOWN_|_)MASK +// which turns in to one of __shfl_sync(_up|_down) // Since the logic with respect to value sizes, etc., is the same everywhere, // put it all in one place. template @@ -157,7 +162,7 @@ struct in_place_shfl_fn : in_place_shfl_op { (void)val; (void)lane; (void)width; - return KOKKOS_IMPL_CUDA_SHFL_MASK(mask, val, lane, width); + return __shfl_sync(mask, val, lane, width); } }; template @@ -170,7 +175,7 @@ struct in_place_shfl_up_fn : in_place_shfl_op { __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, int lane, int width) const noexcept { - return KOKKOS_IMPL_CUDA_SHFL_UP_MASK(mask, val, lane, width); + return __shfl_up_sync(mask, val, lane, width); } }; template @@ -188,7 +193,7 @@ struct in_place_shfl_down_fn : in_place_shfl_op { (void)val; (void)lane; (void)width; - return KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(mask, val, lane, width); + return __shfl_down_sync(mask, val, lane, width); } }; template @@ -228,5 +233,7 @@ __device__ inline T shfl_up(const T& val, int delta, int width, } // end namespace Kokkos +#undef KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF + #endif // defined( KOKKOS_ENABLE_CUDA ) #endif // !defined( KOKKOS_CUDA_VECTORIZATION_HPP ) diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp deleted file mode 100644 index 0cdd84ce27..0000000000 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp +++ /dev/null @@ -1,49 +0,0 @@ -#include - -#if defined(__CUDA_ARCH__) -#define KOKKOS_IMPL_CUDA_ACTIVEMASK __activemask() -#define KOKKOS_IMPL_CUDA_SYNCWARP __syncwarp(0xffffffff) -#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) __syncwarp(m) -#define KOKKOS_IMPL_CUDA_BALLOT(x) __ballot_sync(__activemask(), x) -#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m, x) __ballot_sync(m, x) -#define KOKKOS_IMPL_CUDA_SHFL(x, y, z) __shfl_sync(0xffffffff, x, y, z) -#define KOKKOS_IMPL_CUDA_SHFL_MASK(m, x, y, z) __shfl_sync(m, x, y, z) -#define KOKKOS_IMPL_CUDA_SHFL_UP(x, y, z) __shfl_up_sync(0xffffffff, x, y, z) -#define KOKKOS_IMPL_CUDA_SHFL_UP_MASK(m, x, y, z) __shfl_up_sync(m, x, y, z) -#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x, y, z) \ - __shfl_down_sync(0xffffffff, x, y, z) -#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m, x, y, z) __shfl_down_sync(m, x, y, z) -#else -#define KOKKOS_IMPL_CUDA_ACTIVEMASK 0 -#define KOKKOS_IMPL_CUDA_SYNCWARP -#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) (void)m -#define KOKKOS_IMPL_CUDA_BALLOT(x) 0 -#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m, x) 0 -#define KOKKOS_IMPL_CUDA_SHFL(x, y, z) 0 -#define KOKKOS_IMPL_CUDA_SHFL_MASK(m, x, y, z) 0 -#define KOKKOS_IMPL_CUDA_SHFL_UP(x, y, z) 0 -#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x, y, z) 0 -#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m, x, y, z) 0 -#endif - -#if !defined(KOKKOS_COMPILER_CLANG) -#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(long long) -#else -#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(int) -#endif - -#if defined(__CUDA_ARCH__) -#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG) \ - { \ - __syncwarp(); \ - const unsigned b = __activemask(); \ - if (b != 0xffffffff) { \ - printf(" SYNCWARP AT %s (%d,%d,%d) (%d,%d,%d) failed %x\n", MSG, \ - blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, \ - threadIdx.z, b); \ - return; \ - } \ - } -#else -#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG) -#endif diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp index 9278d1bdc9..7eb3e1e9f7 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp @@ -45,6 +45,7 @@ #ifndef KOKKOS_HIP_BLOCKSIZE_DEDUCTION_HPP #define KOKKOS_HIP_BLOCKSIZE_DEDUCTION_HPP +#include #include #if defined(__HIPCC__) @@ -56,118 +57,239 @@ namespace Kokkos { namespace Experimental { namespace Impl { -template -void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) { - // FIXME_HIP - currently the "constant" path is unimplemented. - // we should look at whether it's functional, and - // perform some simple scaling studies to see when / - // if the constant launcher outperforms the current - // pass by pointer shared launcher - HIP_SAFE_CALL(hipOccupancyMaxActiveBlocksPerMultiprocessor( - numBlocks, - hip_parallel_launch_local_memory, - blockSize, sharedmem)); -} +enum class BlockType { Max, Preferred }; -template -void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) { - hipOccupancy( - numBlocks, blockSize, sharedmem); -} - -template -int hip_internal_get_block_size(const F &condition_check, - const HIPInternal *hip_instance, - const hipFuncAttributes &attr, - const FunctorType &f, - const size_t vector_length, - const size_t shmem_block, - const size_t shmem_thread) { - const int min_blocks_per_sm = - LaunchBounds::minBperSM == 0 ? 1 : LaunchBounds::minBperSM; - const int max_threads_per_block = LaunchBounds::maxTperB == 0 - ? HIPTraits::MaxThreadsPerBlock - : LaunchBounds::maxTperB; - - const int regs_per_wavefront = std::max(attr.numRegs, 1); - const int regs_per_sm = hip_instance->m_regsPerSM; - const int shmem_per_sm = hip_instance->m_shmemPerSM; - const int max_shmem_per_block = hip_instance->m_maxShmemPerBlock; - const int max_blocks_per_sm = hip_instance->m_maxBlocksPerSM; - const int max_threads_per_sm = hip_instance->m_maxThreadsPerSM; - - int block_size = max_threads_per_block; - KOKKOS_ASSERT(block_size > 0); - const int blocks_per_warp = - (block_size + HIPTraits::WarpSize - 1) / HIPTraits::WarpSize; - - int functor_shmem = ::Kokkos::Impl::FunctorTeamShmemSize::value( - f, block_size / vector_length); - int total_shmem = shmem_block + shmem_thread * (block_size / vector_length) + - functor_shmem + attr.sharedSizeBytes; - int max_blocks_regs = regs_per_sm / (regs_per_wavefront * blocks_per_warp); - int max_blocks_shmem = - (total_shmem < max_shmem_per_block) - ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs) - : 0; - int blocks_per_sm = std::min(max_blocks_regs, max_blocks_shmem); - int threads_per_sm = blocks_per_sm * block_size; - if (threads_per_sm > max_threads_per_sm) { - blocks_per_sm = max_threads_per_sm / block_size; - threads_per_sm = blocks_per_sm * block_size; - } - int opt_block_size = - (blocks_per_sm >= min_blocks_per_sm) ? block_size : min_blocks_per_sm; - int opt_threads_per_sm = threads_per_sm; - block_size -= HIPTraits::WarpSize; - while (condition_check(blocks_per_sm) && - (block_size >= HIPTraits::WarpSize)) { - functor_shmem = ::Kokkos::Impl::FunctorTeamShmemSize::value( - f, block_size / vector_length); - total_shmem = shmem_block + shmem_thread * (block_size / vector_length) + - functor_shmem + attr.sharedSizeBytes; - max_blocks_regs = regs_per_sm / (regs_per_wavefront * blocks_per_warp); - max_blocks_shmem = - (total_shmem < max_shmem_per_block) - ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs) - : 0; - blocks_per_sm = std::min(max_blocks_regs, max_blocks_shmem); - threads_per_sm = blocks_per_sm * block_size; - if (threads_per_sm > max_threads_per_sm) { - blocks_per_sm = max_threads_per_sm / block_size; - threads_per_sm = blocks_per_sm * block_size; +template , + HIPLaunchMechanism LaunchMechanism = + DeduceHIPLaunchMechanism::launch_mechanism> +unsigned get_preferred_blocksize_impl() { + // FIXME_HIP - could be if constexpr for c++17 + if (!HIPParallelLaunch::default_launchbounds()) { + // use the user specified value + return LaunchBounds::maxTperB; + } else { + if (HIPParallelLaunch::get_scratch_size() > 0) { + return HIPTraits::ConservativeThreadsPerBlock; } - if ((blocks_per_sm >= min_blocks_per_sm) && - (blocks_per_sm <= max_blocks_per_sm)) { - if (threads_per_sm >= opt_threads_per_sm) { - opt_block_size = block_size; - opt_threads_per_sm = threads_per_sm; + return HIPTraits::MaxThreadsPerBlock; + } +} + +// FIXME_HIP - entire function could be constexpr for c++17 +template , + HIPLaunchMechanism LaunchMechanism = + DeduceHIPLaunchMechanism::launch_mechanism> +unsigned get_max_blocksize_impl() { + // FIXME_HIP - could be if constexpr for c++17 + if (!HIPParallelLaunch::default_launchbounds()) { + // use the user specified value + return LaunchBounds::maxTperB; + } else { + // we can always fit 1024 threads blocks if we only care about registers + // ... and don't mind spilling + return HIPTraits::MaxThreadsPerBlock; + } +} + +// convenience method to select and return the proper function attributes +// for a kernel, given the launch bounds et al. +template , + BlockType BlockSize = BlockType::Max, + HIPLaunchMechanism LaunchMechanism = + DeduceHIPLaunchMechanism::launch_mechanism> +hipFuncAttributes get_hip_func_attributes_impl() { + // FIXME_HIP - could be if constexpr for c++17 + if (!HIPParallelLaunch::default_launchbounds()) { + // for user defined, we *always* honor the request + return HIPParallelLaunch::get_hip_func_attributes(); + } else { + // FIXME_HIP - could be if constexpr for c++17 + if (BlockSize == BlockType::Max) { + return HIPParallelLaunch< + DriverType, Kokkos::LaunchBounds, + LaunchMechanism>::get_hip_func_attributes(); + } else { + const int blocksize = + get_preferred_blocksize_impl(); + if (blocksize == HIPTraits::MaxThreadsPerBlock) { + return HIPParallelLaunch< + DriverType, Kokkos::LaunchBounds, + LaunchMechanism>::get_hip_func_attributes(); + } else { + return HIPParallelLaunch< + DriverType, + Kokkos::LaunchBounds, + LaunchMechanism>::get_hip_func_attributes(); } } - block_size -= HIPTraits::WarpSize; } - return opt_block_size; } -template -int hip_get_max_block_size(const HIPInternal *hip_instance, - const hipFuncAttributes &attr, const FunctorType &f, - const size_t vector_length, const size_t shmem_block, - const size_t shmem_thread) { - return hip_internal_get_block_size( - [](int x) { return x == 0; }, hip_instance, attr, f, vector_length, - shmem_block, shmem_thread); +// Given an initial block-size limitation based on register usage +// determine the block size to select based on LDS limitation +template +unsigned hip_internal_get_block_size(const HIPInternal *hip_instance, + const ShmemFunctor &f, + const unsigned tperb_reg) { + // translate LB from CUDA to HIP + const unsigned min_waves_per_eu = + LaunchBounds::minBperSM ? LaunchBounds::minBperSM : 1; + const unsigned min_threads_per_sm = min_waves_per_eu * HIPTraits::WarpSize; + const unsigned shmem_per_sm = hip_instance->m_shmemPerSM; + unsigned block_size = tperb_reg; + do { + unsigned total_shmem = f(block_size); + // find how many threads we can fit with this blocksize based on LDS usage + unsigned tperb_shmem = total_shmem > shmem_per_sm ? 0 : block_size; + + // FIXME_HIP - could be if constexpr for c++17 + if (BlockSize == BlockType::Max) { + // we want the maximum blocksize possible + // just wait until we get a case where we can fit the LDS per SM + if (tperb_shmem) return block_size; + } else { + if (block_size == tperb_reg && tperb_shmem >= tperb_reg) { + // fast path for exit on first iteration if registers are more limiting + // than LDS usage, just use the register limited size + return tperb_reg; + } + // otherwise we need to apply a heuristic to choose the blocksize + // the current launchbound selection scheme is: + // 1. If no spills, choose 1024 [MaxThreadsPerBlock] + // 2. Otherwise, choose 256 [ConservativeThreadsPerBlock] + // + // For blocksizes between 256 and 1024, we'll be forced to use the 1024 LB + // and we'll already have pretty decent occupancy, thus dropping to 256 + // *probably* isn't a concern + const unsigned blocks_per_cu_shmem = shmem_per_sm / total_shmem; + const unsigned tperb = tperb_shmem < tperb_reg ? tperb_shmem : tperb_reg; + + // for anything with > 4 WF's & can fit multiple blocks + // we're probably not occupancy limited so just return that + if (blocks_per_cu_shmem > 1 && + tperb > HIPTraits::ConservativeThreadsPerBlock) { + return block_size; + } + + // otherwise, it's probably better to drop to the first valid size that + // fits in the ConservativeThreadsPerBlock + if (tperb >= min_threads_per_sm) return block_size; + } + block_size >>= 1; + } while (block_size >= HIPTraits::WarpSize); + // TODO: return a negative, add an error to kernel launch + return 0; } -template -int hip_get_opt_block_size(HIPInternal const *hip_instance, - hipFuncAttributes const &attr, FunctorType const &f, - size_t const vector_length, size_t const shmem_block, - size_t const shmem_thread) { - return hip_internal_get_block_size( - [](int) { return true; }, hip_instance, attr, f, vector_length, - shmem_block, shmem_thread); +// Standardized blocksize deduction for parallel constructs with no LDS usage +// Returns the preferred blocksize as dictated by register usage +// +// Note: a returned block_size of zero indicates that the algorithm could not +// find a valid block size. The caller is responsible for error handling. +template +unsigned hip_get_preferred_blocksize() { + return get_preferred_blocksize_impl(); +} + +// Standardized blocksize deduction for parallel constructs with no LDS usage +// Returns the max blocksize as dictated by register usage +// +// Note: a returned block_size of zero indicates that the algorithm could not +// find a valid block size. The caller is responsible for error handling. +template +unsigned hip_get_max_blocksize() { + return get_max_blocksize_impl(); +} + +// Standardized blocksize deduction for non-teams parallel constructs with LDS +// usage Returns the 'preferred' blocksize, as determined by the heuristics in +// hip_internal_get_block_size +// +// The ShmemFunctor takes a single argument of the current blocksize under +// consideration, and returns the LDS usage +// +// Note: a returned block_size of zero indicates that the algorithm could not +// find a valid block size. The caller is responsible for error handling. +template +unsigned hip_get_preferred_blocksize(HIPInternal const *hip_instance, + ShmemFunctor const &f) { + // get preferred blocksize limited by register usage + const unsigned tperb_reg = + hip_get_preferred_blocksize(); + return hip_internal_get_block_size(hip_instance, f, tperb_reg); +} + +// Standardized blocksize deduction for teams-based parallel constructs with LDS +// usage Returns the 'preferred' blocksize, as determined by the heuristics in +// hip_internal_get_block_size +// +// The ShmemTeamsFunctor takes two arguments: the hipFunctionAttributes and +// the current blocksize under consideration, and returns the LDS usage +// +// Note: a returned block_size of zero indicates that the algorithm could not +// find a valid block size. The caller is responsible for error handling. +template +unsigned hip_get_preferred_team_blocksize(HIPInternal const *hip_instance, + ShmemTeamsFunctor const &f) { + hipFuncAttributes attr = + get_hip_func_attributes_impl(); + // get preferred blocksize limited by register usage + using namespace std::placeholders; + const unsigned tperb_reg = + hip_get_preferred_blocksize(); + return hip_internal_get_block_size( + hip_instance, std::bind(f, attr, _1), tperb_reg); +} + +// Standardized blocksize deduction for non-teams parallel constructs with LDS +// usage Returns the maximum possible blocksize, as determined by the heuristics +// in hip_internal_get_block_size +// +// The ShmemFunctor takes a single argument of the current blocksize under +// consideration, and returns the LDS usage +// +// Note: a returned block_size of zero indicates that the algorithm could not +// find a valid block size. The caller is responsible for error handling. +template +unsigned hip_get_max_blocksize(HIPInternal const *hip_instance, + ShmemFunctor const &f) { + // get max blocksize limited by register usage + const unsigned tperb_reg = hip_get_max_blocksize(); + return hip_internal_get_block_size( + hip_instance, f, tperb_reg); +} + +// Standardized blocksize deduction for teams-based parallel constructs with LDS +// usage Returns the maximum possible blocksize, as determined by the heuristics +// in hip_internal_get_block_size +// +// The ShmemTeamsFunctor takes two arguments: the hipFunctionAttributes and +// the current blocksize under consideration, and returns the LDS usage +// +// Note: a returned block_size of zero indicates that the algorithm could not +// find a valid block size. The caller is responsible for error handling. +template +unsigned hip_get_max_team_blocksize(HIPInternal const *hip_instance, + ShmemTeamsFunctor const &f) { + hipFuncAttributes attr = + get_hip_func_attributes_impl(); + // get max blocksize + using namespace std::placeholders; + const unsigned tperb_reg = hip_get_max_blocksize(); + return hip_internal_get_block_size( + hip_instance, std::bind(f, attr, _1), tperb_reg); } } // namespace Impl diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp index b3480bcad0..a75e7a4a6c 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp @@ -66,12 +66,30 @@ inline void hip_internal_safe_call(hipError_t e, const char* name, } } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 + +KOKKOS_DEPRECATED +inline void hip_internal_safe_call_deprecated(hipError_t e, const char* name, + const char* file = nullptr, + const int line = 0) { + hip_internal_safe_call(e, name, file, line); +} + +#endif + } // namespace Impl } // namespace Kokkos -#define HIP_SAFE_CALL(call) \ +#define KOKKOS_IMPL_HIP_SAFE_CALL(call) \ Kokkos::Impl::hip_internal_safe_call(call, #call, __FILE__, __LINE__) +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 +#define HIP_SAFE_CALL(call) \ + Kokkos::Impl::hip_internal_safe_call_deprecated(call, #call, __FILE__, \ + __LINE__) + +#endif + namespace Kokkos { namespace Experimental { diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp index 18ef10e22c..336ac8c698 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -77,7 +77,7 @@ class HIPInternalDevices { }; HIPInternalDevices::HIPInternalDevices() { - HIP_SAFE_CALL(hipGetDeviceCount(&m_hipDevCount)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&m_hipDevCount)); if (m_hipDevCount > MAXIMUM_DEVICE_COUNT) { Kokkos::abort( @@ -85,7 +85,7 @@ HIPInternalDevices::HIPInternalDevices() { "have. Please report this to github.com/kokkos/kokkos."); } for (int i = 0; i < m_hipDevCount; ++i) { - HIP_SAFE_CALL(hipGetDeviceProperties(m_hipProp + i, i)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(m_hipProp + i, i)); } } @@ -95,6 +95,9 @@ const HIPInternalDevices &HIPInternalDevices::singleton() { } } // namespace +unsigned long *Impl::HIPInternal::constantMemHostStaging = nullptr; +hipEvent_t Impl::HIPInternal::constantMemReusable = nullptr; + namespace Impl { //---------------------------------------------------------------------------- @@ -154,6 +157,9 @@ int HIPInternal::verify_is_initialized(const char *const label) const { return 0 <= m_hipDev; } +uint32_t HIPInternal::impl_get_instance_id() const noexcept { + return m_instance_id; +} HIPInternal &HIPInternal::singleton() { static HIPInternal *self = nullptr; if (!self) { @@ -163,12 +169,23 @@ HIPInternal &HIPInternal::singleton() { } void HIPInternal::fence() const { - HIP_SAFE_CALL(hipStreamSynchronize(m_stream)); - // can reset our cycle id now as well - m_cycleId = 0; + fence("Kokkos::HIPInternal::fence: Unnamed Internal Fence"); +} +void HIPInternal::fence(const std::string &name) const { + Kokkos::Tools::Experimental::Impl::profile_fence_event< + Kokkos::Experimental::HIP>( + name, + Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{ + impl_get_instance_id()}, + [&]() { + KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(m_stream)); + // can reset our cycle id now as well + m_cycleId = 0; + }); } -void HIPInternal::initialize(int hip_device_id, hipStream_t stream) { +void HIPInternal::initialize(int hip_device_id, hipStream_t stream, + bool manage_stream) { if (was_finalized) Kokkos::abort("Calling HIP::initialize after HIP::finalize is illegal\n"); @@ -197,9 +214,10 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) { m_hipDev = hip_device_id; m_deviceProp = hipProp; - HIP_SAFE_CALL(hipSetDevice(m_hipDev)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipSetDevice(m_hipDev)); m_stream = stream; + m_manage_stream = manage_stream; m_team_scratch_current_size = 0; m_team_scratch_ptr = nullptr; @@ -222,7 +240,7 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) { // theoretically, we can get 40 WF's / CU, but only can sustain 32 // see // https://github.com/ROCm-Developer-Tools/HIP/blob/a0b5dfd625d99af7e288629747b40dd057183173/vdi/hip_platform.cpp#L742 - m_maxBlocksPerSM = 32; + m_maxWavesPerCU = 32; // FIXME_HIP - Nick to implement this upstream // Register count comes from Sec. 2.2. "Data Sharing" of the // Vega 7nm ISA document (see the diagram) @@ -232,7 +250,7 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) { m_regsPerSM = 65536; m_shmemPerSM = hipProp.maxSharedMemoryPerMultiProcessor; m_maxShmemPerBlock = hipProp.sharedMemPerBlock; - m_maxThreadsPerSM = m_maxBlocksPerSM * HIPTraits::WarpSize; + m_maxThreadsPerSM = m_maxWavesPerCU * HIPTraits::WarpSize; //---------------------------------- // Multiblock reduction uses scratch flags for counters // and scratch space for partial reduction values. @@ -265,8 +283,8 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) { m_scratchConcurrentBitset = reinterpret_cast(r->data()); - HIP_SAFE_CALL(hipMemset(m_scratchConcurrentBitset, 0, - sizeof(uint32_t) * buffer_bound)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(m_scratchConcurrentBitset, 0, + sizeof(uint32_t) * buffer_bound)); } //---------------------------------- @@ -287,6 +305,15 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) { // Init the array for used for arbitrarily sized atomics if (m_stream == nullptr) ::Kokkos::Impl::initialize_host_hip_lock_arrays(); + + // Allocate a staging buffer for constant mem in pinned host memory + // and an event to avoid overwriting driver for previous kernel launches + if (m_stream == nullptr) { + KOKKOS_IMPL_HIP_SAFE_CALL(hipHostMalloc((void **)&constantMemHostStaging, + HIPTraits::ConstantMemoryUsage)); + + KOKKOS_IMPL_HIP_SAFE_CALL(hipEventCreate(&constantMemReusable)); + } } //---------------------------------------------------------------------------- @@ -339,7 +366,7 @@ Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_flags( m_scratchFlags = reinterpret_cast(r->data()); - HIP_SAFE_CALL( + KOKKOS_IMPL_HIP_SAFE_CALL( hipMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain)); } @@ -365,7 +392,7 @@ void *HIPInternal::resize_team_scratch_space(std::int64_t bytes, //---------------------------------------------------------------------------- void HIPInternal::finalize() { - this->fence(); + this->fence("Kokkos::HIPInternal::finalize: fence on finalization"); was_finalized = true; if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { using RecordHIP = @@ -378,6 +405,9 @@ void HIPInternal::finalize() { if (m_team_scratch_current_size > 0) Kokkos::kokkos_free(m_team_scratch_ptr); + if (m_manage_stream && m_stream != nullptr) + KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(m_stream)); + m_hipDev = -1; m_hipArch = -1; m_multiProcCount = 0; @@ -395,28 +425,36 @@ void HIPInternal::finalize() { m_team_scratch_ptr = nullptr; } if (nullptr != d_driverWorkArray) { - HIP_SAFE_CALL(hipHostFree(d_driverWorkArray)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(d_driverWorkArray)); d_driverWorkArray = nullptr; } + + // only destroy these if we're finalizing the singleton + if (this == &singleton()) { + KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(constantMemHostStaging)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipEventDestroy(constantMemReusable)); + } } char *HIPInternal::get_next_driver(size_t driverTypeSize) const { std::lock_guard const lock(m_mutexWorkArray); if (d_driverWorkArray == nullptr) { - HIP_SAFE_CALL( + KOKKOS_IMPL_HIP_SAFE_CALL( hipHostMalloc(&d_driverWorkArray, m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char), hipHostMallocNonCoherent)); } if (driverTypeSize > m_maxDriverTypeSize) { // fence handles the cycle id reset for us - fence(); - HIP_SAFE_CALL(hipHostFree(d_driverWorkArray)); + fence( + "Kokkos::HIPInternal::get_next_driver: fence before reallocating " + "resources"); + KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(d_driverWorkArray)); m_maxDriverTypeSize = driverTypeSize; if (m_maxDriverTypeSize % 128 != 0) m_maxDriverTypeSize = m_maxDriverTypeSize + 128 - m_maxDriverTypeSize % 128; - HIP_SAFE_CALL( + KOKKOS_IMPL_HIP_SAFE_CALL( hipHostMalloc(&d_driverWorkArray, m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char), hipHostMallocNonCoherent)); @@ -424,7 +462,9 @@ char *HIPInternal::get_next_driver(size_t driverTypeSize) const { m_cycleId = (m_cycleId + 1) % m_maxDriverCycles; if (m_cycleId == 0) { // ensure any outstanding kernels are completed before we wrap around - fence(); + fence( + "Kokkos::HIPInternal::get_next_driver: fence before reusing first " + "driver"); } } return &d_driverWorkArray[m_maxDriverTypeSize * m_cycleId]; @@ -462,7 +502,14 @@ Kokkos::Experimental::HIP::size_type *hip_internal_scratch_flags( namespace Kokkos { namespace Impl { -void hip_device_synchronize() { HIP_SAFE_CALL(hipDeviceSynchronize()); } +void hip_device_synchronize(const std::string &name) { + Kokkos::Tools::Experimental::Impl::profile_fence_event< + Kokkos::Experimental::HIP>( + name, + Kokkos::Tools::Experimental::SpecialSynchronizationCases:: + GlobalDeviceSynchronization, + [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); }); +} void hip_internal_error_throw(hipError_t e, const char *name, const char *file, const int line) { diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp index f4f88628e3..967c6fdd4b 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -48,6 +48,7 @@ #define KOKKOS_HIP_INSTANCE_HPP #include +#include #include @@ -59,10 +60,12 @@ struct HIPTraits { static int constexpr WarpSize = 64; static int constexpr WarpIndexMask = 0x003f; /* hexadecimal for 63 */ static int constexpr WarpIndexShift = 6; /* WarpSize == 1 << WarpShift*/ + static int constexpr ConservativeThreadsPerBlock = + 256; // conservative fallback blocksize in case of spills static int constexpr MaxThreadsPerBlock = - 1024; // FIXME_HIP -- assumed constant for now - + 1024; // the maximum we can fit in a block static int constexpr ConstantMemoryUsage = 0x008000; /* 32k bytes */ + static int constexpr KernelArgumentLimit = 0x001000; /* 4k bytes */ static int constexpr ConstantMemoryUseThreshold = 0x000200; /* 512 bytes */ }; @@ -90,7 +93,7 @@ class HIPInternal { unsigned m_multiProcCount = 0; unsigned m_maxWarpCount = 0; unsigned m_maxBlock = 0; - unsigned m_maxBlocksPerSM = 0; + unsigned m_maxWavesPerCU = 0; unsigned m_maxSharedWords = 0; int m_regsPerSM; int m_shmemPerSM = 0; @@ -108,6 +111,8 @@ class HIPInternal { mutable int m_cycleId = 0; // mutex to access d_driverWorkArray mutable std::mutex m_mutexWorkArray; + // mutex to access shared memory + mutable std::mutex m_mutexSharedMemory; // Scratch Spaces for Reductions size_type m_scratchSpaceCount = 0; @@ -119,7 +124,10 @@ class HIPInternal { hipDeviceProp_t m_deviceProp; - hipStream_t m_stream = nullptr; + hipStream_t m_stream = nullptr; + uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance< + Kokkos::Experimental::HIP>(reinterpret_cast(this)); + bool m_manage_stream = false; // Team Scratch Level 1 Space mutable int64_t m_team_scratch_current_size = 0; @@ -128,18 +136,25 @@ class HIPInternal { bool was_finalized = false; + // FIXME_HIP: these want to be per-device, not per-stream... use of 'static' + // here will break once there are multiple devices though + static unsigned long *constantMemHostStaging; + static hipEvent_t constantMemReusable; + static HIPInternal &singleton(); int verify_is_initialized(const char *const label) const; int is_initialized() const { return m_hipDev >= 0; } - void initialize(int hip_device_id, hipStream_t stream = nullptr); + void initialize(int hip_device_id, hipStream_t stream = nullptr, + bool manage_stream = false); void finalize(); void print_configuration(std::ostream &) const; void fence() const; + void fence(const std::string &) const; // returns the next driver type pointer in our work array char *get_next_driver(size_t driverTypeSize) const; @@ -151,13 +166,52 @@ class HIPInternal { // Resizing of reduction related scratch spaces size_type *scratch_space(const size_type size); size_type *scratch_flags(const size_type size); - + uint32_t impl_get_instance_id() const noexcept; // Resizing of team level 1 scratch void *resize_team_scratch_space(std::int64_t bytes, bool force_shrink = false); }; } // namespace Impl + +// Partitioning an Execution Space: expects space and integer arguments for +// relative weight +// Customization point for backends +// Default behavior is to return the passed in instance + +namespace Impl { +inline void create_HIP_instances(std::vector &instances) { + for (int s = 0; s < int(instances.size()); s++) { + hipStream_t stream; + KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream)); + instances[s] = HIP(stream, true); + } +} +} // namespace Impl + +template +std::vector partition_space(const HIP &, Args...) { +#ifdef __cpp_fold_expressions + static_assert( + (... && std::is_arithmetic_v), + "Kokkos Error: partitioning arguments must be integers or floats"); +#endif + + std::vector instances(sizeof...(Args)); + Impl::create_HIP_instances(instances); + return instances; +} + +template +std::vector partition_space(const HIP &, std::vector &weights) { + static_assert( + std::is_arithmetic::value, + "Kokkos Error: partitioning arguments must be integers or floats"); + + std::vector instances(weights.size()); + Impl::create_HIP_instances(instances); + return instances; +} } // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index f774423b37..f209edf7c0 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -52,6 +52,7 @@ #include #include #include +#include // Must use global variable on the device with HIP-Clang #ifdef __HIP__ @@ -64,7 +65,7 @@ namespace Kokkos { namespace Experimental { template inline __device__ T *kokkos_impl_hip_shared_memory() { - HIP_DYNAMIC_SHARED(HIPSpace::size_type, sh); + extern __shared__ Kokkos::Experimental::HIPSpace::size_type sh[]; return (T *)sh; } } // namespace Experimental @@ -74,10 +75,12 @@ namespace Kokkos { namespace Experimental { namespace Impl { +// The hip_parallel_launch_*_memory code is identical to the cuda code template __global__ static void hip_parallel_launch_constant_memory() { const DriverType &driver = *(reinterpret_cast( kokkos_impl_hip_constant_memory_buffer)); + driver(); } @@ -87,12 +90,13 @@ __global__ __launch_bounds__( const DriverType &driver = *(reinterpret_cast( kokkos_impl_hip_constant_memory_buffer)); - driver->operator()(); + driver(); } template __global__ static void hip_parallel_launch_local_memory( const DriverType *driver) { + // FIXME_HIP driver() pass by copy driver->operator()(); } @@ -101,6 +105,21 @@ __global__ __launch_bounds__( maxTperB, minBperSM) static void hip_parallel_launch_local_memory(const DriverType *driver) { + // FIXME_HIP driver() pass by copy + driver->operator()(); +} + +template +__global__ static void hip_parallel_launch_global_memory( + const DriverType *driver) { + driver->operator()(); +} + +template +__global__ __launch_bounds__( + maxTperB, + minBperSM) static void hip_parallel_launch_global_memory(const DriverType + *driver) { driver->operator()(); } @@ -127,33 +146,238 @@ struct HIPDispatchProperties { HIPLaunchMechanism launch_mechanism = l; }; +// Use local memory up to ConstantMemoryUseThreshold +// Use global memory above ConstantMemoryUsage +// In between use ConstantMemory +// The following code is identical to the cuda code +template +struct DeduceHIPLaunchMechanism { + static constexpr Kokkos::Experimental::WorkItemProperty::HintLightWeight_t + light_weight = Kokkos::Experimental::WorkItemProperty::HintLightWeight; + static constexpr Kokkos::Experimental::WorkItemProperty::HintHeavyWeight_t + heavy_weight = Kokkos::Experimental::WorkItemProperty::HintHeavyWeight; + static constexpr typename DriverType::Policy::work_item_property property = + typename DriverType::Policy::work_item_property(); + + static constexpr HIPLaunchMechanism valid_launch_mechanism = + // BuildValidMask + (sizeof(DriverType) < HIPTraits::KernelArgumentLimit + ? HIPLaunchMechanism::LocalMemory + : HIPLaunchMechanism::Default) | + (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage + ? HIPLaunchMechanism::ConstantMemory + : HIPLaunchMechanism::Default) | + HIPLaunchMechanism::GlobalMemory; + + static constexpr HIPLaunchMechanism requested_launch_mechanism = + (((property & light_weight) == light_weight) + ? HIPLaunchMechanism::LocalMemory + : HIPLaunchMechanism::ConstantMemory) | + HIPLaunchMechanism::GlobalMemory; + + static constexpr HIPLaunchMechanism default_launch_mechanism = + // BuildValidMask + (sizeof(DriverType) < HIPTraits::ConstantMemoryUseThreshold) + ? HIPLaunchMechanism::LocalMemory + : ((sizeof(DriverType) < HIPTraits::ConstantMemoryUsage) + ? HIPLaunchMechanism::ConstantMemory + : HIPLaunchMechanism::GlobalMemory); + + // None LightWeight HeavyWeight + // F +struct HIPParallelLaunchKernelFuncData { + static unsigned int get_scratch_size( + hipFuncAttributes const &hip_func_attributes) { + return hip_func_attributes.localSizeBytes; + } + + static hipFuncAttributes get_hip_func_attributes(void const *kernel_func) { + static hipFuncAttributes attr = [=]() { + hipFuncAttributes attr; + KOKKOS_IMPL_HIP_SAFE_CALL(hipFuncGetAttributes(&attr, kernel_func)); + return attr; + }(); + return attr; + } +}; + +//---------------------------------------------------------------// +// HIPParallelLaunchKernelFunc structure and its specializations // +//---------------------------------------------------------------// template struct HIPParallelLaunchKernelFunc; +// HIPLaunchMechanism::LocalMemory specializations template struct HIPParallelLaunchKernelFunc< DriverType, Kokkos::LaunchBounds, HIPLaunchMechanism::LocalMemory> { + using funcdata_t = HIPParallelLaunchKernelFuncData< + DriverType, Kokkos::LaunchBounds, + HIPLaunchMechanism::LocalMemory>; static auto get_kernel_func() { return hip_parallel_launch_local_memory; } + + static constexpr auto default_launchbounds() { return false; } + + static auto get_scratch_size() { + return funcdata_t::get_scratch_size(get_hip_func_attributes()); + } + + static hipFuncAttributes get_hip_func_attributes() { + return funcdata_t::get_hip_func_attributes( + reinterpret_cast(get_kernel_func())); + } }; template struct HIPParallelLaunchKernelFunc, HIPLaunchMechanism::LocalMemory> { + using funcdata_t = + HIPParallelLaunchKernelFuncData, + HIPLaunchMechanism::LocalMemory>; static auto get_kernel_func() { - return hip_parallel_launch_local_memory; + return HIPParallelLaunchKernelFunc< + DriverType, Kokkos::LaunchBounds, + HIPLaunchMechanism::LocalMemory>::get_kernel_func(); + } + + static constexpr auto default_launchbounds() { return true; } + + static auto get_scratch_size() { + return funcdata_t::get_scratch_size(get_hip_func_attributes()); + } + + static hipFuncAttributes get_hip_func_attributes() { + return funcdata_t::get_hip_func_attributes( + reinterpret_cast(get_kernel_func())); } }; +// HIPLaunchMechanism::GlobalMemory specializations +template +struct HIPParallelLaunchKernelFunc< + DriverType, Kokkos::LaunchBounds, + HIPLaunchMechanism::GlobalMemory> { + using funcdata_t = HIPParallelLaunchKernelFuncData< + DriverType, Kokkos::LaunchBounds, + HIPLaunchMechanism::GlobalMemory>; + static auto get_kernel_func() { + return hip_parallel_launch_global_memory; + } + + static constexpr auto default_launchbounds() { return false; } + + static auto get_scratch_size() { + return funcdata_t::get_scratch_size(get_hip_func_attributes()); + } + + static hipFuncAttributes get_hip_func_attributes() { + return funcdata_t::get_hip_func_attributes( + reinterpret_cast(get_kernel_func())); + } +}; + +template +struct HIPParallelLaunchKernelFunc, + HIPLaunchMechanism::GlobalMemory> { + using funcdata_t = + HIPParallelLaunchKernelFuncData, + HIPLaunchMechanism::GlobalMemory>; + static auto get_kernel_func() { + return hip_parallel_launch_global_memory; + } + + static constexpr auto default_launchbounds() { return true; } + + static auto get_scratch_size() { + return funcdata_t::get_scratch_size(get_hip_func_attributes()); + } + + static hipFuncAttributes get_hip_func_attributes() { + return funcdata_t::get_hip_func_attributes( + reinterpret_cast(get_kernel_func())); + } +}; + +// HIPLaunchMechanism::ConstantMemory specializations +template +struct HIPParallelLaunchKernelFunc< + DriverType, Kokkos::LaunchBounds, + HIPLaunchMechanism::ConstantMemory> { + using funcdata_t = HIPParallelLaunchKernelFuncData< + DriverType, Kokkos::LaunchBounds, + HIPLaunchMechanism::ConstantMemory>; + static auto get_kernel_func() { + return hip_parallel_launch_constant_memory; + } + + static constexpr auto default_launchbounds() { return false; } + + static auto get_scratch_size() { + return funcdata_t::get_scratch_size(get_hip_func_attributes()); + } + + static hipFuncAttributes get_hip_func_attributes() { + return funcdata_t::get_hip_func_attributes( + reinterpret_cast(get_kernel_func())); + } +}; + +template +struct HIPParallelLaunchKernelFunc, + HIPLaunchMechanism::ConstantMemory> { + using funcdata_t = + HIPParallelLaunchKernelFuncData, + HIPLaunchMechanism::ConstantMemory>; + static auto get_kernel_func() { + return hip_parallel_launch_constant_memory; + } + static constexpr auto default_launchbounds() { return true; } + + static auto get_scratch_size() { + return funcdata_t::get_scratch_size(get_hip_func_attributes()); + } + + static hipFuncAttributes get_hip_func_attributes() { + return funcdata_t::get_hip_func_attributes( + reinterpret_cast(get_kernel_func())); + } +}; + +//------------------------------------------------------------------// +// HIPParallelLaunchKernelInvoker structure and its specializations // +//------------------------------------------------------------------// template struct HIPParallelLaunchKernelInvoker; +// HIPLaunchMechanism::LocalMemory specialization template struct HIPParallelLaunchKernelInvoker @@ -170,21 +394,83 @@ struct HIPParallelLaunchKernelInvoker +struct HIPParallelLaunchKernelInvoker + : HIPParallelLaunchKernelFunc { + using base_t = HIPParallelLaunchKernelFunc; + + // FIXME_HIP the code is different than cuda because driver cannot be passed + // by copy + static void invoke_kernel(DriverType const *driver, dim3 const &grid, + dim3 const &block, int shmem, + HIPInternal const *hip_instance) { + (base_t::get_kernel_func())<<m_stream>>>( + driver); + } +}; + +// HIPLaunchMechanism::ConstantMemory specializations +template +struct HIPParallelLaunchKernelInvoker + : HIPParallelLaunchKernelFunc { + using base_t = + HIPParallelLaunchKernelFunc; + static_assert(sizeof(DriverType) < HIPTraits::ConstantMemoryUsage, + "Kokkos Error: Requested HIPLaunchConstantMemory with a " + "Functor larger than 32kB."); + + static void invoke_kernel(DriverType const *driver, dim3 const &grid, + dim3 const &block, int shmem, + HIPInternal const *hip_instance) { + // Wait until the previous kernel that uses the constant buffer is done + KOKKOS_IMPL_HIP_SAFE_CALL( + hipEventSynchronize(hip_instance->constantMemReusable)); + + // Copy functor (synchronously) to staging buffer in pinned host memory + unsigned long *staging = hip_instance->constantMemHostStaging; + std::memcpy((void *)staging, (void *)driver, sizeof(DriverType)); + + // Copy functor asynchronously from there to constant memory on the device + KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyToSymbolAsync( + HIP_SYMBOL(kokkos_impl_hip_constant_memory_buffer), staging, + sizeof(DriverType), 0, hipMemcpyHostToDevice, hip_instance->m_stream)); + + // Invoke the driver function on the device + (base_t:: + get_kernel_func())<<m_stream>>>(); + + // Record an event that says when the constant buffer can be reused + KOKKOS_IMPL_HIP_SAFE_CALL(hipEventRecord(hip_instance->constantMemReusable, + hip_instance->m_stream)); + } +}; + +//-----------------------------// +// HIPParallelLaunch structure // +//-----------------------------// template , - HIPLaunchMechanism LaunchMechanism = HIPLaunchMechanism::LocalMemory> + HIPLaunchMechanism LaunchMechanism = + DeduceHIPLaunchMechanism::launch_mechanism> struct HIPParallelLaunch; template + unsigned int MinBlocksPerSM, HIPLaunchMechanism LaunchMechanism> struct HIPParallelLaunch< DriverType, Kokkos::LaunchBounds, - HIPLaunchMechanism::LocalMemory> + LaunchMechanism> : HIPParallelLaunchKernelInvoker< DriverType, Kokkos::LaunchBounds, - HIPLaunchMechanism::LocalMemory> { + LaunchMechanism> { using base_t = HIPParallelLaunchKernelInvoker< DriverType, Kokkos::LaunchBounds, - HIPLaunchMechanism::LocalMemory>; + LaunchMechanism>; HIPParallelLaunch(const DriverType &driver, const dim3 &grid, const dim3 &block, const int shmem, @@ -205,22 +491,48 @@ struct HIPParallelLaunch< base_t::invoke_kernel(d_driver, grid, block, shmem, hip_instance); #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - HIP_SAFE_CALL(hipGetLastError()); - hip_instance->fence(); + KOKKOS_IMPL_HIP_SAFE_CALL(hipGetLastError()); + hip_instance->fence( + "Kokkos::Experimental::Impl::HIParallelLaunch: Debug Only Check for " + "Execution Error"); #endif } } - - static hipFuncAttributes get_hip_func_attributes() { - static hipFuncAttributes attr = []() { - hipFuncAttributes attr; - HIP_SAFE_CALL(hipFuncGetAttributes( - &attr, reinterpret_cast(base_t::get_kernel_func()))); - return attr; - }(); - return attr; - } }; + +// convenience method to launch the correct kernel given the launch bounds et +// al. +template , + HIPLaunchMechanism LaunchMechanism = + DeduceHIPLaunchMechanism::launch_mechanism> +void hip_parallel_launch(const DriverType &driver, const dim3 &grid, + const dim3 &block, const int shmem, + const HIPInternal *hip_instance, + const bool prefer_shmem) { + // FIXME_HIP - could be if constexpr for c++17 + if (!HIPParallelLaunch::default_launchbounds()) { + // for user defined, we *always* honor the request + HIPParallelLaunch( + driver, grid, block, shmem, hip_instance, prefer_shmem); + } else { + // we can do what we like + const unsigned flat_block_size = block.x * block.y * block.z; + if (flat_block_size <= HIPTraits::ConservativeThreadsPerBlock) { + // we have to use the large blocksize + HIPParallelLaunch< + DriverType, + Kokkos::LaunchBounds, + LaunchMechanism>(driver, grid, block, shmem, hip_instance, + prefer_shmem); + } else { + HIPParallelLaunch, + LaunchMechanism>(driver, grid, block, shmem, + hip_instance, prefer_shmem); + } + } +} } // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp index 4f5271b6f6..c4292d35ec 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp @@ -84,11 +84,17 @@ namespace Impl { HIPLockArrays g_host_hip_lock_arrays = {nullptr, nullptr, 0}; void initialize_host_hip_lock_arrays() { +#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS + desul::Impl::init_lock_arrays(); + + DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE(); +#endif + if (g_host_hip_lock_arrays.atomic != nullptr) return; - HIP_SAFE_CALL(hipMalloc( + KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc( &g_host_hip_lock_arrays.atomic, sizeof(std::int32_t) * (KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1))); - HIP_SAFE_CALL(hipMalloc( + KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc( &g_host_hip_lock_arrays.scratch, sizeof(std::int32_t) * (::Kokkos::Experimental::HIP::concurrency()))); @@ -103,10 +109,14 @@ void initialize_host_hip_lock_arrays() { } void finalize_host_hip_lock_arrays() { +#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS + desul::Impl::finalize_lock_arrays(); +#endif + if (g_host_hip_lock_arrays.atomic == nullptr) return; - HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.atomic)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.atomic)); g_host_hip_lock_arrays.atomic = nullptr; - HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.scratch)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.scratch)); g_host_hip_lock_arrays.scratch = nullptr; g_host_hip_lock_arrays.n = 0; #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp index f34f85f43b..71b104c2e4 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp @@ -51,6 +51,10 @@ #include +#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS +#include +#endif + namespace Kokkos { namespace Impl { @@ -147,7 +151,7 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; } #define KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE() \ { \ if (::Kokkos::Impl::lock_array_copied == 0) { \ - HIP_SAFE_CALL(hipMemcpyToSymbol( \ + KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyToSymbol( \ HIP_SYMBOL(::Kokkos::Impl::g_device_hip_lock_arrays), \ &::Kokkos::Impl::g_host_hip_lock_arrays, \ sizeof(::Kokkos::Impl::HIPLockArrays))); \ @@ -155,6 +159,8 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; } ::Kokkos::Impl::lock_array_copied = 1; \ } +#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS + #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE #define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() #else @@ -162,6 +168,19 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; } KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE() #endif +#else + +#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE +#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() +#else +// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc. +#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() \ + KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE() \ + DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() +#endif + +#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */ + #endif /* defined( __HIPCC__ ) */ #endif /* #ifndef KOKKOS_HIP_LOCKS_HPP */ diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp index ce1aff9586..acb538e1cb 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp @@ -28,7 +28,8 @@ inline TileSizeProperties get_tile_size_properties( space.impl_internal_space_instance()->m_maxThreadsPerSM; properties.default_largest_tile_size = 16; properties.default_tile_size = 4; - properties.max_total_tile_size = 1024; + properties.max_total_tile_size = + Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock; return properties; } diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp index 35e7d6fb85..eae323dd91 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp @@ -81,6 +81,8 @@ class ParallelFor, } inline void execute() const { + using ClosureType = + ParallelFor; if (m_policy.m_num_tiles == 0) return; array_index_type const maxblocks = static_cast( m_policy.space().impl_internal_space_instance()->m_maxBlock); @@ -94,7 +96,8 @@ class ParallelFor, block.y, maxblocks), 1); - Kokkos::Experimental::Impl::HIPParallelLaunch( + Kokkos::Experimental::Impl::hip_parallel_launch( *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), false); } else if (Policy::rank == 3) { @@ -110,7 +113,8 @@ class ParallelFor, std::min((m_policy.m_upper[2] - m_policy.m_lower[2] + block.z - 1) / block.z, maxblocks)); - Kokkos::Experimental::Impl::HIPParallelLaunch( + Kokkos::Experimental::Impl::hip_parallel_launch( *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), false); } else if (Policy::rank == 4) { @@ -128,7 +132,8 @@ class ParallelFor, std::min((m_policy.m_upper[3] - m_policy.m_lower[3] + block.z - 1) / block.z, maxblocks)); - Kokkos::Experimental::Impl::HIPParallelLaunch( + Kokkos::Experimental::Impl::hip_parallel_launch( *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), false); } else if (Policy::rank == 5) { @@ -147,7 +152,8 @@ class ParallelFor, std::min((m_policy.m_upper[4] - m_policy.m_lower[4] + block.z - 1) / block.z, maxblocks)); - Kokkos::Experimental::Impl::HIPParallelLaunch( + Kokkos::Experimental::Impl::hip_parallel_launch( *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), false); } else if (Policy::rank == 6) { @@ -165,7 +171,8 @@ class ParallelFor, std::min(static_cast(m_policy.m_tile_end[4] * m_policy.m_tile_end[5]), static_cast(maxblocks))); - Kokkos::Experimental::Impl::HIPParallelLaunch( + Kokkos::Experimental::Impl::hip_parallel_launch( *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), false); } else { @@ -178,22 +185,18 @@ class ParallelFor, : m_functor(arg_functor), m_policy(arg_policy) {} template - static int max_tile_size_product(const Policy& pol, const Functor&) { + static int max_tile_size_product(const Policy&, const Functor&) { using closure_type = ParallelFor, Kokkos::Experimental::HIP>; - hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< - closure_type, LaunchBounds>::get_hip_func_attributes(); - auto const& prop = pol.space().hip_device_prop(); - // Limits due to registers/SM, MDRange doesn't have - // shared memory constraints - int const regs_per_sm = prop.regsPerMultiprocessor; - int const regs_per_thread = attr.numRegs; - int const max_threads_per_sm = regs_per_sm / regs_per_thread; - return std::min( - max_threads_per_sm, - static_cast( - Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock)); + unsigned block_size = + Kokkos::Experimental::Impl::hip_get_max_blocksize(); + if (block_size == 0) + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid " + "tile size.")); + return block_size; } }; @@ -242,6 +245,9 @@ class ParallelReduce, ReducerType, const bool m_result_ptr_device_accessible; size_type* m_scratch_space; size_type* m_scratch_flags; + // Only let one Parallel/Scan modify the shared memory. The + // constructor acquires the mutex which is released in the destructor. + std::unique_lock m_shared_memory_lock; using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile< Policy::rank, Policy, FunctorType, WorkTag, reference_type>; @@ -307,32 +313,30 @@ class ParallelReduce, ReducerType, // Determine block size constrained by shared memory: // This is copy/paste from Kokkos_HIP_Parallel_Range inline unsigned local_block_size(const FunctorType& f) { - unsigned int n = - ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock; - int shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem< - false, FunctorType, WorkTag>(f, n); - using closure_type = Impl::ParallelReduce; - hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< - closure_type, LaunchBounds>::get_hip_func_attributes(); - while ( - (n && - (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < - shmem_size)) || - (n > - static_cast( - ::Kokkos::Experimental::Impl::hip_get_max_block_size( - m_policy.space().impl_internal_space_instance(), attr, f, 1, - shmem_size, 0)))) { - n >>= 1; - shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem< - false, FunctorType, WorkTag>(f, n); + const auto& instance = m_policy.space().impl_internal_space_instance(); + auto shmem_functor = [&f](unsigned n) { + return hip_single_inter_block_reduce_scan_shmem(f, n); + }; + using closure_type = ParallelReduce; + + unsigned block_size = + Kokkos::Experimental::Impl::hip_get_preferred_blocksize( + instance, shmem_functor); + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a " + "valid tile size.")); } - return n; + return block_size; } inline void execute() { - const int nwork = m_policy.m_num_tiles; + using ClosureType = ParallelReduce; + const int nwork = m_policy.m_num_tiles; if (nwork) { int block_size = m_policy.m_prod_tile_dims; // CONSTRAINT: Algorithm requires block_size >= product of tile dimensions @@ -366,14 +370,16 @@ class ParallelReduce, ReducerType, ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem< false, FunctorType, WorkTag>(m_functor, block.y); - Kokkos::Experimental::Impl::HIPParallelLaunch( + Kokkos::Experimental::Impl::hip_parallel_launch( *this, grid, block, shmem, m_policy.space().impl_internal_space_instance(), false); // copy to device and execute if (!m_result_ptr_device_accessible) { - m_policy.space().fence(); + m_policy.space().fence( + "Kokkos::Impl::ParallelReduce: fence because " + "reduction can't access result storage location"); if (m_result_ptr) { const int size = ValueTraits::value_size( @@ -403,7 +409,10 @@ class ParallelReduce, ReducerType, MemorySpaceAccess::accessible), m_scratch_space(nullptr), - m_scratch_flags(nullptr) {} + m_scratch_flags(nullptr), + m_shared_memory_lock(m_policy.space() + .impl_internal_space_instance() + ->m_mutexSharedMemory) {} ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, const ReducerType& reducer) @@ -416,23 +425,25 @@ class ParallelReduce, ReducerType, typename ReducerType::result_view_type:: memory_space>::accessible), m_scratch_space(nullptr), - m_scratch_flags(nullptr) {} + m_scratch_flags(nullptr), + m_shared_memory_lock(m_policy.space() + .impl_internal_space_instance() + ->m_mutexSharedMemory) {} + template - static int max_tile_size_product(const Policy& pol, const Functor&) { + static int max_tile_size_product(const Policy&, const Functor&) { using closure_type = ParallelReduce, ReducerType, Kokkos::Experimental::HIP>; - hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< - closure_type, LaunchBounds>::get_hip_func_attributes(); - auto const& prop = pol.space().hip_device_prop(); - // Limits due do registers/SM - int const regs_per_sm = prop.regsPerMultiprocessor; - int const regs_per_thread = attr.numRegs; - int const max_threads_per_sm = regs_per_sm / regs_per_thread; - return std::min( - max_threads_per_sm, - static_cast( - Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock)); + unsigned block_size = + Kokkos::Experimental::Impl::hip_get_max_blocksize(); + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a " + "valid tile size.")); + } + return block_size; } }; } // namespace Impl diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp index 7d2825eeb4..e02ead1e99 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp @@ -108,16 +108,21 @@ class ParallelFor, inline void execute() const { const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); + using DriverType = + ParallelFor; const int block_size = - LaunchBounds::maxTperB - ? LaunchBounds::maxTperB - : ::Kokkos::Experimental::Impl::HIPTraits:: - MaxThreadsPerBlock; // FIXME_HIP Choose block_size better + Kokkos::Experimental::Impl::hip_get_preferred_blocksize(); const dim3 block(1, block_size, 1); const dim3 grid( typename Policy::index_type((nwork + block.y - 1) / block.y), 1, 1); - Kokkos::Experimental::Impl::HIPParallelLaunch( + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelFor< HIP > could not find a " + "valid execution configuration.")); + } + Kokkos::Experimental::Impl::hip_parallel_launch( *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), false); } @@ -173,15 +178,12 @@ class ParallelReduce, ReducerType, const bool m_result_ptr_host_accessible; size_type* m_scratch_space = nullptr; size_type* m_scratch_flags = nullptr; + // Only let one ParallelReduce/Scan modify the shared memory. The + // constructor acquires the mutex which is released in the destructor. + std::unique_lock m_shared_memory_lock; -#if HIP_VERSION < 401 - static bool constexpr UseShflReduction = - ((sizeof(value_type) > 2 * sizeof(double)) && - static_cast(ValueTraits::StaticValueSize)); -#else static bool constexpr UseShflReduction = static_cast(ValueTraits::StaticValueSize); -#endif private: struct ShflReductionTag {}; @@ -328,30 +330,15 @@ class ParallelReduce, ReducerType, // Determine block size constrained by shared memory: inline unsigned local_block_size(const FunctorType& f) { - unsigned int n = - ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock; - int shmem_size = - hip_single_inter_block_reduce_scan_shmem( - f, n); - using closure_type = Impl::ParallelReduce; - hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< - closure_type, LaunchBounds>::get_hip_func_attributes(); - while ( - (n && - (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < - shmem_size)) || - (n > - static_cast( - ::Kokkos::Experimental::Impl::hip_get_max_block_size( - m_policy.space().impl_internal_space_instance(), attr, f, 1, - shmem_size, 0)))) { - n >>= 1; - shmem_size = - hip_single_inter_block_reduce_scan_shmem( - f, n); - } - return n; + const auto& instance = m_policy.space().impl_internal_space_instance(); + auto shmem_functor = [&f](unsigned n) { + return hip_single_inter_block_reduce_scan_shmem(f, n); + }; + using DriverType = ParallelReduce; + return Kokkos::Experimental::Impl::hip_get_preferred_blocksize< + DriverType, LaunchBounds>(instance, shmem_functor); } inline void execute() { @@ -362,7 +349,11 @@ class ParallelReduce, ReducerType, !std::is_same::value; if ((nwork > 0) || need_device_set) { const int block_size = local_block_size(m_functor); - KOKKOS_ASSERT(block_size > 0); + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a " + "valid execution configuration.")); + } m_scratch_space = ::Kokkos::Experimental::Impl::hip_internal_scratch_space( @@ -391,14 +382,17 @@ class ParallelReduce, ReducerType, WorkTag>(m_functor, block.y); - Kokkos::Experimental::Impl::HIPParallelLaunch( + using DriverType = ParallelReduce; + Kokkos::Experimental::Impl::hip_parallel_launch( *this, grid, block, shmem, m_policy.space().impl_internal_space_instance(), false); // copy to device and execute if (!m_result_ptr_device_accessible) { - m_policy.space().impl_internal_space_instance()->fence(); + m_policy.space().impl_internal_space_instance()->fence( + "Kokkos::Impl::ParallelReduce: fence because " + "reduction can't access result storage location"); if (m_result_ptr) { const int size = ValueTraits::value_size( @@ -429,7 +423,10 @@ class ParallelReduce, ReducerType, typename ViewType::memory_space>::accessible), m_result_ptr_host_accessible( MemorySpaceAccess::accessible) {} + typename ViewType::memory_space>::accessible), + m_shared_memory_lock(m_policy.space() + .impl_internal_space_instance() + ->m_mutexSharedMemory) {} ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, const ReducerType& reducer) @@ -444,7 +441,10 @@ class ParallelReduce, ReducerType, m_result_ptr_host_accessible( MemorySpaceAccess::accessible) {} + memory_space>::accessible), + m_shared_memory_lock(m_policy.space() + .impl_internal_space_instance() + ->m_mutexSharedMemory) {} }; template @@ -482,6 +482,9 @@ class ParallelScanHIPBase { size_type* m_scratch_flags = nullptr; size_type m_final = false; int m_grid_x = 0; + // Only let one ParallelReduce/Scan modify the shared memory. The + // constructor acquires the mutex which is released in the destructor. + std::unique_lock m_shared_memory_lock; private: template @@ -624,22 +627,7 @@ class ParallelScanHIPBase { } // Determine block size constrained by shared memory: - inline unsigned local_block_size(const FunctorType& f) { - // blockDim.y must be power of two = 128 (2 warps) or 256 (4 warps) or - // 512 (8 warps) gridDim.x <= blockDim.y * blockDim.y - // - // TODO check best option - - unsigned n = Experimental::Impl::HIPTraits::WarpSize * 4; - while (n && static_cast(m_policy.space() - .impl_internal_space_instance() - ->m_maxShmemPerBlock) < - hip_single_inter_block_reduce_scan_shmem(f, n)) { - n >>= 1; - } - return n; - } + virtual inline unsigned local_block_size(const FunctorType& f) = 0; inline void impl_execute() { const index_type nwork = m_policy.end() - m_policy.begin(); @@ -649,7 +637,11 @@ class ParallelScanHIPBase { const int gridMaxComputeCapability_2x = 0x01fff; const int block_size = static_cast(local_block_size(m_functor)); - KOKKOS_ASSERT(block_size > 0); + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelScan< HIP > could not find a " + "valid execution configuration.")); + } const int grid_max = std::min(block_size * block_size, gridMaxComputeCapability_2x); @@ -674,15 +666,16 @@ class ParallelScanHIPBase { const int shmem = ValueTraits::value_size(m_functor) * (block_size + 2); m_final = false; - Kokkos::Experimental::Impl::HIPParallelLaunch( + // these ones are OK to be just the base because the specializations + // do not modify the kernel at all + using DriverType = ParallelScanHIPBase; + Kokkos::Experimental::Impl::hip_parallel_launch( *this, grid, block, shmem, m_policy.space().impl_internal_space_instance(), false); // copy to device and execute m_final = true; - Kokkos::Experimental::Impl::HIPParallelLaunch( + Kokkos::Experimental::Impl::hip_parallel_launch( *this, grid, block, shmem, m_policy.space().impl_internal_space_instance(), false); // copy to device and execute @@ -690,13 +683,17 @@ class ParallelScanHIPBase { } ParallelScanHIPBase(const FunctorType& arg_functor, const Policy& arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} + : m_functor(arg_functor), + m_policy(arg_policy), + m_shared_memory_lock(m_policy.space() + .impl_internal_space_instance() + ->m_mutexSharedMemory) {} }; template class ParallelScan, Kokkos::Experimental::HIP> - : private ParallelScanHIPBase { + : public ParallelScanHIPBase { public: using Base = ParallelScanHIPBase; using Base::operator(); @@ -706,6 +703,23 @@ class ParallelScan, ParallelScan(const FunctorType& arg_functor, const typename Base::Policy& arg_policy) : Base(arg_functor, arg_policy) {} + + inline unsigned local_block_size(const FunctorType& f) { + // blockDim.y must be power of two = 128 (2 warps) or 256 (4 warps) or + // 512 (8 warps) gridDim.x <= blockDim.y * blockDim.y + + const auto& instance = + Base::m_policy.space().impl_internal_space_instance(); + auto shmem_functor = [&f](unsigned n) { + return hip_single_inter_block_reduce_scan_shmem( + f, n); + }; + using DriverType = ParallelScan; + return Kokkos::Experimental::Impl::hip_get_preferred_blocksize< + DriverType, typename Base::LaunchBounds>(instance, shmem_functor); + } }; //---------------------------------------------------------------------------- @@ -713,7 +727,7 @@ class ParallelScan, template class ParallelScanWithTotal, ReturnType, Kokkos::Experimental::HIP> - : private ParallelScanHIPBase { + : public ParallelScanHIPBase { public: using Base = ParallelScanHIPBase; using Base::operator(); @@ -737,6 +751,24 @@ class ParallelScanWithTotal, const typename Base::Policy& arg_policy, ReturnType& arg_returnvalue) : Base(arg_functor, arg_policy), m_returnvalue(arg_returnvalue) {} + + inline unsigned local_block_size(const FunctorType& f) { + // blockDim.y must be power of two = 128 (2 warps) or 256 (4 warps) or + // 512 (8 warps) gridDim.x <= blockDim.y * blockDim.y + + const auto& instance = + Base::m_policy.space().impl_internal_space_instance(); + auto shmem_functor = [&f](unsigned n) { + return hip_single_inter_block_reduce_scan_shmem( + f, n); + }; + using DriverType = + ParallelScanWithTotal; + return Kokkos::Experimental::Impl::hip_get_preferred_blocksize< + DriverType, typename Base::LaunchBounds>(instance, shmem_functor); + } }; } // namespace Impl diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp index 96c3ff2a75..b794f5bc03 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp @@ -56,20 +56,20 @@ namespace Kokkos { namespace Impl { + template class TeamPolicyInternal : public PolicyTraits { public: using execution_policy = TeamPolicyInternal; - using traits = PolicyTraits; + using traits = PolicyTraits; + using BlockType = Kokkos::Experimental::Impl::BlockType; template friend class TeamPolicyInternal; private: - static int constexpr MAX_WARP = 8; - typename traits::execution_space m_space; int m_league_size; int m_team_size; @@ -101,17 +101,9 @@ class TeamPolicyInternal template int team_size_max(FunctorType const& f, ParallelForTag const&) const { using closure_type = - Impl::ParallelFor >; - hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< - closure_type, - typename traits::launch_bounds>::get_hip_func_attributes(); - int const block_size = ::Kokkos::Experimental::Impl::hip_get_max_block_size< - FunctorType, typename traits::launch_bounds>( - space().impl_internal_space_instance(), attr, f, - static_cast(impl_vector_length()), - static_cast(team_scratch_size(0)) + 2 * sizeof(double), - static_cast(thread_scratch_size(0)) + sizeof(double)); - return block_size / impl_vector_length(); + Impl::ParallelFor>; + + return internal_team_size_common(f); } template @@ -129,8 +121,8 @@ class TeamPolicyInternal return internal_team_size_max(f); } - template - inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/, + template + inline int team_size_max(const FunctorType& f, const ReducerType&, const ParallelReduceTag&) const { using closure_type = Impl::ParallelReduce, @@ -141,17 +133,9 @@ class TeamPolicyInternal template int team_size_recommended(FunctorType const& f, ParallelForTag const&) const { using closure_type = - Impl::ParallelFor >; - hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< - closure_type, - typename traits::launch_bounds>::get_hip_func_attributes(); - int const block_size = ::Kokkos::Experimental::Impl::hip_get_opt_block_size< - FunctorType, typename traits::launch_bounds>( - space().impl_internal_space_instance(), attr, f, - static_cast(impl_vector_length()), - static_cast(team_scratch_size(0)) + 2 * sizeof(double), - static_cast(thread_scratch_size(0)) + sizeof(double)); - return block_size / impl_vector_length(); + Impl::ParallelFor>; + + return internal_team_size_common(f); } template @@ -169,7 +153,7 @@ class TeamPolicyInternal return internal_team_size_recommended(f); } - template + template int team_size_recommended(FunctorType const& f, ReducerType const&, ParallelReduceTag const&) const { using closure_type = @@ -177,6 +161,7 @@ class TeamPolicyInternal ReducerType>; return internal_team_size_recommended(f); } + inline bool impl_auto_vector_length() const { return m_tune_vector_length; } inline bool impl_auto_team_size() const { return m_tune_team_size; } static int vector_length_max() { @@ -211,7 +196,10 @@ class TeamPolicyInternal inline void impl_set_vector_length(size_t size) { m_vector_length = size; } inline void impl_set_team_size(size_t size) { m_team_size = size; } int impl_vector_length() const { return m_vector_length; } + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 KOKKOS_DEPRECATED int vector_length() const { return impl_vector_length(); } +#endif int team_size() const { return m_team_size; } @@ -266,7 +254,8 @@ class TeamPolicyInternal "space."); // Make sure total block size is permissible - if (m_team_size * m_vector_length > 1024) { + if (m_team_size * m_vector_length > + ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock) { Impl::throw_runtime_exception( std::string("Kokkos::TeamPolicy< HIP > the team size is too large. " "Team size x vector length must be smaller than 1024.")); @@ -363,26 +352,84 @@ class TeamPolicyInternal using member_type = Kokkos::Impl::HIPTeamMember; protected: - template - int internal_team_size_common(const FunctorType& f, - BlockSizeCallable&& block_size_callable) const { - using closure_type = ClosureType; + template + int internal_team_size_common(const FunctorType& f) const { + // FIXME_HIP: this could be unified with the + // internal_team_size_common_reduce + // once we can turn c++17 constexpr on by default. + // The problem right now is that we can't turn off the evaluation + // of the functor_value_traits's valuesize / StaticValueSize + + const unsigned shmem_block = team_scratch_size(0) + 2 * sizeof(double); + const unsigned shmem_thread = thread_scratch_size(0) + sizeof(double); + const int vector_length = impl_vector_length(); + + const auto functor = [&f, shmem_block, shmem_thread, vector_length]( + const hipFuncAttributes& attr, int block_size) { + int functor_shmem = + ::Kokkos::Impl::FunctorTeamShmemSize::value( + f, block_size / vector_length); + return shmem_block + shmem_thread * (block_size / vector_length) + + functor_shmem + attr.sharedSizeBytes; + }; + int block_size; + // FIXME_HIP - could be if constexpr for c++17 + if (BlockSize == BlockType::Max) { + block_size = ::Kokkos::Experimental::Impl::hip_get_max_team_blocksize< + ClosureType, typename traits::launch_bounds>( + space().impl_internal_space_instance(), functor); + } else { + block_size = + ::Kokkos::Experimental::Impl::hip_get_preferred_team_blocksize< + ClosureType, typename traits::launch_bounds>( + space().impl_internal_space_instance(), functor); + } + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid " + "team size.")); + } + return block_size / impl_vector_length(); + } + + template + int internal_team_size_common_reduce(const FunctorType& f) const { using functor_value_traits = Impl::FunctorValueTraits; - hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< - closure_type, - typename traits::launch_bounds>::get_hip_func_attributes(); - const int block_size = std::forward(block_size_callable)( - space().impl_internal_space_instance(), attr, f, - static_cast(impl_vector_length()), - static_cast(team_scratch_size(0)) + 2 * sizeof(double), - static_cast(thread_scratch_size(0)) + sizeof(double) + - ((functor_value_traits::StaticValueSize != 0) - ? 0 - : functor_value_traits::value_size(f))); - KOKKOS_ASSERT(block_size > 0); + const unsigned shmem_block = team_scratch_size(0) + 2 * sizeof(double); + const unsigned shmem_thread = thread_scratch_size(0) + sizeof(double) + + ((functor_value_traits::StaticValueSize != 0) + ? 0 + : functor_value_traits::value_size(f)); + const int vector_length = impl_vector_length(); + const auto functor = [&f, shmem_block, shmem_thread, vector_length]( + const hipFuncAttributes& attr, int block_size) { + int functor_shmem = + ::Kokkos::Impl::FunctorTeamShmemSize::value( + f, block_size / vector_length); + return shmem_block + shmem_thread * (block_size / vector_length) + + functor_shmem + attr.sharedSizeBytes; + }; + int block_size; + // FIXME_HIP - could be if constexpr for c++17 + if (BlockSize == BlockType::Max) { + block_size = ::Kokkos::Experimental::Impl::hip_get_max_team_blocksize< + ClosureType, typename traits::launch_bounds>( + space().impl_internal_space_instance(), functor); + } else { + block_size = + ::Kokkos::Experimental::Impl::hip_get_preferred_team_blocksize< + ClosureType, typename traits::launch_bounds>( + space().impl_internal_space_instance(), functor); + } + + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a " + "valid team size.")); + } // Currently we require Power-of-2 team size for reductions. int p2 = 1; while (p2 <= block_size) p2 *= 2; @@ -392,16 +439,13 @@ class TeamPolicyInternal template int internal_team_size_max(const FunctorType& f) const { - return internal_team_size_common( - f, ::Kokkos::Experimental::Impl::hip_get_max_block_size< - FunctorType, typename traits::launch_bounds>); + return internal_team_size_common_reduce(f); } template int internal_team_size_recommended(const FunctorType& f) const { - return internal_team_size_common( - f, ::Kokkos::Experimental::Impl::hip_get_opt_block_size< - FunctorType, typename traits::launch_bounds>); + return internal_team_size_common_reduce( + f); } }; @@ -505,7 +549,11 @@ class ParallelFor, dim3 const block(static_cast(m_vector_size), static_cast(m_team_size), 1); - ::Kokkos::Experimental::Impl::HIPParallelLaunch( + using closure_type = + ParallelFor, + Kokkos::Experimental::HIP>; + ::Kokkos::Experimental::Impl::hip_parallel_launch( *this, grid, block, shmem_size_total, m_policy.space().impl_internal_space_instance(), true); // copy to device and execute @@ -520,17 +568,9 @@ class ParallelFor, m_scratch_lock(m_policy.space() .impl_internal_space_instance() ->m_team_scratch_mutex) { - hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< - ParallelFor, launch_bounds>::get_hip_func_attributes(); - m_team_size = - m_team_size >= 0 - ? m_team_size - : ::Kokkos::Experimental::Impl::hip_get_opt_block_size< - FunctorType, launch_bounds>( - m_policy.space().impl_internal_space_instance(), attr, - m_functor, m_vector_size, m_policy.team_scratch_size(0), - m_policy.thread_scratch_size(0)) / - m_vector_size; + m_team_size = m_team_size >= 0 ? m_team_size + : arg_policy.team_size_recommended( + arg_functor, ParallelForTag()); m_shmem_begin = (sizeof(double) * (m_team_size + 2)); m_shmem_size = @@ -556,23 +596,12 @@ class ParallelFor, int const shmem_size_total = m_shmem_begin + m_shmem_size; if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < shmem_size_total) { - printf( - "%i %i\n", - m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock, - shmem_size_total); Kokkos::Impl::throw_runtime_exception(std::string( "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory")); } - if (static_cast(m_team_size) > - static_cast( - ::Kokkos::Experimental::Impl::hip_get_max_block_size( - m_policy.space().impl_internal_space_instance(), attr, - arg_functor, arg_policy.impl_vector_length(), - arg_policy.team_scratch_size(0), - arg_policy.thread_scratch_size(0)) / - arg_policy.impl_vector_length())) { + size_t max_size = arg_policy.team_size_max(arg_functor, ParallelForTag()); + if (static_cast(m_team_size) > static_cast(max_size)) { Kokkos::Impl::throw_runtime_exception(std::string( "Kokkos::Impl::ParallelFor< HIP > requested too large team size.")); } @@ -839,8 +868,11 @@ class ParallelReduce, } const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; - Kokkos::Experimental::Impl::HIPParallelLaunch( + using closure_type = + ParallelReduce, + ReducerType, Kokkos::Experimental::HIP>; + Kokkos::Experimental::Impl::hip_parallel_launch( *this, grid, block, shmem_size_total, m_policy.space().impl_internal_space_instance(), true); // copy to device and execute @@ -890,17 +922,9 @@ class ParallelReduce, m_scratch_lock(m_policy.space() .impl_internal_space_instance() ->m_team_scratch_mutex) { - hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< - ParallelReduce, launch_bounds>::get_hip_func_attributes(); - m_team_size = - m_team_size >= 0 - ? m_team_size - : Kokkos::Experimental::Impl::hip_get_opt_block_size( - m_policy.space().impl_internal_space_instance(), attr, - m_functor, m_vector_size, m_policy.team_scratch_size(0), - m_policy.thread_scratch_size(0)) / - m_vector_size; + m_team_size = m_team_size >= 0 ? m_team_size + : arg_policy.team_size_recommended( + arg_functor, ParallelReduceTag()); m_team_begin = UseShflReduction @@ -958,8 +982,9 @@ class ParallelReduce, "L0 scratch memory")); } - if (static_cast(m_team_size) > - arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) { + size_t max_size = + arg_policy.team_size_max(arg_functor, ParallelReduceTag()); + if (static_cast(m_team_size) > static_cast(max_size)) { Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::ParallelReduce< HIP > requested too " "large team size.")); @@ -992,18 +1017,10 @@ class ParallelReduce, m_scratch_lock(m_policy.space() .impl_internal_space_instance() ->m_team_scratch_mutex) { - hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< - ParallelReduce, launch_bounds>::get_hip_func_attributes(); - m_team_size = - m_team_size >= 0 - ? m_team_size - : Kokkos::Experimental::Impl::hip_get_opt_block_size( - m_policy.space().impl_internal_space_instance(), attr, - m_functor, m_vector_size, m_policy.team_scratch_size(0), - m_policy.thread_scratch_size(0)) / - m_vector_size; - + m_team_size = m_team_size >= 0 + ? m_team_size + : arg_policy.team_size_recommended(arg_functor, reducer, + ParallelReduceTag()); m_team_begin = UseShflReduction ? 0 @@ -1046,7 +1063,6 @@ class ParallelReduce, // upon team size. const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; - if ((!Kokkos::Impl::is_integral_power_of_two(m_team_size) && !UseShflReduction) || m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < @@ -1054,8 +1070,10 @@ class ParallelReduce, Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size")); } - if (static_cast(m_team_size) > - arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) { + + size_t max_size = + arg_policy.team_size_max(arg_functor, reducer, ParallelReduceTag()); + if (static_cast(m_team_size) > static_cast(max_size)) { Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::ParallelReduce< HIP > requested too " "large team size.")); diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp index 15ca089d14..e25ebe2ab3 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp @@ -67,102 +67,32 @@ namespace { hipStream_t get_deep_copy_stream() { static hipStream_t s = nullptr; if (s == nullptr) { - HIP_SAFE_CALL(hipStreamCreate(&s)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&s)); } return s; } } // namespace -DeepCopy::DeepCopy(void* dst, const void* src, - size_t n) { - HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault)); +void DeepCopyHIP(void* dst, void const* src, size_t n) { + KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault)); } -DeepCopy::DeepCopy(void* dst, const void* src, - size_t n) { - HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault)); -} - -DeepCopy::DeepCopy(void* dst, const void* src, - size_t n) { - HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault)); -} - -DeepCopy::DeepCopy(const Kokkos::Experimental::HIP& - instance, - void* dst, const void* src, - size_t n) { - HIP_SAFE_CALL( - hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream())); -} - -DeepCopy:: - DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst, - const void* src, size_t n) { - HIP_SAFE_CALL( - hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream())); -} - -DeepCopy:: - DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst, - const void* src, size_t n) { - HIP_SAFE_CALL( - hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream())); -} - -DeepCopy::DeepCopy(void* dst, const void* src, - size_t n) { - HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault)); -} - -DeepCopy::DeepCopy(void* dst, const void* src, - size_t n) { - HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault)); -} - -DeepCopy::DeepCopy(void* dst, const void* src, - size_t n) { - HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault)); -} - -DeepCopy:: - DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst, - const void* src, size_t n) { - HIP_SAFE_CALL( - hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream())); -} - -DeepCopy::DeepCopy(const Kokkos::Experimental::HIP& - instance, - void* dst, const void* src, - size_t n) { - HIP_SAFE_CALL( - hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream())); -} - -DeepCopy::DeepCopy(const Kokkos::Experimental::HIP& - instance, - void* dst, const void* src, - size_t n) { - HIP_SAFE_CALL( +void DeepCopyAsyncHIP(const Kokkos::Experimental::HIP& instance, void* dst, + void const* src, size_t n) { + KOKKOS_IMPL_HIP_SAFE_CALL( hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream())); } void DeepCopyAsyncHIP(void* dst, void const* src, size_t n) { hipStream_t s = get_deep_copy_stream(); - HIP_SAFE_CALL(hipMemcpyAsync(dst, src, n, hipMemcpyDefault, s)); - HIP_SAFE_CALL(hipStreamSynchronize(s)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyAsync(dst, src, n, hipMemcpyDefault, s)); + Kokkos::Tools::Experimental::Impl::profile_fence_event< + Kokkos::Experimental::HIP>( + "Kokkos::Impl::DeepCopyAsyncHIP: Post Deep Copy Fence on Deep-Copy " + "stream", + Kokkos::Tools::Experimental::SpecialSynchronizationCases:: + DeepCopyResourceSynchronization, + [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(s)); }); } } // namespace Impl @@ -171,6 +101,7 @@ void DeepCopyAsyncHIP(void* dst, void const* src, size_t n) { /*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/ +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 namespace Kokkos { KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error() { @@ -188,6 +119,7 @@ KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error(const void* const) { } } // namespace Kokkos +#endif /*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/ @@ -283,7 +215,7 @@ void HIPSpace::impl_deallocate( Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, reported_size); } - HIP_SAFE_CALL(hipFree(arg_alloc_ptr)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(arg_alloc_ptr)); } void HIPHostPinnedSpace::deallocate(void* const arg_alloc_ptr, @@ -307,7 +239,7 @@ void HIPHostPinnedSpace::impl_deallocate( Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, reported_size); } - HIP_SAFE_CALL(hipHostFree(arg_alloc_ptr)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(arg_alloc_ptr)); } } // namespace Experimental @@ -427,23 +359,42 @@ HIP::HIP() "HIP instance constructor"); } -HIP::HIP(hipStream_t const stream) +HIP::HIP(hipStream_t const stream, bool manage_stream) : m_space_instance(new Impl::HIPInternal, [](Impl::HIPInternal* ptr) { ptr->finalize(); delete ptr; }) { Impl::HIPInternal::singleton().verify_is_initialized( "HIP instance constructor"); - m_space_instance->initialize(Impl::HIPInternal::singleton().m_hipDev, stream); + m_space_instance->initialize(Impl::HIPInternal::singleton().m_hipDev, stream, + manage_stream); } void HIP::print_configuration(std::ostream& s, const bool) { Impl::HIPInternal::singleton().print_configuration(s); } -void HIP::impl_static_fence() { HIP_SAFE_CALL(hipDeviceSynchronize()); } +uint32_t HIP::impl_instance_id() const noexcept { + return m_space_instance->impl_get_instance_id(); +} +void HIP::impl_static_fence(const std::string& name) { + Kokkos::Tools::Experimental::Impl::profile_fence_event< + Kokkos::Experimental::HIP>( + name, + Kokkos::Tools::Experimental::SpecialSynchronizationCases:: + GlobalDeviceSynchronization, + [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); }); +} +void HIP::impl_static_fence() { + impl_static_fence("Kokkos::HIP::impl_static_fence: Unnamed Static Fence"); +} -void HIP::fence() const { m_space_instance->fence(); } +void HIP::fence(const std::string& name) const { + m_space_instance->fence(name); +} +void HIP::fence() const { + fence("Kokkos::HIP::fence(): Unnamed Instance Fence"); +} hipStream_t HIP::hip_stream() const { return m_space_instance->m_stream; } @@ -489,6 +440,9 @@ void HIPSpaceInitializer::finalize(const bool all_spaces) { void HIPSpaceInitializer::fence() { Kokkos::Experimental::HIP::impl_static_fence(); } +void HIPSpaceInitializer::fence(const std::string& name) { + Kokkos::Experimental::HIP::impl_static_fence(name); +} void HIPSpaceInitializer::print_configuration(std::ostream& msg, const bool detail) { diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp index fe52886ced..fb67a25c5e 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp @@ -316,198 +316,6 @@ class HIPTeamMember { #endif } - //-------------------------------------------------------------------------- - /**\brief Global reduction across all blocks - * - * Return !0 if reducer contains the final value - */ - template - KOKKOS_INLINE_FUNCTION static - typename std::enable_if::value, int>::type - global_reduce(ReducerType const& reducer, int* const global_scratch_flags, - void* const global_scratch_space, void* const shmem, - int const shmem_size) { -#ifdef __HIP_DEVICE_COMPILE__ - using value_type = typename ReducerType::value_type; - using pointer_type = value_type volatile*; - - // Number of shared memory entries for the reduction: - const int nsh = shmem_size / sizeof(value_type); - - // Number of HIP threads in the block, rank within the block - const int nid = blockDim.x * blockDim.y * blockDim.z; - const int tid = - threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z); - - // Reduces within block using all available shared memory - // Contributes if it is the root "vector lane" - - // wn == number of warps in the block - // wx == which lane within the warp - // wy == which warp within the block - - const int wn = (nid + Experimental::Impl::HIPTraits::WarpIndexMask) >> - Experimental::Impl::HIPTraits::WarpIndexShift; - const int wx = tid & Experimental::Impl::HIPTraits::WarpIndexMask; - const int wy = tid >> Experimental::Impl::HIPTraits::WarpIndexShift; - - //------------------------ - { // Intra warp shuffle reduction from contributing HIP threads - - value_type tmp(reducer.reference()); - - int constexpr warp_size = - ::Kokkos::Experimental::Impl::HIPTraits::WarpSize; - for (int i = warp_size; static_cast(blockDim.x) <= (i >>= 1);) { - Experimental::Impl::in_place_shfl_down(reducer.reference(), tmp, i, - warp_size); - - // Root of each vector lane reduces "thread" contribution - if (0 == threadIdx.x && wx < i) { - reducer.join(&tmp, reducer.data()); - } - } - - // Reduce across warps using shared memory. - // Number of warps may not be power of two. - - __syncthreads(); // Wait before shared data write - - // Number of shared memory entries for the reduction - // is at most one per warp - const int nentry = wn < nsh ? wn : nsh; - - if (0 == wx && wy < nentry) { - // Root thread of warp 'wy' has warp's value to contribute - (reinterpret_cast(shmem))[wy] = tmp; - } - - __syncthreads(); // Wait for write to be visible to block - - // When more warps than shared entries - // then warps must take turns joining their contribution - // to the designated shared memory entry. - for (int i = nentry; i < wn; i += nentry) { - const int k = wy - i; - - if (0 == wx && i <= wy && k < nentry) { - // Root thread of warp 'wy' has warp's value to contribute - reducer.join((reinterpret_cast(shmem)) + k, &tmp); - } - - __syncthreads(); // Wait for write to be visible to block - } - - // One warp performs the inter-warp reduction: - - if (0 == wy) { - // Start fan-in at power of two covering nentry - - for (int i = (1 << (warp_size - __clz(nentry - 1))); (i >>= 1);) { - const int k = wx + i; - if (wx < i && k < nentry) { - reducer.join((reinterpret_cast(shmem)) + wx, - (reinterpret_cast(shmem)) + k); - __threadfence_block(); // Wait for write to be visible to warp - } - } - } - } - //------------------------ - { // Write block's value to global_scratch_memory - - int last_block = 0; - - if (0 == wx) { - reducer.copy((reinterpret_cast(global_scratch_space)) + - blockIdx.x * reducer.length(), - reducer.data()); - - __threadfence(); // Wait until global write is visible. - - last_block = static_cast(gridDim.x) == - 1 + Kokkos::atomic_fetch_add(global_scratch_flags, 1); - - // If last block then reset count - if (last_block) *global_scratch_flags = 0; - } - - // FIXME hip does not support __syncthreads_or so we need to do it by hand - // last_block = __syncthreads_or(last_block); - - __shared__ int last_block_shared; - if (last_block) last_block_shared = last_block; - __threadfence_block(); - - if (!last_block_shared) return 0; - } - //------------------------ - // Last block reads global_scratch_memory into shared memory. - - const int nentry = nid < gridDim.x ? (nid < nsh ? nid : nsh) - : (gridDim.x < nsh ? gridDim.x : nsh); - - // nentry = min( nid , nsh , gridDim.x ) - - // whole block reads global memory into shared memory: - - if (tid < nentry) { - const int offset = tid * reducer.length(); - - reducer.copy( - (reinterpret_cast(shmem)) + offset, - (reinterpret_cast(global_scratch_space)) + offset); - - for (int i = nentry + tid; i < static_cast(gridDim.x); i += nentry) { - reducer.join((reinterpret_cast(shmem)) + offset, - (reinterpret_cast(global_scratch_space)) + - i * reducer.length()); - } - } - - __syncthreads(); // Wait for writes to be visible to block - - if (0 == wy) { - // Iterate to reduce shared memory to single warp fan-in size - - int constexpr warp_size = - ::Kokkos::Experimental::Impl::HIPTraits::WarpSize; - const int nreduce = warp_size < nentry ? warp_size : nentry; - - if (wx < nreduce && nreduce < nentry) { - for (int i = nreduce + wx; i < nentry; i += nreduce) { - reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + i); - } - __threadfence_block(); // Wait for writes to be visible to warp - } - - // Start fan-in at power of two covering nentry - - for (int i = (1 << (warp_size - __clz(nreduce - 1))); (i >>= 1);) { - const int k = wx + i; - if (wx < i && k < nreduce) { - reducer.join((reinterpret_cast(shmem)) + wx, - (reinterpret_cast(shmem)) + k); - __threadfence_block(); // Wait for writes to be visible to warp - } - } - - if (0 == wx) { - reducer.copy(reducer.data(), reinterpret_cast(shmem)); - return 1; - } - } - return 0; -#else - (void)reducer; - (void)global_scratch_flags; - (void)global_scratch_space; - (void)shmem; - (void)shmem_size; - return 0; -#endif - } - //---------------------------------------- // Private for the driver diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp b/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp index 910d5e52e6..d9cb66e11f 100644 --- a/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp +++ b/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp @@ -191,6 +191,9 @@ void HPXSpaceInitializer::finalize(const bool all_spaces) { } void HPXSpaceInitializer::fence() { Kokkos::Experimental::HPX().fence(); } +void HPXSpaceInitializer::fence(const std::string &name) { + Kokkos::Experimental::HPX().fence(name); +} void HPXSpaceInitializer::print_configuration(std::ostream &msg, const bool detail) { diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp b/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp index df09e026fd..7bb3ca5d00 100644 --- a/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp +++ b/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp @@ -82,7 +82,9 @@ class TaskQueueSpecialization< task_queue.scheduler = &scheduler; Kokkos::Impl::dispatch_execute_task(&task_queue, Kokkos::Experimental::HPX()); - Kokkos::Experimental::HPX().fence(); + Kokkos::Experimental::HPX().fence( + "Kokkos::Impl::TaskQueueSpecialization::execute: fence " + "after task execution"); } // Must provide task queue execution function @@ -214,7 +216,7 @@ class TaskQueueSpecializationConstrained< task_queue.scheduler = &scheduler; Kokkos::Impl::dispatch_execute_task(&task_queue, Kokkos::Experimental::HPX()); - Kokkos::Experimental::HPX().fence(); + Kokkos::Experimental::HPX().fence()"Kokkos::Impl::TaskQueueSpecializationConstrained::execute: fence after task execution"; } // Must provide task queue execution function diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp b/lib/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp index 527fe12ad9..d7e13e28f0 100644 --- a/lib/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp +++ b/lib/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp @@ -79,7 +79,9 @@ class ParallelFor, public: void execute() const { dispatch_execute_task(this, m_policy.space()); - m_policy.space().fence(); + m_policy.space().fence( + "Kokkos::Experimental::Impl::HPX::ParallelFor: fence " + "after kernel execution"); } void execute_task() const { diff --git a/lib/kokkos/core/src/KokkosExp_InterOp.hpp b/lib/kokkos/core/src/KokkosExp_InterOp.hpp new file mode 100644 index 0000000000..37c2088f88 --- /dev/null +++ b/lib/kokkos/core/src/KokkosExp_InterOp.hpp @@ -0,0 +1,147 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CORE_EXP_INTEROP_HPP +#define KOKKOS_CORE_EXP_INTEROP_HPP + +#include +#include +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +// ------------------------------------------------------------------ // +// this is used to convert +// Kokkos::Device to MemSpace +// +template +struct device_memory_space { + using type = Tp; +}; + +template +struct device_memory_space> { + using type = MemT; +}; + +template +using device_memory_space_t = typename device_memory_space::type; + +// ------------------------------------------------------------------ // +// this is the impl version which takes a view and converts to python +// view type +// +template +struct python_view_type_impl; + +template