diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 2d259791f2..f67699c54d 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -156,8 +156,7 @@ if(BUILD_MPI) endif() endif() else() - enable_language(C) - file(GLOB MPI_SOURCES ${LAMMPS_SOURCE_DIR}/STUBS/mpi.c) + file(GLOB MPI_SOURCES ${LAMMPS_SOURCE_DIR}/STUBS/mpi.cpp) add_library(mpi_stubs STATIC ${MPI_SOURCES}) set_target_properties(mpi_stubs PROPERTIES OUTPUT_NAME lammps_mpi_stubs${LAMMPS_MACHINE}) target_include_directories(mpi_stubs PUBLIC $) @@ -778,9 +777,7 @@ if(PKG_GPU) message(STATUS "<<< GPU package settings >>> -- GPU API: ${GPU_API}") if(GPU_API STREQUAL "CUDA") - message(STATUS "GPU architecture: ${GPU_ARCH}") - elseif(GPU_API STREQUAL "OPENCL") - message(STATUS "OpenCL tuning: ${OCL_TUNE}") + message(STATUS "GPU default architecture: ${GPU_ARCH}") elseif(GPU_API STREQUAL "HIP") message(STATUS "HIP platform: ${HIP_PLATFORM}") message(STATUS "HIP architecture: ${HIP_ARCH}") diff --git a/cmake/Modules/Documentation.cmake b/cmake/Modules/Documentation.cmake index 189c32e301..5a42244b9e 100644 --- a/cmake/Modules/Documentation.cmake +++ b/cmake/Modules/Documentation.cmake @@ -50,9 +50,9 @@ if(BUILD_DOC) OUTPUT ${DOC_BUILD_DIR}/requirements.txt DEPENDS docenv ${DOCENV_REQUIREMENTS_FILE} COMMAND ${CMAKE_COMMAND} -E copy ${DOCENV_REQUIREMENTS_FILE} ${DOC_BUILD_DIR}/requirements.txt - COMMAND ${DOCENV_BINARY_DIR}/pip install --upgrade pip - COMMAND ${DOCENV_BINARY_DIR}/pip install --upgrade ${LAMMPS_DOC_DIR}/utils/converters - COMMAND ${DOCENV_BINARY_DIR}/pip install --use-feature=2020-resolver -r ${DOC_BUILD_DIR}/requirements.txt --upgrade + COMMAND ${DOCENV_BINARY_DIR}/pip $ENV{PIP_OPTIONS} install --upgrade pip + COMMAND ${DOCENV_BINARY_DIR}/pip $ENV{PIP_OPTIONS} install --upgrade ${LAMMPS_DOC_DIR}/utils/converters + COMMAND ${DOCENV_BINARY_DIR}/pip $ENV{PIP_OPTIONS} install -r ${DOC_BUILD_DIR}/requirements.txt --upgrade ) # download mathjax distribution and unpack to folder "mathjax" diff --git a/cmake/Modules/GTest.cmake b/cmake/Modules/GTest.cmake index 060a7e42f9..0c62291d5e 100644 --- a/cmake/Modules/GTest.cmake +++ b/cmake/Modules/GTest.cmake @@ -20,10 +20,10 @@ ExternalProject_Add(googletest -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} - BUILD_BYPRODUCTS /lib/${CMAKE_FIND_LIBRARY_PREFIXES}gtest${GTEST_LIB_POSTFIX}.a - /lib/${CMAKE_FIND_LIBRARY_PREFIXES}gmock${GTEST_LIB_POSTFIX}.a - /lib/${CMAKE_FIND_LIBRARY_PREFIXES}gtest_main${GTEST_LIB_POSTFIX}.a - /lib/${CMAKE_FIND_LIBRARY_PREFIXES}gmock_main${GTEST_LIB_POSTFIX}.a + BUILD_BYPRODUCTS /lib/libgtest${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX} + /lib/libgmock${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX} + /lib/libgtest_main${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX} + /lib/libgmock_main${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX} LOG_DOWNLOAD ON LOG_CONFIGURE ON LOG_BUILD ON @@ -39,10 +39,10 @@ file(MAKE_DIRECTORY ${GTEST_INCLUDE_DIR}) file(MAKE_DIRECTORY ${GMOCK_INCLUDE_DIR}) ExternalProject_Get_Property(googletest BINARY_DIR) -set(GTEST_LIBRARY_PATH ${BINARY_DIR}/lib/${CMAKE_FIND_LIBRARY_PREFIXES}gtest${GTEST_LIB_POSTFIX}.a) -set(GMOCK_LIBRARY_PATH ${BINARY_DIR}/lib/${CMAKE_FIND_LIBRARY_PREFIXES}gmock${GTEST_LIB_POSTFIX}.a) -set(GTEST_MAIN_LIBRARY_PATH ${BINARY_DIR}/lib/${CMAKE_FIND_LIBRARY_PREFIXES}gtest_main${GTEST_LIB_POSTFIX}.a) -set(GMOCK_MAIN_LIBRARY_PATH ${BINARY_DIR}/lib/${CMAKE_FIND_LIBRARY_PREFIXES}gmock_main${GTEST_LIB_POSTFIX}.a) +set(GTEST_LIBRARY_PATH ${BINARY_DIR}/lib/libgtest${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}) +set(GMOCK_LIBRARY_PATH ${BINARY_DIR}/lib/libgmock${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}) +set(GTEST_MAIN_LIBRARY_PATH ${BINARY_DIR}/lib/libgtest_main${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}) +set(GMOCK_MAIN_LIBRARY_PATH ${BINARY_DIR}/lib/libgmock_main${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}) # Prevent GoogleTest from overriding our compiler/linker options # when building with Visual Studio diff --git a/cmake/Modules/OpenCLLoader.cmake b/cmake/Modules/OpenCLLoader.cmake new file mode 100644 index 0000000000..ecd9204d24 --- /dev/null +++ b/cmake/Modules/OpenCLLoader.cmake @@ -0,0 +1,54 @@ +message(STATUS "Downloading and building OpenCL loader library") + +if(CMAKE_BUILD_TYPE STREQUAL Debug) + set(OPENCL_LOADER_LIB_POSTFIX d) +else() + set(OPENCL_LOADER_LIB_POSTFIX) +endif() + +include(ExternalProject) +set(OPENCL_LOADER_URL "https://download.lammps.org/thirdparty/opencl-loader-2020.12.18.tar.gz" CACHE STRING "URL for OpenCL loader tarball") +mark_as_advanced(OPENCL_LOADER_URL) +ExternalProject_Add(opencl_loader + URL ${OPENCL_LOADER_URL} + URL_MD5 011cdcbd41030be94f3fced6d763a52a + SOURCE_DIR "${CMAKE_BINARY_DIR}/opencl_loader-src" + BINARY_DIR "${CMAKE_BINARY_DIR}/opencl_loader-build" + CMAKE_ARGS ${CMAKE_REQUEST_PIC} ${CMAKE_EXTRA_OPENCL_LOADER_OPTS} + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_INSTALL_PREFIX= + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM} + -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} + BUILD_BYPRODUCTS /libOpenCL${OPENCL_LOADER_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX} + LOG_DOWNLOAD ON + LOG_CONFIGURE ON + LOG_BUILD ON + INSTALL_COMMAND "" + TEST_COMMAND "") + +ExternalProject_Get_Property(opencl_loader SOURCE_DIR) +set(OPENCL_LOADER_INCLUDE_DIR ${SOURCE_DIR}/inc) + +# workaround for CMake 3.10 on ubuntu 18.04 +file(MAKE_DIRECTORY ${OPENCL_LOADER_INCLUDE_DIR}) + +ExternalProject_Get_Property(opencl_loader BINARY_DIR) +set(OPENCL_LOADER_LIBRARY_PATH "${BINARY_DIR}/libOpenCL${OPENCL_LOADER_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}") + +find_package(Threads QUIET) +if(NOT WIN32) + set(OPENCL_LOADER_DEP_LIBS "Threads::Threads;${CMAKE_DL_LIBS}") +else() + set(OPENCL_LOADER_DEP_LIBS "cfgmgr32;runtimeobject") +endif() + +add_library(OpenCL::OpenCL UNKNOWN IMPORTED) +add_dependencies(OpenCL::OpenCL opencl_loader) + +set_target_properties(OpenCL::OpenCL PROPERTIES + IMPORTED_LOCATION ${OPENCL_LOADER_LIBRARY_PATH} + INTERFACE_INCLUDE_DIRECTORIES ${OPENCL_LOADER_INCLUDE_DIR} + INTERFACE_LINK_LIBRARIES "${OPENCL_LOADER_DEP_LIBS}") + + diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake index 4c52eee68b..e2586881ef 100644 --- a/cmake/Modules/Packages/GPU.cmake +++ b/cmake/Modules/Packages/GPU.cmake @@ -1,7 +1,9 @@ set(GPU_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/GPU) set(GPU_SOURCES ${GPU_SOURCES_DIR}/gpu_extra.h ${GPU_SOURCES_DIR}/fix_gpu.h - ${GPU_SOURCES_DIR}/fix_gpu.cpp) + ${GPU_SOURCES_DIR}/fix_gpu.cpp + ${GPU_SOURCES_DIR}/fix_nh_gpu.h + ${GPU_SOURCES_DIR}/fix_nh_gpu.cpp) target_compile_definitions(lammps PRIVATE -DLMP_GPU) set(GPU_API "opencl" CACHE STRING "API used by GPU package") @@ -139,27 +141,13 @@ if(GPU_API STREQUAL "CUDA") target_include_directories(nvc_get_devices PRIVATE ${CUDA_INCLUDE_DIRS}) elseif(GPU_API STREQUAL "OPENCL") - if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") - # download and unpack support binaries for compilation of windows binaries. - set(LAMMPS_THIRDPARTY_URL "https://download.lammps.org/thirdparty") - file(DOWNLOAD "${LAMMPS_THIRDPARTY_URL}/opencl-win-devel.tar.gz" "${CMAKE_CURRENT_BINARY_DIR}/opencl-win-devel.tar.gz" - EXPECTED_MD5 2c00364888d5671195598b44c2e0d44d) - execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf opencl-win-devel.tar.gz WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) - add_library(OpenCL::OpenCL UNKNOWN IMPORTED) - if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86") - set_target_properties(OpenCL::OpenCL PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/OpenCL/lib_win32/libOpenCL.dll") - elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64") - set_target_properties(OpenCL::OpenCL PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/OpenCL/lib_win64/libOpenCL.dll") - endif() - set_target_properties(OpenCL::OpenCL PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_BINARY_DIR}/OpenCL/include") + option(USE_STATIC_OPENCL_LOADER "Download and include a static OpenCL ICD loader" ON) + mark_as_advanced(USE_STATIC_OPENCL_LOADER) + if (USE_STATIC_OPENCL_LOADER) + include(OpenCLLoader) else() find_package(OpenCL REQUIRED) endif() - set(OCL_TUNE "generic" CACHE STRING "OpenCL Device Tuning") - set(OCL_TUNE_VALUES intel fermi kepler cypress generic) - set_property(CACHE OCL_TUNE PROPERTY STRINGS ${OCL_TUNE_VALUES}) - validate_option(OCL_TUNE OCL_TUNE_VALUES) - string(TOUPPER ${OCL_TUNE} OCL_TUNE) include(OpenCLUtils) set(OCL_COMMON_HEADERS ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_preprocessor.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_aux_fun1.h) @@ -203,7 +191,7 @@ elseif(GPU_API STREQUAL "OPENCL") add_library(gpu STATIC ${GPU_LIB_SOURCES}) target_link_libraries(gpu PRIVATE OpenCL::OpenCL) target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu) - target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -D${OCL_TUNE}_OCL -DMPI_GERYON -DUCL_NO_EXIT) + target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT) target_compile_definitions(gpu PRIVATE -DUSE_OPENCL) target_link_libraries(lammps PRIVATE gpu) @@ -211,6 +199,7 @@ elseif(GPU_API STREQUAL "OPENCL") add_executable(ocl_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp) target_compile_definitions(ocl_get_devices PRIVATE -DUCL_OPENCL) target_link_libraries(ocl_get_devices PRIVATE OpenCL::OpenCL) + add_dependencies(ocl_get_devices OpenCL::OpenCL) elseif(GPU_API STREQUAL "HIP") if(NOT DEFINED HIP_PATH) if(NOT DEFINED ENV{HIP_PATH}) @@ -393,13 +382,10 @@ elseif(GPU_API STREQUAL "HIP") target_link_libraries(lammps PRIVATE gpu) endif() -# GPU package -FindStyleHeaders(${GPU_SOURCES_DIR} FIX_CLASS fix_ FIX) - set_property(GLOBAL PROPERTY "GPU_SOURCES" "${GPU_SOURCES}") - -# detects styles which have GPU version +# detect styles which have a GPU version RegisterStylesExt(${GPU_SOURCES_DIR} gpu GPU_SOURCES) +RegisterFixStyle(${GPU_SOURCES_DIR}/fix_gpu.h) get_property(GPU_SOURCES GLOBAL PROPERTY GPU_SOURCES) diff --git a/cmake/Modules/Packages/KIM.cmake b/cmake/Modules/Packages/KIM.cmake index 83a96d02b8..5482d3071c 100644 --- a/cmake/Modules/Packages/KIM.cmake +++ b/cmake/Modules/Packages/KIM.cmake @@ -69,14 +69,14 @@ if(DOWNLOAD_KIM) BUILD_RPATH "${_rpath_prefix}/kim_build-prefix/lib" ) else() - if(KIM-API_FOUND AND KIM_API_VERSION VERSION_GREATER_EQUAL 2.2.0) + if(KIM-API_FOUND AND KIM-API_VERSION VERSION_GREATER_EQUAL 2.2.0) # For kim-api >= 2.2.0 - find_package(KIM-API ${KIM-API_MIN_VERSION} CONFIG REQUIRED) + find_package(KIM-API 2.2.0 CONFIG REQUIRED) target_link_libraries(lammps PRIVATE KIM-API::kim-api) else() # For kim-api 2.1.3 (consistent with previous version of this file) find_package(PkgConfig REQUIRED) - pkg_check_modules(KIM-API REQUIRED IMPORTED_TARGET libkim-api>=KIM-API_MIN_VERSION) + pkg_check_modules(KIM-API REQUIRED IMPORTED_TARGET libkim-api>=${KIM-API_MIN_VERSION}) target_link_libraries(lammps PRIVATE PkgConfig::KIM-API) endif() endif() diff --git a/cmake/Modules/Packages/MESSAGE.cmake b/cmake/Modules/Packages/MESSAGE.cmake index fb62763828..6ff4e322aa 100644 --- a/cmake/Modules/Packages/MESSAGE.cmake +++ b/cmake/Modules/Packages/MESSAGE.cmake @@ -2,9 +2,8 @@ if(LAMMPS_SIZES STREQUAL BIGBIG) message(FATAL_ERROR "The MESSAGE Package is not compatible with -DLAMMPS_BIGBIG") endif() option(MESSAGE_ZMQ "Use ZeroMQ in MESSAGE package" OFF) -file(GLOB_RECURSE cslib_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/message/cslib/[^.]*.F - ${LAMMPS_LIB_SOURCE_DIR}/message/cslib/[^.]*.c - ${LAMMPS_LIB_SOURCE_DIR}/message/cslib/[^.]*.cpp) +file(GLOB_RECURSE cslib_SOURCES + ${LAMMPS_LIB_SOURCE_DIR}/message/cslib/[^.]*.cpp) add_library(cslib STATIC ${cslib_SOURCES}) target_compile_definitions(cslib PRIVATE -DLAMMPS_${LAMMPS_SIZES}) diff --git a/cmake/Modules/YAML.cmake b/cmake/Modules/YAML.cmake index a080b566be..f2ba34e1b6 100644 --- a/cmake/Modules/YAML.cmake +++ b/cmake/Modules/YAML.cmake @@ -12,7 +12,7 @@ ExternalProject_Add(libyaml CXX=${CMAKE_CXX_COMPILER} CC=${CMAKE_C_COMPILER} --prefix= --disable-shared - BUILD_BYPRODUCTS /lib/${CMAKE_FIND_LIBRARY_PREFIXES}yaml.a + BUILD_BYPRODUCTS /lib/libyaml${CMAKE_STATIC_LIBRARY_SUFFIX} TEST_COMMAND "") ExternalProject_Get_Property(libyaml INSTALL_DIR) @@ -23,7 +23,7 @@ set(YAML_LIBRARY_DIR ${INSTALL_DIR}/lib) file(MAKE_DIRECTORY ${YAML_INCLUDE_DIR}) file(MAKE_DIRECTORY ${YAML_LIBRARY_DIR}) -set(YAML_LIBRARY_PATH ${INSTALL_DIR}/lib/${CMAKE_FIND_LIBRARY_PREFIXES}yaml.a) +set(YAML_LIBRARY_PATH ${INSTALL_DIR}/lib/libyaml${CMAKE_STATIC_LIBRARY_SUFFIX}) add_library(Yaml::Yaml UNKNOWN IMPORTED) set_target_properties(Yaml::Yaml PROPERTIES diff --git a/doc/Makefile b/doc/Makefile index 6032aff45f..7deaaf2a2e 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -47,6 +47,8 @@ HAS_PDFLATEX = YES endif endif +# override settings for PIP commands +# PIP_OPTIONS = --cert /etc/pki/ca-trust/extracted/openssl/ca-bundle.trust.crt --proxy http://proxy.mydomain.org #SPHINXEXTRA = -j $(shell $(PYTHON) -c 'import multiprocessing;print(multiprocessing.cpu_count())') $(shell test -f $(BUILDDIR)/doxygen/xml/run.stamp && printf -- "-E") @@ -228,13 +230,13 @@ $(VENV): @( \ $(VIRTUALENV) -p $(PYTHON) $(VENV); \ . $(VENV)/bin/activate; \ - pip install --upgrade pip; \ - pip install -r $(BUILDDIR)/utils/requirements.txt; \ + pip $(PIP_OPTIONS) install --upgrade pip; \ + pip $(PIP_OPTIONS) install -r $(BUILDDIR)/utils/requirements.txt; \ deactivate;\ ) $(MATHJAX): - @git clone --depth 1 https://github.com/mathjax/MathJax.git $@ + @git clone --depth 1 git://github.com/mathjax/MathJax.git $@ $(TXT2RST) $(ANCHORCHECK): $(VENV) @( \ diff --git a/doc/src/Build_basics.rst b/doc/src/Build_basics.rst index cb6bd9f6aa..c7baa21e62 100644 --- a/doc/src/Build_basics.rst +++ b/doc/src/Build_basics.rst @@ -95,7 +95,7 @@ standard. A more detailed discussion of that is below. .. note:: - The file ``src/STUBS/mpi.c`` provides a CPU timer function + The file ``src/STUBS/mpi.cpp`` provides a CPU timer function called ``MPI_Wtime()`` that calls ``gettimeofday()``. If your operating system does not support ``gettimeofday()``, you will need to insert code to call another timer. Note that the diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst index 60d5ad09af..9180933007 100644 --- a/doc/src/Build_extras.rst +++ b/doc/src/Build_extras.rst @@ -120,8 +120,6 @@ CMake build -D GPU_API=value # value = opencl (default) or cuda or hip -D GPU_PREC=value # precision setting # value = double or mixed (default) or single - -D OCL_TUNE=value # hardware choice for GPU_API=opencl - # generic (default) or intel (Intel CPU) or fermi, kepler, cypress (NVIDIA) -D GPU_ARCH=value # primary GPU hardware choice for GPU_API=cuda # value = sm_XX, see below # default is sm_50 @@ -135,6 +133,8 @@ CMake build # value = yes (default) or no -D CUDA_MPS_SUPPORT=value # enables some tweaks required to run with active nvidia-cuda-mps daemon # value = yes or no (default) + -D USE_STATIC_OPENCL_LOADER=value # downloads/includes OpenCL ICD loader library, no local OpenCL headers/libs needed + # value = yes (default) or no :code:`GPU_ARCH` settings for different GPU hardware is as follows: @@ -161,6 +161,12 @@ When building with CMake, you **must NOT** build the GPU library in ``lib/gpu`` using the traditional build procedure. CMake will detect files generated by that process and will terminate with an error and a suggestion for how to remove them. +If you are compiling for OpenCL, the default setting is to download, build, and +link with a static OpenCL ICD loader library and standard OpenCL headers. This +way no local OpenCL development headers or library needs to be present and only +OpenCL compatible drivers need to be installed to use OpenCL. If this is not +desired, you can set :code:`USE_STATIC_OPENCL_LOADER` to :code:`no`. + If you are compiling with HIP, note that before running CMake you will have to set appropriate environment variables. Some variables such as :code:`HCC_AMDGPU_TARGET` or :code:`CUDA_PATH` are necessary for :code:`hipcc` diff --git a/doc/src/Build_link.rst b/doc/src/Build_link.rst index 3d66371304..5255620231 100644 --- a/doc/src/Build_link.rst +++ b/doc/src/Build_link.rst @@ -20,16 +20,8 @@ the suffix ``.so.0`` (or some other number). .. note:: Care should be taken to use the same MPI library for the calling code - and the LAMMPS library. The ``library.h`` file includes ``mpi.h`` - and uses definitions from it so those need to be available and - consistent. When LAMMPS is compiled with the included STUBS MPI - library, then its ``mpi.h`` file needs to be included. While it is - technically possible to use a full MPI library in the calling code - and link to a serial LAMMPS library compiled with MPI STUBS, it is - recommended to use the *same* MPI library for both, and then use - ``MPI_Comm_split()`` in the calling code to pass a suitable - communicator with a subset of MPI ranks to the function creating the - LAMMPS instance. + and the LAMMPS library unless LAMMPS is to be compiled without (real) + MPI support using the include STUBS MPI library. Link with LAMMPS as a static library ------------------------------------ @@ -110,7 +102,7 @@ executable, that are also required to link the LAMMPS executable. .. code-block:: bash - gcc -c -O -I${HOME}/lammps/src/STUBS -I${HOME}/lammps/src -caller.c + gcc -c -O -I${HOME}/lammps/src -caller.c g++ -o caller caller.o -L${HOME}/lammps/lib/poems \ -L${HOME}/lammps/src/STUBS -L${HOME}/lammps/src \ -llammps_serial -lpoems -lmpi_stubs @@ -174,7 +166,7 @@ the POEMS package installed becomes: .. code-block:: bash - gcc -c -O -I${HOME}/lammps/src/STUBS -I${HOME}/lammps/src -caller.c + gcc -c -O -I${HOME}/lammps/src -caller.c g++ -o caller caller.o -L${HOME}/lammps/src -llammps_serial Locating liblammps.so at runtime diff --git a/doc/src/Build_manual.rst b/doc/src/Build_manual.rst index 59e4e3235b..3bf0337b31 100644 --- a/doc/src/Build_manual.rst +++ b/doc/src/Build_manual.rst @@ -74,7 +74,11 @@ For the documentation build a python virtual environment is set up in the folder ``doc/docenv`` and various python packages are installed into that virtual environment via the ``pip`` tool. For rendering embedded LaTeX code also the `MathJax `_ JavaScript -engine needs to be downloaded. +engine needs to be downloaded. If you need to pass additional options +to the pip commands to work (e.g. to use a web proxy or to point to +additional SSL certificates) you can set them via the ``PIP_OPTIONS`` +environment variable or uncomment and edit the ``PIP_OPTIONS`` setting +at beginning of the makefile. The actual translation is then done via ``make`` commands in the doc folder. The following ``make`` commands are available: @@ -108,7 +112,10 @@ installation of the HTML manual pages into the "install" step when installing LAMMPS after the CMake build via ``cmake --build . --target install``. The documentation build is included in the default build target, but can also be requested independently with -``cmake --build . --target doc``. +``cmake --build . --target doc``. If you need to pass additional options +to the pip commands to work (e.g. to use a web proxy or to point to +additional SSL certificates) you can set them via the ``PIP_OPTIONS`` +environment variable. .. code-block:: bash diff --git a/doc/src/Commands_fix.rst b/doc/src/Commands_fix.rst index 26dcc1101c..4793568288 100644 --- a/doc/src/Commands_fix.rst +++ b/doc/src/Commands_fix.rst @@ -114,7 +114,7 @@ OPT. * :doc:`nph/eff ` * :doc:`nph/sphere (o) ` * :doc:`nphug ` - * :doc:`npt (iko) ` + * :doc:`npt (giko) ` * :doc:`npt/asphere (o) ` * :doc:`npt/body ` * :doc:`npt/cauchy ` @@ -122,8 +122,8 @@ OPT. * :doc:`npt/sphere (o) ` * :doc:`npt/uef ` * :doc:`numdiff ` - * :doc:`nve (iko) ` - * :doc:`nve/asphere (i) ` + * :doc:`nve (giko) ` + * :doc:`nve/asphere (gi) ` * :doc:`nve/asphere/noforce ` * :doc:`nve/awpmd ` * :doc:`nve/body ` @@ -138,7 +138,7 @@ OPT. * :doc:`nve/spin ` * :doc:`nve/tri ` * :doc:`nvk ` - * :doc:`nvt (iko) ` + * :doc:`nvt (giko) ` * :doc:`nvt/asphere (o) ` * :doc:`nvt/body ` * :doc:`nvt/eff ` diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst index f5b1ef9b38..e7277e2bbb 100644 --- a/doc/src/Commands_pair.rst +++ b/doc/src/Commands_pair.rst @@ -122,7 +122,7 @@ OPT. * :doc:`lebedeva/z ` * :doc:`lennard/mdf ` * :doc:`line/lj ` - * :doc:`lj/charmm/coul/charmm (iko) ` + * :doc:`lj/charmm/coul/charmm (giko) ` * :doc:`lj/charmm/coul/charmm/implicit (ko) ` * :doc:`lj/charmm/coul/long (gikot) ` * :doc:`lj/charmm/coul/long/soft (o) ` diff --git a/doc/src/Install_tarball.rst b/doc/src/Install_tarball.rst index 7c9e834104..6f87df8a21 100644 --- a/doc/src/Install_tarball.rst +++ b/doc/src/Install_tarball.rst @@ -33,22 +33,19 @@ in its name, e.g. lammps-23Jun18. ---------- -You can also download a zip file via the "Clone or download" button on -the `LAMMPS GitHub site `_. The file name will be lammps-master.zip -which can be unzipped with the following command, to create -a lammps-master dir: +You can also download a compressed tar or zip archives from the +"Assets" sections of the `LAMMPS GitHub releases site `_. +The file name will be lammps-.zip which can be unzipped +with the following command, to create a lammps- dir: .. code-block:: bash $ unzip lammps*.zip -This version is the most up-to-date LAMMPS development version. It -will have the date of the most recent patch release (see the file -src/version.h). But it will also include any new bug-fixes or -features added since the last patch release. They will be included in -the next patch release tarball. +This version corresponds to the selected LAMMPS patch or stable +release. -.. _git: https://github.com/lammps/lammps +.. _git: https://github.com/lammps/lammps/releases ---------- diff --git a/doc/src/Python_atoms.rst b/doc/src/Python_atoms.rst index 92b9677d16..be0d4ff800 100644 --- a/doc/src/Python_atoms.rst +++ b/doc/src/Python_atoms.rst @@ -50,7 +50,7 @@ against invalid accesses. **Numpy Methods**: - * :py:meth:`numpy.extract_atom() `: extract a per-atom quantity as numpy array + * :py:meth:`numpy.extract_atom() `: extract a per-atom quantity as numpy array .. tab:: PyLammps/IPyLammps API diff --git a/doc/src/Python_module.rst b/doc/src/Python_module.rst index 59be645cbd..d2564986de 100644 --- a/doc/src/Python_module.rst +++ b/doc/src/Python_module.rst @@ -61,7 +61,7 @@ functions. Below is a detailed documentation of the API. .. autoclass:: lammps.lammps :members: -.. autoclass:: lammps.numpy::numpy_wrapper +.. autoclass:: lammps.numpy_wrapper::numpy_wrapper :members: ---------- @@ -134,8 +134,8 @@ Style Constants to request from computes or fixes. See :cpp:enum:`_LMP_STYLE_CONST` for the equivalent constants in the C library interface. Used in :py:func:`lammps.extract_compute`, :py:func:`lammps.extract_fix`, and their NumPy variants - :py:func:`lammps.numpy.extract_compute() ` and - :py:func:`lammps.numpy.extract_fix() `. + :py:func:`lammps.numpy.extract_compute() ` and + :py:func:`lammps.numpy.extract_fix() `. .. _py_type_constants: @@ -149,8 +149,8 @@ Type Constants to request from computes or fixes. See :cpp:enum:`_LMP_TYPE_CONST` for the equivalent constants in the C library interface. Used in :py:func:`lammps.extract_compute`, :py:func:`lammps.extract_fix`, and their NumPy variants - :py:func:`lammps.numpy.extract_compute() ` and - :py:func:`lammps.numpy.extract_fix() `. + :py:func:`lammps.numpy.extract_compute() ` and + :py:func:`lammps.numpy.extract_fix() `. .. _py_vartype_constants: @@ -170,6 +170,6 @@ Classes representing internal objects :members: :no-undoc-members: -.. autoclass:: lammps.numpy::NumPyNeighList +.. autoclass:: lammps.numpy_wrapper::NumPyNeighList :members: :no-undoc-members: diff --git a/doc/src/Python_neighbor.rst b/doc/src/Python_neighbor.rst index 80651b608f..cba117ad20 100644 --- a/doc/src/Python_neighbor.rst +++ b/doc/src/Python_neighbor.rst @@ -14,5 +14,5 @@ Neighbor list access **NumPy Methods:** -* :py:meth:`lammps.numpy.get_neighlist() `: Get neighbor list for given index, which uses NumPy arrays for its element neighbor arrays -* :py:meth:`lammps.numpy.get_neighlist_element_neighbors() `: Get element in neighbor list and its neighbors (as numpy array) +* :py:meth:`lammps.numpy.get_neighlist() `: Get neighbor list for given index, which uses NumPy arrays for its element neighbor arrays +* :py:meth:`lammps.numpy.get_neighlist_element_neighbors() `: Get element in neighbor list and its neighbors (as numpy array) diff --git a/doc/src/Python_objects.rst b/doc/src/Python_objects.rst index ec29863d38..4c8161b8bd 100644 --- a/doc/src/Python_objects.rst +++ b/doc/src/Python_objects.rst @@ -36,9 +36,9 @@ computes, fixes, or variables in LAMMPS using the :py:mod:`lammps` module. Python subscripting. The values will be zero for atoms not in the specified group. - :py:meth:`lammps.numpy.extract_compute() `, - :py:meth:`lammps.numpy.extract_fix() `, and - :py:meth:`lammps.numpy.extract_variable() ` are + :py:meth:`lammps.numpy.extract_compute() `, + :py:meth:`lammps.numpy.extract_fix() `, and + :py:meth:`lammps.numpy.extract_variable() ` are equivalent NumPy implementations that return NumPy arrays instead of ``ctypes`` pointers. The :py:meth:`lammps.set_variable() ` method sets an @@ -54,9 +54,9 @@ computes, fixes, or variables in LAMMPS using the :py:mod:`lammps` module. **NumPy Methods**: - * :py:meth:`lammps.numpy.extract_compute() `: extract value(s) from a compute, return arrays as numpy arrays - * :py:meth:`lammps.numpy.extract_fix() `: extract value(s) from a fix, return arrays as numpy arrays - * :py:meth:`lammps.numpy.extract_variable() `: extract value(s) from a variable, return arrays as numpy arrays + * :py:meth:`lammps.numpy.extract_compute() `: extract value(s) from a compute, return arrays as numpy arrays + * :py:meth:`lammps.numpy.extract_fix() `: extract value(s) from a fix, return arrays as numpy arrays + * :py:meth:`lammps.numpy.extract_variable() `: extract value(s) from a variable, return arrays as numpy arrays .. tab:: PyLammps/IPyLammps API diff --git a/doc/src/Speed_gpu.rst b/doc/src/Speed_gpu.rst index 56eb48cd0e..709a3ad3bb 100644 --- a/doc/src/Speed_gpu.rst +++ b/doc/src/Speed_gpu.rst @@ -1,11 +1,14 @@ GPU package =========== -The GPU package was developed by Mike Brown while at SNL and ORNL -and his collaborators, particularly Trung Nguyen (now at Northwestern). -It provides GPU versions of many pair styles and for parts of the -:doc:`kspace_style pppm ` for long-range Coulombics. -It has the following general features: +The GPU package was developed by Mike Brown while at SNL and ORNL (now +at Intel Corp.) and his collaborators, particularly Trung Nguyen (now at +Northwestern). Support for AMD GPUs via HIP was added by Vsevolod Nikolskiy +and coworkers at HSE University. + +The GPU package provides GPU versions of many pair styles and for +parts of the :doc:`kspace_style pppm ` for long-range +Coulombics. It has the following general features: * It is designed to exploit common GPU hardware configurations where one or more GPUs are coupled to many cores of one or more multi-core CPUs, @@ -24,8 +27,9 @@ It has the following general features: force vectors. * LAMMPS-specific code is in the GPU package. It makes calls to a generic GPU library in the lib/gpu directory. This library provides - NVIDIA support as well as more general OpenCL support, so that the - same functionality is supported on a variety of hardware. + either Nvidia support, AMD support, or more general OpenCL support + (for Nvidia GPUs, AMD GPUs, Intel GPUs, and multi-core CPUs). + so that the same functionality is supported on a variety of hardware. **Required hardware/software:** @@ -45,12 +49,23 @@ to have the OpenCL headers and the (vendor neutral) OpenCL library installed. In OpenCL mode, the acceleration depends on having an `OpenCL Installable Client Driver (ICD) `_ installed. There can be multiple of them for the same or different hardware (GPUs, CPUs, Accelerators) installed at the same time. OpenCL refers to those -as 'platforms'. The GPU library will select the **first** suitable platform, -but this can be overridden using the device option of the :doc:`package ` +as 'platforms'. The GPU library will try to auto-select the best suitable platform, +but this can be overridden using the platform option of the :doc:`package ` command. run lammps/lib/gpu/ocl_get_devices to get a list of available platforms and devices with a suitable ICD available. -To compute and use this package in HIP mode, you have to have the AMD ROCm +To compile and use this package for Intel GPUs, OpenCL or the Intel oneAPI +HPC Toolkit can be installed using linux package managers. The latter also +provides optimized C++, MPI, and many other libraries and tools. See: + +* https://software.intel.com/content/www/us/en/develop/tools/oneapi/hpc-toolkit/download.html + +If you do not have a discrete GPU card installed, this package can still provide +significant speedups on some CPUs that include integrated GPUs. Additionally, for +many macs, OpenCL is already included with the OS and Makefiles are available +in the lib/gpu directory. + +To compile and use this package in HIP mode, you have to have the AMD ROCm software installed. Versions of ROCm older than 3.5 are currently deprecated by AMD. @@ -75,10 +90,20 @@ automatically if you create more MPI tasks/node than there are GPUs/mode. E.g. with 8 MPI tasks/node and 2 GPUs, each GPU will be shared by 4 MPI tasks. +The GPU package also has limited support for OpenMP for both +multi-threading and vectorization of routines that are run on the CPUs. +This requires that the GPU library and LAMMPS are built with flags to +enable OpenMP support (e.g. -fopenmp). Some styles for time integration +are also available in the GPU package. These run completely on the CPUs +in full double precision, but exploit multi-threading and vectorization +for faster performance. + Use the "-sf gpu" :doc:`command-line switch `, which will automatically append "gpu" to styles that support it. Use the "-pk gpu Ng" :doc:`command-line switch ` to set Ng = # of -GPUs/node to use. +GPUs/node to use. If Ng is 0, the number is selected automatically as +the number of matching GPUs that have the highest number of compute +cores. .. code-block:: bash @@ -87,8 +112,8 @@ GPUs/node to use. mpirun -np 48 -ppn 12 lmp_machine -sf gpu -pk gpu 2 -in in.script # ditto on 4 16-core nodes Note that if the "-sf gpu" switch is used, it also issues a default -:doc:`package gpu 1 ` command, which sets the number of -GPUs/node to 1. +:doc:`package gpu 0 ` command, which will result in +automatic selection of the number of GPUs to use. Using the "-pk" switch explicitly allows for setting of the number of GPUs/node to use and additional options. Its syntax is the same as @@ -138,6 +163,13 @@ Likewise, you should experiment with the precision setting for the GPU library to see if single or mixed precision will give accurate results, since they will typically be faster. +MPI parallelism typically outperforms OpenMP parallelism, but in some +cases using fewer MPI tasks and multiple OpenMP threads with the GPU +package can give better performance. 3-body potentials can often perform +better with multiple OMP threads because the inter-process communication +is higher for these styles with the GPU package in order to allow +deterministic results. + **Guidelines for best performance:** * Using multiple MPI tasks per GPU will often give the best performance, @@ -161,6 +193,12 @@ results, since they will typically be faster. :doc:`angle `, :doc:`dihedral `, :doc:`improper `, and :doc:`long-range ` calculations will not be included in the "Pair" time. +* Since only part of the pppm kspace style is GPU accelerated, it + may be faster to only use GPU acceleration for Pair styles with + long-range electrostatics. See the "pair/only" keyword of the + package command for a shortcut to do that. The work between kspace + on the CPU and non-bonded interactions on the GPU can be balanced + through adjusting the coulomb cutoff without loss of accuracy. * When the *mode* setting for the package gpu command is force/neigh, the time for neighbor list calculations on the GPU will be added into the "Pair" time, not the "Neigh" time. An additional breakdown of the diff --git a/doc/src/Speed_packages.rst b/doc/src/Speed_packages.rst index 600c4ac2b4..6210242413 100644 --- a/doc/src/Speed_packages.rst +++ b/doc/src/Speed_packages.rst @@ -16,7 +16,7 @@ These are the accelerator packages currently in LAMMPS, either as standard or user packages: +-----------------------------------------+-------------------------------------------------------+ -| :doc:`GPU Package ` | for NVIDIA GPUs as well as OpenCL support | +| :doc:`GPU Package ` | for GPUs via CUDA, OpenCL, or ROCm HIP | +-----------------------------------------+-------------------------------------------------------+ | :doc:`USER-INTEL Package ` | for Intel CPUs and Intel Xeon Phi | +-----------------------------------------+-------------------------------------------------------+ @@ -43,7 +43,7 @@ three kinds of hardware, via the listed packages: +-----------------+-----------------------------------------------------------------------------------------------------------------------------+ | Many-core CPUs | :doc:`USER-INTEL `, :doc:`KOKKOS `, :doc:`USER-OMP `, :doc:`OPT ` packages | +-----------------+-----------------------------------------------------------------------------------------------------------------------------+ -| NVIDIA/AMD GPUs | :doc:`GPU `, :doc:`KOKKOS ` packages | +| GPUs | :doc:`GPU `, :doc:`KOKKOS ` packages | +-----------------+-----------------------------------------------------------------------------------------------------------------------------+ | Intel Phi/AVX | :doc:`USER-INTEL `, :doc:`KOKKOS ` packages | +-----------------+-----------------------------------------------------------------------------------------------------------------------------+ @@ -154,8 +154,8 @@ Here is a brief summary of what the various packages provide. Details are in the individual accelerator sections. * Styles with a "gpu" suffix are part of the GPU package and can be run - on NVIDIA or AMD GPUs. The speed-up on a GPU depends on a variety of - factors, discussed in the accelerator sections. + on Intel, NVIDIA, or AMD GPUs. The speed-up on a GPU depends on a + variety of factors, discussed in the accelerator sections. * Styles with an "intel" suffix are part of the USER-INTEL package. These styles support vectorized single and mixed precision calculations, in addition to full double precision. In extreme cases, diff --git a/doc/src/compute_temp_chunk.rst b/doc/src/compute_temp_chunk.rst index 77e2568fce..f1c34b42fa 100644 --- a/doc/src/compute_temp_chunk.rst +++ b/doc/src/compute_temp_chunk.rst @@ -153,7 +153,7 @@ temp/chunk calculation to a file is to use the :doc:`fix ave/time compute cc1 all chunk/atom molecule compute myChunk all temp/chunk cc1 temp - fix 1 all ave/time 100 1 100 c_myChunk file tmp.out mode vector + fix 1 all ave/time 100 1 100 c_myChunk[1] file tmp.out mode vector ---------- diff --git a/doc/src/fix_nh.rst b/doc/src/fix_nh.rst index 590211eda7..f40ce0c463 100644 --- a/doc/src/fix_nh.rst +++ b/doc/src/fix_nh.rst @@ -1,8 +1,10 @@ .. index:: fix nvt +.. index:: fix nvt/gpu .. index:: fix nvt/intel .. index:: fix nvt/kk .. index:: fix nvt/omp .. index:: fix npt +.. index:: fix npt/gpu .. index:: fix npt/intel .. index:: fix npt/kk .. index:: fix npt/omp @@ -13,12 +15,12 @@ fix nvt command =============== -Accelerator Variants: *nvt/intel*, *nvt/kk*, *nvt/omp* +Accelerator Variants: *nvt/gpu*, *nvt/intel*, *nvt/kk*, *nvt/omp* fix npt command =============== -Accelerator Variants: *npt/intel*, *npt/kk*, *npt/omp* +Accelerator Variants: *npt/gpu*, *npt/intel*, *npt/kk*, *npt/omp* fix nph command =============== diff --git a/doc/src/fix_nve.rst b/doc/src/fix_nve.rst index 71f8ec300f..ae472b1a38 100644 --- a/doc/src/fix_nve.rst +++ b/doc/src/fix_nve.rst @@ -1,4 +1,5 @@ .. index:: fix nve +.. index:: fix nve/gpu .. index:: fix nve/intel .. index:: fix nve/kk .. index:: fix nve/omp @@ -6,7 +7,7 @@ fix nve command =============== -Accelerator Variants: *nve/intel*, *nve/kk*, *nve/omp* +Accelerator Variants: *nve/gpu*, *nve/intel*, *nve/kk*, *nve/omp* Syntax """""" diff --git a/doc/src/fix_nve_asphere.rst b/doc/src/fix_nve_asphere.rst index af80460b32..c49de34d0b 100644 --- a/doc/src/fix_nve_asphere.rst +++ b/doc/src/fix_nve_asphere.rst @@ -1,10 +1,11 @@ .. index:: fix nve/asphere +.. index:: fix nve/asphere/gpu .. index:: fix nve/asphere/intel fix nve/asphere command ======================= -Accelerator Variants: *nve/asphere/intel* +Accelerator Variants: *nve/asphere/gpu*, *nve/asphere/intel* Syntax """""" diff --git a/doc/src/package.rst b/doc/src/package.rst index 6a5ff44077..1613ff2fae 100644 --- a/doc/src/package.rst +++ b/doc/src/package.rst @@ -18,7 +18,7 @@ Syntax *gpu* args = Ngpu keyword value ... Ngpu = # of GPUs per node zero or more keyword/value pairs may be appended - keywords = *neigh* or *newton* or *pair/only* or *binsize* or *split* or *gpuID* or *tpa* or *device* or *blocksize* + keywords = *neigh* or *newton* or *pair/only* or *binsize* or *split* or *gpuID* or *tpa* or *blocksize* or *platform* or *device_type* or *ocl_args* *neigh* value = *yes* or *no* yes = neighbor list build on GPU (default) no = neighbor list build on CPU @@ -32,17 +32,20 @@ Syntax size = bin size for neighbor list construction (distance units) *split* = fraction fraction = fraction of atoms assigned to GPU (default = 1.0) - *gpuID* values = first last - first = ID of first GPU to be used on each node - last = ID of last GPU to be used on each node - *tpa* value = Nthreads - Nthreads = # of GPU threads used per atom - *device* value = device_type or platform_id:device_type or platform_id:custom,val1,val2,val3,..,val13 - platform_id = numerical OpenCL platform id (default: -1) - device_type = *kepler* or *fermi* or *cypress* or *intel* or *phi* or *generic* or *custom* - val1,val2,... = custom OpenCL tune parameters (see below for details) + *tpa* value = Nlanes + Nlanes = # of GPU vector lanes (CUDA threads) used per atom *blocksize* value = size size = thread block size for pair force computation + *omp* value = Nthreads + Nthreads = number of OpenMP threads to use on CPU (default = 0) + *platform* value = id + id = For OpenCL, platform ID for the GPU or accelerator + *gpuID* values = id + id = ID of first GPU to be used on each node + *device_type* value = *intelgpu* or *nvidiagpu* or *amdgpu* or *applegpu* or *generic* or *custom,val1,val2,...* + val1,val2,... = custom OpenCL accelerator configuration parameters (see below for details) + *ocl_args* value = args + args = List of additional OpenCL compiler arguments delimited by colons *intel* args = NPhi keyword value ... Nphi = # of co-processors per node zero or more keyword/value pairs may be appended @@ -100,7 +103,7 @@ Syntax off = use device acceleration (e.g. GPU) for all available styles in the KOKKOS package (default) on = use device acceleration only for pair styles (and host acceleration for others) *omp* args = Nthreads keyword value ... - Nthread = # of OpenMP threads to associate with each MPI process + Nthreads = # of OpenMP threads to associate with each MPI process zero or more keyword/value pairs may be appended keywords = *neigh* *neigh* value = *yes* or *no* @@ -112,12 +115,10 @@ Examples .. code-block:: LAMMPS - package gpu 1 + package gpu 0 package gpu 1 split 0.75 package gpu 2 split -1.0 - package gpu 1 device kepler - package gpu 1 device 2:generic - package gpu 1 device custom,32,4,8,256,11,128,256,128,32,64,8,128,128 + package gpu 0 omp 2 device_type intelgpu package kokkos neigh half comm device package omp 0 neigh no package omp 4 @@ -174,10 +175,18 @@ simulations. The *gpu* style invokes settings associated with the use of the GPU package. -The *Ngpu* argument sets the number of GPUs per node. There must be -at least as many MPI tasks per node as GPUs, as set by the mpirun or -mpiexec command. If there are more MPI tasks (per node) -than GPUs, multiple MPI tasks will share each GPU. +The *Ngpu* argument sets the number of GPUs per node. If *Ngpu* is 0 +and no other keywords are specified, GPU or accelerator devices are +auto-selected. In this process, all platforms are searched for +accelerator devices and GPUs are chosen if available. The device with +the highest number of compute cores is selected. The number of devices +is increased to be the number of matching accelerators with the same +number of compute cores. If there are more devices than MPI tasks, +the additional devices will be unused. The auto-selection of GPUs/ +accelerator devices and platforms can be restricted by specifying +a non-zero value for *Ngpu* and / or using the *gpuID*, *platform*, +and *device_type* keywords as described below. If there are more MPI +tasks (per node) than GPUs, multiple MPI tasks will share each GPU. Optional keyword/value pairs can also be specified. Each has a default value as listed below. @@ -212,18 +221,8 @@ overlapped with all other computations on the CPU. The *binsize* keyword sets the size of bins used to bin atoms in neighbor list builds performed on the GPU, if *neigh* = *yes* is set. -If *binsize* is set to 0.0 (the default), then bins = the size of the -pairwise cutoff + neighbor skin distance. This is 2x larger than the -LAMMPS default used for neighbor list building on the CPU. This will -be close to optimal for the GPU, so you do not normally need to use -this keyword. Note that if you use a longer-than-usual pairwise -cutoff, e.g. to allow for a smaller fraction of KSpace work with a -:doc:`long-range Coulombic solver ` because the GPU is -faster at performing pairwise interactions, then it may be optimal to -make the *binsize* smaller than the default. For example, with a -cutoff of 20\*sigma in LJ :doc:`units ` and a neighbor skin -distance of sigma, a *binsize* = 5.25\*sigma can be more efficient than -the default. +If *binsize* is set to 0.0 (the default), then the binsize is set +automatically using heuristics in the GPU package. The *split* keyword can be used for load balancing force calculations between CPU and GPU cores in GPU-enabled pair styles. If 0 < *split* < @@ -257,63 +256,79 @@ cores would perform force calculations for some fraction of the particles at the same time the GPUs performed force calculation for the other particles. -The *gpuID* keyword allows selection of which GPUs on each node will -be used for a simulation. The *first* and *last* values specify the -GPU IDs to use (from 0 to Ngpu-1). By default, first = 0 and last = -Ngpu-1, so that all GPUs are used, assuming Ngpu is set to the number -of physical GPUs. If you only wish to use a subset, set Ngpu to a -smaller number and first/last to a sub-range of the available GPUs. +The *gpuID* keyword is used to specify the first ID for the GPU or +other accelerator that LAMMPS will use. For example, if the ID is +1 and *Ngpu* is 3, GPUs 1-3 will be used. Device IDs should be +determined from the output of nvc_get_devices, ocl_get_devices, +or hip_get_devices +as provided in the lib/gpu directory. When using OpenCL with +accelerators that have main memory NUMA, the accelerators can be +split into smaller virtual accelerators for more efficient use +with MPI. -The *tpa* keyword sets the number of GPU thread per atom used to +The *tpa* keyword sets the number of GPU vector lanes per atom used to perform force calculations. With a default value of 1, the number of -threads will be chosen based on the pair style, however, the value can +lanes will be chosen based on the pair style, however, the value can be set explicitly with this keyword to fine-tune performance. For large cutoffs or with a small number of particles per GPU, increasing -the value can improve performance. The number of threads per atom must -be a power of 2 and currently cannot be greater than 32. - -The *device* keyword can be used to tune parameters optimized for a -specific accelerator and platform when using OpenCL. OpenCL supports -the concept of a **platform**\ , which represents one or more devices that -share the same driver (e.g. there would be a different platform for -GPUs from different vendors or for CPU based accelerator support). -In LAMMPS only one platform can be active at a time and by default -the first platform with an accelerator is selected. This is equivalent -to using a platform ID of -1. The platform ID is a number corresponding -to the output of the ocl_get_devices tool. The platform ID is passed -to the GPU library, by prefixing the *device* keyword with that number -separated by a colon. For CUDA, the *device* keyword is ignored. -Currently, the device tuning support is limited to NVIDIA Kepler, NVIDIA -Fermi, AMD Cypress, Intel x86_64 CPU, Intel Xeon Phi, or a generic device. -More devices may be added later. The default device type can be -specified when building LAMMPS with the GPU library, via setting a -variable in the lib/gpu/Makefile that is used. - -In addition, a device type *custom* is available, which is followed by -13 comma separated numbers, which allows to set those tweakable parameters -from the package command. It can be combined with the (colon separated) -platform id. The individual settings are: - -* MEM_THREADS -* THREADS_PER_ATOM -* THREADS_PER_CHARGE -* BLOCK_PAIR -* MAX_SHARED_TYPES -* BLOCK_NBOR_BUILD -* BLOCK_BIO_PAIR -* BLOCK_ELLIPSE -* WARP_SIZE -* PPPM_BLOCK_1D -* BLOCK_CELL_2D -* BLOCK_CELL_ID -* MAX_BIO_SHARED_TYPES +the value can improve performance. The number of lanes per atom must +be a power of 2 and currently cannot be greater than the SIMD width +for the GPU / accelerator. In the case it exceeds the SIMD width, it +will automatically be decreased to meet the restriction. The *blocksize* keyword allows you to tweak the number of threads used per thread block. This number should be a multiple of 32 (for GPUs) and its maximum depends on the specific GPU hardware. Typical choices are 64, 128, or 256. A larger block size increases occupancy of individual GPU cores, but reduces the total number of thread blocks, -thus may lead to load imbalance. +thus may lead to load imbalance. On modern hardware, the sensitivity +to the blocksize is typically low. + +The *Nthreads* value for the *omp* keyword sets the number of OpenMP +threads allocated for each MPI task. This setting controls OpenMP +parallelism only for routines run on the CPUs. For more details on +setting the number of OpenMP threads, see the discussion of the +*Nthreads* setting on this doc page for the "package omp" command. +The meaning of *Nthreads* is exactly the same for the GPU, USER-INTEL, +and GPU packages. + +The *platform* keyword is only used with OpenCL to specify the ID for +an OpenCL platform. See the output from ocl_get_devices in the lib/gpu +directory. In LAMMPS only one platform can be active at a time and by +default (id=-1) the platform is auto-selected to find the GPU with the +most compute cores. When *Ngpu* or other keywords are specified, the +auto-selection is appropriately restricted. For example, if *Ngpu* is +3, only platforms with at least 3 accelerators are considered. Similar +restrictions can be enforced by the *gpuID* and *device_type* keywords. + +The *device_type* keyword can be used for OpenCL to specify the type of +GPU to use or specify a custom configuration for an accelerator. In most +cases this selection will be automatic and there is no need to use the +keyword. The *applegpu* type is not specific to a particular GPU vendor, +but is separate due to the more restrictive Apple OpenCL implementation. +For expert users, to specify a custom configuration, the *custom* keyword +followed by the next parameters can be specified: + +CONFIG_ID, SIMD_SIZE, MEM_THREADS, SHUFFLE_AVAIL, FAST_MATH, +THREADS_PER_ATOM, THREADS_PER_CHARGE, THREADS_PER_THREE, BLOCK_PAIR, +BLOCK_BIO_PAIR, BLOCK_ELLIPSE, PPPM_BLOCK_1D, BLOCK_NBOR_BUILD, +BLOCK_CELL_2D, BLOCK_CELL_ID, MAX_SHARED_TYPES, MAX_BIO_SHARED_TYPES, +PPPM_MAX_SPLINE. + +CONFIG_ID can be 0. SHUFFLE_AVAIL in {0,1} indicates that inline-PTX +(NVIDIA) or OpenCL extensions (Intel) should be used for horizontal +vector operations. FAST_MATH in {0,1} indicates that OpenCL fast math +optimizations are used during the build and hardware-accelerated +transcendental functions are used when available. THREADS_PER_* give the +default *tpa* values for ellipsoidal models, styles using charge, and +any other styles. The BLOCK_* parameters specify the block sizes for +various kernel calls and the MAX_*SHARED*_ parameters are used to +determine the amount of local shared memory to use for storing model +parameters. + +For OpenCL, the routines are compiled at runtime for the specified GPU +or accelerator architecture. The *ocl_args* keyword can be used to +specify additional flags for the runtime build. ---------- @@ -331,44 +346,13 @@ built with co-processor support. Optional keyword/value pairs can also be specified. Each has a default value as listed below. -The *omp* keyword determines the number of OpenMP threads allocated -for each MPI task when any portion of the interactions computed by a -USER-INTEL pair style are run on the CPU. This can be the case even -if LAMMPS was built with co-processor support; see the *balance* -keyword discussion below. If you are running with less MPI tasks/node -than there are CPUs, it can be advantageous to use OpenMP threading on -the CPUs. - -.. note:: - - The *omp* keyword has nothing to do with co-processor threads on - the Xeon Phi; see the *tpc* and *tptask* keywords below for a - discussion of co-processor threads. - -The *Nthread* value for the *omp* keyword sets the number of OpenMP -threads allocated for each MPI task. Setting *Nthread* = 0 (the -default) instructs LAMMPS to use whatever value is the default for the -given OpenMP environment. This is usually determined via the -*OMP_NUM_THREADS* environment variable or the compiler runtime, which -is usually a value of 1. - -For more details, including examples of how to set the OMP_NUM_THREADS -environment variable, see the discussion of the *Nthreads* setting on -this doc page for the "package omp" command. Nthreads is a required -argument for the USER-OMP package. Its meaning is exactly the same -for the USER-INTEL package. - -.. note:: - - If you build LAMMPS with both the USER-INTEL and USER-OMP - packages, be aware that both packages allow setting of the *Nthreads* - value via their package commands, but there is only a single global - *Nthreads* value used by OpenMP. Thus if both package commands are - invoked, you should insure the two values are consistent. If they are - not, the last one invoked will take precedence, for both packages. - Also note that if the :doc:`-sf hybrid intel omp command-line switch ` is used, it invokes a "package intel" - command, followed by a "package omp" command, both with a setting of - *Nthreads* = 0. +The *Nthreads* value for the *omp* keyword sets the number of OpenMP +threads allocated for each MPI task. This setting controls OpenMP +parallelism only for routines run on the CPUs. For more details on +setting the number of OpenMP threads, see the discussion of the +*Nthreads* setting on this doc page for the "package omp" command. +The meaning of *Nthreads* is exactly the same for the GPU, USER-INTEL, +and GPU packages. The *mode* keyword determines the precision mode to use for computing pair style forces, either on the CPU or on the co-processor, @@ -574,7 +558,7 @@ result in better performance for certain configurations and system sizes. The *omp* style invokes settings associated with the use of the USER-OMP package. -The *Nthread* argument sets the number of OpenMP threads allocated for +The *Nthreads* argument sets the number of OpenMP threads allocated for each MPI task. For example, if your system has nodes with dual quad-core processors, it has a total of 8 cores per node. You could use two MPI tasks per node (e.g. using the -ppn option of the mpirun @@ -583,7 +567,7 @@ This would use all 8 cores on each node. Note that the product of MPI tasks \* threads/task should not exceed the physical number of cores (on a node), otherwise performance will suffer. -Setting *Nthread* = 0 instructs LAMMPS to use whatever value is the +Setting *Nthreads* = 0 instructs LAMMPS to use whatever value is the default for the given OpenMP environment. This is usually determined via the *OMP_NUM_THREADS* environment variable or the compiler runtime. Note that in most cases the default for OpenMP capable @@ -614,6 +598,24 @@ input. Not all features of LAMMPS support OpenMP threading via the USER-OMP package and the parallel efficiency can be very different, too. +.. note:: + + If you build LAMMPS with the GPU, USER-INTEL, and / or USER-OMP + packages, be aware these packages all allow setting of the *Nthreads* + value via their package commands, but there is only a single global + *Nthreads* value used by OpenMP. Thus if multiple package commands are + invoked, you should insure the values are consistent. If they are + not, the last one invoked will take precedence, for all packages. + Also note that if the :doc:`-sf hybrid intel omp command-line switch ` is used, it invokes a "package intel" command, followed by a + "package omp" command, both with a setting of *Nthreads* = 0. Likewise + for a hybrid suffix for gpu and omp. Note that KOKKOS also supports + setting the number of OpenMP threads from the command line using the + "-k on" :doc:`command-line switch `. The default for + KOKKOS is 1 thread per MPI task, so any other number of threads should + be explicitly set using the "-k on" command-line switch (and this + setting should be consistent with settings from any other packages + used). + Optional keyword/value pairs can also be specified. Each has a default value as listed below. @@ -658,9 +660,9 @@ Related commands Default """"""" -For the GPU package, the default is Ngpu = 1 and the option defaults +For the GPU package, the default is Ngpu = 0 and the option defaults are neigh = yes, newton = off, binsize = 0.0, split = 1.0, gpuID = 0 -to Ngpu-1, tpa = 1, and device = not used. These settings are made +to Ngpu-1, tpa = 1, omp = 0, and platform=-1. These settings are made automatically if the "-sf gpu" :doc:`command-line switch ` is used. If it is not used, you must invoke the package gpu command in your input script or via the "-pk gpu" :doc:`command-line switch `. diff --git a/doc/src/pair_charmm.rst b/doc/src/pair_charmm.rst index 6d81266a35..b3d2a2b878 100644 --- a/doc/src/pair_charmm.rst +++ b/doc/src/pair_charmm.rst @@ -1,4 +1,5 @@ .. index:: pair_style lj/charmm/coul/charmm +.. index:: pair_style lj/charmm/coul/charmm/gpu .. index:: pair_style lj/charmm/coul/charmm/intel .. index:: pair_style lj/charmm/coul/charmm/kk .. index:: pair_style lj/charmm/coul/charmm/omp @@ -19,7 +20,7 @@ pair_style lj/charmm/coul/charmm command ======================================== -Accelerator Variants: *lj/charmm/coul/charmm/intel*, *lj/charmm/coul/charmm/kk*, *lj/charmm/coul/charmm/omp* +Accelerator Variants: *lj/charmm/coul/charmm/gpu*, *lj/charmm/coul/charmm/intel*, *lj/charmm/coul/charmm/kk*, *lj/charmm/coul/charmm/omp* pair_style lj/charmm/coul/charmm/implicit command ================================================= diff --git a/doc/utils/requirements.txt b/doc/utils/requirements.txt index e025e23b09..00fa6ecfaf 100644 --- a/doc/utils/requirements.txt +++ b/doc/utils/requirements.txt @@ -1,6 +1,6 @@ Sphinx sphinxcontrib-spelling -git+https://github.com/akohlmey/sphinx-fortran@parallel-read +git+git://github.com/akohlmey/sphinx-fortran@parallel-read sphinx_tabs breathe Pygments diff --git a/doc/utils/sphinx-config/false_positives.txt b/doc/utils/sphinx-config/false_positives.txt index 9937a98850..982e1fde2a 100644 --- a/doc/utils/sphinx-config/false_positives.txt +++ b/doc/utils/sphinx-config/false_positives.txt @@ -2297,6 +2297,7 @@ omegaz Omelyan omp OMP +oneAPI onelevel oneway onn @@ -2528,6 +2529,7 @@ ptm PTM ptol ptr +PTX pu purdue Purohit diff --git a/examples/USER/reaction/tiny_nylon/in.tiny_nylon.stabilized_variable_probability b/examples/USER/reaction/tiny_nylon/in.tiny_nylon.stabilized_variable_probability index 2c101ac77c..e81fedc34a 100644 --- a/examples/USER/reaction/tiny_nylon/in.tiny_nylon.stabilized_variable_probability +++ b/examples/USER/reaction/tiny_nylon/in.tiny_nylon.stabilized_variable_probability @@ -22,7 +22,7 @@ improper_style class2 read_data tiny_nylon.data variable runsteps equal 1000 -variable prob1 equal step/v_runsteps*2 +variable prob1 equal step/v_runsteps*2+0.1 variable prob2 equal (step/v_runsteps)>0.5 velocity all create 300.0 4928459 dist gaussian diff --git a/lib/gpu/Makefile.cuda_mps b/lib/gpu/Makefile.cuda_mps index 172640ce6a..21aac89151 100644 --- a/lib/gpu/Makefile.cuda_mps +++ b/lib/gpu/Makefile.cuda_mps @@ -51,7 +51,7 @@ BIN2C = $(CUDA_HOME)/bin/bin2c # host code compiler and settings -CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC +CUDR_CPP = mpicxx -fopenmp -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC CUDR_OPTS = -O2 $(LMP_INC) CUDR = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PROXY) $(CUDA_PRECISION) $(CUDA_INCLUDE) \ $(CUDPP_OPT) diff --git a/lib/gpu/Makefile.hip b/lib/gpu/Makefile.hip index e2fd3c22d7..dbdef433ec 100644 --- a/lib/gpu/Makefile.hip +++ b/lib/gpu/Makefile.hip @@ -17,7 +17,7 @@ LMP_INC = -DLAMMPS_SMALLBIG HIP_PRECISION = -D_SINGLE_DOUBLE HIP_OPTS = -O3 -HIP_HOST_OPTS = -Wno-deprecated-declarations +HIP_HOST_OPTS = -Wno-deprecated-declarations -fopenmp HIP_HOST_INCLUDE = # use device sort diff --git a/lib/gpu/Makefile.lammps.mac_ocl b/lib/gpu/Makefile.lammps.mac_ocl index f6c8a36430..0073efa2ba 100644 --- a/lib/gpu/Makefile.lammps.mac_ocl +++ b/lib/gpu/Makefile.lammps.mac_ocl @@ -1,5 +1,5 @@ # Settings that the LAMMPS build will import when this package library is used -gpu_SYSINC = +gpu_SYSINC = -DFFT_SINGLE gpu_SYSLIB = -framework OpenCL gpu_SYSPATH = diff --git a/lib/gpu/Makefile.linux_opencl b/lib/gpu/Makefile.linux_opencl index 2aea7f5a46..43d012dc4a 100644 --- a/lib/gpu/Makefile.linux_opencl +++ b/lib/gpu/Makefile.linux_opencl @@ -1,25 +1,21 @@ # /* ---------------------------------------------------------------------- -# Generic Linux Makefile for OpenCL +# Generic Linux Makefile for OpenCL - Mixed precision # ------------------------------------------------------------------------- */ # which file will be copied to Makefile.lammps EXTRAMAKE = Makefile.lammps.opencl -# OCL_TUNE = -DFERMI_OCL # -- Uncomment for NVIDIA Fermi -# OCL_TUNE = -DKEPLER_OCL # -- Uncomment for NVIDIA Kepler -# OCL_TUNE = -DCYPRESS_OCL # -- Uncomment for AMD Cypress -OCL_TUNE = -DGENERIC_OCL # -- Uncomment for generic device - # this setting should match LAMMPS Makefile # one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL LMP_INC = -DLAMMPS_SMALLBIG -OCL_INC = -I/usr/local/cuda/include # Path to CL directory -OCL_CPP = mpic++ $(DEFAULT_DEVICE) -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC) -std=c++11 -OCL_LINK = -L/usr/local/cuda/lib64 -lOpenCL +OCL_INC = +OCL_CPP = mpic++ -std=c++11 -O3 -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC) +OCL_LINK = -lOpenCL OCL_PREC = -D_SINGLE_DOUBLE +OCL_TUNE = -fopenmp -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT BIN_DIR = ./ OBJ_DIR = ./ @@ -28,4 +24,3 @@ AR = ar BSH = /bin/sh include Opencl.makefile - diff --git a/lib/gpu/Makefile.mac_opencl b/lib/gpu/Makefile.mac_opencl index 62b58c1cef..ae7e8ca6fd 100644 --- a/lib/gpu/Makefile.mac_opencl +++ b/lib/gpu/Makefile.mac_opencl @@ -1,19 +1,17 @@ # /* ---------------------------------------------------------------------- -# Generic Mac Makefile for OpenCL +# Generic Mac Makefile for OpenCL - Single precision with FFT_SINGLE # ------------------------------------------------------------------------- */ # which file will be copied to Makefile.lammps EXTRAMAKE = Makefile.lammps.mac_ocl -OCL_TUNE = -DFERMI_OCL # -- Uncomment for NVIDIA Fermi -# OCL_TUNE = -DKEPLER_OCL # -- Uncomment for NVIDIA Kepler -# OCL_TUNE = -DCYPRESS_OCL # -- Uncomment for AMD Cypress -# OCL_TUNE = -DGENERIC_OCL # -- Uncomment for generic device +LMP_INC = -DLAMMPS_SMALLBIG -OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT +OCL_CPP = clang++ -std=c++11 -O3 -I../../src/STUBS OCL_LINK = -framework OpenCL OCL_PREC = -D_SINGLE_SINGLE +OCL_TUNE = -DUCL_NO_EXIT BIN_DIR = ./ OBJ_DIR = ./ diff --git a/lib/gpu/Makefile.mac_opencl_mpi b/lib/gpu/Makefile.mac_opencl_mpi new file mode 100644 index 0000000000..9be9f07e93 --- /dev/null +++ b/lib/gpu/Makefile.mac_opencl_mpi @@ -0,0 +1,23 @@ +# /* ---------------------------------------------------------------------- +# Generic Mac Makefile for OpenCL - Single precision with FFT_SINGLE +# ------------------------------------------------------------------------- */ + +# which file will be copied to Makefile.lammps + +EXTRAMAKE = Makefile.lammps.mac_ocl + +LMP_INC = -DLAMMPS_SMALLBIG + +OCL_CPP = mpicxx -std=c++11 -O3 -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 +OCL_LINK = -framework OpenCL +OCL_PREC = -D_SINGLE_SINGLE +OCL_TUNE = -DUCL_NO_EXIT -DMPI_GERYON + +BIN_DIR = ./ +OBJ_DIR = ./ +LIB_DIR = ./ +AR = ar +BSH = /bin/sh + +include Opencl.makefile + diff --git a/lib/gpu/Makefile.oneapi b/lib/gpu/Makefile.oneapi new file mode 100644 index 0000000000..015ab47057 --- /dev/null +++ b/lib/gpu/Makefile.oneapi @@ -0,0 +1,26 @@ +# /* ---------------------------------------------------------------------- +# Generic Linux Makefile for OpenCL +# ------------------------------------------------------------------------- */ + +# which file will be copied to Makefile.lammps + +EXTRAMAKE = Makefile.lammps.opencl + +# this setting should match LAMMPS Makefile +# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL + +LMP_INC = -DLAMMPS_SMALLBIG + +OCL_INC = +OCL_CPP = mpiicpc -std=c++11 -xHost -O2 -qopenmp -qopenmp-simd -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC) +OCL_LINK = -lOpenCL +OCL_PREC = -D_SINGLE_DOUBLE +OCL_TUNE = -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT -fp-model fast=2 -no-prec-div + +BIN_DIR = ./ +OBJ_DIR = ./ +LIB_DIR = ./ +AR = ar +BSH = /bin/sh + +include Opencl.makefile diff --git a/lib/gpu/Makefile.opencl b/lib/gpu/Makefile.opencl deleted file mode 100644 index aa7806b542..0000000000 --- a/lib/gpu/Makefile.opencl +++ /dev/null @@ -1,92 +0,0 @@ -# /* ---------------------------------------------------------------------- -# Generic Linux Makefile for OpenCL -# ------------------------------------------------------------------------- */ - -# which file will be copied to Makefile.lammps - -EXTRAMAKE = Makefile.lammps.opencl - -# this setting should match LAMMPS Makefile -# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL - -LMP_INC = -DLAMMPS_SMALLBIG - -# precision for GPU calculations -# -D_SINGLE_SINGLE # Single precision for all calculations -# -D_DOUBLE_DOUBLE # Double precision for all calculations -# -D_SINGLE_DOUBLE # Accumulation of forces, etc. in double - -OCL_PREC = -D_SINGLE_DOUBLE - -BIN_DIR = ./ -OBJ_DIR = ./ -LIB_DIR = ./ -AR = ar -BSH = /bin/sh - -# Compiler and linker settings - -# OCL_TUNE = -DFERMI_OCL # -- Uncomment for NVIDIA Fermi -# OCL_TUNE = -DKEPLER_OCL # -- Uncomment for NVIDIA Kepler -# OCL_TUNE = -DCYPRESS_OCL # -- Uncomment for AMD Cypress -OCL_TUNE = -DGENERIC_OCL # -- Uncomment for generic device - -OCL_INC = -I/usr/local/cuda/include # Path to CL directory -OCL_CPP = mpic++ $(DEFAULT_DEVICE) -g -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC) -OCL_LINK = -lOpenCL -OCL = $(OCL_CPP) $(OCL_PREC) $(OCL_TUNE) -DUSE_OPENCL - -# Headers for Geryon -UCL_H = $(wildcard ./geryon/ucl*.h) -OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_preprocessor.h -PRE1_H = lal_preprocessor.h lal_aux_fun1.h -ALL_H = $(OCL_H) $(wildcard ./lal_*.h) - -# Source files -SRCS := $(wildcard ./lal_*.cpp) -OBJS := $(subst ./,$(OBJ_DIR)/,$(SRCS:%.cpp=%.o)) -CUS := $(wildcard lal_*.cu) -KERS := $(subst ./,$(OBJ_DIR)/,$(CUS:lal_%.cu=%_cl.h)) -KERS := $(addprefix $(OBJ_DIR)/, $(KERS)) - -# targets - -GPU_LIB = $(LIB_DIR)/libgpu.a - -EXECS = $(BIN_DIR)/ocl_get_devices - -all: $(OBJ_DIR) $(KERS) $(GPU_LIB) $(EXECS) - -$(OBJ_DIR): - mkdir -p $@ - -# device code compilation - -$(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H) - $(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@; - -# host code compilation - -$(OBJ_DIR)/lal_%.o: lal_%.cpp $(KERS) - $(OCL) -o $@ -c $< -I$(OBJ_DIR) - -# build libgpu.a - -$(GPU_LIB): $(OBJS) - $(AR) -crusv $(GPU_LIB) $(OBJS) - @cp $(EXTRAMAKE) Makefile.lammps - -# test app for querying device info - -$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(OCL_H) - $(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) - -clean: - -rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(KERS) *.linkinfo - -veryclean: clean - -rm -rf *~ *.linkinfo - -cleanlib: - -rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(KERS) *.linkinfo - diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index 6716388562..d3275b890f 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -1,6 +1,7 @@ # Headers for Geryon UCL_H = $(wildcard ./geryon/ucl*.h) -NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h +NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h \ + lal_pre_cuda_hip.h ALL_H = $(NVD_H) $(wildcard ./lal_*.h) # Source files @@ -39,17 +40,21 @@ BIN2C = $(CUDA_HOME)/bin/bin2c # device code compilation -$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h +$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \ + lal_pre_cuda_hip.h $(CUDA) --fatbin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu $(OBJ_DIR)/pppm_f_cubin.h: $(OBJ_DIR)/pppm_f.cubin $(BIN2C) -c -n pppm_f $(OBJ_DIR)/pppm_f.cubin > $(OBJ_DIR)/pppm_f_cubin.h + rm $(OBJ_DIR)/pppm_f.cubin -$(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h +$(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \ + lal_pre_cuda_hip.h $(CUDA) --fatbin -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ lal_pppm.cu $(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin $(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h + rm $(OBJ_DIR)/pppm_d.cubin $(OBJ_DIR)/%_cubin.h: lal_%.cu $(ALL_H) $(CUDA) --fatbin -DNV_KERNEL -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/lal_$*.cu @@ -93,7 +98,7 @@ $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H) $(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda clean: - -rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(CUHS) *.linkinfo + -rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(CUHS) *.cubin *.linkinfo veryclean: clean -rm -rf *~ *.linkinfo diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index 996a564998..2ff98827d4 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -1,8 +1,15 @@ +# Common headers for kernels +PRE1_H = lal_preprocessor.h lal_aux_fun1.h + # Headers for Geryon UCL_H = $(wildcard ./geryon/ucl*.h) -OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_preprocessor.h -PRE1_H = lal_preprocessor.h lal_aux_fun1.h -ALL_H = $(OCL_H) $(wildcard ./lal_*.h) +OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_precision.h + +# Headers for Host files +HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h \ + lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \ + lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \ + lal_neighbor_shared.h lal_pre_ocl_config.h $(OCL_H) # Source files SRCS := $(wildcard ./lal_*.cpp) @@ -28,12 +35,75 @@ OCL = $(OCL_CPP) $(OCL_PREC) $(OCL_TUNE) -DUSE_OPENCL # device code compilation +$(OBJ_DIR)/atom_cl.h: lal_atom.cu lal_preprocessor.h + $(BSH) ./geryon/file_to_cstr.sh atom lal_preprocessor.h lal_atom.cu $(OBJ_DIR)/atom_cl.h + +$(OBJ_DIR)/neighbor_cpu_cl.h: lal_neighbor_cpu.cu lal_preprocessor.h + $(BSH) ./geryon/file_to_cstr.sh neighbor_cpu lal_preprocessor.h lal_neighbor_cpu.cu $(OBJ_DIR)/neighbor_cpu_cl.h + +$(OBJ_DIR)/neighbor_gpu_cl.h: lal_neighbor_gpu.cu lal_preprocessor.h + $(BSH) ./geryon/file_to_cstr.sh neighbor_gpu lal_preprocessor.h lal_neighbor_gpu.cu $(OBJ_DIR)/neighbor_gpu_cl.h + +$(OBJ_DIR)/device_cl.h: lal_device.cu lal_preprocessor.h + $(BSH) ./geryon/file_to_cstr.sh device lal_preprocessor.h lal_device.cu $(OBJ_DIR)/device_cl.h + +$(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h + $(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h; + +$(OBJ_DIR)/ellipsoid_nbor_cl.h: lal_ellipsoid_nbor.cu lal_preprocessor.h + $(BSH) ./geryon/file_to_cstr.sh ellipsoid_nbor lal_preprocessor.h lal_ellipsoid_nbor.cu $(OBJ_DIR)/ellipsoid_nbor_cl.h + +$(OBJ_DIR)/gayberne_cl.h: lal_gayberne.cu $(PRE1_H) lal_ellipsoid_extra.h + $(BSH) ./geryon/file_to_cstr.sh gayberne $(PRE1_H) lal_ellipsoid_extra.h lal_gayberne.cu $(OBJ_DIR)/gayberne_cl.h; + +$(OBJ_DIR)/gayberne_lj_cl.h: lal_gayberne_lj.cu $(PRE1_H) lal_ellipsoid_extra.h + $(BSH) ./geryon/file_to_cstr.sh gayberne_lj $(PRE1_H) lal_ellipsoid_extra.h lal_gayberne_lj.cu $(OBJ_DIR)/gayberne_lj_cl.h; + +$(OBJ_DIR)/re_squared_cl.h: lal_re_squared.cu $(PRE1_H) lal_ellipsoid_extra.h + $(BSH) ./geryon/file_to_cstr.sh re_squared $(PRE1_H) lal_ellipsoid_extra.h lal_re_squared.cu $(OBJ_DIR)/re_squared_cl.h; + +$(OBJ_DIR)/re_squared_lj_cl.h: lal_re_squared_lj.cu $(PRE1_H) lal_ellipsoid_extra.h + $(BSH) ./geryon/file_to_cstr.sh re_squared_lj $(PRE1_H) lal_ellipsoid_extra.h lal_re_squared_lj.cu $(OBJ_DIR)/re_squared_lj_cl.h; + +$(OBJ_DIR)/tersoff_cl.h: lal_tersoff.cu $(PRE1_H) lal_tersoff_extra.h + $(BSH) ./geryon/file_to_cstr.sh tersoff $(PRE1_H) lal_tersoff_extra.h lal_tersoff.cu $(OBJ_DIR)/tersoff_cl.h; + +$(OBJ_DIR)/tersoff_mod_cl.h: lal_tersoff_mod.cu $(PRE1_H) lal_tersoff_mod_extra.h + $(BSH) ./geryon/file_to_cstr.sh tersoff_mod $(PRE1_H) lal_tersoff_mod_extra.h lal_tersoff_mod.cu $(OBJ_DIR)/tersoff_mod_cl.h; + +$(OBJ_DIR)/tersoff_zbl_cl.h: lal_tersoff_zbl.cu $(PRE1_H) lal_tersoff_zbl_extra.h + $(BSH) ./geryon/file_to_cstr.sh tersoff_zbl $(PRE1_H) lal_tersoff_zbl_extra.h lal_tersoff_zbl.cu $(OBJ_DIR)/tersoff_zbl_cl.h; + $(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@; # host code compilation -$(OBJ_DIR)/lal_%.o: lal_%.cpp $(KERS) +$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H) + $(OCL) -o $@ -c lal_answer.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H) + $(OCL) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H) + $(OCL) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H) + $(OCL) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H) + $(OCL) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H) + $(OCL) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H) + $(OCL) -o $@ -c $< -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H) + $(OCL) -o $@ -c $< -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cl.h $(HOST_H) $(OCL) -o $@ -c $< -I$(OBJ_DIR) $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(OCL_H) diff --git a/lib/gpu/README b/lib/gpu/README index dfa8dcf7ff..dfffe11b81 100644 --- a/lib/gpu/README +++ b/lib/gpu/README @@ -1,21 +1,112 @@ -------------------------------- LAMMPS ACCELERATOR LIBRARY -------------------------------- - + W. Michael Brown (ORNL) Trung Dac Nguyen (ORNL/Northwestern) - Peng Wang (NVIDIA) + Nitin Dhamankar (Intel) Axel Kohlmeyer (Temple) + Peng Wang (NVIDIA) + Anders Hafreager (UiO) + V. Nikolskiy (HSE) + Maurice de Koning (Unicamp/Brazil) + Rodolfo Paula Leite (Unicamp/Brazil) Steve Plimpton (SNL) Inderaj Bains (NVIDIA) -------------------------------------------------------------------- -This directory has source files to build a library that LAMMPS -links against when using the GPU package. +------------------------------------------------------------------------------ -This library must be built with a C++ compiler, before LAMMPS is -built, so LAMMPS can link against it. +This directory has source files to build a library that LAMMPS links against +when using the GPU package. + +This library must be built with a C++ compiler along with CUDA, HIP, or OpenCL +before LAMMPS is built, so LAMMPS can link against it. + +This library, libgpu.a, provides routines for acceleration of certain +LAMMPS styles and neighbor list builds using CUDA, OpenCL, or ROCm HIP. + +Pair styles supported by this library are marked in the list of Pair style +potentials with a "g". See the online version at: + +https://lammps.sandia.gov/doc/Commands_pair.html + +In addition the (plain) pppm kspace style is supported as well. + +------------------------------------------------------------------------------ + DEVICE QUERY +------------------------------------------------------------------------------ +The gpu library includes binaries to check for available GPUs and their +properties. It is a good idea to run this on first use to make sure the +system and build is setup properly. Additionally, the GPU numbering for +specific selection of devices should be taking from this output. The GPU +library may split some accelerators into separate virtual accelerators for +efficient use with MPI. + +After building the GPU library, for OpenCL: + ./ocl_get_devices +for CUDA: + ./nvc_get_devices +and for ROCm HIP: + ./hip_get_devices + +------------------------------------------------------------------------------ + QUICK START +------------------------------------------------------------------------------ +OpenCL: Mac without MPI: + make -f Makefile.mac_opencl -j; cd ../../src/; make mpi-stubs + make g++_serial -j + ./lmp_g++_serial -in ../bench/in.lj -log none -sf gpu + +OpenCL: Mac with MPI: + make -f Makefile.mac_opencl_mpi -j; cd ../../src/; make g++_openmpi -j + mpirun -np $NUM_MPI ./lmp_g++_openmpi -in ../bench/in.lj -log none -sf gpu + +OpenCL: Linux with Intel oneAPI: + make -f Makefile.oneapi -j; cd ../../src; make oneapi -j + export OMP_NUM_THREADS=$NUM_THREADS + mpirun -np $NUM_MPI ./lmp_oneapi -in ../bench/in.lj -log none -sf gpu + +OpenCL: Linux with MPI: + make -f Makefile.linux_opencl -j; cd ../../src; make omp -j + export OMP_NUM_THREADS=$NUM_THREADS + mpirun -np $NUM_MPI ./lmp_omp -in ../bench/in.lj -log none -sf gpu + +NVIDIA CUDA: + make -f Makefile.cuda_mps -j; cd ../../src; make omp -j + export CUDA_MPS_LOG_DIRECTORY=/tmp; export CUDA_MPS_PIPE_DIRECTORY=/tmp + nvidia-smi -i 0 -c EXCLUSIVE_PROCESS + export OMP_NUM_THREADS=$NUM_THREADS + mpirun -np $NUM_MPI ./lmp_omp -in ../bench/in.lj -log none -sf gpu + echo quit | /usr/bin/nvidia-cuda-mps-control + +AMD HIP: + make -f Makefile.hip -j; cd ../../src; make omp -j + export OMP_NUM_THREADS=$NUM_THREADS + mpirun -np $NUM_MPI ./lmp_omp -in ../bench/in.lj -log none -sf gpu + +------------------------------------------------------------------------------ + Installing oneAPI, OpenCl, CUDA, or ROCm +------------------------------------------------------------------------------ +The easiest approach is to use the linux package manger to perform the +installation from Intel, NVIDIA, etc. repositories. All are available for +free. The oneAPI installation includes Intel optimized MPI and C++ compilers, +along with many libraries. Alternatively, Intel OpenCL can also be installed +separately from the Intel repository. + +NOTE: Installation of the CUDA SDK is not required, only the CUDA toolkit. + +See: + +https://software.intel.com/content/www/us/en/develop/tools/oneapi/hpc-toolkit.html + +https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html + +https://github.com/RadeonOpenCompute/ROCm + +------------------------------------------------------------------------------ + Build Intro +------------------------------------------------------------------------------ You can type "make lib-gpu" from the src directory to see help on how to build this library via make commands, or you can do the same thing @@ -25,13 +116,13 @@ do it manually by following the instructions below. Build the library using one of the provided Makefile.* files or create your own, specific to your compiler and system. For example: -make -f Makefile.linux +make -f Makefile.linux_opencl When you are done building this library, two files should exist in this directory: -libgpu.a the library LAMMPS will link against -Makefile.lammps settings the LAMMPS Makefile will import +libgpu.a the library LAMMPS will link against +Makefile.lammps settings the LAMMPS Makefile will import Makefile.lammps is created by the make command, by copying one of the Makefile.lammps.* files. See the EXTRAMAKE setting at the top of the @@ -45,77 +136,52 @@ IMPORTANT: If you re-build the library, e.g. for a different precision Makefile.linux clean, to insure all previous derived files are removed before the new build is done. -Makefile.lammps has settings for 3 variables: - -user-gpu_SYSINC = leave blank for this package -user-gpu_SYSLIB = CUDA libraries needed by this package -user-gpu_SYSPATH = path(s) to where those libraries are - -Because you have the CUDA compilers on your system, you should have -the needed libraries. If the CUDA development tools were installed -in the standard manner, the settings in the Makefile.lammps.standard -file should work. - -------------------------------------------------------------------- - - GENERAL NOTES - -------------------------------- - -This library, libgpu.a, provides routines for GPU acceleration -of certain LAMMPS styles and neighbor list builds. Compilation of this -library requires installing the CUDA GPU driver and CUDA toolkit for -your operating system. Installation of the CUDA SDK is not necessary. -In addition to the LAMMPS library, the binary nvc_get_devices will also -be built. This can be used to query the names and properties of GPU -devices on your system. A Makefile for OpenCL and ROCm HIP compilation -is provided, but support for it is not currently provided by the developers. -Details of the implementation are provided in: - ----- - -Brown, W.M., Wang, P. Plimpton, S.J., Tharrington, A.N. Implementing -Molecular Dynamics on Hybrid High Performance Computers - Short Range -Forces. Computer Physics Communications. 2011. 182: p. 898-911. - -and - -Brown, W.M., Kohlmeyer, A. Plimpton, S.J., Tharrington, A.N. Implementing -Molecular Dynamics on Hybrid High Performance Computers - Particle-Particle -Particle-Mesh. Computer Physics Communications. 2012. 183: p. 449-459. - -and - -Brown, W.M., Masako, Y. Implementing Molecular Dynamics on Hybrid High -Performance Computers - Three-Body Potentials. Computer Physics Communications. -2013. 184: p. 2785–2793. - ----- - -NOTE: Installation of the CUDA SDK is not required, only the CUDA -toolkit itself or an OpenCL 1.2 compatible header and library. - -Pair styles supporting GPU acceleration this this library -are marked in the list of Pair style potentials with a "g". -See the online version at: https://lammps.sandia.gov/doc/Commands_pair.html - -In addition the (plain) pppm kspace style is supported as well. +NOTE: The system-specific setting LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG, + or LAMMPS_SMALLSMALL if specified when building LAMMPS (i.e. in + src/MAKE/Makefile.foo) should be consistent with that specified + when building libgpu.a (i.e. by LMP_INC in the lib/gpu/Makefile.bar). - MULTIPLE LAMMPS PROCESSES - -------------------------------- - -Multiple LAMMPS MPI processes can share GPUs on the system, but multiple -GPUs cannot be utilized by a single MPI process. In many cases, the -best performance will be obtained by running as many MPI processes as -CPU cores available with the condition that the number of MPI processes -is an integer multiple of the number of GPUs being used. See the -LAMMPS user manual for details on running with GPU acceleration. +------------------------------------------------------------------------------ + PRECISION MODES +------------------------------------------------------------------------------ +The GPU library supports 3 precision modes: single, double, and mixed, with +the latter being the default for most Makefiles aside from Mac specific +Makefiles due to the more restrictive nature of the Apple OpenCL for some +devices. + +To specify the precision mode (output to the screen before LAMMPS runs for +verification), set either CUDA_PRECISION, OCL_PREC, or HIP_PRECISION to one +of -D_SINGLE_SINGLE, -D_DOUBLE_DOUBLE, or -D_SINGLE_DOUBLE. + +Some accelerators or OpenCL implementations only support single precision. +This mode should be used with care and appropriate validation as the errors +can scale with system size in this implementation. This can be useful for +accelerating test runs when setting up a simulation for production runs on +another machine. In the case where only single precision is supported, either +LAMMPS must be compiled with -DFFT_SINGLE to use PPPM with GPU acceleration +or GPU acceleration should be disabled for PPPM (e.g. suffix off or pair/only +as described in the LAMMPS documentation). - BUILDING AND PRECISION MODES - -------------------------------- +------------------------------------------------------------------------------ + CUDA BUILD NOTES +------------------------------------------------------------------------------ +NOTE: when compiling with CMake, all of the considerations listed below +are considered within the CMake configuration process, so no separate +compilation of the gpu library is required. Also this will build in support +for all compute architecture that are supported by the CUDA toolkit version +used to build the gpu library. -To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME variables in one of +If you do not want to use a fat binary, that supports multiple CUDA +architectures, the CUDA_ARCH must be set to match the GPU architecture. This +is reported by nvc_get_devices executable created by the build process and +a detailed list of GPU architectures and CUDA compatible GPUs can be found +e.g. here: https://en.wikipedia.org/wiki/CUDA#GPUs_supported + +The CUDA_HOME variable should be set to the location of the CUDA toolkit. + +To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME variables in one of the Makefiles. CUDA_ARCH should be set based on the compute capability of your GPU. This can be verified by running the nvc_get_devices executable after the build is complete. Additionally, the GPU package must be installed and @@ -123,82 +189,93 @@ compiled for LAMMPS. This may require editing the gpu_SYSPATH variable in the LAMMPS makefile. Please note that the GPU library accesses the CUDA driver library directly, -so it needs to be linked not only to the CUDA runtime library (libcudart.so) -that ships with the CUDA toolkit, but also with the CUDA driver library -(libcuda.so) that ships with the Nvidia driver. If you are compiling LAMMPS -on the head node of a GPU cluster, this library may not be installed, -so you may need to copy it over from one of the compute nodes (best into -this directory). Recent CUDA toolkits starting from CUDA 9 provide a dummy -libcuda.so library (typically under $(CUDA_HOME)/lib64/stubs), that can be used for -linking. +so it needs to be linked with the CUDA driver library (libcuda.so) that ships +with the Nvidia driver. If you are compiling LAMMPS on the head node of a GPU +cluster, this library may not be installed, so you may need to copy it over +from one of the compute nodes (best into this directory). Recent CUDA toolkits +starting from CUDA 9 provide a dummy libcuda.so library (typically under +$(CUDA_HOME)/lib64/stubs), that can be used for linking. -The gpu library supports 3 precision modes as determined by -the CUDA_PRECISION variable: +Best performance with the GPU library is typically with multiple MPI processes +sharing the same GPU cards. For NVIDIA, this is most efficient with CUDA +MPS enabled. To prevent runtime errors for GPUs configured in exclusive process +mode with MPS, the GPU library should be build with either of the equivalent +-DCUDA_MPS_SUPPORT or -DCUDA_PROXY flags. - CUDA_PRECISION = -D_SINGLE_SINGLE # Single precision for all calculations - CUDA_PRECISION = -D_DOUBLE_DOUBLE # Double precision for all calculations - CUDA_PRECISION = -D_SINGLE_DOUBLE # Accumulation of forces, etc. in double +------------------------------------------------------------------------------ + HIP BUILD NOTES +------------------------------------------------------------------------------ -As of CUDA 7.5 only GPUs with compute capability 2.0 (Fermi) or newer are -supported and as of CUDA 9.0 only compute capability 3.0 (Kepler) or newer -are supported. There are some limitations of this library for GPUs older -than that, which require additional preprocessor flag, and limit features, -but they are kept for historical reasons. There is no value in trying to -use those GPUs for production calculations. - -You have to make sure that you set a CUDA_ARCH line suitable for your -hardware and CUDA toolkit version: e.g. -arch=sm_35 for Tesla K20 or K40 -or -arch=sm_52 GeForce GTX Titan X. A detailed list of GPU architectures -and CUDA compatible GPUs can be found e.g. here: -https://en.wikipedia.org/wiki/CUDA#GPUs_supported - -NOTE: when compiling with CMake, all of the considerations listed below -are considered within the CMake configuration process, so no separate -compilation of the gpu library is required. Also this will build in support -for all compute architecture that are supported by the CUDA toolkit version -used to build the gpu library. - -Please note the CUDA_CODE settings in Makefile.linux_multi, which allows -to compile this library with support for multiple GPUs. This list can be -extended for newer GPUs with newer CUDA toolkits and should allow to build -a single GPU library compatible with all GPUs that are worth using for -GPU acceleration and supported by the current CUDA toolkits and drivers. - -NOTE: The system-specific setting LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG, - or LAMMPS_SMALLSMALL if specified when building LAMMPS (i.e. in - src/MAKE/Makefile.foo) should be consistent with that specified - when building libgpu.a (i.e. by LMP_INC in the lib/gpu/Makefile.bar). - - BUILDING FOR HIP FRAMEWORK - -------------------------------- -1. Install the latest ROCm framework (https://github.com/RadeonOpenCompute/ROCm). -2. GPU sorting requires installing hipcub +1. GPU sorting requires installing hipcub (https://github.com/ROCmSoftwarePlatform/hipCUB). The HIP CUDA-backend additionally requires cub (https://nvlabs.github.io/cub). Download and extract the cub directory to lammps/lib/gpu/ or specify an appropriate path in lammps/lib/gpu/Makefile.hip. -3. In Makefile.hip it is possible to specify the target platform via -export HIP_PLATFORM=hcc or HIP_PLATFORM=nvcc as well as the target +2. In Makefile.hip it is possible to specify the target platform via +export HIP_PLATFORM=hcc or HIP_PLATFORM=nvcc as well as the target architecture (gfx803, gfx900, gfx906 etc.) -4. If your MPI implementation does not support `mpicxx --showme` command, +3. If your MPI implementation does not support `mpicxx --showme` command, it is required to specify the corresponding MPI compiler and linker flags in lammps/lib/gpu/Makefile.hip and in lammps/src/MAKE/OPTIONS/Makefile.hip. -5. Building the GPU library (libgpu.a): - cd lammps/lib/gpu; make -f Makefile.hip -j -6. Building the LAMMPS executable (lmp_hip): - cd ../../src; make hip -j - EXAMPLE CONVENTIONAL BUILD PROCESS - -------------------------------- - -cd ~/lammps/lib/gpu -emacs Makefile.linux -make -f Makefile.linux -./nvc_get_devices -cd ../../src -emacs ./MAKE/Makefile.linux -make yes-asphere -make yes-kspace -make yes-gpu -make linux +------------------------------------------------------------------------------ + OPENCL BUILD NOTES +------------------------------------------------------------------------------ +If GERYON_NUMA_FISSION is defined at build time, LAMMPS will consider separate +NUMA nodes on GPUs or accelerators as separate devices. For example, a 2-socket +CPU would appear as two separate devices for OpenCL (and LAMMPS would require +two MPI processes to use both sockets with the GPU library - each with its +own device ID as output by ocl_get_devices). + +For a debug build, use "-DUCL_DEBUG -DGERYON_KERNEL_DUMP" and remove +"-DUCL_NO_EXIT" and "-DMPI_GERYON" from the build options. + +------------------------------------------------------------------------------ + ALL PREPROCESSOR OPTIONS (For Advanced Users) +------------------------------------------------------------------------------ +_SINGLE_SINGLE Build library for single precision mode +_SINGLE_DOUBLE Build library for mixed precision mode +_DOUBLE_DOUBLE Build library for double precision mode +CUDA_MPS_SUPPORT Do not generate errors for exclusive mode for CUDA +CUDA_PROXY Same as above +MPI_GERYON Library should use MPI_Abort for unhandled errors +GERYON_NUMA_FISSION Accelerators with main memory NUMA are split into + multiple virtual accelerators for each NUMA node +LAL_USE_OMP=0 Disable OpenMP in lib, regardless of compiler setting +LAL_USE_OMP_SIMD=0 Disable OpenMP SIMD in lib, regardless of compiler set +GERYON_OCL_FLUSH For OpenCL, flush queue after every enqueue +LAL_NO_OCL_EV_JIT Turn off JIT specialization for kernels in OpenCL +LAL_USE_OLD_NEIGHBOR Use old neighbor list algorithm +USE_CUDPP Enable GPU binning in neighbor builds (not recommended) +USE_HIP_DEVICE_SORT Enable GPU binning for HIP builds + (only w/ LAL_USE_OLD_NEIGHBOR) +LAL_NO_BLOCK_REDUCE Use host for energy/virial accumulation +LAL_OCL_EXTRA_ARGS Supply extra args for OpenCL compiler delimited with : +UCL_NO_EXIT LAMMPS should handle errors instead of Geryon lib +UCL_DEBUG Debug build for Geryon +GERYON_KERNEL_DUMP Dump all compiled OpenCL programs with compiler + flags and build logs +GPU_CAST Casting performed on GPU, untested recently +THREE_CONCURRENT Concurrent 3-body calcs in separate queues, untested + + +------------------------------------------------------------------------------ + References for Details +------------------------------------------------------------------------------ + +Brown, W.M., Wang, P. Plimpton, S.J., Tharrington, A.N. Implementing +Molecular Dynamics on Hybrid High Performance Computers - Short Range +Forces. Computer Physics Communications. 2011. 182: p. 898-911. + +and + +Brown, W.M., Kohlmeyer, A. Plimpton, S.J., Tharrington, A.N. Implementing +Molecular Dynamics on Hybrid High Performance Computers - Particle-Particle +Particle-Mesh. Computer Physics Communications. 2012. 183: p. 449-459. + +and + +Brown, W.M., Masako, Y. Implementing Molecular Dynamics on Hybrid High +Performance Computers - Three-Body Potentials. Computer Physics Communications. +2013. 184: p. 2785–2793. diff --git a/lib/gpu/geryon/hip_device.h b/lib/gpu/geryon/hip_device.h index d2fb1919b7..373b3783b0 100644 --- a/lib/gpu/geryon/hip_device.h +++ b/lib/gpu/geryon/hip_device.h @@ -24,6 +24,8 @@ namespace ucl_hip { // -------------------------------------------------------------------------- typedef hipStream_t command_queue; +inline void ucl_flush(command_queue &cq) {} + inline void ucl_sync(hipStream_t &stream) { CU_SAFE_CALL(hipStreamSynchronize(stream)); } @@ -143,15 +145,26 @@ class UCL_Device { inline std::string device_type_name(const int i) { return "GPU"; } /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) - inline int device_type() { return device_type(_device); } + inline enum UCL_DEVICE_TYPE device_type() { return device_type(_device); } /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) - inline int device_type(const int i) { return UCL_GPU; } + inline enum UCL_DEVICE_TYPE device_type(const int i) { return UCL_GPU; } /// Returns true if host memory is efficiently addressable from device inline bool shared_memory() { return shared_memory(_device); } /// Returns true if host memory is efficiently addressable from device inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; } + /// Returns preferred vector width + inline int preferred_fp32_width() { return preferred_fp32_width(_device); } + /// Returns preferred vector width + inline int preferred_fp32_width(const int i) + {return _properties[i].SIMDWidth;} + /// Returns preferred vector width + inline int preferred_fp64_width() { return preferred_fp64_width(_device); } + /// Returns preferred vector width + inline int preferred_fp64_width(const int i) + {return _properties[i].SIMDWidth;} + /// Returns true if double precision is support for the current device inline bool double_precision() { return double_precision(_device); } /// Returns true if double precision is support for the device @@ -215,7 +228,19 @@ class UCL_Device { /// Get the maximum number of threads per block inline size_t group_size(const int i) { return _properties[i].maxThreadsPerBlock; } - + /// Get the maximum number of threads per block in dimension 'dim' + inline size_t group_size_dim(const int dim) + { return group_size_dim(_device, dim); } + /// Get the maximum number of threads per block in dimension 'dim' + inline size_t group_size_dim(const int i, const int dim) + { return _properties[i].maxThreadsDim[dim];} + + /// Get the shared local memory size in bytes + inline size_t slm_size() { return slm_size(_device); } + /// Get the shared local memory size in bytes + inline size_t slm_size(const int i) + { return _properties[i].sharedMemPerBlock; } + /// Return the maximum memory pitch in bytes for current device inline size_t max_pitch() { return max_pitch(_device); } /// Return the maximum memory pitch in bytes @@ -255,11 +280,20 @@ class UCL_Device { inline int max_sub_devices(const int i) { return 0; } + /// True if the device supports shuffle intrinsics + inline bool has_shuffle_support() + { return has_shuffle_support(_device); } + /// True if the device supports shuffle intrinsics + inline bool has_shuffle_support(const int i) + { return arch(i)>=3.0; } + /// List all devices along with all properties inline void print_all(std::ostream &out); - /// Select the platform that has accelerators (for compatibility with OpenCL) - inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; } + /// For compatability with OCL API + inline int auto_set_platform(const enum UCL_DEVICE_TYPE type=UCL_GPU, + const std::string vendor="") + { return set_platform(0); } inline int load_module(const void* program, hipModule_t& module, std::string *log=nullptr){ auto it = _loaded_modules.emplace(program, hipModule_t()); diff --git a/lib/gpu/geryon/hip_kernel.h b/lib/gpu/geryon/hip_kernel.h index c5014b52e7..10bc9f1334 100644 --- a/lib/gpu/geryon/hip_kernel.h +++ b/lib/gpu/geryon/hip_kernel.h @@ -14,6 +14,7 @@ #include #include #include +#include namespace ucl_hip { @@ -64,7 +65,7 @@ class UCL_Program { } /// Load a program from a string and compile with flags - inline int load_string(const void *program, const char *flags="", std::string *log=nullptr) { + inline int load_string(const void *program, const char *flags="", std::string *log=nullptr, FILE* foutput=nullptr) { return _device_ptr->load_module(program, _module, log); } @@ -73,6 +74,7 @@ class UCL_Program { hipModule_t _module; hipStream_t _cq; friend class UCL_Texture; + friend class UCL_Const; }; /// Class for dealing with CUDA Driver kernels diff --git a/lib/gpu/geryon/hip_texture.h b/lib/gpu/geryon/hip_texture.h index ae16bee900..9117adc879 100644 --- a/lib/gpu/geryon/hip_texture.h +++ b/lib/gpu/geryon/hip_texture.h @@ -107,6 +107,37 @@ class UCL_Texture { } }; +/// Class storing a const global memory reference +class UCL_Const { + public: + UCL_Const() {} + ~UCL_Const() {} + /// Construct with a specified global reference + inline UCL_Const(UCL_Program &prog, const char *global_name) + { get_global(prog,global_name); } + /// Set the global reference for this object + inline void get_global(UCL_Program &prog, const char *global_name) { + _cq=prog.cq(); + CU_SAFE_CALL(hipModuleGetGlobal(&_global, &_global_bytes, prog._module, + global_name)); + } + /// Copy from array on host to const memory + template + inline void update_device(UCL_H_Vec &src, const int numel) { + CU_SAFE_CALL(hipMemcpyHtoDAsync(_global, src.begin(), numel*sizeof(numtyp), + _cq)); + } + /// Get device ptr associated with object + inline const void* begin() const { return &_global; } + inline void clear() {} + + private: + hipStream_t _cq; + void* _global; + size_t _global_bytes; + friend class UCL_Kernel; +}; + } // namespace #endif diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h index 42f176bcbf..52b2ed478e 100644 --- a/lib/gpu/geryon/nvd_device.h +++ b/lib/gpu/geryon/nvd_device.h @@ -37,6 +37,8 @@ namespace ucl_cudadr { // -------------------------------------------------------------------------- typedef CUstream command_queue; +inline void ucl_flush(command_queue &cq) {} + inline void ucl_sync(CUstream &stream) { CU_SAFE_CALL(cuStreamSynchronize(stream)); } @@ -156,15 +158,26 @@ class UCL_Device { inline std::string device_type_name(const int i) { return "GPU"; } /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) - inline int device_type() { return device_type(_device); } + inline enum UCL_DEVICE_TYPE device_type() { return device_type(_device); } /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) - inline int device_type(const int i) { return UCL_GPU; } + inline enum UCL_DEVICE_TYPE device_type(const int i) { return UCL_GPU; } /// Returns true if host memory is efficiently addressable from device inline bool shared_memory() { return shared_memory(_device); } /// Returns true if host memory is efficiently addressable from device inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; } + /// Returns preferred vector width + inline int preferred_fp32_width() { return preferred_fp32_width(_device); } + /// Returns preferred vector width + inline int preferred_fp32_width(const int i) + {return _properties[i].SIMDWidth;} + /// Returns preferred vector width + inline int preferred_fp64_width() { return preferred_fp64_width(_device); } + /// Returns preferred vector width + inline int preferred_fp64_width(const int i) + {return _properties[i].SIMDWidth;} + /// Returns true if double precision is support for the current device inline bool double_precision() { return double_precision(_device); } /// Returns true if double precision is support for the device @@ -228,6 +241,18 @@ class UCL_Device { /// Get the maximum number of threads per block inline size_t group_size(const int i) { return _properties[i].maxThreadsPerBlock; } + /// Get the maximum number of threads per block in dimension 'dim' + inline size_t group_size_dim(const int dim) + { return group_size_dim(_device, dim); } + /// Get the maximum number of threads per block in dimension 'dim' + inline size_t group_size_dim(const int i, const int dim) + { return _properties[i].maxThreadsDim[dim]; } + + /// Get the shared local memory size in bytes + inline size_t slm_size() { return slm_size(_device); } + /// Get the shared local memory size in bytes + inline size_t slm_size(const int i) + { return _properties[i].sharedMemPerBlock; } /// Return the maximum memory pitch in bytes for current device inline size_t max_pitch() { return max_pitch(_device); } @@ -268,11 +293,22 @@ class UCL_Device { inline int max_sub_devices(const int i) { return 0; } + /// True if the device supports shuffle intrinsics + inline bool has_shuffle_support() + { return has_shuffle_support(_device); } + /// True if the device supports shuffle intrinsics + inline bool has_shuffle_support(const int i) + { return arch(i)>=3.0; } + /// List all devices along with all properties inline void print_all(std::ostream &out); - /// Select the platform that has accelerators (for compatibility with OpenCL) - inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; } + /// For compatability with OCL API + inline int auto_set_platform(const enum UCL_DEVICE_TYPE type=UCL_GPU, + const std::string vendor="", + const int ndevices=-1, + const int first_device=-1) + { return set_platform(0); } private: int _device, _num_devices; diff --git a/lib/gpu/geryon/nvd_kernel.h b/lib/gpu/geryon/nvd_kernel.h index d74b0e2dc1..c31b8cdf9b 100644 --- a/lib/gpu/geryon/nvd_kernel.h +++ b/lib/gpu/geryon/nvd_kernel.h @@ -26,6 +26,7 @@ #include "nvd_device.h" #include +#include namespace ucl_cudadr { @@ -77,7 +78,7 @@ class UCL_Program { /// Load a program from a string and compile with flags inline int load_string(const void *program, const char *flags="", - std::string *log=nullptr) { + std::string *log=nullptr, FILE* foutput=nullptr) { if (std::string(flags)=="BINARY") return load_binary((const char *)program); const unsigned int num_opts=2; @@ -100,12 +101,25 @@ class UCL_Program { if (err != CUDA_SUCCESS) { #ifndef UCL_NO_EXIT - std::cerr << std::endl + std::cerr << std::endl << std::endl << "----------------------------------------------------------\n" << " UCL Error: Error compiling PTX Program...\n" << "----------------------------------------------------------\n"; - std::cerr << log << std::endl; + std::cerr << log << std::endl + << "----------------------------------------------------------\n\n"; #endif + if (foutput != NULL) { + fprintf(foutput,"\n\n"); + fprintf(foutput, + "----------------------------------------------------------\n"); + fprintf(foutput," UCL Error: Error compiling PTX Program...\n"); + fprintf(foutput, + "----------------------------------------------------------\n"); + fprintf(foutput,"%s\n",log); + fprintf(foutput, + "----------------------------------------------------------\n"); + fprintf(foutput,"\n\n"); + } return UCL_COMPILE_ERROR; } @@ -139,11 +153,15 @@ class UCL_Program { return UCL_SUCCESS; } + /// Return the default command queue/stream associated with this data + inline command_queue & cq() { return _cq; } + friend class UCL_Kernel; private: CUmodule _module; CUstream _cq; friend class UCL_Texture; + friend class UCL_Const; }; /// Class for dealing with CUDA Driver kernels diff --git a/lib/gpu/geryon/nvd_texture.h b/lib/gpu/geryon/nvd_texture.h index c766af826c..65f4ad9ef5 100644 --- a/lib/gpu/geryon/nvd_texture.h +++ b/lib/gpu/geryon/nvd_texture.h @@ -38,8 +38,11 @@ class UCL_Texture { inline UCL_Texture(UCL_Program &prog, const char *texture_name) { get_texture(prog,texture_name); } /// Set the texture reference for this object - inline void get_texture(UCL_Program &prog, const char *texture_name) - { CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); } + inline void get_texture(UCL_Program &prog, const char *texture_name) { + #if (CUDA_VERSION < 11000) + CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); + #endif + } /// Bind a float array where each fetch grabs a vector of length numel template @@ -72,11 +75,14 @@ class UCL_Texture { } private: + #if (CUDA_VERSION < 11000) CUtexref _tex; + #endif friend class UCL_Kernel; template inline void _bind_float(mat_typ &vec, const unsigned numel) { + #if (CUDA_VERSION < 11000) #ifdef UCL_DEBUG assert(numel!=0 && numel<5); #endif @@ -90,10 +96,42 @@ class UCL_Texture { else CU_SAFE_CALL(cuTexRefSetFormat(_tex,CU_AD_FORMAT_SIGNED_INT32,numel*2)); } + #endif } }; +/// Class storing a const global memory reference +class UCL_Const { + public: + UCL_Const() {} + ~UCL_Const() {} + /// Construct with a specified global reference + inline UCL_Const(UCL_Program &prog, const char *global_name) + { get_global(prog,global_name); } + /// Set the global reference for this object + inline void get_global(UCL_Program &prog, const char *global_name) { + _cq=prog.cq(); + CU_SAFE_CALL(cuModuleGetGlobal(&_global, &_global_bytes, prog._module, + global_name)); + } + /// Copy from array on host to const memory + template + inline void update_device(UCL_H_Vec &src, const int numel) { + CU_SAFE_CALL(cuMemcpyHtoDAsync(_global, src.begin(), numel*sizeof(numtyp), + _cq)); + } + /// Get device ptr associated with object + inline const CUdeviceptr * begin() const { return &_global; } + inline void clear() {} + + private: + CUstream _cq; + CUdeviceptr _global; + size_t _global_bytes; + friend class UCL_Kernel; +}; + } // namespace #endif diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h index de4def0bc1..435ee24dd3 100644 --- a/lib/gpu/geryon/ocl_device.h +++ b/lib/gpu/geryon/ocl_device.h @@ -28,12 +28,8 @@ #include #include -/* We default to OpenCL 1.2 as target version for now as - * there are known issues with OpenCL 2.0 and later. - * This is also to silence warnings from generic OpenCL headers */ - -#if !defined(CL_TARGET_OPENCL_VERSION) -#define CL_TARGET_OPENCL_VERSION 120 +#ifndef CL_TARGET_OPENCL_VERSION +#define CL_TARGET_OPENCL_VERSION 210 #endif #ifdef __APPLE__ @@ -55,17 +51,36 @@ namespace ucl_opencl { typedef cl_command_queue command_queue; typedef cl_context context_type; +inline void ucl_flush(command_queue &cq) { CL_SAFE_CALL(clFlush(cq)); } + inline void ucl_sync(cl_command_queue &cq) { CL_SAFE_CALL(clFinish(cq)); } -inline bool _shared_mem_device(cl_device_type &device_type) { +#if defined(GERYON_FORCE_SHARED_MAIN_MEM_ON) +inline bool _shared_mem_device(cl_device_id &device) { return true; } +#elif defined(GERYON_FORCE_SHARED_MAIN_MEM_OFF) +inline bool _shared_mem_device(cl_device_id &device) { return false; } +#else +inline bool _shared_mem_device(cl_device_id &device) { + #ifdef CL_VERSION_1_2 + cl_bool br; + CL_SAFE_CALL(clGetDeviceInfo(device, CL_DEVICE_HOST_UNIFIED_MEMORY, + sizeof(cl_bool), &br,NULL)); + return (br == CL_TRUE); + #else + cl_device_type device_type; + CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE, + sizeof(device_type),&device_type,NULL)); return (device_type==CL_DEVICE_TYPE_CPU); + #endif } +#endif struct OCLProperties { std::string name; cl_device_type device_type; + bool is_subdevice; cl_ulong global_mem; cl_ulong shared_mem; cl_ulong const_mem; @@ -74,12 +89,16 @@ struct OCLProperties { size_t work_group_size; size_t work_item_size[3]; bool double_precision; + int preferred_vector_width32, preferred_vector_width64; int alignment; size_t timer_resolution; bool ecc_support; std::string c_version; bool partition_equal, partition_counts, partition_affinity; cl_uint max_sub_devices; + int cl_device_version; + bool has_subgroup_support; + bool has_shuffle_support; }; /// Class for looking at data parallel device properties @@ -182,16 +201,27 @@ class UCL_Device { inline std::string device_type_name(const int i); /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) - inline int device_type() { return device_type(_device); } + inline enum UCL_DEVICE_TYPE device_type() { return device_type(_device); } /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) - inline int device_type(const int i); + inline enum UCL_DEVICE_TYPE device_type(const int i); /// Returns true if host memory is efficiently addressable from device inline bool shared_memory() { return shared_memory(_device); } /// Returns true if host memory is efficiently addressable from device inline bool shared_memory(const int i) - { return _shared_mem_device(_properties[i].device_type); } + { return _shared_mem_device(_cl_devices[i]); } + /// Returns preferred vector width + inline int preferred_fp32_width() { return preferred_fp32_width(_device); } + /// Returns preferred vector width + inline int preferred_fp32_width(const int i) + {return _properties[i].preferred_vector_width32;} + /// Returns preferred vector width + inline int preferred_fp64_width() { return preferred_fp64_width(_device); } + /// Returns preferred vector width + inline int preferred_fp64_width(const int i) + {return _properties[i].preferred_vector_width64;} + /// Returns true if double precision is support for the current device inline bool double_precision() { return double_precision(_device); } /// Returns true if double precision is support for the device @@ -242,6 +272,18 @@ class UCL_Device { /// Get the maximum number of threads per block inline size_t group_size(const int i) { return _properties[i].work_group_size; } + /// Get the maximum number of threads per block in dimension 'dim' + inline size_t group_size_dim(const int dim) + { return group_size_dim(_device, dim); } + /// Get the maximum number of threads per block in dimension 'dim' + inline size_t group_size_dim(const int i, const int dim) + { return _properties[i].work_item_size[dim]; } + + /// Get the shared local memory size in bytes + inline size_t slm_size() { return slm_size(_device); } + /// Get the shared local memory size in bytes + inline size_t slm_size(const int i) + { return _properties[i].shared_mem; } /// Return the maximum memory pitch in bytes for current device inline size_t max_pitch() { return max_pitch(_device); } @@ -256,6 +298,12 @@ class UCL_Device { inline bool sharing_supported(const int i) { return true; } + /// True if the device is a sub-device + inline bool is_subdevice() + { return is_subdevice(_device); } + /// True if the device is a sub-device + inline bool is_subdevice(const int i) + { return _properties[i].is_subdevice; } /// True if splitting device into equal subdevices supported inline bool fission_equal() { return fission_equal(_device); } @@ -274,6 +322,18 @@ class UCL_Device { /// True if splitting device into subdevices by affinity domains supported inline bool fission_by_affinity(const int i) { return _properties[i].partition_affinity; } + /// True if the device has subgroup support + inline bool has_subgroup_support() + { return has_subgroup_support(_device); } + /// True if the device has subgroup support + inline bool has_subgroup_support(const int i) + { return _properties[i].has_subgroup_support; } + /// True if the device supports shuffle intrinsics + inline bool has_shuffle_support() + { return has_shuffle_support(_device); } + /// True if the device supports shuffle intrinsics + inline bool has_shuffle_support(const int i) + { return _properties[i].has_shuffle_support; } /// Maximum number of subdevices allowed from device fission inline int max_sub_devices() @@ -281,6 +341,12 @@ class UCL_Device { /// Maximum number of subdevices allowed from device fission inline int max_sub_devices(const int i) { return _properties[i].max_sub_devices; } + /// OpenCL version supported by the device + inline int cl_device_version() + { return cl_device_version(_device); } + /// OpenCL version supported by the device + inline int cl_device_version(const int i) + { return _properties[i].cl_device_version; } /// List all devices along with all properties inline void print_all(std::ostream &out); @@ -288,8 +354,14 @@ class UCL_Device { /// Return the OpenCL type for the device inline cl_device_id & cl_device() { return _cl_device; } - /// Select the platform that has accelerators - inline int set_platform_accelerator(int pid=-1); + /// Automatically set the platform by type, vendor, and/or CU count + /** If first_device is positive, search restricted to platforms containing + * this device IDs. If ndevices is positive, search is restricted + * to platforms with at least that many devices **/ + inline int auto_set_platform(const enum UCL_DEVICE_TYPE type=UCL_GPU, + const std::string vendor="", + const int ndevices=-1, + const int first_device=-1); private: int _num_platforms; // Number of platforms @@ -322,8 +394,7 @@ UCL_Device::UCL_Device() { return; } else _num_platforms=static_cast(nplatforms); - // note that platform 0 may not necessarily be associated with accelerators - set_platform_accelerator(); + set_platform(0); } UCL_Device::~UCL_Device() { @@ -332,6 +403,14 @@ UCL_Device::~UCL_Device() { void UCL_Device::clear() { _properties.clear(); + + #ifdef GERYON_NUMA_FISSION + #ifdef CL_VERSION_1_2 + for (int i=0; i<_cl_devices.size(); i++) + CL_DESTRUCT_CALL(clReleaseDevice(_cl_devices[i])); + #endif + #endif + _cl_devices.clear(); if (_device>-1) { for (size_t i=0; i<_cq.size(); i++) { @@ -341,6 +420,7 @@ void UCL_Device::clear() { CL_DESTRUCT_CALL(clReleaseContext(_context)); } _device=-1; + _num_devices=0; } int UCL_Device::set_platform(int pid) { @@ -370,11 +450,51 @@ int UCL_Device::set_platform(int pid) { CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list, &n)); + #ifndef GERYON_NUMA_FISSION // --- Store properties for each device for (int i=0; i<_num_devices; i++) { _cl_devices.push_back(device_list[i]); add_properties(device_list[i]); } + #else + // --- Create sub-devices for anything partitionable by NUMA and store props + int num_unpart = _num_devices; + _num_devices = 0; + for (int i=0; i 1) { + subdevice_list = new cl_device_id[num_subdevices]; + CL_SAFE_CALL(clCreateSubDevices(device_list[i], props, num_subdevices, + subdevice_list, &num_subdevices)); + } + #endif + + for (int j=0; j 1) delete[] subdevice_list; + } // for i + #endif + delete[] device_list; return UCL_SUCCESS; } @@ -429,11 +549,18 @@ void UCL_Device::add_properties(cl_device_id device_list) { sizeof(cl_uint),&op.alignment,nullptr)); op.alignment/=8; + cl_uint float_width; + CL_SAFE_CALL(clGetDeviceInfo(device_list, + CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, + sizeof(float_width),&float_width,nullptr)); + op.preferred_vector_width32=float_width; + // Determine if double precision is supported cl_uint double_width; CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(double_width),&double_width,nullptr)); + op.preferred_vector_width64=double_width; if (double_width==0) op.double_precision=false; else @@ -452,9 +579,14 @@ void UCL_Device::add_properties(cl_device_id device_list) { op.ecc_support=true; op.c_version=""; + op.is_subdevice=false; op.partition_equal=false; op.partition_counts=false; op.partition_affinity=false; + op.max_sub_devices=1; + op.cl_device_version=0; + op.has_subgroup_support=false; + op.has_shuffle_support=false; #ifdef CL_VERSION_1_2 size_t return_bytes; @@ -463,6 +595,13 @@ void UCL_Device::add_properties(cl_device_id device_list) { op.c_version=buffer; cl_device_partition_property pinfo[4]; + CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_PARTITION_TYPE, + 4*sizeof(cl_device_partition_property), + &pinfo, &return_bytes)); + if (return_bytes == 0) op.is_subdevice=false; + else if (pinfo[0]) op.is_subdevice=true; + else op.is_subdevice=false; + CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_PARTITION_PROPERTIES, 4*sizeof(cl_device_partition_property), @@ -480,6 +619,46 @@ void UCL_Device::add_properties(cl_device_id device_list) { CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_PARTITION_MAX_SUB_DEVICES, sizeof(cl_uint),&op.max_sub_devices,nullptr)); + + CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_VERSION,1024,buffer,nullptr)); + int cl_version_maj = buffer[7] - '0'; + int cl_version_min = buffer[9] - '0'; + op.cl_device_version = cl_version_maj * 100 + cl_version_min * 10; + + size_t ext_str_size_ret; + CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_EXTENSIONS, 0, nullptr, + &ext_str_size_ret)); + char buffer2[ext_str_size_ret]; + CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_EXTENSIONS, + ext_str_size_ret, buffer2, nullptr)); + #if defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0) + if (op.cl_device_version >= 210) { + if ((std::string(buffer2).find("cl_khr_subgroups") != std::string::npos) || + (std::string(buffer2).find("cl_intel_subgroups") != std::string::npos)) + op.has_subgroup_support=true; + if (std::string(buffer2).find("cl_intel_subgroups") != std::string::npos) + op.has_shuffle_support=true; + } + #endif + if (std::string(buffer2).find("cl_nv_device_attribute_query") != + std::string::npos) { + #ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV + #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000 + #endif + #ifndef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV + #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001 + #endif + cl_uint major, minor; + CL_SAFE_CALL(clGetDeviceInfo(device_list, + CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, + sizeof(cl_uint), &major, nullptr)); + CL_SAFE_CALL(clGetDeviceInfo(device_list, + CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, + sizeof(cl_uint), &minor, nullptr)); + double arch = static_cast(minor)/10+major; + if (arch >= 3.0) + op.has_shuffle_support=true; + } #endif _properties.push_back(op); @@ -516,7 +695,7 @@ std::string UCL_Device::device_type_name(const int i) { } // Get a string telling the type of the device -int UCL_Device::device_type(const int i) { +enum UCL_DEVICE_TYPE UCL_Device::device_type(const int i) { if (_properties[i].device_type==CL_DEVICE_TYPE_CPU) return UCL_CPU; else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU) @@ -529,14 +708,8 @@ int UCL_Device::device_type(const int i) { // Set the CUDA device to the specified device number int UCL_Device::set(int num) { - cl_device_id *device_list = new cl_device_id[_num_devices]; - cl_uint n; - CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices, - device_list,&n)); - _device=num; - _cl_device=device_list[_device]; - delete[] device_list; + _cl_device=_cl_devices[_device]; return create_context(); } @@ -555,6 +728,11 @@ void UCL_Device::print_all(std::ostream &out) { out << "\nDevice " << i << ": \"" << name(i).c_str() << "\"\n"; out << " Type of device: " << device_type_name(i).c_str() << std::endl; + out << " Is a subdevice: "; + if (is_subdevice(i)) + out << "Yes\n"; + else + out << "No\n"; out << " Double precision support: "; if (double_precision(i)) out << "Yes\n"; @@ -613,33 +791,93 @@ void UCL_Device::print_all(std::ostream &out) { out << "No\n"; out << " Maximum subdevices from fission: " << max_sub_devices(i) << std::endl; + out << " Shared memory system: "; + if (shared_memory(i)) + out << "Yes\n"; + else + out << "No\n"; } } } -// Select the platform that is associated with accelerators -// if pid < 0, select the first platform -int UCL_Device::set_platform_accelerator(int pid) { - if (pid < 0) { - int found = 0; - for (int n=0; n<_num_platforms; n++) { - set_platform(n); - for (int i=0; i -1) { + if (ndevices) + last_device = first_device + ndevices - 1; + else + last_device = first_device; + } + + bool vendor_match=false; + bool type_match=false; + int max_cus=0; + int best_platform=0; + + std::string vendor_upper=vendor; + for (int i=0; i='a') + vendor_upper[i]=toupper(vendor_upper[i]); + + for (int n=0; n<_num_platforms; n++) { + set_platform(n); + if (last_device > -1 && last_device >= num_devices()) continue; + if (ndevices > num_devices()) continue; + + int first_id=0; + int last_id=num_devices()-1; + if (last_device > -1) { + first_id=first_device; + last_id=last_device; + } + + if (vendor_upper!="") { + std::string pname = platform_name(); + for (int i=0; i='a') + pname[i]=toupper(pname[i]); + + if (pname.find(vendor_upper)!=std::string::npos) { + if (vendor_match == false) { + best_platform=n; + max_cus=0; + vendor_match=true; + } + } else if (vendor_match) + continue; + } + + if (type != UCL_DEFAULT) { + bool ptype_matched=false; + for (int d=first_id; d<=last_id; d++) { + if (type==device_type(d)) { + if (type_match == false) { + best_platform=n; + max_cus=0; + type_match=true; + ptype_matched=true; + } + } + } + if (type_match==true && ptype_matched==false) + continue; + } + + for (int d=first_id; d<=last_id; d++) { + if (cus(d) > max_cus) { + best_platform=n; + max_cus=cus(d); } - if (found) return UCL_SUCCESS; } - return UCL_ERROR; - } else { - return set_platform(pid); } + return set_platform(best_platform); } -} // namespace ucl_opencl +} // namespace ucl_opencl #endif diff --git a/lib/gpu/geryon/ocl_kernel.h b/lib/gpu/geryon/ocl_kernel.h index 77593f4515..23f9baa09e 100644 --- a/lib/gpu/geryon/ocl_kernel.h +++ b/lib/gpu/geryon/ocl_kernel.h @@ -2,6 +2,7 @@ ocl_kernel.h ------------------- W. Michael Brown + Nitin Dhamankar (Intel) Utilities for dealing with OpenCL kernels @@ -26,6 +27,7 @@ #include "ocl_device.h" #include +#include namespace ucl_opencl { @@ -93,7 +95,7 @@ class UCL_Program { /// Load a program from a string and compile with flags inline int load_string(const void *program, const char *flags="", - std::string *log=nullptr) { + std::string *log=nullptr, FILE* foutput=nullptr) { cl_int error_flag; const char *prog=(const char *)program; _program=clCreateProgramWithSource(_context,1,&prog,nullptr,&error_flag); @@ -107,27 +109,66 @@ class UCL_Program { sizeof(cl_build_status),&build_status, nullptr)); - if (build_status != CL_SUCCESS || log!=nullptr) { + #ifdef GERYON_KERNEL_DUMP + { size_t ms; - CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0, - nullptr, &ms)); + CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG, + 0,NULL,&ms)); char *build_log = new char[ms]; - CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,ms, - build_log, nullptr)); + CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG, + ms,build_log, NULL)); + std::cout << std::endl << std::endl + << "--------------------------------------------------------\n" + << " UCL PROGRAM DUMP\n" + << "--------------------------------------------------------\n" + << flags << std::endl + << "--------------------------------------------------------\n" + << prog << std::endl + << "--------------------------------------------------------\n" + << build_log + << "--------------------------------------------------------\n" + << std::endl << std::endl; + } + #endif + + if (build_status != CL_SUCCESS || log!=NULL) { + size_t ms; + CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG, + 0,NULL,&ms)); + char *build_log = new char[ms]; + CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG, + ms,build_log, NULL)); if (log!=nullptr) *log=std::string(build_log); if (build_status != CL_SUCCESS) { #ifndef UCL_NO_EXIT - std::cerr << std::endl - << "----------------------------------------------------------\n" - << " UCL Error: Error compiling OpenCL Program (" - << build_status << ") ...\n" - << "----------------------------------------------------------\n"; + std::cerr << std::endl << std::endl + << "----------------------------------------------------------\n" + << " UCL Error: Error compiling OpenCL Program (" + << build_status << ") ...\n" + << "----------------------------------------------------------\n"; std::cerr << build_log << std::endl; + std::cerr << + "----------------------------------------------------------\n" + << std::endl << std::endl; #endif - delete[] build_log; + if (foutput != NULL) { + fprintf(foutput,"\n\n"); + fprintf(foutput, + "----------------------------------------------------------\n"); + fprintf(foutput, + " UCL Error: Error compiling OpenCL Program (%d) ...\n", + build_status); + fprintf(foutput, + "----------------------------------------------------------\n"); + fprintf(foutput,"%s\n",build_log); + fprintf(foutput, + "----------------------------------------------------------\n"); + fprintf(foutput,"\n\n"); + } + delete[] build_log; return UCL_COMPILE_ERROR; } else delete[] build_log; } @@ -141,6 +182,7 @@ class UCL_Program { inline void cq(command_queue &cq_in) { _cq=cq_in; } friend class UCL_Kernel; + friend class UCL_Const; private: bool _init_done; cl_program _program; @@ -322,9 +364,45 @@ class UCL_Kernel { inline void cq(command_queue &cq_in) { _cq=cq_in; } #include "ucl_arg_kludge.h" + #if defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0) + inline size_t max_subgroup_size(const size_t block_size_x) { + size_t block_size = block_size_x; + CL_SAFE_CALL(clGetKernelSubGroupInfo(_kernel, _device, + CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE, + sizeof(block_size), (void *) &block_size, + sizeof(size_t), (void *) &_mx_subgroup_sz, + NULL)); + return _mx_subgroup_sz; + } + + inline size_t max_subgroup_size(const size_t block_size_x, + const size_t block_size_y) { + size_t block_size[2] { block_size_x, block_size_y }; + CL_SAFE_CALL(clGetKernelSubGroupInfo(_kernel, _device, + CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE, + sizeof(block_size), (void *) &block_size, + sizeof(size_t), (void *) &_mx_subgroup_sz, + NULL)); + return _mx_subgroup_sz; + } + + inline size_t max_subgroup_size(const size_t block_size_x, + const size_t block_size_y, + const size_t block_size_z) { + size_t block_size[3] { block_size_x, block_size_y, block_size_z }; + CL_SAFE_CALL(clGetKernelSubGroupInfo(_kernel, _device, + CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE, + sizeof(block_size), (void *) &block_size, + sizeof(size_t), (void *) &_mx_subgroup_sz, + NULL)); + return _mx_subgroup_sz; + } + #endif + private: cl_kernel _kernel; cl_program _program; + cl_device_id _device; cl_uint _dimensions; size_t _block_size[3]; size_t _num_blocks[3]; @@ -338,6 +416,11 @@ class UCL_Kernel { unsigned _kernel_info_nargs; //std::string _kernel_info_args[256]; #endif + + #ifdef CL_VERSION_2_1 + size_t _mx_subgroup_sz; // Maximum sub-group size for this kernel + #endif + }; inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) { @@ -347,6 +430,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) CL_SAFE_CALL(clRetainCommandQueue(_cq)); _program=program._program; CL_SAFE_CALL(clRetainProgram(_program)); + _device=program._device; cl_int error_flag; _kernel=clCreateKernel(program._program,function,&error_flag); @@ -380,8 +464,11 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) } void UCL_Kernel::run() { - CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,nullptr, - _num_blocks,_block_size,0,nullptr,nullptr)); + CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL, + _num_blocks,_block_size,0,NULL,NULL)); + #ifdef GERYON_OCL_FLUSH + ucl_flush(_cq); + #endif } } // namespace diff --git a/lib/gpu/geryon/ocl_macros.h b/lib/gpu/geryon/ocl_macros.h index aeff689859..0e9ce78389 100644 --- a/lib/gpu/geryon/ocl_macros.h +++ b/lib/gpu/geryon/ocl_macros.h @@ -4,12 +4,8 @@ #include #include -/* We default to OpenCL 1.2 as target version for now as - * there are known issues with OpenCL 2.0 and later. - * This is also to silence warnings from generic OpenCL headers */ - -#if !defined(CL_TARGET_OPENCL_VERSION) -#define CL_TARGET_OPENCL_VERSION 120 +#ifndef CL_TARGET_OPENCL_VERSION +#define CL_TARGET_OPENCL_VERSION 210 #endif #ifdef __APPLE__ diff --git a/lib/gpu/geryon/ocl_memory.h b/lib/gpu/geryon/ocl_memory.h index 740020ab18..8937d4145a 100644 --- a/lib/gpu/geryon/ocl_memory.h +++ b/lib/gpu/geryon/ocl_memory.h @@ -106,9 +106,9 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n, mat.cbegin()=clCreateBuffer(context,buffer_perm,n,nullptr,&error_flag); if (error_flag != CL_SUCCESS) return UCL_MEMORY_ERROR; - *mat.host_ptr() = (typename mat_type::data_type*) - clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE, - map_perm,0,n,0,nullptr,nullptr,nullptr); + *mat.host_ptr() = (typename mat_type::data_type*) + clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE, + map_perm,0,n,0,NULL,NULL,NULL); mat.cq()=cm.cq(); CL_SAFE_CALL(clRetainCommandQueue(mat.cq())); @@ -116,18 +116,15 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n, } template -inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) { +inline int _host_view(mat_type &mat, copy_type &cm, const size_t o, + const size_t n) { cl_int error_flag; - cl_context context; - CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context), - &context,nullptr)); - cl_mem_flags orig_flags; - CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_FLAGS,sizeof(orig_flags), - &orig_flags,nullptr)); - orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR; - - mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n, - *mat.host_ptr(), &error_flag); + cl_buffer_region subbuffer; + subbuffer.origin = o; + subbuffer.size = n; + mat.cbegin()=clCreateSubBuffer(cm.cbegin(), 0, + CL_BUFFER_CREATE_TYPE_REGION, &subbuffer, + &error_flag); CL_CHECK_ERR(error_flag); CL_SAFE_CALL(clRetainCommandQueue(mat.cq())); @@ -470,6 +467,9 @@ inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) { size_t kn=n/sizeof(typename mat_type::data_type); CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,kzero,1,0,&kn,0,0,0,0)); #endif + #ifdef GERYON_OCL_FLUSH + ucl_flush(cq); + #endif } // -------------------------------------------------------------------------- @@ -585,7 +585,10 @@ template <> struct _ucl_memcpy<1,0> { std::cerr << "UCL_COPY 1NS\n"; #endif CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,n, - dst.begin(),0,nullptr,nullptr)); + dst.begin(),0,NULL,NULL)); + #ifdef GERYON_OCL_FLUSH + if (block==CL_FALSE) ucl_flush(cq); + #endif } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, @@ -617,6 +620,9 @@ template <> struct _ucl_memcpy<1,0> { src_offset+=spitch; dst_offset+=dpitch; } + #ifdef GERYON_OCL_FLUSH + if (block==CL_FALSE) ucl_flush(cq); + #endif } }; @@ -637,7 +643,10 @@ template <> struct _ucl_memcpy<0,1> { std::cerr << "UCL_COPY 3NS\n"; #endif CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,n, - src.begin(),0,nullptr,nullptr)); + src.begin(),0,NULL,NULL)); + #ifdef GERYON_OCL_FLUSH + if (block==CL_FALSE) ucl_flush(cq); + #endif } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, @@ -669,6 +678,9 @@ template <> struct _ucl_memcpy<0,1> { src_offset+=spitch; dst_offset+=dpitch; } + #ifdef GERYON_OCL_FLUSH + if (block==CL_FALSE) ucl_flush(cq); + #endif } }; @@ -690,6 +702,9 @@ template struct _ucl_memcpy { #endif if (block==CL_TRUE) ucl_sync(cq); + #ifdef GERYON_OCL_FLUSH + else ucl_flush(cq); + #endif } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, @@ -720,6 +735,9 @@ template struct _ucl_memcpy { #endif if (block==CL_TRUE) ucl_sync(cq); + #ifdef GERYON_OCL_FLUSH + else ucl_flush(cq); + #endif } }; diff --git a/lib/gpu/geryon/ocl_texture.h b/lib/gpu/geryon/ocl_texture.h index 0e60045f55..43de4b258c 100644 --- a/lib/gpu/geryon/ocl_texture.h +++ b/lib/gpu/geryon/ocl_texture.h @@ -53,6 +53,59 @@ class UCL_Texture { friend class UCL_Kernel; }; +/// Class storing a const global memory reference +class UCL_Const { + public: + UCL_Const() : _global_bytes(0), _active(false) {} + ~UCL_Const() { clear(); } + /// Construct with a specified global reference + inline UCL_Const(UCL_Program &prog, const char *global_name) + { get_global(prog,global_name); } + /// Set the global reference for this object + inline void get_global(UCL_Program &prog, const char *global_name) { + if (_active) { + CL_DESTRUCT_CALL(clReleaseContext(_context)); + CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq)); + } + _active = true; + _context = prog._context; + _cq = prog._cq; + CL_SAFE_CALL(clRetainContext(_context)); + CL_SAFE_CALL(clRetainCommandQueue(_cq)); + } + /// Copy from array on host to const memory + template + inline void update_device(UCL_H_Vec &src, const int numel) { + const int bytes=numel*sizeof(numtyp); + if (_global_bytes < bytes) { + if (_global_bytes) CL_SAFE_CALL(clReleaseMemObject(_global)); + cl_int e; + _global = clCreateBuffer(_context, CL_MEM_READ_ONLY, bytes, NULL, &e); + CL_SAFE_CALL(e); + } + CL_SAFE_CALL(clEnqueueWriteBuffer(_cq, _global, CL_FALSE, 0, bytes, + (void *)src.begin(), 0, NULL, NULL)); + } + /// Get device ptr associated with object + inline const cl_mem * begin() const { return &_global; } + inline void clear() { + if (_global_bytes) CL_SAFE_CALL(clReleaseMemObject(_global)); + if (_active) { + CL_DESTRUCT_CALL(clReleaseContext(_context)); + CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq)); + } + _global_bytes=0; + _active=false; + } + + private: + cl_mem _global; + size_t _global_bytes; + cl_context _context; + cl_command_queue _cq; + bool _active; +}; + } // namespace #endif diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h index 8e8ffa929e..ca74312d51 100644 --- a/lib/gpu/geryon/ocl_timer.h +++ b/lib/gpu/geryon/ocl_timer.h @@ -61,7 +61,6 @@ class UCL_Timer { /// Initialize command queue for timing inline void init(UCL_Device &dev, command_queue &cq) { clear(); - t_factor=dev.timer_resolution()/1000000000.0; _cq=cq; clRetainCommandQueue(_cq); _initialized=true; @@ -124,17 +123,17 @@ class UCL_Timer { clReleaseEvent(start_event); clReleaseEvent(stop_event); has_measured_time = false; - return (tend-tstart)*t_factor; + return (tend-tstart)*1e-6; } /// Return the time (s) of last start to stop - Forces synchronization - inline double seconds() { return time()/1000.0; } + inline double seconds() { return time()*1e-3; } /// Return the total time in ms inline double total_time() { return _total_time; } /// Return the total time in seconds - inline double total_seconds() { return _total_time/1000.0; } + inline double total_seconds() { return _total_time*1e-3; } private: cl_event start_event, stop_event; diff --git a/lib/gpu/geryon/ucl_basemat.h b/lib/gpu/geryon/ucl_basemat.h index 07e23aebe7..51fd33d623 100644 --- a/lib/gpu/geryon/ucl_basemat.h +++ b/lib/gpu/geryon/ucl_basemat.h @@ -69,17 +69,17 @@ class UCL_BaseMat { /// Return the type/permissions of memory allocation /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, UCL_NOT_PINNED * or UCL_VIEW **/ + /// Assert that any ops in associate command queue have been issued to device + inline void flush() { ucl_flush(_cq); } + inline enum UCL_MEMOPT kind() const { return _kind; } inline bool shared_mem_device() { #ifdef _OCL_MAT cl_device_id device; CL_SAFE_CALL(clGetCommandQueueInfo(_cq,CL_QUEUE_DEVICE, - sizeof(cl_device_id),&device,nullptr)); - cl_device_type device_type; - CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE, - sizeof(device_type),&device_type,nullptr)); - return _shared_mem_device(device_type); + sizeof(cl_device_id),&device,NULL)); + return _shared_mem_device(device); #else return false; #endif diff --git a/lib/gpu/geryon/ucl_d_vec.h b/lib/gpu/geryon/ucl_d_vec.h index cd2a90fe2d..e791f18f29 100644 --- a/lib/gpu/geryon/ucl_d_vec.h +++ b/lib/gpu/geryon/ucl_d_vec.h @@ -39,7 +39,7 @@ class UCL_D_Vec : public UCL_BaseMat { }; typedef numtyp data_type; - UCL_D_Vec() : _cols(0) {} + UCL_D_Vec() : _cols(0), _row_bytes(0) {} ~UCL_D_Vec() { _device_free(*this); } /// Construct with n columns diff --git a/lib/gpu/geryon/ucl_get_devices.cpp b/lib/gpu/geryon/ucl_get_devices.cpp index b8dfc6f7b1..5654bb40bd 100644 --- a/lib/gpu/geryon/ucl_get_devices.cpp +++ b/lib/gpu/geryon/ucl_get_devices.cpp @@ -44,10 +44,8 @@ using namespace ucl_hip; int main(int argc, char** argv) { UCL_Device cop; std::cout << "Found " << cop.num_platforms() << " platform(s).\n"; - if (cop.num_platforms()>0) { - std::cout << "Using platform: " << cop.platform_name() << std::endl; + if (cop.num_platforms()>0) cop.print_all(std::cout); - } return 0; } diff --git a/lib/gpu/geryon/ucl_h_mat.h b/lib/gpu/geryon/ucl_h_mat.h index 1df3c2de4b..41dad2b285 100644 --- a/lib/gpu/geryon/ucl_h_mat.h +++ b/lib/gpu/geryon/ucl_h_mat.h @@ -241,7 +241,7 @@ class UCL_H_Mat : public UCL_BaseMat { _array=input.begin()+offset; _end=_array+_cols; #ifdef _OCL_MAT - _host_view(*this,input,_row_bytes*_rows); + _host_view(*this,input,offset*sizeof(numtyp),_row_bytes*_rows); #endif } diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h index a9d64349d9..5de0c312b0 100644 --- a/lib/gpu/geryon/ucl_h_vec.h +++ b/lib/gpu/geryon/ucl_h_vec.h @@ -39,7 +39,7 @@ class UCL_H_Vec : public UCL_BaseMat { }; typedef numtyp data_type; - UCL_H_Vec() : _cols(0) { + UCL_H_Vec() : _cols(0), _row_bytes(0) { #ifdef _OCL_MAT _carray=(cl_mem)(0); #endif @@ -135,7 +135,7 @@ class UCL_H_Vec : public UCL_BaseMat { _cols=cols; _row_bytes=_cols*sizeof(numtyp); this->_cq=input.cq(); - _array=input.begin(); + _array=(numtyp *)input.begin(); _end=_array+_cols; #ifdef _OCL_MAT _carray=input.cbegin(); @@ -240,10 +240,10 @@ class UCL_H_Vec : public UCL_BaseMat { _cols=cols; _row_bytes=_cols*sizeof(numtyp); this->_cq=input.cq(); - _array=input.begin()+offset; + _array=(numtyp *)input.begin()+offset; _end=_array+_cols; #ifdef _OCL_MAT - _host_view(*this,input,_row_bytes); + _host_view(*this,input,offset*sizeof(numtyp),_row_bytes); #endif } diff --git a/lib/gpu/geryon/ucl_vector.h b/lib/gpu/geryon/ucl_vector.h index 7fe2604de6..c03fd31fce 100644 --- a/lib/gpu/geryon/ucl_vector.h +++ b/lib/gpu/geryon/ucl_vector.h @@ -162,7 +162,9 @@ class UCL_Vector { inline void cq(command_queue &cq_in) { host.cq(cq_in); device.cq(cq_in); } /// Block until command_queue associated with matrix is complete inline void sync() { host.sync(); } - + /// Assert that any ops in associate command queue have been issued to device + inline void flush() { ucl_flush(host.cq()); } + ///Get the size of a row on the host (including any padding) in elements inline size_t row_size() const { return host.row_size(); } /// Get the size of a row on the host(including any padding) in bytes diff --git a/lib/gpu/lal_answer.cpp b/lib/gpu/lal_answer.cpp index 803b781286..4a68466d05 100644 --- a/lib/gpu/lal_answer.cpp +++ b/lib/gpu/lal_answer.cpp @@ -14,6 +14,9 @@ ***************************************************************************/ #include "lal_answer.h" +#if (LAL_USE_OMP == 1) +#include +#endif namespace LAMMPS_AL { #define AnswerT Answer @@ -56,7 +59,7 @@ bool AnswerT::alloc(const int inum) { template bool AnswerT::init(const int inum, const bool charge, const bool rot, - UCL_Device &devi) { + UCL_Device &devi) { clear(); bool success=true; @@ -81,6 +84,10 @@ bool AnswerT::init(const int inum, const bool charge, const bool rot, _time_cast=0.0; _time_cpu_idle=0.0; + success=success && (error_flag.alloc(1,*dev,UCL_READ_WRITE, + UCL_WRITE_ONLY)==UCL_SUCCESS); + if (success) error_flag.zero(); + return success && alloc(ef_inum); } @@ -111,6 +118,7 @@ bool AnswerT::add_fields(const bool charge, const bool rot) { template void AnswerT::clear() { _gpu_bytes=0; + error_flag.clear(); if (!_allocated) return; _allocated=false; @@ -138,12 +146,21 @@ double AnswerT::host_memory_usage() const { template void AnswerT::copy_answers(const bool eflag, const bool vflag, - const bool ef_atom, const bool vf_atom) { + const bool ef_atom, const bool vf_atom, + const int red_blocks) { time_answer.start(); _eflag=eflag; _vflag=vflag; _ef_atom=ef_atom; _vf_atom=vf_atom; + #ifdef LAL_NO_BLOCK_REDUCE + _ev_stride=_inum; + #else + if (ef_atom || vf_atom) + _ev_stride=_inum; + else + _ev_stride=red_blocks; + #endif int csize=_ev_fields; if (!eflag) @@ -152,20 +169,24 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag, csize-=6; if (csize>0) - engv.update_host(_inum*csize,true); + engv.update_host(_ev_stride*csize,true); if (_rot) force.update_host(_inum*4*2,true); else force.update_host(_inum*4,true); time_answer.stop(); + + #ifndef GERYON_OCL_FLUSH + force.flush(); + #endif } template void AnswerT::copy_answers(const bool eflag, const bool vflag, - const bool ef_atom, const bool vf_atom, - int *ilist) { + const bool ef_atom, const bool vf_atom, + int *ilist, const int red_blocks) { _ilist=ilist; - copy_answers(eflag,vflag,ef_atom,vf_atom); + copy_answers(eflag,vflag,ef_atom,vf_atom,red_blocks); } template @@ -177,21 +198,24 @@ double AnswerT::energy_virial(double *eatom, double **vatom, double evdwl=0.0; int vstart=0; if (_eflag) { - for (int i=0; i<_inum; i++) + #if (LAL_USE_OMP_SIMD == 1) + #pragma omp simd reduction(+:evdwl) + #endif + for (int i=0; i<_ev_stride; i++) evdwl+=engv[i]; if (_ef_atom) { if (_ilist==nullptr) { - for (int i=0; i<_inum; i++) + for (int i=0; i<_ev_stride; i++) eatom[i]+=engv[i]; } else { - for (int i=0; i<_inum; i++) + for (int i=0; i<_ev_stride; i++) eatom[_ilist[i]]+=engv[i]; } } - vstart=_inum; + vstart=_ev_stride; } if (_vflag) { - int iend=vstart+_inum; + int iend=vstart+_ev_stride; for (int j=0; j<6; j++) { for (int i=vstart; i void AnswerT::get_answers(double **f, double **tor) { - int fl=0; if (_ilist==nullptr) { - for (int i=0; i<_inum; i++) { - f[i][0]+=force[fl]; - f[i][1]+=force[fl+1]; - f[i][2]+=force[fl+2]; - fl+=4; - } - if (_rot) { - for (int i=0; i<_inum; i++) { - tor[i][0]+=force[fl]; - tor[i][1]+=force[fl+1]; - tor[i][2]+=force[fl+2]; - fl+=4; + typedef struct { double x,y,z; } vec3d; + typedef struct { acctyp x,y,z,w; } vec4d_t; + vec3d *fp=reinterpret_cast(&(f[0][0])); + vec4d_t *forcep=reinterpret_cast(&(force[0])); + + #if (LAL_USE_OMP == 1) + #pragma omp parallel + #endif + { + #if (LAL_USE_OMP == 1) + const int nthreads = omp_get_num_threads(); + const int tid = omp_get_thread_num(); + const int idelta = _inum / nthreads + 1; + const int ifrom = tid * idelta; + const int ito = std::min(ifrom + idelta, _inum); + #else + const int tid = 0; + const int ifrom = 0; + const int ito = _inum; + #endif + + for (int i=ifrom; i(&(tor[0][0])); + vec4d_t *torquep=reinterpret_cast(&(force[_inum*4])); + for (int i=ifrom; i force; /// Energy and virial per-atom storage UCL_Vector engv; + /// Error flag + UCL_Vector error_flag; /// Device timers UCL_Timer time_answer; @@ -162,7 +166,7 @@ class Answer { bool alloc(const int inum); bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other; - int _max_local, _inum, _e_fields, _ev_fields, _ans_fields; + int _max_local, _inum, _e_fields, _ev_fields, _ans_fields, _ev_stride; int *_ilist; double _time_cast, _time_cpu_idle; diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp index 7ce3e3e7ff..cda4d383b5 100644 --- a/lib/gpu/lal_atom.cpp +++ b/lib/gpu/lal_atom.cpp @@ -414,9 +414,9 @@ const char *atom=0; template void AtomT::compile_kernels(UCL_Device &dev) { - std::string flags = "-D"+std::string(OCL_VENDOR); + std::string flags = ""; atom_program=new UCL_Program(dev); - atom_program->load_string(atom,flags); + atom_program->load_string(atom,flags,nullptr,screen); k_cast_x.set_function(*atom_program,"kernel_cast_x"); _compiled=true; } diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h index e39740d6c8..3cf97d94a0 100644 --- a/lib/gpu/lal_atom.h +++ b/lib/gpu/lal_atom.h @@ -24,6 +24,9 @@ #include "geryon/ocl_mat.h" #include "geryon/ocl_kernel.h" using namespace ucl_opencl; +#ifndef LAL_NO_OCL_EV_JIT +#define LAL_OCL_EV_JIT +#endif #elif defined(USE_CUDART) #include "geryon/nvc_timer.h" #include "geryon/nvc_mat.h" @@ -178,7 +181,7 @@ class Atom { ii+=m_size-n; } UCL_H_Vec view; - view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); + view.view_offset(0,buffer,m_size*m_size); ucl_copy(dev_v,view,false); } @@ -197,7 +200,26 @@ class Atom { ii+=m_size-n; } UCL_H_Vec view; - view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); + view.view_offset(0,buffer,m_size*m_size); + ucl_copy(dev_v,view,false); + } + + /// Pack LAMMPS atom type constants into 2 vectors and copy to device + template + inline void type_pack2(const int n, UCL_D_Vec &dev_v, + UCL_H_Vec &buffer, t1 ***one, t2 ***two) { + int ii=0; + for (int i=0; i(one[i][j][k]); + buffer[ii*2+1]=static_cast(two[i][j][k]); + ii++; + } + } + } + UCL_H_Vec view; + view.view_offset(0,buffer,n*n*n); ucl_copy(dev_v,view,false); } @@ -217,7 +239,7 @@ class Atom { ii+=m_size-n; } UCL_H_Vec view; - view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); + view.view_offset(0,buffer,m_size*m_size); ucl_copy(dev_v,view,false); } @@ -238,7 +260,7 @@ class Atom { ii+=m_size-n; } UCL_H_Vec view; - view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); + view.view_offset(0,buffer,m_size*m_size); ucl_copy(dev_v,view,false); } @@ -251,7 +273,7 @@ class Atom { buffer[i*2+1]=static_cast(two[i][i]); } UCL_H_Vec view; - view.view((dev_typ*)buffer.begin(),n,*dev); + view.view_offset(0,buffer,n); ucl_copy(dev_v,view,false); } @@ -261,6 +283,9 @@ class Atom { inline void data_unavail() { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; } + typedef struct { double x,y,z; } vec3d; + typedef struct { numtyp x,y,z,w; } vec4d_t; + /// Cast positions and types to write buffer inline void cast_x_data(double **host_ptr, const int *host_type) { if (_x_avail==false) { @@ -269,13 +294,16 @@ class Atom { memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double)); memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int)); #else - int wl=0; + vec3d *host_p=reinterpret_cast(&(host_ptr[0][0])); + vec4d_t *xp=reinterpret_cast(&(x[0])); + #if (LAL_USE_OMP == 1) + #pragma omp parallel for schedule(static) + #endif for (int i=0; i<_nall; i++) { - x[wl]=host_ptr[i][0]; - x[wl+1]=host_ptr[i][1]; - x[wl+2]=host_ptr[i][2]; - x[wl+3]=host_type[i]; - wl+=4; + xp[i].x=host_p[i].x; + xp[i].y=host_p[i].y; + xp[i].z=host_p[i].z; + xp[i].w=host_type[i]; } #endif _time_cast+=MPI_Wtime()-t; @@ -320,6 +348,11 @@ class Atom { } else if (sizeof(numtyp)==sizeof(double)) memcpy(q.host.begin(),host_ptr,_nall*sizeof(numtyp)); else + #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) + #pragma omp parallel for simd schedule(static) + #elif (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif for (int i=0; i<_nall; i++) q[i]=host_ptr[i]; _time_cast+=MPI_Wtime()-t; } @@ -346,6 +379,11 @@ class Atom { } else if (sizeof(numtyp)==sizeof(double)) memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp)); else + #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) + #pragma omp parallel for simd schedule(static) + #elif (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i]; _time_cast+=MPI_Wtime()-t; } @@ -370,13 +408,16 @@ class Atom { memcpy(host_v_cast.begin(),host_ptr[0],_nall*3*sizeof(double)); memcpy(host_tag_cast.begin(),host_tag,_nall*sizeof(int)); #else - int wl=0; + vec3d *host_p=reinterpret_cast(&(host_ptr[0][0])); + vec4d_t *vp=reinterpret_cast(&(v[0])); + #if (LAL_USE_OMP == 1) + #pragma omp parallel for schedule(static) + #endif for (int i=0; i<_nall; i++) { - v[wl]=host_ptr[i][0]; - v[wl+1]=host_ptr[i][1]; - v[wl+2]=host_ptr[i][2]; - v[wl+3]=host_tag[i]; - wl+=4; + vp[i].x=host_p[i].x; + vp[i].y=host_p[i].y; + vp[i].z=host_p[i].z; + vp[i].w=host_tag[i]; } #endif _time_cast+=MPI_Wtime()-t; diff --git a/lib/gpu/lal_aux_fun1.h b/lib/gpu/lal_aux_fun1.h index 5b7150d950..be00abbcef 100644 --- a/lib/gpu/lal_aux_fun1.h +++ b/lib/gpu/lal_aux_fun1.h @@ -40,170 +40,521 @@ nbor_begin+=offset; \ } -#if (ARCH < 300) +#define nbor_info_p(nbor_mem, nbor_stride, t_per_atom, ii, offset, \ + i, numj, stride, nbor_end, nbor_begin) \ + i=nbor_mem[ii]; \ + nbor_begin=ii+nbor_stride; \ + numj=nbor_mem[nbor_begin]; \ + nbor_begin+=nbor_stride+ii*(t_per_atom-1); \ + stride=fast_mul(t_per_atom,nbor_stride); \ + nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & \ + (t_per_atom-1)); \ + nbor_begin+=offset; -#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \ - eflag, vflag, ans, engv) \ - if (t_per_atom>1) { \ - __local acctyp red_acc[6][BLOCK_PAIR]; \ - red_acc[0][tid]=f.x; \ - red_acc[1][tid]=f.y; \ - red_acc[2][tid]=f.z; \ - red_acc[3][tid]=energy; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<4; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ - } \ - } \ - f.x=red_acc[0][tid]; \ - f.y=red_acc[1][tid]; \ - f.z=red_acc[2][tid]; \ - energy=red_acc[3][tid]; \ - if (vflag>0) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid]=virial[r]; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ - } \ - } \ - for (int r=0; r<6; r++) \ - virial[r]=red_acc[r][tid]; \ +#if (SHUFFLE_AVAIL == 0) + +#define simd_reduce_add1(width, local, offset, tid, one) \ + local[0][tid]=one; \ + for (unsigned int s=width/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) local[0][tid] += local[0][tid+s]; \ + } \ + if (offset==0) one=local[0][tid]; + +#define simd_reduce_add2(width, local, offset, tid, one, two) \ + local[0][tid]=one; \ + local[1][tid]=two; \ + for (unsigned int s=width/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + local[0][tid] += local[0][tid+s]; \ + local[1][tid] += local[1][tid+s]; \ } \ } \ if (offset==0) { \ - int ei=ii; \ - if (eflag>0) { \ - engv[ei]=energy*(acctyp)0.5; \ - ei+=inum; \ + one=local[0][tid]; \ + two=local[1][tid]; \ + } + +#define simd_reduce_add3(width, local, offset, tid, one, two, three) \ + local[0][tid]=one; \ + local[1][tid]=two; \ + local[2][tid]=three; \ + for (unsigned int s=width/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + local[0][tid] += local[0][tid+s]; \ + local[1][tid] += local[1][tid+s]; \ + local[2][tid] += local[2][tid+s]; \ } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - engv[ei]=virial[i]*(acctyp)0.5; \ - ei+=inum; \ + } \ + if (offset==0) { \ + one=local[0][tid]; \ + two=local[1][tid]; \ + three=local[2][tid]; \ + } + +#define simd_reduce_add6(width, local, offset, tid, one, two, three, \ + four, five, six) \ + local[0][tid]=one; \ + local[1][tid]=two; \ + local[2][tid]=three; \ + local[3][tid]=four; \ + local[4][tid]=five; \ + local[5][tid]=six; \ + for (unsigned int s=width/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + local[0][tid] += local[0][tid+s]; \ + local[1][tid] += local[1][tid+s]; \ + local[2][tid] += local[2][tid+s]; \ + local[3][tid] += local[3][tid+s]; \ + local[4][tid] += local[4][tid+s]; \ + local[5][tid] += local[5][tid+s]; \ + } \ + } \ + if (offset==0) { \ + one=local[0][tid]; \ + two=local[1][tid]; \ + three=local[2][tid]; \ + four=local[3][tid]; \ + five=local[4][tid]; \ + six=local[5][tid]; \ + } + +#define simd_reduce_arr(trip, width, local, offset, tid, arr) \ + for (int r=0; r0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; rwidth/2; s>>=1) { \ + __syncthreads(); \ + if (tid < s) local[0][tid] += local[0][tid+s]; \ + } \ + if (tid0; s>>=1) { \ + simdsync(); \ + if (tid < s) local[0][tid] += local[0][tid+s]; \ + } \ + if (tid==0) one=local[0][tid]; \ + } + +#define block_reduce_add2(width, local, tid, one, two) \ + local[0][tid]=one; \ + local[1][tid]=two; \ + for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) { \ + __syncthreads(); \ + if (tid < s) { \ + local[0][tid] += local[0][tid+s]; \ + local[1][tid] += local[1][tid+s]; \ + } \ + } \ + if (tid0; s>>=1) { \ + simdsync(); \ + if (tid < s) { \ + local[0][tid] += local[0][tid+s]; \ + local[1][tid] += local[1][tid+s]; \ + } \ + } \ + if (tid==0) { \ + one=local[0][tid]; \ + two=local[1][tid]; \ + } \ + } + +#define block_reduce_arr(trip, width, local, tid, arr) \ + for (int r=0; rwidth/2; s>>=1) { \ + __syncthreads(); \ + if (tid < s) { \ + for (int r=0; r0; s>>=1) { \ + simdsync(); \ + if (tid < s) { \ + for (int r=0; r1) { \ + simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ + if (EVFLAG && (vflag==2 || eflag==2)) { \ + if (eflag) { \ + simdsync(); \ + simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy); \ + } \ + if (vflag) { \ + simdsync(); \ + simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \ + } \ + } \ + } \ + if (offset==0 && ii1) { \ - __local acctyp red_acc[6][BLOCK_PAIR]; \ - red_acc[0][tid]=f.x; \ - red_acc[1][tid]=f.y; \ - red_acc[2][tid]=f.z; \ - red_acc[3][tid]=energy; \ - red_acc[4][tid]=e_coul; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<5; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ + simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ + if (EVFLAG && (vflag==2 || eflag==2)) { \ + if (eflag) { \ + simdsync(); \ + simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \ } \ - } \ - f.x=red_acc[0][tid]; \ - f.y=red_acc[1][tid]; \ - f.z=red_acc[2][tid]; \ - energy=red_acc[3][tid]; \ - e_coul=red_acc[4][tid]; \ - if (vflag>0) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid]=virial[r]; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ - } \ + if (vflag) { \ + simdsync(); \ + simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \ } \ - for (int r=0; r<6; r++) \ - virial[r]=red_acc[r][tid]; \ } \ } \ - if (offset==0) { \ - int ei=ii; \ - if (eflag>0) { \ - engv[ei]=energy*(acctyp)0.5; \ - ei+=inum; \ - engv[ei]=e_coul*(acctyp)0.5; \ - ei+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - engv[ei]=virial[i]*(acctyp)0.5; \ + if (offset==0 && ii0; s>>=1) one += shfl_down(one, s, width); + +#define simd_reduce_add2(width, one, two) \ + for (unsigned int s=width/2; s>0; s>>=1) { \ + one += shfl_down(one, s, width); \ + two += shfl_down(two, s, width); \ + } + +#define simd_reduce_add3(width, one, two, three) \ + for (unsigned int s=width/2; s>0; s>>=1) { \ + one += shfl_down(one, s, width); \ + two += shfl_down(two, s, width); \ + three += shfl_down(three, s, width); \ + } + +#define simd_reduce_add6(width, one, two, three, four, five, six) \ + for (unsigned int s=width/2; s>0; s>>=1) { \ + one += shfl_down(one, s, width); \ + two += shfl_down(two, s, width); \ + three += shfl_down(three, s, width); \ + four += shfl_down(four, s, width); \ + five += shfl_down(five, s, width); \ + six += shfl_down(six, s, width); \ + } + +#define simd_reduce_arr(trip, width, arr) \ + for (unsigned int s=width/2; s>0; s>>=1) { \ + for (int r=0; r1) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - f.x += shfl_xor(f.x, s, t_per_atom); \ - f.y += shfl_xor(f.y, s, t_per_atom); \ - f.z += shfl_xor(f.z, s, t_per_atom); \ - energy += shfl_xor(energy, s, t_per_atom); \ - } \ - if (vflag>0) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - for (int r=0; r<6; r++) \ - virial[r] += shfl_xor(virial[r], s, t_per_atom); \ - } \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (vflag==2 || eflag==2) { \ + if (eflag) \ + simd_reduce_add1(t_per_atom,energy); \ + if (vflag) \ + simd_reduce_arr(6, t_per_atom,virial); \ } \ } \ - if (offset==0) { \ - int ei=ii; \ - if (eflag>0) { \ - engv[ei]=energy*(acctyp)0.5; \ - ei+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - engv[ei]=virial[i]*(acctyp)0.5; \ + if (offset==0 && ii 1; active_subgs /= vwidth) { \ + if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \ + if (bnum < active_subgs) { \ + if (eflag) { \ + simd_reduce_add1(vwidth, energy); \ + if (voffset==0) red_acc[6][bnum] = energy; \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (voffset==0) \ + for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \ + } \ + } \ + \ + __syncthreads(); \ + if (tid < active_subgs) { \ + if (eflag) energy = red_acc[6][tid]; \ + if (vflag) \ + for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \ + } else { \ + if (eflag) energy = (acctyp)0; \ + if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \ + } \ + } \ + \ + if (bnum == 0) { \ + int ei=BLOCK_ID_X; \ + const int ev_stride=NUM_BLOCKS_X; \ + if (eflag) { \ + simd_reduce_add1(vwidth, energy); \ + if (tid==0) { \ + engv[ei]=energy*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (tid==0) { \ + for (int r=0; r<6; r++) { \ + engv[ei]=virial[r]*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + } \ + } \ + } else if (offset==0 && ii1) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - f.x += shfl_xor(f.x, s, t_per_atom); \ - f.y += shfl_xor(f.y, s, t_per_atom); \ - f.z += shfl_xor(f.z, s, t_per_atom); \ - energy += shfl_xor(energy, s, t_per_atom); \ - e_coul += shfl_xor(e_coul, s, t_per_atom); \ - } \ - if (vflag>0) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - for (int r=0; r<6; r++) \ - virial[r] += shfl_xor(virial[r], s, t_per_atom); \ - } \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (vflag==2 || eflag==2) { \ + if (eflag) \ + simd_reduce_add2(t_per_atom,energy,e_coul); \ + if (vflag) \ + simd_reduce_arr(6, t_per_atom,virial); \ } \ } \ - if (offset==0) { \ - int ei=ii; \ - if (eflag>0) { \ - engv[ei]=energy*(acctyp)0.5; \ - ei+=inum; \ - engv[ei]=e_coul*(acctyp)0.5; \ - ei+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - engv[ei]=virial[i]*(acctyp)0.5; \ + if (offset==0 && ii 1; active_subgs /= vwidth) { \ + if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \ + if (bnum < active_subgs) { \ + if (eflag) { \ + simd_reduce_add2(vwidth, energy, e_coul); \ + if (voffset==0) { \ + red_acc[6][bnum] = energy; \ + red_acc[7][bnum] = e_coul; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (voffset==0) \ + for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \ + } \ + } \ + \ + __syncthreads(); \ + if (tid < active_subgs) { \ + if (eflag) { \ + energy = red_acc[6][tid]; \ + e_coul = red_acc[7][tid]; \ + } \ + if (vflag) \ + for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \ + } else { \ + if (eflag) energy = e_coul = (acctyp)0; \ + if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \ + } \ + } \ + \ + if (bnum == 0) { \ + int ei=BLOCK_ID_X; \ + const int ev_stride=NUM_BLOCKS_X; \ + if (eflag) { \ + simd_reduce_add2(vwidth, energy, e_coul); \ + if (tid==0) { \ + engv[ei]=energy*(acctyp)0.5; \ + ei+=ev_stride; \ + engv[ei]=e_coul*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (tid==0) { \ + for (int r=0; r<6; r++) { \ + engv[ei]=virial[r]*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + } \ + } \ + } else if (offset==0 && ii1) \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (offset==0 && ii1) \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (offset==0 && ii global_device; template -BaseAtomicT::BaseAtomic() : _compiled(false), _max_bytes(0) { +BaseAtomicT::BaseAtomic() : _compiled(false), _max_bytes(0), _onetype(0) { device=&global_device; ans=new Answer(); nbor=new Neighbor(); pair_program=nullptr; ucl_device=nullptr; + #if defined(LAL_OCL_EV_JIT) + pair_program_noev=nullptr; + #endif } template @@ -36,6 +39,10 @@ BaseAtomicT::~BaseAtomic() { k_pair_fast.clear(); k_pair.clear(); if (pair_program) delete pair_program; + #if defined(LAL_OCL_EV_JIT) + k_pair_noev.clear(); + if (pair_program_noev) delete pair_program_noev; + #endif } template @@ -49,7 +56,7 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, const void *pair_program, - const char *k_name) { + const char *k_name, const int onetype) { screen=_screen; int gpu_nbor=0; @@ -64,28 +71,29 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall, _gpu_host=1; _threads_per_atom=device->threads_per_atom(); - if (_threads_per_atom>1 && gpu_nbor==0) { - nbor->packing(true); - _nbor_data=&(nbor->dev_packed); - } else - _nbor_data=&(nbor->dev_nbor); int success=device->init(*ans,false,false,nlocal,nall,maxspecial); if (success!=0) return success; - success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, - max_nbors,cell_size,false,_threads_per_atom); - if (success!=0) - return success; - if (ucl_device!=device->gpu) _compiled=false; ucl_device=device->gpu; atom=&device->atom; _block_size=device->pair_block_size(); - compile_kernels(*ucl_device,pair_program,k_name); + compile_kernels(*ucl_device,pair_program,k_name,onetype); + + if (_threads_per_atom>1 && gpu_nbor==0) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else + _nbor_data=&(nbor->dev_nbor); + + success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, + max_nbors,cell_size,false,_threads_per_atom); + if (success!=0) + return success; // Initialize host-device load balancer hd_balancer.init(device,gpu_nbor,gpu_split); @@ -102,8 +110,8 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall, } template -void BaseAtomicT::estimate_gpu_overhead() { - device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead); +void BaseAtomicT::estimate_gpu_overhead(const int add_kernels) { + device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead); } template @@ -164,8 +172,8 @@ inline void BaseAtomicT::build_nbor_list(const int inum, const int host_inum, atom->cast_copy_x(host_x,host_type); int mn; - nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag, - nspecial, special, success, mn); + nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, + tag, nspecial, special, success, mn, ans->error_flag); double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) @@ -177,13 +185,27 @@ inline void BaseAtomicT::build_nbor_list(const int inum, const int host_inum, // --------------------------------------------------------------------------- template void BaseAtomicT::compute(const int f_ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, - int &host_start, const double cpu_time, - bool &success) { + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success) { acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); if (inum_full==0) { host_start=0; // Make sure textures are correct if realloc by a different hybrid style @@ -207,8 +229,8 @@ void BaseAtomicT::compute(const int f_ago, const int inum_full, hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); - loop(eflag,vflag); - ans->copy_answers(eflag,vflag,eatom,vatom,ilist); + const int red_blocks=loop(eflag,vflag); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks); device->add_ans_object(ans); hd_balancer.stop_timer(); } @@ -218,14 +240,28 @@ void BaseAtomicT::compute(const int f_ago, const int inum_full, // --------------------------------------------------------------------------- template int ** BaseAtomicT::compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, - const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, - const double cpu_time, bool &success) { + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success) { acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); if (inum_full==0) { host_start=0; // Make sure textures are correct if realloc by a different hybrid style @@ -254,8 +290,8 @@ int ** BaseAtomicT::compute(const int ago, const int inum_full, *ilist=nbor->host_ilist.begin(); *jnum=nbor->host_acc.begin(); - loop(eflag,vflag); - ans->copy_answers(eflag,vflag,eatom,vatom); + const int red_blocks=loop(eflag,vflag); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); device->add_ans_object(ans); hd_balancer.stop_timer(); @@ -270,19 +306,46 @@ double BaseAtomicT::host_memory_usage_atomic() const { template void BaseAtomicT::compile_kernels(UCL_Device &dev, const void *pair_str, - const char *kname) { - if (_compiled) + const char *kname, const int onetype) { + if (_compiled && _onetype==onetype) return; + _onetype=onetype; std::string s_fast=std::string(kname)+"_fast"; if (pair_program) delete pair_program; pair_program=new UCL_Program(dev); - pair_program->load_string(pair_str,device->compile_string().c_str()); + std::string oclstring = device->compile_string()+" -DEVFLAG=1"; + if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype); + pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen); k_pair_fast.set_function(*pair_program,s_fast.c_str()); k_pair.set_function(*pair_program,kname); pos_tex.get_texture(*pair_program,"pos_tex"); + #if defined(LAL_OCL_EV_JIT) + oclstring = device->compile_string()+" -DEVFLAG=0"; + if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype); + if (pair_program_noev) delete pair_program_noev; + pair_program_noev=new UCL_Program(dev); + pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen); + k_pair_noev.set_function(*pair_program_noev,s_fast.c_str()); + #else + k_pair_sel = &k_pair_fast; + #endif + _compiled=true; + + #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)) + if (dev.cl_device_version() >= 210) { + size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size); + #if defined(LAL_OCL_EV_JIT) + mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size)); + #endif + if (_threads_per_atom > mx_subgroup_sz) + _threads_per_atom = mx_subgroup_sz; + device->set_simd_size(mx_subgroup_sz); + } + #endif + } template class BaseAtomic; diff --git a/lib/gpu/lal_base_atomic.h b/lib/gpu/lal_base_atomic.h index c97f42c50e..701675390f 100644 --- a/lib/gpu/lal_base_atomic.h +++ b/lib/gpu/lal_base_atomic.h @@ -53,10 +53,11 @@ class BaseAtomic { int init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, - const void *pair_program, const char *k_name); + const void *pair_program, const char *k_name, + const int onetype=0); /// Estimate the overhead for GPU context changes and CPU driver - void estimate_gpu_overhead(); + void estimate_gpu_overhead(const int add_kernels=0); /// Check if there is enough storage for atom arrays and realloc if not /** \param success set to false if insufficient memory **/ @@ -100,7 +101,7 @@ class BaseAtomic { /// Accumulate timers inline void acc_timers() { if (device->time_device()) { - nbor->acc_timers(); + nbor->acc_timers(screen); time_pair.add_to_total(); atom->acc_timers(); ans->acc_timers(); @@ -179,23 +180,31 @@ class BaseAtomic { Neighbor *nbor; // ------------------------- DEVICE KERNELS ------------------------- - UCL_Program *pair_program; - UCL_Kernel k_pair_fast, k_pair; + UCL_Program *pair_program, *pair_program_noev; + UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel; inline int block_size() { return _block_size; } + inline void set_kernel(const int eflag, const int vflag) { + #if defined(LAL_OCL_EV_JIT) + if (eflag || vflag) k_pair_sel = &k_pair_fast; + else k_pair_sel = &k_pair_noev; + #endif + } + // --------------------------- TEXTURES ----------------------------- UCL_Texture pos_tex; protected: bool _compiled; - int _block_size, _threads_per_atom; + int _block_size, _threads_per_atom, _onetype; double _max_bytes, _max_an_bytes; double _gpu_overhead, _driver_overhead; UCL_D_Vec *_nbor_data; - void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k); + void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k, + const int onetype); - virtual void loop(const bool _eflag, const bool _vflag) = 0; + virtual int loop(const int eflag, const int vflag) = 0; }; } diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp index d5a6e06222..b0d08e4df7 100644 --- a/lib/gpu/lal_base_charge.cpp +++ b/lib/gpu/lal_base_charge.cpp @@ -27,6 +27,9 @@ BaseChargeT::BaseCharge() : _compiled(false), _max_bytes(0) { nbor=new Neighbor(); pair_program=nullptr; ucl_device=nullptr; + #if defined(LAL_OCL_EV_JIT) + pair_program_noev=nullptr; + #endif } template @@ -36,6 +39,10 @@ BaseChargeT::~BaseCharge() { k_pair_fast.clear(); k_pair.clear(); if (pair_program) delete pair_program; + #if defined(LAL_OCL_EV_JIT) + k_pair_noev.clear(); + if (pair_program_noev) delete pair_program_noev; + #endif } template @@ -64,21 +71,11 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall, _gpu_host=1; _threads_per_atom=device->threads_per_charge(); - if (_threads_per_atom>1 && gpu_nbor==0) { - nbor->packing(true); - _nbor_data=&(nbor->dev_packed); - } else - _nbor_data=&(nbor->dev_nbor); int success=device->init(*ans,true,false,nlocal,nall,maxspecial); if (success!=0) return success; - success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, - max_nbors,cell_size,false,_threads_per_atom); - if (success!=0) - return success; - if (ucl_device!=device->gpu) _compiled=false; ucl_device=device->gpu; @@ -88,6 +85,17 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall, _block_bio_size=device->block_bio_pair(); compile_kernels(*ucl_device,pair_program,k_name); + if (_threads_per_atom>1 && gpu_nbor==0) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else + _nbor_data=&(nbor->dev_nbor); + + success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, + max_nbors,cell_size,false,_threads_per_atom); + if (success!=0) + return success; + // Initialize host-device load balancer hd_balancer.init(device,gpu_nbor,gpu_split); @@ -104,8 +112,8 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall, } template -void BaseChargeT::estimate_gpu_overhead() { - device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead); +void BaseChargeT::estimate_gpu_overhead(const int add_kernels) { + device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead); } template @@ -166,8 +174,8 @@ inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum, atom->cast_copy_x(host_x,host_type); int mn; - nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag, - nspecial, special, success, mn); + nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, + tag, nspecial, special, success, mn, ans->error_flag); double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) @@ -179,14 +187,28 @@ inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum, // --------------------------------------------------------------------------- template void BaseChargeT::compute(const int f_ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, - int &host_start, const double cpu_time, - bool &success, double *host_q, - const int nlocal, double *boxlo, double *prd) { + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); if (inum_full==0) { host_start=0; // Make sure textures are correct if realloc by a different hybrid style @@ -215,8 +237,8 @@ void BaseChargeT::compute(const int f_ago, const int inum_full, device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q, boxlo, prd); - loop(eflag,vflag); - ans->copy_answers(eflag,vflag,eatom,vatom,ilist); + const int red_blocks=loop(eflag,vflag); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks); device->add_ans_object(ans); hd_balancer.stop_timer(); } @@ -226,15 +248,29 @@ void BaseChargeT::compute(const int f_ago, const int inum_full, // --------------------------------------------------------------------------- template int** BaseChargeT::compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, - const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, - const double cpu_time, bool &success, - double *host_q, double *boxlo, double *prd) { + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success, + double *host_q, double *boxlo, double *prd) { acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); if (inum_full==0) { host_start=0; // Make sure textures are correct if realloc by a different hybrid style @@ -269,8 +305,8 @@ int** BaseChargeT::compute(const int ago, const int inum_full, device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q, boxlo, prd); - loop(eflag,vflag); - ans->copy_answers(eflag,vflag,eatom,vatom); + const int red_blocks=loop(eflag,vflag); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); device->add_ans_object(ans); hd_balancer.stop_timer(); @@ -292,13 +328,37 @@ void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str, std::string s_fast=std::string(kname)+"_fast"; if (pair_program) delete pair_program; pair_program=new UCL_Program(dev); - pair_program->load_string(pair_str,device->compile_string().c_str()); + std::string oclstring = device->compile_string()+" -DEVFLAG=1"; + pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen); k_pair_fast.set_function(*pair_program,s_fast.c_str()); k_pair.set_function(*pair_program,kname); pos_tex.get_texture(*pair_program,"pos_tex"); q_tex.get_texture(*pair_program,"q_tex"); + #if defined(LAL_OCL_EV_JIT) + oclstring = device->compile_string()+" -DEVFLAG=0"; + if (pair_program_noev) delete pair_program_noev; + pair_program_noev=new UCL_Program(dev); + pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen); + k_pair_noev.set_function(*pair_program_noev,s_fast.c_str()); + #else + k_pair_sel = &k_pair_fast; + #endif + _compiled=true; + + #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)) + if (dev.cl_device_version() >= 210) { + size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size); + #if defined(LAL_OCL_EV_JIT) + mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size)); + #endif + if (_threads_per_atom > mx_subgroup_sz) + _threads_per_atom = mx_subgroup_sz; + device->set_simd_size(mx_subgroup_sz); + } + #endif + } template class BaseCharge; diff --git a/lib/gpu/lal_base_charge.h b/lib/gpu/lal_base_charge.h index b6d3e9e3f8..6b8761092a 100644 --- a/lib/gpu/lal_base_charge.h +++ b/lib/gpu/lal_base_charge.h @@ -57,7 +57,7 @@ class BaseCharge { const void *pair_program, const char *k_name); /// Estimate the overhead for GPU context changes and CPU driver - void estimate_gpu_overhead(); + void estimate_gpu_overhead(const int add_kernels=0); /// Check if there is enough storage for atom arrays and realloc if not /** \param success set to false if insufficient memory **/ @@ -103,7 +103,7 @@ class BaseCharge { /// Accumulate timers inline void acc_timers() { if (device->time_device()) { - nbor->acc_timers(); + nbor->acc_timers(screen); time_pair.add_to_total(); atom->acc_timers(); ans->acc_timers(); @@ -177,9 +177,15 @@ class BaseCharge { Neighbor *nbor; // ------------------------- DEVICE KERNELS ------------------------- - UCL_Program *pair_program; - UCL_Kernel k_pair_fast, k_pair; + UCL_Program *pair_program, *pair_program_noev; + UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel; inline int block_size() { return _block_size; } + inline void set_kernel(const int eflag, const int vflag) { + #if defined(LAL_OCL_EV_JIT) + if (eflag || vflag) k_pair_sel = &k_pair_fast; + else k_pair_sel = &k_pair_noev; + #endif + } // --------------------------- TEXTURES ----------------------------- UCL_Texture pos_tex; @@ -194,7 +200,7 @@ class BaseCharge { void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k); - virtual void loop(const bool _eflag, const bool _vflag) = 0; + virtual int loop(const int eflag, const int vflag) = 0; }; } diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp index 57773a3b80..9781065b13 100644 --- a/lib/gpu/lal_base_dipole.cpp +++ b/lib/gpu/lal_base_dipole.cpp @@ -27,6 +27,9 @@ BaseDipoleT::BaseDipole() : _compiled(false), _max_bytes(0) { nbor=new Neighbor(); pair_program=nullptr; ucl_device=nullptr; + #if defined(LAL_OCL_EV_JIT) + pair_program_noev=nullptr; + #endif } template @@ -36,6 +39,10 @@ BaseDipoleT::~BaseDipole() { k_pair_fast.clear(); k_pair.clear(); if (pair_program) delete pair_program; + #if defined(LAL_OCL_EV_JIT) + k_pair_noev.clear(); + if (pair_program_noev) delete pair_program_noev; + #endif } template @@ -65,30 +72,30 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall, _gpu_host=1; _threads_per_atom=device->threads_per_charge(); - if (_threads_per_atom>1 && gpu_nbor==0) { - nbor->packing(true); - _nbor_data=&(nbor->dev_packed); - } else - _nbor_data=&(nbor->dev_nbor); int success=device->init(*ans,true,true,nlocal,nall,maxspecial); if (success!=0) return success; - success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, - max_nbors,cell_size,false,_threads_per_atom); - if (success!=0) - return success; - if (ucl_device!=device->gpu) _compiled=false; ucl_device=device->gpu; atom=&device->atom; _block_size=device->pair_block_size(); - _block_bio_size=device->block_bio_pair(); compile_kernels(*ucl_device,pair_program,k_name); + if (_threads_per_atom>1 && gpu_nbor==0) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else + _nbor_data=&(nbor->dev_nbor); + + success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, + max_nbors,cell_size,false,_threads_per_atom); + if (success!=0) + return success; + // Initialize host-device load balancer hd_balancer.init(device,gpu_nbor,gpu_split); @@ -168,8 +175,8 @@ inline void BaseDipoleT::build_nbor_list(const int inum, const int host_inum, atom->cast_copy_x(host_x,host_type); int mn; - nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag, - nspecial, special, success, mn); + nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, + tag, nspecial, special, success, mn, ans->error_flag); double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) @@ -183,12 +190,26 @@ template void BaseDipoleT::compute(const int f_ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, + const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_q, double **host_mu, const int nlocal, double *boxlo, double *prd) { acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); if (inum_full==0) { host_start=0; // Make sure textures are correct if realloc by a different hybrid style @@ -219,8 +240,8 @@ void BaseDipoleT::compute(const int f_ago, const int inum_full, device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q, boxlo, prd); - loop(eflag,vflag); - ans->copy_answers(eflag,vflag,eatom,vatom,ilist); + const int red_blocks=loop(eflag,vflag); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks); device->add_ans_object(ans); hd_balancer.stop_timer(); } @@ -232,14 +253,28 @@ template int** BaseDipoleT::compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, - const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, + int **nspecial, tagint **special, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double **host_mu, double *boxlo, double *prd) { acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); if (inum_full==0) { host_start=0; // Make sure textures are correct if realloc by a different hybrid style @@ -277,8 +312,8 @@ int** BaseDipoleT::compute(const int ago, const int inum_full, device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q, boxlo, prd); - loop(eflag,vflag); - ans->copy_answers(eflag,vflag,eatom,vatom); + const int red_blocks=loop(eflag,vflag); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); device->add_ans_object(ans); hd_balancer.stop_timer(); @@ -300,14 +335,38 @@ void BaseDipoleT::compile_kernels(UCL_Device &dev, const void *pair_str, std::string s_fast=std::string(kname)+"_fast"; if (pair_program) delete pair_program; pair_program=new UCL_Program(dev); - pair_program->load_string(pair_str,device->compile_string().c_str()); + std::string oclstring = device->compile_string()+" -DEVFLAG=1"; + pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen); k_pair_fast.set_function(*pair_program,s_fast.c_str()); k_pair.set_function(*pair_program,kname); pos_tex.get_texture(*pair_program,"pos_tex"); q_tex.get_texture(*pair_program,"q_tex"); mu_tex.get_texture(*pair_program,"mu_tex"); + #if defined(LAL_OCL_EV_JIT) + oclstring = device->compile_string()+" -DEVFLAG=0"; + if (pair_program_noev) delete pair_program_noev; + pair_program_noev=new UCL_Program(dev); + pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen); + k_pair_noev.set_function(*pair_program_noev,s_fast.c_str()); + #else + k_pair_sel = &k_pair_fast; + #endif + _compiled=true; + + #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)) + if (dev.cl_device_version() >= 210) { + size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size); + #if defined(LAL_OCL_EV_JIT) + mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size)); + #endif + if (_threads_per_atom > mx_subgroup_sz) + _threads_per_atom = mx_subgroup_sz; + device->set_simd_size(mx_subgroup_sz); + } + #endif + } template class BaseDipole; diff --git a/lib/gpu/lal_base_dipole.h b/lib/gpu/lal_base_dipole.h index 856b69b56b..f7cefd9066 100644 --- a/lib/gpu/lal_base_dipole.h +++ b/lib/gpu/lal_base_dipole.h @@ -102,7 +102,7 @@ class BaseDipole { /// Accumulate timers inline void acc_timers() { if (device->time_device()) { - nbor->acc_timers(); + nbor->acc_timers(screen); time_pair.add_to_total(); atom->acc_timers(); ans->acc_timers(); @@ -176,9 +176,16 @@ class BaseDipole { Neighbor *nbor; // ------------------------- DEVICE KERNELS ------------------------- - UCL_Program *pair_program; - UCL_Kernel k_pair_fast, k_pair; + UCL_Program *pair_program, *pair_program_noev; + UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel; inline int block_size() { return _block_size; } + inline void set_kernel(const int eflag, const int vflag) { + #if defined(LAL_OCL_EV_JIT) + if (eflag || vflag) k_pair_sel = &k_pair_fast; + else k_pair_sel = &k_pair_noev; + #endif + } + // --------------------------- TEXTURES ----------------------------- UCL_Texture pos_tex; @@ -187,14 +194,14 @@ class BaseDipole { protected: bool _compiled; - int _block_size, _block_bio_size, _threads_per_atom; + int _block_size, _threads_per_atom; double _max_bytes, _max_an_bytes; double _gpu_overhead, _driver_overhead; UCL_D_Vec *_nbor_data; void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k); - virtual void loop(const bool _eflag, const bool _vflag) = 0; + virtual int loop(const int eflag, const int vflag) = 0; }; } diff --git a/lib/gpu/lal_base_dpd.cpp b/lib/gpu/lal_base_dpd.cpp index e4fd80fcc3..4b6a964bfb 100644 --- a/lib/gpu/lal_base_dpd.cpp +++ b/lib/gpu/lal_base_dpd.cpp @@ -27,6 +27,9 @@ BaseDPDT::BaseDPD() : _compiled(false), _max_bytes(0) { nbor=new Neighbor(); pair_program=nullptr; ucl_device=nullptr; + #if defined(LAL_OCL_EV_JIT) + pair_program_noev=nullptr; + #endif } template @@ -36,6 +39,10 @@ BaseDPDT::~BaseDPD() { k_pair_fast.clear(); k_pair.clear(); if (pair_program) delete pair_program; + #if defined(LAL_OCL_EV_JIT) + k_pair_noev.clear(); + if (pair_program_noev) delete pair_program_noev; + #endif } template @@ -47,9 +54,9 @@ int BaseDPDT::bytes_per_atom_atomic(const int max_nbors) const { template int BaseDPDT::init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, - const double cell_size, - const double gpu_split, FILE *_screen, - const void *pair_program, const char *k_name) { + const double cell_size, const double gpu_split, + FILE *_screen, const void *pair_program, + const char *k_name, const int onetype) { screen=_screen; int gpu_nbor=0; @@ -63,31 +70,30 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall, if (host_nlocal>0) _gpu_host=1; - _threads_per_atom=device->threads_per_charge(); - if (_threads_per_atom>1 && gpu_nbor==0) { - nbor->packing(true); - _nbor_data=&(nbor->dev_packed); - } else - _nbor_data=&(nbor->dev_nbor); + _threads_per_atom=device->threads_per_atom(); int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true); if (success!=0) return success; - success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, - max_nbors,cell_size,false,_threads_per_atom); - - if (success!=0) - return success; - if (ucl_device!=device->gpu) _compiled=false; ucl_device=device->gpu; atom=&device->atom; _block_size=device->pair_block_size(); - _block_bio_size=device->block_bio_pair(); - compile_kernels(*ucl_device,pair_program,k_name); + compile_kernels(*ucl_device,pair_program,k_name,onetype); + + if (_threads_per_atom>1 && gpu_nbor==0) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else + _nbor_data=&(nbor->dev_nbor); + + success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, + max_nbors,cell_size,false,_threads_per_atom); + if (success!=0) + return success; // Initialize host-device load balancer hd_balancer.init(device,gpu_nbor,gpu_split); @@ -167,8 +173,8 @@ inline void BaseDPDT::build_nbor_list(const int inum, const int host_inum, atom->cast_copy_x(host_x,host_type); int mn; - nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag, - nspecial, special, success, mn); + nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, + tag, nspecial, special, success, mn, ans->error_flag); double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) @@ -179,16 +185,30 @@ inline void BaseDPDT::build_nbor_list(const int inum, const int host_inum, // Copy nbor list from host if necessary and then calculate forces, virials,.. // --------------------------------------------------------------------------- template -void BaseDPDT::compute(const int f_ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, - int &host_start, const double cpu_time, - bool &success, tagint *tag, double **host_v, - const double dtinvsqrt, const int seed, const int timestep, +void BaseDPDT::compute(const int f_ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag_in, + const bool vflag_in, const bool eatom, + const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, const int nlocal, double *boxlo, double *prd) { acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); if (inum_full==0) { host_start=0; // Make sure textures are correct if realloc by a different hybrid style @@ -218,8 +238,8 @@ void BaseDPDT::compute(const int f_ago, const int inum_full, _seed = seed; _timestep = timestep; - loop(eflag,vflag); - ans->copy_answers(eflag,vflag,eatom,vatom,ilist); + const int red_blocks=loop(eflag,vflag); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks); device->add_ans_object(ans); hd_balancer.stop_timer(); } @@ -231,8 +251,8 @@ template int** BaseDPDT::compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, - const bool vflag, const bool eatom, + int **nspecial, tagint **special, const bool eflag_in, + const bool vflag_in, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, @@ -240,6 +260,20 @@ int** BaseDPDT::compute(const int ago, const int inum_full, const int seed, const int timestep, double *boxlo, double *prd) { acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); if (inum_full==0) { host_start=0; // Make sure textures are correct if realloc by a different hybrid style @@ -275,8 +309,8 @@ int** BaseDPDT::compute(const int ago, const int inum_full, _seed = seed; _timestep = timestep; - loop(eflag,vflag); - ans->copy_answers(eflag,vflag,eatom,vatom); + const int red_blocks=loop(eflag,vflag); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); device->add_ans_object(ans); hd_balancer.stop_timer(); @@ -291,20 +325,48 @@ double BaseDPDT::host_memory_usage_atomic() const { template void BaseDPDT::compile_kernels(UCL_Device &dev, const void *pair_str, - const char *kname) { - if (_compiled) + const char *kname, const int onetype) { + if (_compiled && _onetype==onetype) return; + _onetype=onetype; + std::string s_fast=std::string(kname)+"_fast"; if (pair_program) delete pair_program; pair_program=new UCL_Program(dev); - pair_program->load_string(pair_str,device->compile_string().c_str()); + std::string oclstring = device->compile_string()+" -DEVFLAG=1"; + if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype); + pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen); k_pair_fast.set_function(*pair_program,s_fast.c_str()); k_pair.set_function(*pair_program,kname); pos_tex.get_texture(*pair_program,"pos_tex"); vel_tex.get_texture(*pair_program,"vel_tex"); + #if defined(LAL_OCL_EV_JIT) + oclstring = device->compile_string()+" -DEVFLAG=0"; + if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype); + if (pair_program_noev) delete pair_program_noev; + pair_program_noev=new UCL_Program(dev); + pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen); + k_pair_noev.set_function(*pair_program_noev,s_fast.c_str()); + #else + k_pair_sel = &k_pair_fast; + #endif + _compiled=true; + + #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)) + if (dev.cl_device_version() >= 210) { + size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size); + #if defined(LAL_OCL_EV_JIT) + mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size)); + #endif + if (_threads_per_atom > mx_subgroup_sz) + _threads_per_atom = mx_subgroup_sz; + device->set_simd_size(mx_subgroup_sz); + } + #endif + } template class BaseDPD; diff --git a/lib/gpu/lal_base_dpd.h b/lib/gpu/lal_base_dpd.h index 5d1573c1a9..9eb56993af 100644 --- a/lib/gpu/lal_base_dpd.h +++ b/lib/gpu/lal_base_dpd.h @@ -52,7 +52,8 @@ class BaseDPD { int init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, - const void *pair_program, const char *k_name); + const void *pair_program, const char *k_name, + const int onetype=0); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(); @@ -101,7 +102,7 @@ class BaseDPD { /// Accumulate timers inline void acc_timers() { if (device->time_device()) { - nbor->acc_timers(); + nbor->acc_timers(screen); time_pair.add_to_total(); atom->acc_timers(); ans->acc_timers(); @@ -177,9 +178,16 @@ class BaseDPD { Neighbor *nbor; // ------------------------- DEVICE KERNELS ------------------------- - UCL_Program *pair_program; - UCL_Kernel k_pair_fast, k_pair; + UCL_Program *pair_program, *pair_program_noev; + UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel; inline int block_size() { return _block_size; } + inline void set_kernel(const int eflag, const int vflag) { + #if defined(LAL_OCL_EV_JIT) + if (eflag || vflag) k_pair_sel = &k_pair_fast; + else k_pair_sel = &k_pair_noev; + #endif + } + // --------------------------- TEXTURES ----------------------------- UCL_Texture pos_tex; @@ -191,13 +199,14 @@ class BaseDPD { protected: bool _compiled; - int _block_size, _block_bio_size, _threads_per_atom; + int _block_size, _threads_per_atom, _onetype; double _max_bytes, _max_an_bytes; double _gpu_overhead, _driver_overhead; UCL_D_Vec *_nbor_data; - void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k); - virtual void loop(const bool _eflag, const bool _vflag) = 0; + void compile_kernels(UCL_Device &dev, const void *pair_string, + const char *k, const int onetype); + virtual int loop(const int eflag, const int vflag) = 0; }; } diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp index 524705ed41..98411a8033 100644 --- a/lib/gpu/lal_base_ellipsoid.cpp +++ b/lib/gpu/lal_base_ellipsoid.cpp @@ -29,7 +29,8 @@ const char *ellipsoid_nbor=0; extern Device global_device; template -BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0) { +BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0), + host_olist_size(0) { device=&global_device; ans=new Answer(); nbor=new Neighbor(); @@ -37,6 +38,10 @@ BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0) { ellipsoid_program=nullptr; lj_program=nullptr; ucl_device=nullptr; + #if defined(LAL_OCL_EV_JIT) + ellipsoid_program_noev=nullptr; + lj_program_noev=nullptr; + #endif } template @@ -53,6 +58,14 @@ BaseEllipsoidT::~BaseEllipsoid() { if (nbor_program) delete nbor_program; if (ellipsoid_program) delete ellipsoid_program; if (lj_program) delete lj_program; + #if defined(LAL_OCL_EV_JIT) + k_ellipsoid_noev.clear(); + k_ellipsoid_sphere_noev.clear(); + k_sphere_ellipsoid_noev.clear(); + k_lj_fast.clear(); + if (ellipsoid_program_noev) delete ellipsoid_program_noev; + if (lj_program_noev) delete lj_program_noev; + #endif } template @@ -89,11 +102,6 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall, if (success!=0) return success; - success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, - max_nbors,cell_size,true,1); - if (success!=0) - return success; - if (ucl_device!=device->gpu) _compiled=false; ucl_device=device->gpu; @@ -102,6 +110,11 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall, _block_size=device->block_ellipse(); compile_kernels(*ucl_device,ellipsoid_program,lj_program,k_name,ellip_sphere); + success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, + max_nbors,cell_size,true,1); + if (success!=0) + return success; + // Initialize host-device load balancer hd_balancer.init(device,gpu_nbor,gpu_split); @@ -133,12 +146,11 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall, if (_multiple_forms && gpu_nbor!=0) return -9; - if (_multiple_forms) + if (_multiple_forms) { ans->force.zero(); - - // Memory for ilist ordered by particle type - if (host_olist.alloc(nbor->max_atoms(),*ucl_device)!=UCL_SUCCESS) - return -3; + host_olist_size = nbor->max_atoms(); + host_olist = new int[nbor->max_atoms()]; + } _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); @@ -160,7 +172,10 @@ template void BaseEllipsoidT::clear_base() { // Output any timing information output_times(); - host_olist.clear(); + if (host_olist_size) { + host_olist_size = 0; + delete []host_olist; + } time_nbor1.clear(); time_ellipsoid.clear(); @@ -206,10 +221,14 @@ void BaseEllipsoidT::output_times() { MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0, device->replica()); double max_mb=mpi_max_bytes/(1024*1024); - double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5]; + + #ifdef USE_OPENCL + // Workaround for timing issue on Intel OpenCL + if (times[3] > 80e6) times[3]=0.0; + #endif if (device->replica_me()==0) - if (screen && times[5]>0.0) { + if (screen && times[7]>0.0) { int replica_size=device->replica_size(); fprintf(screen,"\n\n-------------------------------------"); @@ -218,9 +237,8 @@ void BaseEllipsoidT::output_times() { fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - if (device->procs_per_gpu()==1 && t_time>0) { + if (device->procs_per_gpu()==1 && times[3]>0) { fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size); - fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[5]/replica_size); fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/replica_size); if (nbor->gpu_nbor()>0) fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/replica_size); @@ -229,13 +247,15 @@ void BaseEllipsoidT::output_times() { fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size); fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size); } - if (nbor->gpu_nbor()==2) - fprintf(screen,"Neighbor (CPU): %.4f s.\n",times[9]/replica_size); if (times[6]>0) fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size); fprintf(screen,"Average split: %.4f.\n",avg_split); - fprintf(screen,"Threads / atom: %d.\n",_threads_per_atom); + fprintf(screen,"Lanes / atom: %d.\n",_threads_per_atom); + fprintf(screen,"Vector width: %d.\n", device->simd_size()); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); + if (nbor->gpu_nbor()==2) + fprintf(screen,"CPU Neighbor: %.4f s.\n",times[9]/replica_size); + fprintf(screen,"CPU Cast/Pack: %.4f s.\n",times[5]/replica_size); fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size); fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size); fprintf(screen,"-------------------------------------"); @@ -256,11 +276,13 @@ void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start, if (shared_types) { k_nbor_fast.set_size(GX,BX); k_nbor_fast.run(&atom->x, &cut_form, &nbor->dev_nbor, &stride, &start, - &inum, &nbor->dev_packed, &form_low, &form_high); + &inum, &nbor->dev_packed, &form_low, &form_high, + &_threads_per_atom); } else { k_nbor.set_size(GX,BX); k_nbor.run(&atom->x, &cut_form, &ntypes, &nbor->dev_nbor, &stride, - &start, &inum, &nbor->dev_packed, &form_low, &form_high); + &start, &inum, &nbor->dev_packed, &form_low, &form_high, + &_threads_per_atom); } } @@ -298,7 +320,7 @@ void BaseEllipsoidT::reset_nbors(const int nall, const int inum, p++; } } - nbor->get_host(inum,host_olist.begin(),numj,firstneigh,block_size()); + nbor->get_host(inum,host_olist,numj,firstneigh,block_size()); nbor->copy_unpacked(inum,mn); return; } @@ -330,8 +352,8 @@ inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum, atom->cast_copy_x(host_x,host_type); int mn; - nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag, - nspecial, special, success, mn); + nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, + tag, nspecial, special, success, mn, ans->error_flag); nbor->copy_unpacked(inum,mn); _last_ellipse=inum; _max_last_ellipse=inum; @@ -348,11 +370,18 @@ template int* BaseEllipsoidT::compute(const int f_ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, + const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double **host_quat) { acc_timers(); + int eflag, vflag; + if (eflag_in) eflag=2; + else eflag=0; + if (vflag_in) vflag=2; + else vflag=0; + + set_kernel(eflag,vflag); if (inum_full==0) { host_start=0; zero_timers(); @@ -373,7 +402,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full, } int *list; if (_multiple_forms) - list=host_olist.begin(); + list=host_olist; else list=ilist; @@ -384,7 +413,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full, atom->add_quat_data(); loop(eflag,vflag); - ans->copy_answers(eflag,vflag,eatom,vatom,list); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,list,inum); device->add_ans_object(ans); hd_balancer.stop_timer(); return list; @@ -394,15 +423,23 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full, // Reneighbor on GPU if necessary and then compute forces, virials, energies // --------------------------------------------------------------------------- template -int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, +int** BaseEllipsoidT::compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double **host_quat) { acc_timers(); + int eflag, vflag; + if (eflag_in) eflag=2; + else eflag=0; + if (vflag_in) vflag=2; + else vflag=0; + + set_kernel(eflag,vflag); if (inum_full==0) { host_start=0; zero_timers(); @@ -435,7 +472,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall *jnum=nbor->host_acc.begin(); loop(eflag,vflag); - ans->copy_answers(eflag,vflag,eatom,vatom); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,inum); device->add_ans_object(ans); hd_balancer.stop_timer(); @@ -462,25 +499,26 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev, std::string s_lj=kns+"_lj"; std::string s_lj_fast=kns+"_lj_fast"; - std::string flags=device->compile_string(); + std::string oclstring = device->compile_string()+" -DEVFLAG=1"; if (nbor_program) delete nbor_program; nbor_program=new UCL_Program(dev); - nbor_program->load_string(ellipsoid_nbor,flags.c_str()); + nbor_program->load_string(ellipsoid_nbor,oclstring.c_str(),nullptr,screen); k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast"); k_nbor.set_function(*nbor_program,"kernel_nbor"); neigh_tex.get_texture(*nbor_program,"pos_tex"); if (ellipsoid_program) delete ellipsoid_program; ellipsoid_program=new UCL_Program(dev); - ellipsoid_program->load_string(ellipsoid_string,flags.c_str()); + ellipsoid_program->load_string(ellipsoid_string,oclstring.c_str(), + nullptr,screen); k_ellipsoid.set_function(*ellipsoid_program,kname); pos_tex.get_texture(*ellipsoid_program,"pos_tex"); quat_tex.get_texture(*ellipsoid_program,"quat_tex"); if (lj_program) delete lj_program; lj_program=new UCL_Program(dev); - lj_program->load_string(lj_string,flags.c_str()); + lj_program->load_string(lj_string,oclstring.c_str(),nullptr,screen); k_sphere_ellipsoid.set_function(*lj_program,s_sphere_ellipsoid.c_str()); k_lj_fast.set_function(*lj_program,s_lj_fast.c_str()); k_lj.set_function(*lj_program,s_lj.c_str()); @@ -489,7 +527,52 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev, lj_pos_tex.get_texture(*lj_program,"pos_tex"); lj_quat_tex.get_texture(*lj_program,"quat_tex"); + #if defined(LAL_OCL_EV_JIT) + oclstring = device->compile_string()+" -DEVFLAG=0"; + if (ellipsoid_program_noev) delete ellipsoid_program_noev; + ellipsoid_program_noev=new UCL_Program(dev); + ellipsoid_program_noev->load_string(ellipsoid_string,oclstring.c_str(), + nullptr,screen); + k_ellipsoid_noev.set_function(*ellipsoid_program_noev,kname); + + if (lj_program_noev) delete lj_program_noev; + lj_program_noev=new UCL_Program(dev); + lj_program_noev->load_string(lj_string,oclstring.c_str(),nullptr,screen); + k_sphere_ellipsoid_noev.set_function(*lj_program_noev, + s_sphere_ellipsoid.c_str()); + k_lj_fast_noev.set_function(*lj_program_noev,s_lj_fast.c_str()); + if (e_s) + k_ellipsoid_sphere_noev.set_function(*lj_program_noev, + s_ellipsoid_sphere.c_str()); + #else + k_elps_sel = &k_ellipsoid; + k_elps_sphere_sel = &k_ellipsoid_sphere; + k_sphere_elps_sel = &k_sphere_ellipsoid; + k_lj_sel = &k_lj_fast; + #endif + _compiled=true; + + #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)) + if (dev.cl_device_version() >= 210) { + size_t mx_subgroup_sz = k_lj_fast.max_subgroup_size(_block_size); + mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid.max_subgroup_size(_block_size)); + mx_subgroup_sz = std::min(mx_subgroup_sz, k_sphere_ellipsoid.max_subgroup_size(_block_size)); + if (e_s) + mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_sphere.max_subgroup_size(_block_size)); + #if defined(LAL_OCL_EV_JIT) + mx_subgroup_sz = std::min(mx_subgroup_sz, k_lj_fast_noev.max_subgroup_size(_block_size)); + mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_noev.max_subgroup_size(_block_size)); + mx_subgroup_sz = std::min(mx_subgroup_sz, k_sphere_ellipsoid_noev.max_subgroup_size(_block_size)); + if (e_s) + mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_sphere_noev.max_subgroup_size(_block_size)); + #endif + if (_threads_per_atom > mx_subgroup_sz) + _threads_per_atom = mx_subgroup_sz; + device->set_simd_size(mx_subgroup_sz); + } + #endif + } template class BaseEllipsoid; diff --git a/lib/gpu/lal_base_ellipsoid.h b/lib/gpu/lal_base_ellipsoid.h index dc1e624a2f..f30a0062d2 100644 --- a/lib/gpu/lal_base_ellipsoid.h +++ b/lib/gpu/lal_base_ellipsoid.h @@ -88,10 +88,10 @@ class BaseEllipsoid { ans->resize(nlocal, success); if (_multiple_forms) ans->force.zero(); - if (olist_size>static_cast(host_olist.numel())) { - host_olist.clear(); - int new_size=static_cast(static_cast(olist_size)*1.10); - success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS); + if (olist_size>host_olist_size) { + if (host_olist_size) delete []host_olist; + host_olist_size=static_cast(static_cast(olist_size)*1.10); + host_olist = new int[host_olist_size]; } nbor->resize(nlocal,host_inum,max_nbors,success); @@ -116,7 +116,7 @@ class BaseEllipsoid { /// Accumulate timers inline void acc_timers() { if (device->time_device()) { - nbor->acc_timers(); + nbor->acc_timers(screen); time_nbor1.add_to_total(); time_ellipsoid.add_to_total(); if (_multiple_forms) { @@ -223,14 +223,40 @@ class BaseEllipsoid { /// Neighbor data Neighbor *nbor; /// ilist with particles sorted by type - UCL_H_Vec host_olist; + int *host_olist; + int host_olist_size; // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *nbor_program, *ellipsoid_program, *lj_program; + UCL_Program *ellipsoid_program_noev, *lj_program_noev; UCL_Kernel k_nbor_fast, k_nbor; UCL_Kernel k_ellipsoid, k_ellipsoid_sphere, k_sphere_ellipsoid; UCL_Kernel k_lj_fast, k_lj; + UCL_Kernel k_ellipsoid_noev, k_ellipsoid_sphere_noev; + UCL_Kernel k_sphere_ellipsoid_noev, k_lj_fast_noev; + UCL_Kernel *k_elps_sel, *k_elps_sphere_sel, *k_sphere_elps_sel, *k_lj_sel; inline int block_size() { return _block_size; } + inline void set_kernel(const int eflag, const int vflag) { + #if defined(LAL_OCL_EV_JIT) + if (_multiple_forms == false) { + if (eflag || vflag) k_elps_sel = &k_ellipsoid; + else k_elps_sel = &k_ellipsoid_noev; + } else { + if (eflag || vflag) { + k_elps_sel = &k_ellipsoid; + k_elps_sphere_sel = &k_ellipsoid_sphere; + k_sphere_elps_sel = &k_sphere_ellipsoid; + k_lj_sel = &k_lj_fast; + } else { + k_elps_sel = &k_ellipsoid_noev; + k_elps_sphere_sel = &k_ellipsoid_sphere_noev; + k_sphere_elps_sel = &k_sphere_ellipsoid_noev; + k_lj_sel = &k_lj_fast_noev; + } + } + #endif + } + // --------------------------- TEXTURES ----------------------------- UCL_Texture pos_tex, quat_tex, lj_pos_tex, lj_quat_tex, neigh_tex; @@ -240,7 +266,6 @@ class BaseEllipsoid { int _block_size, _threads_per_atom; double _max_bytes, _max_an_bytes; double _gpu_overhead, _driver_overhead; - UCL_D_Vec *_nbor_data; // True if we want to use fast GB-sphere or sphere-sphere calculations bool _multiple_forms; @@ -250,7 +275,7 @@ class BaseEllipsoid { void compile_kernels(UCL_Device &dev, const void *ellipsoid_string, const void *lj_string, const char *kname,const bool e_s); - virtual void loop(const bool _eflag, const bool _vflag) = 0; + virtual int loop(const int eflag, const int vflag) = 0; }; } diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp index cfc138aea2..660385eb56 100644 --- a/lib/gpu/lal_base_three.cpp +++ b/lib/gpu/lal_base_three.cpp @@ -20,7 +20,7 @@ namespace LAMMPS_AL { extern Device global_device; template -BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) { +BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0), _onetype(-1) { device=&global_device; ans=new Answer(); nbor=new Neighbor(); @@ -29,6 +29,9 @@ BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) { #endif pair_program=nullptr; ucl_device=nullptr; + #if defined(LAL_OCL_EV_JIT) + pair_program_noev=nullptr; + #endif } template @@ -44,12 +47,18 @@ BaseThreeT::~BaseThree() { k_pair.clear(); k_short_nbor.clear(); if (pair_program) delete pair_program; + #if defined(LAL_OCL_EV_JIT) + k_three_center_noev.clear(); + k_three_end_noev.clear(); + k_pair_noev.clear(); + if (pair_program_noev) delete pair_program_noev; + #endif } template int BaseThreeT::bytes_per_atom_atomic(const int max_nbors) const { int b=device->atom.bytes_per_atom()+ans->bytes_per_atom()+ - nbor->bytes_per_atom(max_nbors); + nbor->bytes_per_atom(max_nbors); #ifdef THREE_CONCURRENT b+=ans2->bytes_per_atom(); #endif @@ -62,7 +71,9 @@ int BaseThreeT::init_three(const int nlocal, const int nall, const double cell_size, const double gpu_split, FILE *_screen, const void *pair_program, const char *two, const char *three_center, - const char *three_end, const char *short_nbor) { + const char *three_end, const char *short_nbor, + const int onetype, const int onetype3, + const int spq, const int tpa_override) { screen=_screen; int gpu_nbor=0; @@ -77,24 +88,16 @@ int BaseThreeT::init_three(const int nlocal, const int nall, if (host_nlocal>0) _gpu_host=1; - _threads_per_atom=device->threads_per_atom(); - if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1 - nbor->packing(true); - _nbor_data=&(nbor->dev_packed); - } else // neigh yes or tpa == 1 - _nbor_data=&(nbor->dev_nbor); - if (_threads_per_atom*_threads_per_atom>device->warp_size()) - return -10; + // Allow forcing threads per atom to 1 for tersoff due to subg sync issue + if (tpa_override) + _threads_per_atom=tpa_override; + else + _threads_per_atom=device->threads_per_three(); int success=device->init(*ans,false,false,nlocal,nall,maxspecial); if (success!=0) return success; - success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, - max_nbors,cell_size,false,_threads_per_atom); - if (success!=0) - return success; - if (ucl_device!=device->gpu) _compiled=false; ucl_device=device->gpu; @@ -110,7 +113,19 @@ int BaseThreeT::init_three(const int nlocal, const int nall, _block_pair=device->pair_block_size(); _block_size=device->block_ellipse(); - compile_kernels(*ucl_device,pair_program,two,three_center,three_end,short_nbor); + compile_kernels(*ucl_device,pair_program,two,three_center,three_end, + short_nbor,onetype,onetype3,spq); + + while (_threads_per_atom*_threads_per_atom>device->simd_size()) + _threads_per_atom = _threads_per_atom / 2; + + if (_threads_per_atom*_threads_per_atom>device->simd_size()) + return -10; + + success = device->init_nbor(nbor,nall,host_nlocal,nall,maxspecial, + _gpu_host,max_nbors,cell_size,true,1,true); + if (success!=0) + return success; // Initialize host-device load balancer hd_balancer.init(device,gpu_nbor,gpu_split); @@ -121,22 +136,21 @@ int BaseThreeT::init_three(const int nlocal, const int nall, pos_tex.bind_float(atom->x,4); + int ef_nall=nall; + if (ef_nall==0) + ef_nall=2000; + _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); #ifdef THREE_CONCURRENT _max_an_bytes+=ans2->gpu_bytes(); #endif - int ef_nall=nall; - if (ef_nall==0) - ef_nall=2000; - dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE); - return 0; } template -void BaseThreeT::estimate_gpu_overhead() { - device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead); +void BaseThreeT::estimate_gpu_overhead(const int add_kernels) { + device->estimate_gpu_overhead(4+add_kernels,_gpu_overhead,_driver_overhead); } template @@ -152,7 +166,6 @@ void BaseThreeT::clear_atomic() { time_pair.clear(); hd_balancer.clear(); - dev_short_nbor.clear(); nbor->clear(); ans->clear(); #ifdef THREE_CONCURRENT @@ -186,6 +199,7 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist, // now the requirement is removed, allowing to work within pair hybrid nbor->get_host(nlist,ilist,numj,firstneigh,block_size()); + nbor->copy_unpacked(nlist,mn); double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); #ifdef THREE_CONCURRENT @@ -201,24 +215,32 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist, // Build neighbor list on device // --------------------------------------------------------------------------- template -inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum, - const int nall, double **host_x, - int *host_type, double *sublo, - double *subhi, tagint *tag, - int **nspecial, tagint **special, - bool &success) { +inline void BaseThreeT::build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, + int *host_type, double *sublo, + double *subhi, tagint *tag, + int **nspecial, tagint **special, + bool &success) { success=true; resize_atom(inum,nall,success); resize_local(nall,host_inum,nbor->max_nbors(),success); if (!success) - return 0; + return; atom->cast_copy_x(host_x,host_type); _nall = nall; + // Increase the effective sub-domain size for neighbors of ghosts + // This is still inefficient because we are calculating neighbors for more + // ghosts than necessary due to increased ghost cutoff + const double ncut=nbor->cutoff()*2.0; + for (int i=0; i<3; i++) sublo[i]-=ncut; + for (int i=0; i<3; i++) subhi[i]+=ncut; + int mn; - nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, tag, - nspecial, special, success, mn); + nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, + tag, nspecial, special, success, mn, ans->error_flag); + nbor->copy_unpacked(nall,mn); double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); #ifdef THREE_CONCURRENT @@ -226,7 +248,6 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum, #endif if (bytes>_max_an_bytes) _max_an_bytes=bytes; - return mn; } // --------------------------------------------------------------------------- @@ -236,10 +257,24 @@ template void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall, const int nlist, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success) { acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); if (inum_full==0) { host_start=0; // Make sure textures are correct if realloc by a different hybrid style @@ -260,19 +295,12 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall, reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success); if (!success) return; - _max_nbors = nbor->max_nbor_loop(nlist,numj,ilist); } atom->cast_x_data(host_x,host_type); hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); - // re-allocate dev_short_nbor if necessary - if (nall*(2+_max_nbors) > dev_short_nbor.cols()) { - int _nmax=static_cast(static_cast(nall)*1.10); - dev_short_nbor.resize((2+_max_nbors)*_nmax); - } - // _ainum to be used in loop() for short neighbor list build _ainum = nlist; @@ -282,11 +310,11 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall, #ifdef THREE_CONCURRENT ucl_device->sync(); #endif - loop(eflag,vflag,evatom); - ans->copy_answers(eflag,vflag,eatom,vatom,ilist); + const int red_blocks=loop(eflag,vflag,evatom,success); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks); device->add_ans_object(ans); #ifdef THREE_CONCURRENT - ans2->copy_answers(eflag,vflag,eatom,vatom,ilist); + ans2->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks); device->add_ans_object(ans2); #endif hd_balancer.stop_timer(); @@ -296,15 +324,29 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall, // Reneighbor on GPU if necessary and then compute forces, virials, energies // --------------------------------------------------------------------------- template -int ** BaseThreeT::compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, - const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, - const double cpu_time, bool &success) { +int ** BaseThreeT::compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag_in, + const bool vflag_in, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success) { acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); if (inum_full==0) { host_start=0; // Make sure textures are correct if realloc by a different hybrid style @@ -323,7 +365,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full, // Build neighbor list on GPU if necessary if (ago==0) { - _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, success); if (!success) return nullptr; @@ -336,12 +378,6 @@ int ** BaseThreeT::compute(const int ago, const int inum_full, *ilist=nbor->host_ilist.begin(); *jnum=nbor->host_acc.begin(); - // re-allocate dev_short_nbor if necessary - if (nall*(2+_max_nbors) > dev_short_nbor.cols()) { - int _nmax=static_cast(static_cast(nall)*1.10); - dev_short_nbor.resize((2+_max_nbors)*_nmax); - } - // _ainum to be used in loop() for short neighbor list build _ainum = nall; @@ -351,11 +387,11 @@ int ** BaseThreeT::compute(const int ago, const int inum_full, #ifdef THREE_CONCURRENT ucl_device->sync(); #endif - loop(eflag,vflag,evatom); - ans->copy_answers(eflag,vflag,eatom,vatom); + const int red_blocks=loop(eflag,vflag,evatom,success); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); device->add_ans_object(ans); #ifdef THREE_CONCURRENT - ans2->copy_answers(eflag,vflag,eatom,vatom); + ans2->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); device->add_ans_object(ans2); #endif hd_balancer.stop_timer(); @@ -372,14 +408,24 @@ double BaseThreeT::host_memory_usage_atomic() const { template void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str, const char *two, const char *three_center, - const char *three_end, const char* short_nbor) { - if (_compiled) + const char *three_end, const char* short_nbor, + const int onetype, const int onetype3, + const int spq) { + if (_compiled && _onetype==onetype && _onetype3==onetype3 && _spq==spq) return; + _onetype=onetype; + _onetype3=onetype3; + _spq=spq; + std::string vatom_name=std::string(three_end)+"_vatom"; if (pair_program) delete pair_program; pair_program=new UCL_Program(dev); - pair_program->load_string(pair_str,device->compile_string().c_str()); + std::string oclstring = device->compile_string()+" -DEVFLAG=1"; + if (_onetype>=0) oclstring+=" -DONETYPE="+device->toa(_onetype)+ + " -DONETYPE3="+device->toa(_onetype3); + if (_spq) oclstring+=" -DSPQ="+device->toa(_spq); + pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen); k_three_center.set_function(*pair_program,three_center); k_three_end.set_function(*pair_program,three_end); k_three_end_vatom.set_function(*pair_program,vatom_name.c_str()); @@ -387,12 +433,50 @@ void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str, k_short_nbor.set_function(*pair_program,short_nbor); pos_tex.get_texture(*pair_program,"pos_tex"); + #if defined(LAL_OCL_EV_JIT) + oclstring = device->compile_string()+" -DEVFLAG=0"; + if (_onetype>=0) oclstring+=" -DONETYPE="+device->toa(_onetype)+ + " -DONETYPE3="+device->toa(_onetype3); + if (_spq) oclstring+=" -DSPQ="+device->toa(_spq); + if (pair_program_noev) delete pair_program_noev; + pair_program_noev=new UCL_Program(dev); + pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen); + k_three_center_noev.set_function(*pair_program_noev,three_center); + k_three_end_noev.set_function(*pair_program_noev,three_end); + k_pair_noev.set_function(*pair_program_noev,two); + #else + k_sel = &k_pair; + k_3center_sel = &k_three_center; + k_3end_sel = &k_three_end; + #endif + #ifdef THREE_CONCURRENT k_three_end.cq(ucl_device->cq(_end_command_queue)); k_three_end_vatom.cq(ucl_device->cq(_end_command_queue)); + #if defined(LAL_OCL_EV_JIT) + k_three_end_noev.cq(ucl_device->cq(_end_command_queue)); + #endif #endif _compiled=true; + + #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)) + if (dev.cl_device_version() >= 210) { + size_t mx_subgroup_sz = k_pair.max_subgroup_size(_block_size); + mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_center.max_subgroup_size(_block_size)); + mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_end.max_subgroup_size(_block_size)); + mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_end_vatom.max_subgroup_size(_block_size)); + #if defined(LAL_OCL_EV_JIT) + mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size)); + mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_center_noev.max_subgroup_size(_block_size)); + mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_end_noev.max_subgroup_size(_block_size)); + #endif + if (_threads_per_atom > mx_subgroup_sz) + _threads_per_atom = mx_subgroup_sz; + device->set_simd_size(mx_subgroup_sz); + } + #endif + } template class BaseThree; diff --git a/lib/gpu/lal_base_three.h b/lib/gpu/lal_base_three.h index 36129e6168..3e830d4217 100644 --- a/lib/gpu/lal_base_three.h +++ b/lib/gpu/lal_base_three.h @@ -59,10 +59,12 @@ class BaseThree { const double gpu_split, FILE *screen, const void *pair_program, const char *k_two, const char *k_three_center, const char *k_three_end, - const char *k_short_nbor=nullptr); + const char *k_short_nbor=nullptr, const int onetype=-1, + const int onetype3=-1, const int spq=0, + const int tpa_override=0); /// Estimate the overhead for GPU context changes and CPU driver - void estimate_gpu_overhead(); + void estimate_gpu_overhead(const int add_kernels=0); /// Check if there is enough storage for atom arrays and realloc if not /** \param success set to false if insufficient memory **/ @@ -109,7 +111,7 @@ class BaseThree { /// Accumulate timers inline void acc_timers() { if (device->time_device()) { - nbor->acc_timers(); + nbor->acc_timers(screen); time_pair.add_to_total(); atom->acc_timers(); ans->acc_timers(); @@ -134,9 +136,9 @@ class BaseThree { int *numj, int **firstneigh, bool &success); /// Build neighbor list on device - int build_nbor_list(const int inum, const int host_inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + void build_nbor_list(const int inum, const int host_inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, tagint **special, bool &success); /// Pair loop with host neighboring @@ -147,12 +149,12 @@ class BaseThree { int &host_start, const double cpu_time, bool &success); /// Pair loop with device neighboring - int ** compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **numj, const double cpu_time, bool &success); + int ** compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, tagint **special, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, + int **numj, const double cpu_time, bool &success); // -------------------------- DEVICE DATA ------------------------- @@ -188,14 +190,29 @@ class BaseThree { /// Neighbor data Neighbor *nbor; - UCL_D_Vec dev_short_nbor; UCL_Kernel k_short_nbor; // ------------------------- DEVICE KERNELS ------------------------- - UCL_Program *pair_program; + UCL_Program *pair_program, *pair_program_noev; UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom; + UCL_Kernel k_pair_noev, k_three_center_noev, k_three_end_noev; + UCL_Kernel *k_sel, *k_3center_sel, *k_3end_sel; inline int block_pair() { return _block_pair; } inline int block_size() { return _block_size; } + inline void set_kernel(const int eflag, const int vflag) { + #if defined(LAL_OCL_EV_JIT) + if (eflag || vflag) { + k_sel = &k_pair; + k_3center_sel = &k_three_center; + k_3end_sel = &k_three_end; + } else { + k_sel = &k_pair_noev; + k_3center_sel = &k_three_center_noev; + k_3end_sel = &k_three_end_noev; + } + #endif + } + // --------------------------- TEXTURES ----------------------------- UCL_Texture pos_tex; @@ -203,18 +220,19 @@ class BaseThree { protected: bool _compiled; int _block_pair, _block_size, _threads_per_atom, _end_command_queue; - int _gpu_nbor; + int _gpu_nbor, _onetype, _onetype3, _spq; double _max_bytes, _max_an_bytes; - int _max_nbors, _ainum, _nall; + int _ainum, _nall; double _gpu_overhead, _driver_overhead; - UCL_D_Vec *_nbor_data; void compile_kernels(UCL_Device &dev, const void *pair_string, const char *two, const char *three_center, - const char *three_end, const char* short_nbor); + const char *three_end, const char* short_nbor, + const int onetype, const int onetype3, + const int spq); - virtual void loop(const bool _eflag, const bool _vflag, - const int evatom) = 0; + virtual int loop(const int eflag, const int vflag, const int evatom, + bool &success) = 0; }; } diff --git a/lib/gpu/lal_beck.cpp b/lib/gpu/lal_beck.cpp index be1722c32c..57551d9787 100644 --- a/lib/gpu/lal_beck.cpp +++ b/lib/gpu/lal_beck.cpp @@ -113,20 +113,9 @@ double BeckT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void BeckT::loop(const bool _eflag, const bool _vflag) { +int BeckT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -134,8 +123,8 @@ void BeckT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &beck1, &beck2, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &beck1, &beck2, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); @@ -147,6 +136,7 @@ void BeckT::loop(const bool _eflag, const bool _vflag) { &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class Beck; diff --git a/lib/gpu/lal_beck.cu b/lib/gpu/lal_beck.cu index f24132b9a2..a2a15e4d21 100644 --- a/lib/gpu/lal_beck.cu +++ b/lib/gpu/lal_beck.cu @@ -39,22 +39,25 @@ __kernel void k_beck(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { numtyp term6 = pow(term1,(numtyp)-3); numtyp term1inv = ucl_recip(term1); numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4); e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv); energy+=factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -116,9 +119,9 @@ __kernel void k_beck(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_beck_fast(const __global numtyp4 *restrict x_, @@ -137,6 +140,9 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_, __local numtyp4 beck1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 beck2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + if (tid<4) sp_lj[tid]=sp_lj_in[tid]; if (tid0) { + if (EVFLAG && eflag) { numtyp term6 = pow(term1,(numtyp)-3); numtyp term1inv = ucl_recip(term1); numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4); e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv); energy+=factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -218,8 +224,8 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } diff --git a/lib/gpu/lal_beck.h b/lib/gpu/lal_beck.h index 638f1bf626..c6413ed766 100644 --- a/lib/gpu/lal_beck.h +++ b/lib/gpu/lal_beck.h @@ -72,7 +72,7 @@ class Beck : public BaseAtomic { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_beck_ext.cpp b/lib/gpu/lal_beck_ext.cpp index dcba4e4f40..ab65237e27 100644 --- a/lib/gpu/lal_beck_ext.cpp +++ b/lib/gpu/lal_beck_ext.cpp @@ -55,7 +55,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa, int init_ok=0; if (world_me==0) init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta, - AA, BB, special_lj, inum, nall, 300, + AA, BB, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); BLMF.device->world_barrier(); @@ -73,7 +73,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa, } if (gpu_rank==i && world_me!=0) init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta, AA, BB, - special_lj, inum, nall, 300, maxspecial, + special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); BLMF.device->gpu_barrier(); diff --git a/lib/gpu/lal_born.cpp b/lib/gpu/lal_born.cpp index 4a6b789687..c4796b3450 100644 --- a/lib/gpu/lal_born.cpp +++ b/lib/gpu/lal_born.cpp @@ -138,20 +138,9 @@ double BornT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void BornT::loop(const bool _eflag, const bool _vflag) { +int BornT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -159,8 +148,8 @@ void BornT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &coeff1,&coeff2, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &coeff1,&coeff2, &cutsq_sigma, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), @@ -176,6 +165,7 @@ void BornT::loop(const bool _eflag, const bool _vflag) { &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class Born; diff --git a/lib/gpu/lal_born.cu b/lib/gpu/lal_born.cu index f9fea6d618..825175af8f 100644 --- a/lib/gpu/lal_born.cu +++ b/lib/gpu/lal_born.cu @@ -40,22 +40,25 @@ __kernel void k_born(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv + coeff2[mtype].z*r2inv*r6inv; energy+=factor_lj*(e-coeff2[mtype].w); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -108,9 +111,9 @@ __kernel void k_born(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_born_fast(const __global numtyp4 *restrict x_, @@ -130,27 +133,30 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_, __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + if (tid<4) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) coeff2[tid]=coeff2_in[tid]; } - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv + coeff2[mtype].z*r2inv*r6inv; energy+=factor_lj*(e-coeff2[mtype].w); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -203,8 +209,8 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } diff --git a/lib/gpu/lal_born.h b/lib/gpu/lal_born.h index 2a7f355d69..3f5277b682 100644 --- a/lib/gpu/lal_born.h +++ b/lib/gpu/lal_born.h @@ -82,7 +82,7 @@ class Born : public BaseAtomic { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_born_coul_long.cpp b/lib/gpu/lal_born_coul_long.cpp index 1b147395f6..8c7084f4a4 100644 --- a/lib/gpu/lal_born_coul_long.cpp +++ b/lib/gpu/lal_born_coul_long.cpp @@ -129,20 +129,9 @@ double BornCoulLongT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void BornCoulLongT::loop(const bool _eflag, const bool _vflag) { +int BornCoulLongT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -150,8 +139,8 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, @@ -170,6 +159,7 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) { &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class BornCoulLong; diff --git a/lib/gpu/lal_born_coul_long.cu b/lib/gpu/lal_born_coul_long.cu index 14e644b45a..d38a101c30 100644 --- a/lib/gpu/lal_born_coul_long.cu +++ b/lib/gpu/lal_born_coul_long.cu @@ -48,6 +48,9 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; @@ -57,18 +60,18 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) e_coul += prefactor*(_erfc-factor_coul); if (rsq < cutsq_sigma[mtype].y) { @@ -133,7 +136,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-coeff2[mtype].w); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -144,9 +147,9 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_, @@ -169,28 +172,31 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_, __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) coeff2[tid]=coeff2_in[tid]; } - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) e_coul += prefactor*(_erfc-factor_coul); if (rsq < cutsq_sigma[mtype].y) { @@ -255,7 +261,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-coeff2[mtype].w); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -266,8 +272,8 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } diff --git a/lib/gpu/lal_born_coul_long.h b/lib/gpu/lal_born_coul_long.h index e383d18e0c..a33b8f436a 100644 --- a/lib/gpu/lal_born_coul_long.h +++ b/lib/gpu/lal_born_coul_long.h @@ -80,7 +80,7 @@ class BornCoulLong : public BaseCharge { protected: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_born_coul_long_cs.cu b/lib/gpu/lal_born_coul_long_cs.cu index 6f04fcea94..077ec2f74f 100644 --- a/lib/gpu/lal_born_coul_long_cs.cu +++ b/lib/gpu/lal_born_coul_long_cs.cu @@ -63,6 +63,9 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; @@ -72,18 +75,18 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) { numtyp e = prefactor*_erfc; if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor; @@ -167,7 +170,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-coeff2[mtype].w); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -178,9 +181,9 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_, @@ -203,28 +206,31 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_, __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) coeff2[tid]=coeff2_in[tid]; } - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) { numtyp e = prefactor*_erfc; if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor; @@ -308,7 +314,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-coeff2[mtype].w); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -319,8 +325,8 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } diff --git a/lib/gpu/lal_born_coul_long_cs_ext.cpp b/lib/gpu/lal_born_coul_long_cs_ext.cpp index badc8b0808..fc6b89692f 100644 --- a/lib/gpu/lal_born_coul_long_cs_ext.cpp +++ b/lib/gpu/lal_born_coul_long_cs_ext.cpp @@ -60,7 +60,7 @@ int bornclcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, if (world_me==0) init_ok=BCLCSMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, sigma, offset, - special_lj, inum, nall, 300, maxspecial, cell_size, + special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); @@ -80,7 +80,7 @@ int bornclcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, if (gpu_rank==i && world_me!=0) init_ok=BCLCSMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, sigma, offset, - special_lj, inum, nall, 300, maxspecial, cell_size, + special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); diff --git a/lib/gpu/lal_born_coul_long_ext.cpp b/lib/gpu/lal_born_coul_long_ext.cpp index d0825529b1..9d17f2fa7d 100644 --- a/lib/gpu/lal_born_coul_long_ext.cpp +++ b/lib/gpu/lal_born_coul_long_ext.cpp @@ -60,7 +60,7 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, if (world_me==0) init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, sigma, offset, - special_lj, inum, nall, 300, maxspecial, cell_size, + special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); @@ -80,7 +80,7 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, if (gpu_rank==i && world_me!=0) init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, sigma, offset, - special_lj, inum, nall, 300, maxspecial, cell_size, + special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); diff --git a/lib/gpu/lal_born_coul_wolf.cpp b/lib/gpu/lal_born_coul_wolf.cpp index 1624dd9d50..e6caebbab8 100644 --- a/lib/gpu/lal_born_coul_wolf.cpp +++ b/lib/gpu/lal_born_coul_wolf.cpp @@ -131,20 +131,9 @@ double BornCoulWolfT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) { +int BornCoulWolfT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -152,8 +141,8 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, @@ -171,6 +160,7 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) { &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class BornCoulWolf; diff --git a/lib/gpu/lal_born_coul_wolf.cu b/lib/gpu/lal_born_coul_wolf.cu index 0eeda48ec0..aefcac8127 100644 --- a/lib/gpu/lal_born_coul_wolf.cu +++ b/lib/gpu/lal_born_coul_wolf.cu @@ -51,6 +51,9 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; @@ -60,18 +63,18 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; @@ -137,7 +140,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_, f.y+=dely*force; f.z+=delz*force; - if (eflag>0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) { numtyp e=v_sh; if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor; @@ -149,7 +152,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-coeff2[mtype].w); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -160,9 +163,9 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_, @@ -186,28 +189,31 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_, __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) coeff2[tid]=coeff2_in[tid]; } - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; @@ -273,7 +279,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_, f.y+=dely*force; f.z+=delz*force; - if (eflag>0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) { numtyp e=v_sh; if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor; @@ -285,7 +291,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-coeff2[mtype].w); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -296,8 +302,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } - diff --git a/lib/gpu/lal_born_coul_wolf.h b/lib/gpu/lal_born_coul_wolf.h index fa53f48939..0aad07dfa5 100644 --- a/lib/gpu/lal_born_coul_wolf.h +++ b/lib/gpu/lal_born_coul_wolf.h @@ -81,7 +81,7 @@ class BornCoulWolf : public BaseCharge { protected: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_born_coul_wolf_cs.cu b/lib/gpu/lal_born_coul_wolf_cs.cu index b957b8be69..866d256f33 100644 --- a/lib/gpu/lal_born_coul_wolf_cs.cu +++ b/lib/gpu/lal_born_coul_wolf_cs.cu @@ -52,6 +52,9 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; @@ -61,18 +64,18 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; @@ -139,7 +142,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_, f.y+=dely*force; f.z+=delz*force; - if (eflag>0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) { acctyp e=v_sh; if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor; @@ -151,7 +154,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-coeff2[mtype].w); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -162,9 +165,9 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_, @@ -188,28 +191,31 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_, __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) coeff2[tid]=coeff2_in[tid]; } - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; @@ -276,7 +282,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_, f.y+=dely*force; f.z+=delz*force; - if (eflag>0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) { acctyp e=v_sh; if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor; @@ -288,7 +294,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-coeff2[mtype].w); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -299,8 +305,8 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } diff --git a/lib/gpu/lal_born_coul_wolf_cs_ext.cpp b/lib/gpu/lal_born_coul_wolf_cs_ext.cpp index e2211644af..ae162a7c52 100644 --- a/lib/gpu/lal_born_coul_wolf_cs_ext.cpp +++ b/lib/gpu/lal_born_coul_wolf_cs_ext.cpp @@ -60,7 +60,7 @@ int borncwcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, if (world_me==0) init_ok=BornCWCST.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, sigma, - offset, special_lj, inum, nall, 300, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, alf, e_shift, f_shift); @@ -81,7 +81,7 @@ int borncwcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, if (gpu_rank==i && world_me!=0) init_ok=BornCWCST.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, sigma, - offset, special_lj, inum, nall, 300, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, alf, e_shift, f_shift); diff --git a/lib/gpu/lal_born_coul_wolf_ext.cpp b/lib/gpu/lal_born_coul_wolf_ext.cpp index d664f30212..bc38db1b9c 100644 --- a/lib/gpu/lal_born_coul_wolf_ext.cpp +++ b/lib/gpu/lal_born_coul_wolf_ext.cpp @@ -60,7 +60,7 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, if (world_me==0) init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, sigma, - offset, special_lj, inum, nall, 300, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, alf, e_shift, f_shift); @@ -81,7 +81,7 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, if (gpu_rank==i && world_me!=0) init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, sigma, - offset, special_lj, inum, nall, 300, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, alf, e_shift, f_shift); diff --git a/lib/gpu/lal_born_ext.cpp b/lib/gpu/lal_born_ext.cpp index 63991889d9..2321a1264d 100644 --- a/lib/gpu/lal_born_ext.cpp +++ b/lib/gpu/lal_born_ext.cpp @@ -58,7 +58,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, if (world_me==0) init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, sigma, - offset, special_lj, inum, nall, 300, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); BORNMF.device->world_barrier(); @@ -77,7 +77,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, if (gpu_rank==i && world_me!=0) init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, sigma, - offset, special_lj, inum, nall, 300, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); BORNMF.device->gpu_barrier(); diff --git a/lib/gpu/lal_buck.cpp b/lib/gpu/lal_buck.cpp index 5a335a1e51..01411775e1 100644 --- a/lib/gpu/lal_buck.cpp +++ b/lib/gpu/lal_buck.cpp @@ -130,20 +130,9 @@ double BuckT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void BuckT::loop(const bool _eflag, const bool _vflag) { +int BuckT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -151,8 +140,8 @@ void BuckT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, @@ -165,6 +154,7 @@ void BuckT::loop(const bool _eflag, const bool _vflag) { &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class Buck; diff --git a/lib/gpu/lal_buck.cu b/lib/gpu/lal_buck.cu index 0f9044cefc..958c7bdd4d 100644 --- a/lib/gpu/lal_buck.cu +++ b/lib/gpu/lal_buck.cu @@ -39,22 +39,25 @@ __kernel void k_buck(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; energy+=factor_lj*(e-coeff2[mtype].z); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -106,9 +109,9 @@ __kernel void k_buck(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_buck_fast(const __global numtyp4 *restrict x_, @@ -127,27 +130,30 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_, __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + if (tid<4) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) coeff2[tid]=coeff2_in[tid]; } - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; energy+=factor_lj*(e-coeff2[mtype].z); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -199,8 +205,8 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } diff --git a/lib/gpu/lal_buck.h b/lib/gpu/lal_buck.h index 7a09fae5dd..5755dea230 100644 --- a/lib/gpu/lal_buck.h +++ b/lib/gpu/lal_buck.h @@ -77,7 +77,7 @@ class Buck : public BaseAtomic { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_buck_coul.cpp b/lib/gpu/lal_buck_coul.cpp index 25607eae17..c3c70e6d4d 100644 --- a/lib/gpu/lal_buck_coul.cpp +++ b/lib/gpu/lal_buck_coul.cpp @@ -122,20 +122,9 @@ double BuckCoulT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void BuckCoulT::loop(const bool _eflag, const bool _vflag) { +int BuckCoulT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -143,8 +132,8 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, @@ -158,6 +147,7 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) { &cutsq, &_qqrd2e, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class BuckCoul; diff --git a/lib/gpu/lal_buck_coul.cu b/lib/gpu/lal_buck_coul.cu index 163c8e4362..2aaa9c9b3d 100644 --- a/lib/gpu/lal_buck_coul.cu +++ b/lib/gpu/lal_buck_coul.cu @@ -47,6 +47,9 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; @@ -56,18 +59,18 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { e_coul += forcecoul; if (rsq < cutsq[mtype].y) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; energy+=factor_lj*(e-coeff2[mtype].z); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -137,9 +140,9 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_, @@ -162,29 +165,32 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_, __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) coeff2[tid]=coeff2_in[tid]; } - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { e_coul += forcecoul; if (rsq < cutsq[mtype].y) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; energy+=factor_lj*(e-coeff2[mtype].z); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -254,8 +260,8 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } diff --git a/lib/gpu/lal_buck_coul.h b/lib/gpu/lal_buck_coul.h index eebba78eb0..bd2afcf9d8 100644 --- a/lib/gpu/lal_buck_coul.h +++ b/lib/gpu/lal_buck_coul.h @@ -78,7 +78,7 @@ class BuckCoul : public BaseCharge { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_buck_coul_ext.cpp b/lib/gpu/lal_buck_coul_ext.cpp index 2a089e2040..9cf8f9b00e 100644 --- a/lib/gpu/lal_buck_coul_ext.cpp +++ b/lib/gpu/lal_buck_coul_ext.cpp @@ -58,7 +58,7 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, int init_ok=0; if (world_me==0) init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, - host_a, host_c, offset, special_lj, inum, nall, 300, + host_a, host_c, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); @@ -78,7 +78,7 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, } if (gpu_rank==i && world_me!=0) init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, - host_a, host_c, offset, special_lj, inum, nall, 300, + host_a, host_c, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); diff --git a/lib/gpu/lal_buck_coul_long.cpp b/lib/gpu/lal_buck_coul_long.cpp index 1c0288c2d8..60205a2ad6 100644 --- a/lib/gpu/lal_buck_coul_long.cpp +++ b/lib/gpu/lal_buck_coul_long.cpp @@ -126,20 +126,9 @@ double BuckCoulLongT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) { +int BuckCoulLongT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -147,8 +136,8 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, @@ -163,6 +152,7 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) { &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class BuckCoulLong; diff --git a/lib/gpu/lal_buck_coul_long.cu b/lib/gpu/lal_buck_coul_long.cu index b1bbf67bc2..f5ce3a7d11 100644 --- a/lib/gpu/lal_buck_coul_long.cu +++ b/lib/gpu/lal_buck_coul_long.cu @@ -48,6 +48,9 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; @@ -57,18 +60,18 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) e_coul += prefactor*(_erfc-factor_coul); if (rsq < coeff1[mtype].w) { @@ -134,7 +137,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-coeff2[mtype].z); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -145,9 +148,9 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_, @@ -171,28 +174,31 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_, __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) coeff2[tid]=coeff2_in[tid]; } - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) e_coul += prefactor*(_erfc-factor_coul); if (rsq < coeff1[mtype].w) { @@ -258,7 +264,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-coeff2[mtype].z); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -269,8 +275,8 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } diff --git a/lib/gpu/lal_buck_coul_long.h b/lib/gpu/lal_buck_coul_long.h index e2d69475cf..fa978a70be 100644 --- a/lib/gpu/lal_buck_coul_long.h +++ b/lib/gpu/lal_buck_coul_long.h @@ -78,7 +78,7 @@ class BuckCoulLong : public BaseCharge { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_buck_coul_long_ext.cpp b/lib/gpu/lal_buck_coul_long_ext.cpp index c7e1cd1e35..393ccc3feb 100644 --- a/lib/gpu/lal_buck_coul_long_ext.cpp +++ b/lib/gpu/lal_buck_coul_long_ext.cpp @@ -59,7 +59,7 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, int init_ok=0; if (world_me==0) init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, - host_a, host_c, offset, special_lj, inum, nall, 300, + host_a, host_c, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); @@ -78,7 +78,7 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, } if (gpu_rank==i && world_me!=0) init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, - host_a, host_c, offset, special_lj, inum, nall, 300, + host_a, host_c, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); diff --git a/lib/gpu/lal_buck_ext.cpp b/lib/gpu/lal_buck_ext.cpp index cc8b77c0a9..738b33337d 100644 --- a/lib/gpu/lal_buck_ext.cpp +++ b/lib/gpu/lal_buck_ext.cpp @@ -56,7 +56,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, int init_ok=0; if (world_me==0) init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, - host_a, host_c, offset, special_lj, inum, nall, 300, + host_a, host_c, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); BUCKMF.device->world_barrier(); @@ -74,7 +74,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, } if (gpu_rank==i && world_me!=0) init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, - host_a, host_c, offset, special_lj, inum, nall, 300, + host_a, host_c, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); BUCKMF.device->gpu_barrier(); diff --git a/lib/gpu/lal_charmm.cpp b/lib/gpu/lal_charmm.cpp new file mode 100644 index 0000000000..811a431cc7 --- /dev/null +++ b/lib/gpu/lal_charmm.cpp @@ -0,0 +1,166 @@ +/*************************************************************************** + charmm.cpp + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the charmm/coul pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "charmm_cl.h" +#elif defined(USE_CUDART) +const char *charmm_long=0; +#else +#include "charmm_cubin.h" +#endif + +#include "lal_charmm.h" +#include +namespace LAMMPS_AL { +#define CHARMMT CHARMM + +extern Device device; + +template +CHARMMT::CHARMM() : BaseCharge(), + _allocated(false) { +} + +template +CHARMMT::~CHARMM() { + clear(); +} + +template +int CHARMMT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int CHARMMT::init(const int ntypes, double host_cut_bothsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double *host_special_lj, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, const double gpu_split, + FILE *_screen, double host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double cut_lj_innersq, + const double cut_coul_innersq, const double denom_lj, + const double denom_coul, double **epsilon, + double **sigma, const bool mix_arithmetic) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, + gpu_split,_screen,charmm,"k_charmm"); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_bio_shared_types=this->device->max_bio_shared_types(); + if (this->_block_bio_size>=64 && mix_arithmetic && + lj_types<=max_bio_shared_types) + shared_types=true; + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + int h_size=lj_types*lj_types; + if (h_size host_write(h_size*32,*(this->ucl_device), + UCL_WRITE_ONLY); + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_lj3,host_lj4); + + if (shared_types) { + ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma); + } + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _cut_bothsq = host_cut_bothsq; + _cut_coulsq = host_cut_coulsq; + _cut_ljsq = host_cut_ljsq; + _cut_lj_innersq = cut_lj_innersq; + _cut_coul_innersq = cut_coul_innersq; + _qqrd2e=qqrd2e; + _denom_lj=denom_lj; + _denom_coul=denom_coul; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void CHARMMT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + ljd.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double CHARMMT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(CHARMM); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int CHARMMT::loop(const int eflag, const int vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->_block_bio_size; + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &ljd, &sp_lj, + &this->nbor->dev_nbor, this->_nbor_data, + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->atom->q, + &_cut_coulsq, &_qqrd2e, &_denom_lj, &_denom_coul, + &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, + &_cut_coul_innersq, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &ljd, &sp_lj, + &this->nbor->dev_nbor, this->_nbor_data, + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->atom->q, + &_cut_coulsq, &_qqrd2e, &_denom_lj, &_denom_coul, + &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, + &_cut_coul_innersq, &this->_threads_per_atom); + } + this->time_pair.stop(); + return GX; +} + +template class CHARMM; +} diff --git a/lib/gpu/lal_charmm.cu b/lib/gpu/lal_charmm.cu new file mode 100644 index 0000000000..42fb810796 --- /dev/null +++ b/lib/gpu/lal_charmm.cu @@ -0,0 +1,303 @@ +// ************************************************************************** +// charmm.cu +// ------------------- +// W. Michael Brown (ORNL) +// +// Device code for acceleration of the charmm/coul pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL + +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +texture pos_tex; +texture q_tex; +#else +texture pos_tex; +texture q_tex; +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +__kernel void k_charmm(const __global numtyp4 *restrict x_, + const __global numtyp2 *restrict ljd, + const __global numtyp *restrict sp_lj, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, + const __global numtyp *restrict q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp denom_lj, + const numtyp denom_coul, + const numtyp cut_bothsq, + const numtyp cut_ljsq, + const numtyp cut_lj_innersq, + const numtyp cut_coul_innersq, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + int n_stride; + local_allocate_store_bio(); + + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } + + if (ii cut_lj_innersq) { + switch1 = (cut_ljsq-rsq); + numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)* + denom_lj; + switch1 *= switch1; + switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)* + denom_lj; + switch2 *= lj3-lj4; + force_lj = force_lj*switch1+switch2; + } + } else + force_lj = (numtyp)0.0; + + if (rsq < cut_coulsq) { + numtyp rinv = ucl_rsqrt(rsq); + fetch(forcecoul,j,q_tex); + forcecoul *= factor_coul * qqrd2e * qtmp * rinv; + if (rsq > cut_coul_innersq) { + numtyp switch3 = (cut_coulsq-rsq) * (cut_coulsq-rsq) * + (cut_coulsq + (numtyp)2.0*rsq - (numtyp)3.0*cut_coul_innersq) * + denom_coul; + forcecoul *= switch3; + } + } else + forcecoul = (numtyp)0.0; + + force = (force_lj + forcecoul) * r2inv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (EVFLAG && eflag) { + e_coul += forcecoul; + if (rsq < cut_ljsq) { + numtyp e=lj3-lj4; + if (rsq > cut_lj_innersq) + e *= switch1; + energy+=factor_lj*e; + } + } + if (EVFLAG && vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); +} + +__kernel void k_charmm_fast(const __global numtyp4 *restrict x_, + const __global numtyp2 *restrict ljd_in, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, + const __global numtyp *restrict q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp denom_lj, + const numtyp denom_coul, + const numtyp cut_bothsq, + const numtyp cut_ljsq, + const numtyp cut_lj_innersq, + const numtyp cut_coul_innersq, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp2 ljd[MAX_BIO_SHARED_TYPES]; + __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_bio(); + + if (tid<8) + sp_lj[tid]=sp_lj_in[tid]; + if (tid cut_lj_innersq) { + switch1 = (cut_ljsq-rsq); + numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)* + denom_lj; + switch1 *= switch1; + switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)* + denom_lj; + switch2 *= lj3-lj4; + force_lj = force_lj*switch1+switch2; + } + } else + force_lj = (numtyp)0.0; + + if (rsq < cut_coulsq) { + numtyp rinv = ucl_rsqrt(rsq); + fetch(forcecoul,j,q_tex); + forcecoul *= factor_coul * qqrd2e * qtmp * rinv; + if (rsq > cut_coul_innersq) { + numtyp switch3 = (cut_coulsq-rsq) * (cut_coulsq-rsq) * + (cut_coulsq + (numtyp)2.0*rsq - (numtyp)3.0*cut_coul_innersq) * + denom_coul; + forcecoul *= switch3; + } + } else + forcecoul = (numtyp)0.0; + + force = (force_lj + forcecoul) * r2inv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (EVFLAG && eflag) { + e_coul += forcecoul; + if (rsq < cut_ljsq) { + numtyp e=lj3-lj4; + if (rsq > cut_lj_innersq) + e *= switch1; + energy+=factor_lj*e; + } + } + if (EVFLAG && vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); +} diff --git a/lib/gpu/lal_charmm.h b/lib/gpu/lal_charmm.h new file mode 100644 index 0000000000..0793d7ca0f --- /dev/null +++ b/lib/gpu/lal_charmm.h @@ -0,0 +1,89 @@ +/*************************************************************************** + charmm.h + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the charmm/coul pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_CHARMM_ +#define LAL_CHARMM_ + +#include "lal_base_charge.h" + +namespace LAMMPS_AL { + +template +class CHARMM : public BaseCharge { + public: + CHARMM(); + ~CHARMM(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double host_cut_bothsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double cut_lj_innersq, + const double cut_coul_innersq, const double denom_lj, + const double denom_coul, double **epsilon, double **sigma, + const bool mix_arithmetic); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// x = lj1, y = lj2, z = lj3, w = lj4 + UCL_D_Vec lj1; + /// x = epsilon, y = sigma + UCL_D_Vec ljd; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _qqrd2e, _denom_lj, _denom_coul; + + numtyp _cut_coulsq, _cut_bothsq, _cut_ljsq, _cut_lj_innersq; + numtyp _cut_coul_innersq; + + private: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_charmm_ext.cpp b/lib/gpu/lal_charmm_ext.cpp new file mode 100644 index 0000000000..bed2f21933 --- /dev/null +++ b/lib/gpu/lal_charmm_ext.cpp @@ -0,0 +1,137 @@ +/*************************************************************************** + charmm_long_ext.cpp + ------------------- + W. Michael Brown (ORNL) + + Functions for LAMMPS access to charmm/coul/long acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#include +#include +#include + +#include "lal_charmm.h" + +using namespace std; +using namespace LAMMPS_AL; + +static CHARMM CRMMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int crm_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double cut_lj_innersq, const double cut_coul_innersq, + const double denom_lj, const double denom_coul, + double **epsilon, double **sigma, + const bool mix_arithmetic) { + CRMMF.clear(); + gpu_mode=CRMMF.device->gpu_mode(); + double gpu_split=CRMMF.device->particle_split(); + int first_gpu=CRMMF.device->first_device(); + int last_gpu=CRMMF.device->last_device(); + int world_me=CRMMF.device->world_me(); + int gpu_rank=CRMMF.device->gpu_rank(); + int procs_per_gpu=CRMMF.device->procs_per_gpu(); + + CRMMF.device->init_message(screen,"lj/charmm/coul/charmm",first_gpu, + last_gpu); + + bool message=false; + if (CRMMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + CRMMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4, + special_lj, inum, nall, max_nbors, maxspecial, cell_size, + gpu_split, screen, host_cut_ljsq, host_cut_coulsq, + host_special_coul, qqrd2e, cut_lj_innersq, cut_coul_innersq, + denom_lj, denom_coul, epsilon, sigma, mix_arithmetic); + + CRMMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + CRMMF.estimate_gpu_overhead(); + + return init_ok; +} + +void crm_gpu_clear() { + CRMMF.clear(); +} + +int** crm_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return CRMMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void crm_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + CRMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh, + eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q, + nlocal,boxlo,prd); +} + +double crm_gpu_bytes() { + return CRMMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_charmm_long.cpp b/lib/gpu/lal_charmm_long.cpp index a78996a7d5..8008b1fbb3 100644 --- a/lib/gpu/lal_charmm_long.cpp +++ b/lib/gpu/lal_charmm_long.cpp @@ -131,20 +131,9 @@ double CHARMMLongT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void CHARMMLongT::loop(const bool _eflag, const bool _vflag) { +int CHARMMLongT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->_block_bio_size; - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -152,8 +141,8 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &ljd, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, @@ -171,6 +160,7 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) { &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class CHARMMLong; diff --git a/lib/gpu/lal_charmm_long.cu b/lib/gpu/lal_charmm_long.cu index 4e9802f368..77793d0e83 100644 --- a/lib/gpu/lal_charmm_long.cu +++ b/lib/gpu/lal_charmm_long.cu @@ -47,18 +47,21 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_, int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; + int n_stride; + local_allocate_store_bio(); + acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) e_coul += prefactor*(_erfc-factor_coul); if (rsq < cut_ljsq) { @@ -132,7 +135,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_, energy+=factor_lj*e; } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -143,9 +146,9 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_, @@ -168,6 +171,9 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_, __local numtyp2 ljd[MAX_BIO_SHARED_TYPES]; __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_bio(); + if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) e_coul += prefactor*(_erfc-factor_coul); if (rsq < cut_ljsq) { @@ -268,7 +274,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_, energy+=factor_lj*e; } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -277,10 +283,9 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_, virial[5] += dely*delz*force; } } - } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } diff --git a/lib/gpu/lal_charmm_long.h b/lib/gpu/lal_charmm_long.h index 5d9d9ea50b..69f1a0734a 100644 --- a/lib/gpu/lal_charmm_long.h +++ b/lib/gpu/lal_charmm_long.h @@ -79,7 +79,7 @@ class CHARMMLong : public BaseCharge { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_charmm_long_ext.cpp b/lib/gpu/lal_charmm_long_ext.cpp index 743b510825..13565f5682 100644 --- a/lib/gpu/lal_charmm_long_ext.cpp +++ b/lib/gpu/lal_charmm_long_ext.cpp @@ -60,7 +60,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1, int init_ok=0; if (world_me==0) CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4, - offset, special_lj, inum, nall, 300, maxspecial, cell_size, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj, epsilon,sigma,mix_arithmetic); @@ -80,7 +80,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1, } if (gpu_rank==i && world_me!=0) init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, + host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj, epsilon, diff --git a/lib/gpu/lal_colloid.cpp b/lib/gpu/lal_colloid.cpp index c441d50968..fec7a3ad5f 100644 --- a/lib/gpu/lal_colloid.cpp +++ b/lib/gpu/lal_colloid.cpp @@ -140,20 +140,9 @@ double ColloidT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void ColloidT::loop(const bool _eflag, const bool _vflag) { +int ColloidT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -161,8 +150,8 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj, &colloid1, &colloid2, &form, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, @@ -176,6 +165,7 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) { &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class Colloid; diff --git a/lib/gpu/lal_colloid.cu b/lib/gpu/lal_colloid.cu index 4983142aa0..8a20f0c400 100644 --- a/lib/gpu/lal_colloid.cu +++ b/lib/gpu/lal_colloid.cu @@ -42,22 +42,25 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { numtyp e=(numtyp)0.0; if (form[mtype]==0) { e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); @@ -160,7 +163,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_, } energy+=factor_lj*(e-lj3[mtype].z); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -171,9 +174,9 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_colloid_fast(const __global numtyp4 *restrict x_, @@ -198,6 +201,9 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_, __local numtyp4 colloid2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + if (tid<4) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) lj3[tid]=lj3_in[tid]; } - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { numtyp e=(numtyp)0.0; if (form[mtype]==0) { e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); @@ -325,7 +331,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_, } energy+=factor_lj*(e-lj3[mtype].z); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -336,8 +342,8 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } diff --git a/lib/gpu/lal_colloid.h b/lib/gpu/lal_colloid.h index 35426007d8..43f14cd354 100644 --- a/lib/gpu/lal_colloid.h +++ b/lib/gpu/lal_colloid.h @@ -81,7 +81,7 @@ class Colloid : public BaseAtomic { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_colloid_ext.cpp b/lib/gpu/lal_colloid_ext.cpp index 961ad75925..dcfd1a6d34 100644 --- a/lib/gpu/lal_colloid_ext.cpp +++ b/lib/gpu/lal_colloid_ext.cpp @@ -60,7 +60,7 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1, init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, host_a12, host_a1, host_a2, host_d1, host_d2, host_sigma3, - host_sigma6, host_form, inum, nall, 300, + host_sigma6, host_form, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); COLLMF.device->world_barrier(); @@ -80,7 +80,7 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1, init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, host_a12, host_a1, host_a2, host_d1, host_d2, host_sigma3, host_sigma6, host_form, - inum, nall, 300, maxspecial, + inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); COLLMF.device->gpu_barrier(); diff --git a/lib/gpu/lal_coul.cpp b/lib/gpu/lal_coul.cpp index 3e29215c91..df9eeae667 100644 --- a/lib/gpu/lal_coul.cpp +++ b/lib/gpu/lal_coul.cpp @@ -125,20 +125,9 @@ double CoulT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void CoulT::loop(const bool _eflag, const bool _vflag) { +int CoulT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -146,8 +135,8 @@ void CoulT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &scale, &sp_cl, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, @@ -161,6 +150,7 @@ void CoulT::loop(const bool _eflag, const bool _vflag) { &cutsq, &_qqrd2e, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class Coul; diff --git a/lib/gpu/lal_coul.cu b/lib/gpu/lal_coul.cu index 03fc568c77..c4da81a3a2 100644 --- a/lib/gpu/lal_coul.cu +++ b/lib/gpu/lal_coul.cu @@ -46,22 +46,25 @@ __kernel void k_coul(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_cl[8]; + int n_stride; + local_allocate_store_charge(); + sp_cl[0]=sp_cl_in[0]; sp_cl[1]=sp_cl_in[1]; sp_cl[2]=sp_cl_in[2]; sp_cl[3]=sp_cl_in[3]; - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { e_coul += forcecoul; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -112,9 +115,9 @@ __kernel void k_coul(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_coul_fast(const __global numtyp4 *restrict x_, @@ -134,25 +137,28 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_, __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_cl[4]; + int n_stride; + local_allocate_store_charge(); + if (tid<4) sp_cl[tid]=sp_cl_in[tid]; if (tid0) { + if (EVFLAG && eflag) { e_coul += forcecoul; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -203,8 +209,8 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } diff --git a/lib/gpu/lal_coul.h b/lib/gpu/lal_coul.h index 38472375fb..7298536dea 100644 --- a/lib/gpu/lal_coul.h +++ b/lib/gpu/lal_coul.h @@ -75,7 +75,7 @@ class Coul : public BaseCharge { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_coul_debye.cpp b/lib/gpu/lal_coul_debye.cpp index 08ceb99300..1107708ca8 100644 --- a/lib/gpu/lal_coul_debye.cpp +++ b/lib/gpu/lal_coul_debye.cpp @@ -126,20 +126,9 @@ double CoulDebyeT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void CoulDebyeT::loop(const bool _eflag, const bool _vflag) { +int CoulDebyeT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -147,8 +136,8 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &scale, &sp_cl, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &cutsq, @@ -162,6 +151,7 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) { &_qqrd2e, &_kappa, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class CoulDebye; diff --git a/lib/gpu/lal_coul_debye.cu b/lib/gpu/lal_coul_debye.cu index e7f0b97e23..ba922f04a6 100644 --- a/lib/gpu/lal_coul_debye.cu +++ b/lib/gpu/lal_coul_debye.cu @@ -47,22 +47,25 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_cl[4]; + int n_stride; + local_allocate_store_charge(); + sp_cl[0]=sp_cl_in[0]; sp_cl[1]=sp_cl_in[1]; sp_cl[2]=sp_cl_in[2]; sp_cl[3]=sp_cl_in[3]; - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { e_coul+=qqrd2e*scale[mtype]*qtmp*rinv*screening*factor_coul; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -116,9 +119,9 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_, @@ -140,6 +143,9 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_, __local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_cl[4]; + int n_stride; + local_allocate_store_charge(); + if (tid<4) sp_cl[tid]=sp_cl_in[tid]; if (tid0) { + if (EVFLAG && eflag) { e_coul+=qqrd2e*scale[mtype]*qtmp*rinv*screening*factor_coul; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -213,8 +219,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } - diff --git a/lib/gpu/lal_coul_debye.h b/lib/gpu/lal_coul_debye.h index 13e4c5b0c6..9054df1995 100644 --- a/lib/gpu/lal_coul_debye.h +++ b/lib/gpu/lal_coul_debye.h @@ -76,7 +76,7 @@ class CoulDebye : public BaseCharge { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_coul_debye_ext.cpp b/lib/gpu/lal_coul_debye_ext.cpp index af54746def..516dca5df8 100644 --- a/lib/gpu/lal_coul_debye_ext.cpp +++ b/lib/gpu/lal_coul_debye_ext.cpp @@ -54,7 +54,7 @@ int cdebye_gpu_init(const int ntypes, double **host_scale, double **cutsq, int init_ok=0; if (world_me==0) - init_ok=CDEMF.init(ntypes, host_scale, cutsq, host_special_coul, inum, nall, 300, + init_ok=CDEMF.init(ntypes, host_scale, cutsq, host_special_coul, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, qqrd2e, kappa); CDEMF.device->world_barrier(); @@ -71,7 +71,7 @@ int cdebye_gpu_init(const int ntypes, double **host_scale, double **cutsq, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=CDEMF.init(ntypes, host_scale, cutsq, host_special_coul, inum, nall, 300, + init_ok=CDEMF.init(ntypes, host_scale, cutsq, host_special_coul, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, qqrd2e, kappa); CDEMF.device->gpu_barrier(); diff --git a/lib/gpu/lal_coul_dsf.cpp b/lib/gpu/lal_coul_dsf.cpp index fe1fbfede7..1a56e84b52 100644 --- a/lib/gpu/lal_coul_dsf.cpp +++ b/lib/gpu/lal_coul_dsf.cpp @@ -110,20 +110,9 @@ double CoulDSFT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void CoulDSFT::loop(const bool _eflag, const bool _vflag) { +int CoulDSFT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -131,8 +120,8 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, @@ -148,6 +137,7 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) { &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class CoulDSF; diff --git a/lib/gpu/lal_coul_dsf.cu b/lib/gpu/lal_coul_dsf.cu index 190fb5b7fd..5241cb5097 100644 --- a/lib/gpu/lal_coul_dsf.cu +++ b/lib/gpu/lal_coul_dsf.cu @@ -48,30 +48,33 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_charge(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; @@ -111,11 +114,11 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_, f.y+=dely*force; f.z+=delz*force; - if (eflag>0) { + if (EVFLAG && eflag) { numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift-factor_coul); e_coul += e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -126,9 +129,9 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_, @@ -147,30 +150,33 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_charge(); + if (tid<4) sp_lj[tid]=sp_lj_in[tid]; - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; @@ -210,11 +216,11 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_, f.y+=dely*force; f.z+=delz*force; - if (eflag>0) { + if (EVFLAG && eflag) { numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift-factor_coul); e_coul += e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -225,8 +231,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } - diff --git a/lib/gpu/lal_coul_dsf.h b/lib/gpu/lal_coul_dsf.h index 3d57898f81..a33e98f836 100644 --- a/lib/gpu/lal_coul_dsf.h +++ b/lib/gpu/lal_coul_dsf.h @@ -70,7 +70,7 @@ class CoulDSF : public BaseCharge { private: bool _allocated; numtyp _e_shift, _f_shift, _alpha, _cut_coulsq; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_coul_dsf_ext.cpp b/lib/gpu/lal_coul_dsf_ext.cpp index 2d18f9f94d..e21c70ae4b 100644 --- a/lib/gpu/lal_coul_dsf_ext.cpp +++ b/lib/gpu/lal_coul_dsf_ext.cpp @@ -55,7 +55,7 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall, int init_ok=0; if (world_me==0) - init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, + init_ok=CDMF.init(ntypes, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_coulsq, host_special_coul, qqrd2e, e_shift, f_shift, alpha); @@ -73,7 +73,7 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, + init_ok=CDMF.init(ntypes, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_coulsq, host_special_coul, qqrd2e, e_shift, f_shift, alpha); diff --git a/lib/gpu/lal_coul_ext.cpp b/lib/gpu/lal_coul_ext.cpp index 9779526d62..370c186123 100644 --- a/lib/gpu/lal_coul_ext.cpp +++ b/lib/gpu/lal_coul_ext.cpp @@ -54,7 +54,7 @@ int coul_gpu_init(const int ntypes, double **host_scale, int init_ok=0; if (world_me==0) - init_ok=COULMF.init(ntypes, host_scale, cutsq, special_coul, inum, nall, 300, + init_ok=COULMF.init(ntypes, host_scale, cutsq, special_coul, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, qqrd2e); COULMF.device->world_barrier(); @@ -71,7 +71,7 @@ int coul_gpu_init(const int ntypes, double **host_scale, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=COULMF.init(ntypes, host_scale, cutsq, special_coul, inum, nall, 300, + init_ok=COULMF.init(ntypes, host_scale, cutsq, special_coul, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, qqrd2e); COULMF.device->gpu_barrier(); diff --git a/lib/gpu/lal_coul_long.cpp b/lib/gpu/lal_coul_long.cpp index 02097a2c61..36c1cd751f 100644 --- a/lib/gpu/lal_coul_long.cpp +++ b/lib/gpu/lal_coul_long.cpp @@ -116,20 +116,9 @@ double CoulLongT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void CoulLongT::loop(const bool _eflag, const bool _vflag) { +int CoulLongT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -137,8 +126,8 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &scale, &sp_cl, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, @@ -153,6 +142,7 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) { &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class CoulLong; diff --git a/lib/gpu/lal_coul_long.cu b/lib/gpu/lal_coul_long.cu index 7adcdbbabc..f8a33e90a2 100644 --- a/lib/gpu/lal_coul_long.cu +++ b/lib/gpu/lal_coul_long.cu @@ -29,100 +29,6 @@ _texture( q_tex,int2); #define q_tex q_ #endif -#if (ARCH < 300) - -#define store_answers_lq(f, e_coul, virial, ii, inum, tid, \ - t_per_atom, offset, eflag, vflag, ans, engv) \ - if (t_per_atom>1) { \ - __local acctyp red_acc[6][BLOCK_PAIR]; \ - \ - red_acc[0][tid]=f.x; \ - red_acc[1][tid]=f.y; \ - red_acc[2][tid]=f.z; \ - red_acc[3][tid]=e_coul; \ - \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<4; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ - } \ - } \ - \ - f.x=red_acc[0][tid]; \ - f.y=red_acc[1][tid]; \ - f.z=red_acc[2][tid]; \ - e_coul=red_acc[3][tid]; \ - \ - if (vflag>0) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid]=virial[r]; \ - \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ - } \ - } \ - \ - for (int r=0; r<6; r++) \ - virial[r]=red_acc[r][tid]; \ - } \ - } \ - \ - if (offset==0) { \ - __global acctyp *ap1=engv+ii; \ - if (eflag>0) { \ - *ap1=(acctyp)0; \ - ap1+=inum; \ - *ap1=e_coul*(acctyp)0.5; \ - ap1+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - *ap1=virial[i]*(acctyp)0.5; \ - ap1+=inum; \ - } \ - } \ - ans[ii]=f; \ - } - -#else - -#define store_answers_lq(f, e_coul, virial, ii, inum, tid, \ - t_per_atom, offset, eflag, vflag, ans, engv) \ - if (t_per_atom>1) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - f.x += shfl_xor(f.x, s, t_per_atom); \ - f.y += shfl_xor(f.y, s, t_per_atom); \ - f.z += shfl_xor(f.z, s, t_per_atom); \ - e_coul += shfl_xor(e_coul, s, t_per_atom); \ - } \ - if (vflag>0) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - for (int r=0; r<6; r++) \ - virial[r] += shfl_xor(virial[r], s, t_per_atom); \ - } \ - } \ - } \ - if (offset==0) { \ - __global acctyp *ap1=engv+ii; \ - if (eflag>0) { \ - *ap1=(acctyp)0; \ - ap1+=inum; \ - *ap1=e_coul*(acctyp)0.5; \ - ap1+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - *ap1=virial[i]*(acctyp)0.5; \ - ap1+=inum; \ - } \ - } \ - ans[ii]=f; \ - } - -#endif - __kernel void k_coul_long(const __global numtyp4 *restrict x_, const __global numtyp *restrict scale, const int lj_types, @@ -140,22 +46,25 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_cl[4]; + int n_stride; + local_allocate_store_charge(); + sp_cl[0]=sp_cl_in[0]; sp_cl[1]=sp_cl_in[1]; sp_cl[2]=sp_cl_in[2]; sp_cl[3]=sp_cl_in[3]; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp e_coul, virial[6]; + if (EVFLAG) { + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { e_coul += prefactor*(_erfc-factor_coul); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -211,9 +120,11 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + acctyp energy; + if (EVFLAG) energy=(acctyp)0.0; + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_, @@ -233,24 +144,27 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_, __local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_cl[4]; + int n_stride; + local_allocate_store_charge(); + if (tid<4) sp_cl[tid]=sp_cl_in[tid]; if (tid0) { + if (EVFLAG && eflag) { e_coul += prefactor*(_erfc-factor_coul); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -306,8 +220,10 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + acctyp energy; + if (EVFLAG) energy=(acctyp)0.0; + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } diff --git a/lib/gpu/lal_coul_long.h b/lib/gpu/lal_coul_long.h index 0668e0fd02..a89b8e447c 100644 --- a/lib/gpu/lal_coul_long.h +++ b/lib/gpu/lal_coul_long.h @@ -74,7 +74,7 @@ class CoulLong : public BaseCharge { protected: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_coul_long_cs.cu b/lib/gpu/lal_coul_long_cs.cu index 85c9d84bdb..dfbc771adc 100644 --- a/lib/gpu/lal_coul_long_cs.cu +++ b/lib/gpu/lal_coul_long_cs.cu @@ -43,100 +43,6 @@ _texture( q_tex,int2); #define EPS_EWALD (acctyp)(1.0e-6) #define EPS_EWALD_SQR (acctyp)(1.0e-12) -#if (ARCH < 300) - -#define store_answers_lq(f, e_coul, virial, ii, inum, tid, \ - t_per_atom, offset, eflag, vflag, ans, engv) \ - if (t_per_atom>1) { \ - __local acctyp red_acc[6][BLOCK_PAIR]; \ - \ - red_acc[0][tid]=f.x; \ - red_acc[1][tid]=f.y; \ - red_acc[2][tid]=f.z; \ - red_acc[3][tid]=e_coul; \ - \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<4; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ - } \ - } \ - \ - f.x=red_acc[0][tid]; \ - f.y=red_acc[1][tid]; \ - f.z=red_acc[2][tid]; \ - e_coul=red_acc[3][tid]; \ - \ - if (vflag>0) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid]=virial[r]; \ - \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ - } \ - } \ - \ - for (int r=0; r<6; r++) \ - virial[r]=red_acc[r][tid]; \ - } \ - } \ - \ - if (offset==0) { \ - __global acctyp *ap1=engv+ii; \ - if (eflag>0) { \ - *ap1=(acctyp)0; \ - ap1+=inum; \ - *ap1=e_coul*(acctyp)0.5; \ - ap1+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - *ap1=virial[i]*(acctyp)0.5; \ - ap1+=inum; \ - } \ - } \ - ans[ii]=f; \ - } - -#else - -#define store_answers_lq(f, e_coul, virial, ii, inum, tid, \ - t_per_atom, offset, eflag, vflag, ans, engv) \ - if (t_per_atom>1) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - f.x += shfl_xor(f.x, s, t_per_atom); \ - f.y += shfl_xor(f.y, s, t_per_atom); \ - f.z += shfl_xor(f.z, s, t_per_atom); \ - e_coul += shfl_xor(e_coul, s, t_per_atom); \ - } \ - if (vflag>0) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - for (int r=0; r<6; r++) \ - virial[r] += shfl_xor(virial[r], s, t_per_atom); \ - } \ - } \ - } \ - if (offset==0) { \ - __global acctyp *ap1=engv+ii; \ - if (eflag>0) { \ - *ap1=(acctyp)0; \ - ap1+=inum; \ - *ap1=e_coul*(acctyp)0.5; \ - ap1+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - *ap1=virial[i]*(acctyp)0.5; \ - ap1+=inum; \ - } \ - } \ - ans[ii]=f; \ - } - -#endif - __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_, const __global numtyp *restrict scale, const int lj_types, @@ -154,22 +60,25 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_cl[4]; + int n_stride; + local_allocate_store_charge(); + sp_cl[0]=sp_cl_in[0]; sp_cl[1]=sp_cl_in[1]; sp_cl[2]=sp_cl_in[2]; sp_cl[3]=sp_cl_in[3]; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp e_coul, virial[6]; + if (EVFLAG) { + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { numtyp e = prefactor*_erfc; if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor; e_coul += e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -245,9 +154,11 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + acctyp energy; + if (EVFLAG) energy=(acctyp)0.0; + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_, @@ -267,24 +178,27 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_, __local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_cl[4]; + int n_stride; + local_allocate_store_charge(); + if (tid<4) sp_cl[tid]=sp_cl_in[tid]; if (tid0) { + if (EVFLAG && eflag) { numtyp e = prefactor*_erfc; if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor; e_coul += e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -360,8 +274,9 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + acctyp energy; + if (EVFLAG) energy=(acctyp)0.0; + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } - diff --git a/lib/gpu/lal_coul_long_cs_ext.cpp b/lib/gpu/lal_coul_long_cs_ext.cpp index ae57eb2038..df92619f2f 100644 --- a/lib/gpu/lal_coul_long_cs_ext.cpp +++ b/lib/gpu/lal_coul_long_cs_ext.cpp @@ -54,7 +54,7 @@ int clcs_gpu_init(const int ntypes, double **host_scale, int init_ok=0; if (world_me==0) - init_ok=CLCSMF.init(ntypes, host_scale, inum, nall, 300, maxspecial, + init_ok=CLCSMF.init(ntypes, host_scale, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); @@ -72,7 +72,7 @@ int clcs_gpu_init(const int ntypes, double **host_scale, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=CLCSMF.init(ntypes, host_scale, inum, nall, 300, maxspecial, + init_ok=CLCSMF.init(ntypes, host_scale, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); diff --git a/lib/gpu/lal_coul_long_ext.cpp b/lib/gpu/lal_coul_long_ext.cpp index 653b4be4f3..1d9dcfdeca 100644 --- a/lib/gpu/lal_coul_long_ext.cpp +++ b/lib/gpu/lal_coul_long_ext.cpp @@ -54,7 +54,7 @@ int cl_gpu_init(const int ntypes, double **host_scale, int init_ok=0; if (world_me==0) - init_ok=CLMF.init(ntypes, host_scale, inum, nall, 300, maxspecial, + init_ok=CLMF.init(ntypes, host_scale, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); @@ -72,7 +72,7 @@ int cl_gpu_init(const int ntypes, double **host_scale, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=CLMF.init(ntypes, host_scale, inum, nall, 300, maxspecial, + init_ok=CLMF.init(ntypes, host_scale, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index 911cdda383..a65c3d8810 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -18,12 +18,18 @@ #include #include #include -#ifdef _OPENMP +#if (LAL_USE_OMP == 1) #include #endif #if defined(USE_OPENCL) #include "device_cl.h" + +#ifdef LAL_OCL_EXTRA_ARGS +#define LAL_DM_STRINGIFY(x) #x +#define LAL_PRE_STRINGIFY(x) LAL_DM_STRINGIFY(x) +#endif + #elif defined(USE_CUDART) const char *device=0; #else @@ -45,40 +51,44 @@ DeviceT::~Device() { } template -int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, - const int last_gpu, const int gpu_mode, - const double p_split, const int nthreads, - const int t_per_atom, const double cell_size, - char *ocl_vendor, const int block_pair) { - _nthreads=nthreads; - #ifdef _OPENMP - omp_set_num_threads(nthreads); - #endif +int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu, + const int first_gpu_id, const int gpu_mode, + const double p_split, const int t_per_atom, + const double user_cell_size, char *ocl_args, + const int ocl_platform, char *device_type_flags, + const int block_pair) { _threads_per_atom=t_per_atom; _threads_per_charge=t_per_atom; + _threads_per_three=t_per_atom; if (_device_init) return 0; _device_init=true; _comm_world=replica; //world; _comm_replica=replica; - _first_device=first_gpu; - _last_device=last_gpu; + int ndevices=ngpu; + _first_device=first_gpu_id; _gpu_mode=gpu_mode; _particle_split=p_split; - _cell_size=cell_size; + _user_cell_size=user_cell_size; _block_pair=block_pair; - // support selecting platform though "package device" keyword. - // "0:generic" will select platform 0 and tune for generic device - // "1:fermi" will select platform 1 and tune for Nvidia Fermi gpu - if (ocl_vendor) { - char *sep = nullptr; - if ((sep = strstr(ocl_vendor,":"))) { - *sep = '\0'; - _platform_id = atoi(ocl_vendor); - ocl_vendor = sep+1; - } - } + + // support selecting OpenCL platform id with "package platform" keyword + if (ocl_platform >= 0) + _platform_id = ocl_platform; + + gpu=new UCL_Device(); + + // ---------------------- OpenCL Compiler Args ------------------------- + std::string extra_args=""; + if (ocl_args) extra_args+=":"+std::string(ocl_args); + #ifdef LAL_OCL_EXTRA_ARGS + extra_args+=":" LAL_PRE_STRINGIFY(LAL_OCL_EXTRA_ARGS); + #endif + for (int i=0; i procs_per_node) + ndevices = procs_per_node; + + // --------------------- OCL Platform Selection ----------------------- + + // Setup OpenCL platform and parameters based on platform + // and device type specifications + std::string ocl_vstring=""; + if (device_type_flags != nullptr) ocl_vstring=device_type_flags; + + // Setup the OpenCL platform + // If multiple platforms and no user platform specified, + // try to match platform from config matching any user specified + // device type. Give preference to platforms with GPUs. + // Priority under these conditions to platform with device with + // highest compute unit count. + int pres; + enum UCL_DEVICE_TYPE type=UCL_GPU; + #ifndef USE_OPENCL + pres=gpu->set_platform(0); + #else + if (_platform_id>=0) + pres=gpu->set_platform(_platform_id); + else { + std::string vendor=""; + if (device_type_flags!=nullptr) { + if (ocl_vstring=="intelgpu") + vendor="intel"; + else if (ocl_vstring=="intelcpu") { + vendor="intel"; + type=UCL_CPU; + } else if (ocl_vstring=="nvidiagpu") + vendor="nvidia"; + else if (ocl_vstring=="amdgpu") + vendor="amd"; + else if (ocl_vstring=="applegpu") + vendor="apple"; + } + pres=gpu->auto_set_platform(type,vendor,ndevices,_first_device); + } + #endif + if (pres != UCL_SUCCESS) + return -12; + + // ------------------------ Device Selection --------------------------- + if (_first_device > -1 && _first_device >= gpu->num_devices()) + return -2; + if (ndevices > gpu->num_devices()) + return -2; + if (_first_device + ndevices > gpu->num_devices()) + return -2; + if (gpu->num_devices()==0) + return -2; + + // Fully specified deviceIDs + if (_first_device > -1 && ndevices > 0) + _last_device = _first_device + ndevices - 1; + + // Find deviceID with most CUs (priority given to the accelerator type) + if (_first_device < 0) { + int best_device = 0; + int best_cus = gpu->cus(0); + bool type_match = (gpu->device_type(0) == type); + for (int i = 1; i < gpu->num_devices(); i++) { + if (type_match==true && gpu->device_type(i)!=type) + continue; + if (type_match == false && gpu->device_type(i) == type) { + type_match = true; + best_cus = gpu->cus(i); + best_device = i; + } + if (gpu->cus(i) > best_cus) { + best_cus = gpu->cus(i); + best_device = i; + } + } + _first_device = _last_device = best_device; + type = gpu->device_type(_first_device); + + if (ndevices > 0) { + // Expand range to meet specified number of devices + while (_last_device - _first_device < ndevices - 1) { + if (_last_device + 1 == gpu->num_devices()) + _first_device--; + else if (_first_device == 0) + _last_device++; + else { + if (gpu->device_type(_last_device+1)==type && + gpu->device_type(_first_device-1)!=type) + _last_device++; + else if (gpu->device_type(_last_device+1)!=type && + gpu->device_type(_first_device-1)==type) + _first_device--; + else if (gpu->cus(_last_device+1) > gpu->cus(_first_device-1)) + _last_device++; + else + _first_device--; + } + } + } + } + + // If ngpus not specified, expand range to include matching devices + if (ndevices == 0) { + for (int i = _first_device; i < gpu->num_devices(); i++) { + if (gpu->device_type(i)==gpu->device_type(_first_device) && + gpu->cus(i)==gpu->cus(_first_device)) + _last_device = i; + else + break; + } + ndevices = _last_device - _first_device + 1; + if (ndevices > procs_per_node) { + ndevices = procs_per_node; + _last_device=_first_device + ndevices - 1; + } + } + + // ------------------------ MPI Device ID Setup ----------------------- + // set the device ID _procs_per_gpu=static_cast(ceil(static_cast(procs_per_node)/ - (last_gpu-first_gpu+1))); - int my_gpu=node_rank/_procs_per_gpu+first_gpu; + ndevices)); + int my_gpu=node_rank/_procs_per_gpu+_first_device; // Time on the device only if 1 proc per gpu _time_device=true; @@ -146,27 +278,51 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu); MPI_Comm_rank(_comm_gpu,&_gpu_rank); - gpu=new UCL_Device(); - if (my_gpu>=gpu->num_devices()) - return -2; - - #ifndef CUDA_PROXY + #if !defined(CUDA_PROXY) && !defined(CUDA_MPS_SUPPORT) if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false) return -7; #endif - if (gpu->set_platform_accelerator(_platform_id)!=UCL_SUCCESS) - return -12; + // --------------- Device Configuration and Setup ------------------------- if (gpu->set(my_gpu)!=UCL_SUCCESS) return -6; - gpu->push_command_queue(); - gpu->set_command_queue(1); + #if !defined(USE_OPENCL) && !defined(USE_HIP) + if (gpu->arch()<7.0) { + gpu->push_command_queue(); + gpu->set_command_queue(1); + } + #endif _long_range_precompute=0; - if (set_ocl_params(ocl_vendor)!=0) + // If OpenCL parameters not specified by user, try to auto detect + // best option from the platform config + #ifdef USE_OPENCL + if (device_type_flags==nullptr) { + std::string pname = gpu->platform_name(); + for (int i=0; i='a') + pname[i]=toupper(pname[i]); + if (pname.find("NVIDIA")!=std::string::npos) + ocl_vstring="nvidiagpu"; + else if (pname.find("INTEL")!=std::string::npos) { + if (gpu->device_type()==UCL_GPU) + ocl_vstring="intelgpu"; + else if (gpu->device_type()==UCL_CPU) + ocl_vstring="intelcpu"; + } else if (pname.find("AMD")!=std::string::npos) { + if (gpu->device_type()==UCL_GPU) + ocl_vstring="amdgpu"; + } else if (pname.find("APPLE")!=std::string::npos) { + if (gpu->device_type()==UCL_GPU) + ocl_vstring="applegpu"; + } + } + #endif + + if (set_ocl_params(ocl_vstring, extra_args)!=0) return -11; int flag=0; @@ -175,71 +331,90 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, flag=compile_kernels(); gpu_barrier(); } + + // Setup auto bin size calculation for calls from atom::sort + // - This is repeated in neighbor init with additional info + if (_user_cell_size<0.0) { + #ifndef LAL_USE_OLD_NEIGHBOR + _neighbor_shared.setup_auto_cell_size(true,0,_simd_size); + #else + _neighbor_shared.setup_auto_cell_size(false,0,_simd_size); + #endif + } else + _neighbor_shared.setup_auto_cell_size(false,_user_cell_size,_simd_size); + return flag; } template -int DeviceT::set_ocl_params(char *ocl_vendor) { +int DeviceT::set_ocl_params(std::string s_config, std::string extra_args) { #ifdef USE_OPENCL - std::string s_vendor=OCL_DEFAULT_VENDOR; - if (ocl_vendor!=nullptr) - s_vendor=ocl_vendor; - if (s_vendor=="none") - s_vendor="generic"; - if (s_vendor=="kepler") { - _ocl_vendor_name="NVIDIA Kepler"; - #if defined (__APPLE__) || defined(MACOSX) - _ocl_vendor_string="-DKEPLER_OCL -DNO_OCL_PTX"; - #else - _ocl_vendor_string="-DKEPLER_OCL"; - #endif - } else if (s_vendor=="fermi") { - _ocl_vendor_name="NVIDIA Fermi"; - _ocl_vendor_string="-DFERMI_OCL"; - } else if (s_vendor=="cypress") { - _ocl_vendor_name="AMD Cypress"; - _ocl_vendor_string="-DCYPRESS_OCL"; - } else if (s_vendor=="phi") { - _ocl_vendor_name="Intel Phi"; - _ocl_vendor_string="-DPHI_OCL"; - } else if (s_vendor=="intel") { - _ocl_vendor_name="Intel CPU"; - _ocl_vendor_string="-DINTEL_OCL"; - } else if (s_vendor=="generic") { - _ocl_vendor_name="GENERIC"; - _ocl_vendor_string="-DGENERIC_OCL"; - } else { - _ocl_vendor_name="CUSTOM"; - _ocl_vendor_string="-DUSE_OPENCL"; - int token_count=0; - std::string params[13]; - char *pch = strtok(ocl_vendor,","); + #include "lal_pre_ocl_config.h" + + if (s_config=="" || s_config=="none") + s_config="generic"; + + int config_index=-1; + for (int i=0; ihas_subgroup_support()) + _ocl_compile_string+=" -DUSE_OPENCL_SUBGROUPS"; + #ifdef LAL_USE_OLD_NEIGHBOR + _ocl_compile_string+=" -DLAL_USE_OLD_NEIGHBOR"; + #endif + + _ocl_compile_string += " -DCONFIG_ID="+params[0]+ + " -DSIMD_SIZE="+params[1]+ + " -DMEM_THREADS="+params[2]; + if (gpu->has_shuffle_support()==false) + _ocl_compile_string+=" -DSHUFFLE_AVAIL=0"; + else + _ocl_compile_string+=" -DSHUFFLE_AVAIL="+params[3]; + _ocl_compile_string += " -DFAST_MATH="+params[4]+ + + " -DTHREADS_PER_ATOM="+params[5]+ + " -DTHREADS_PER_CHARGE="+params[6]+ + " -DTHREADS_PER_THREE="+params[7]+ + + " -DBLOCK_PAIR="+params[8]+ + " -DBLOCK_BIO_PAIR="+params[9]+ + " -DBLOCK_ELLIPSE="+params[10]+ + " -DPPPM_BLOCK_1D="+params[11]+ + " -DBLOCK_NBOR_BUILD="+params[12]+ + " -DBLOCK_CELL_2D="+params[13]+ + " -DBLOCK_CELL_ID="+params[14]+ + + " -DMAX_SHARED_TYPES="+params[15]+ + " -DMAX_BIO_SHARED_TYPES="+params[16]+ + " -DPPPM_MAX_SPLINE="+params[17]; + _ocl_compile_string += extra_args; #endif return 0; } @@ -269,8 +444,10 @@ int DeviceT::init(Answer &ans, const bool charge, else if (_gpu_mode==Device::GPU_HYB_NEIGH) gpu_nbor=2; #if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT) - if (gpu_nbor==1) - gpu_nbor=2; + if (gpu_nbor==1) gpu_nbor=2; + #endif + #ifndef LAL_USE_OLD_NEIGHBOR + if (gpu_nbor==1) gpu_nbor=2; #endif if (_init_count==0) { @@ -328,14 +505,15 @@ int DeviceT::init(Answer &ans, const int nlocal, template int DeviceT::init_nbor(Neighbor *nbor, const int nlocal, - const int host_nlocal, const int nall, - const int maxspecial, const int gpu_host, - const int max_nbors, const double cell_size, - const bool pre_cut, const int threads_per_atom) { + const int host_nlocal, const int nall, + const int maxspecial, const int gpu_host, + const int max_nbors, const double cutoff, + const bool pre_cut, const int threads_per_atom, + const bool ilist_map) { int ef_nlocal=nlocal; if (_particle_split<1.0 && _particle_split>0.0) ef_nlocal=static_cast(_particle_split*nlocal); - + int gpu_nbor=0; if (_gpu_mode==Device::GPU_NEIGH) gpu_nbor=1; @@ -345,16 +523,27 @@ int DeviceT::init_nbor(Neighbor *nbor, const int nlocal, if (gpu_nbor==1) gpu_nbor=2; #endif + #ifndef LAL_USE_OLD_NEIGHBOR + if (gpu_nbor==1) + gpu_nbor=2; + #endif if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial, *gpu,gpu_nbor,gpu_host,pre_cut,_block_cell_2d, _block_cell_id, _block_nbor_build, threads_per_atom, - _warp_size, _time_device, compile_string())) + _simd_size, _time_device, compile_string(), ilist_map)) return -3; - if (_cell_size<0.0) - nbor->cell_size(cell_size,cell_size); - else - nbor->cell_size(_cell_size,cell_size); + + if (_user_cell_size<0.0) { + #ifndef LAL_USE_OLD_NEIGHBOR + _neighbor_shared.setup_auto_cell_size(true,cutoff,nbor->simd_size()); + #else + _neighbor_shared.setup_auto_cell_size(false,cutoff,nbor->simd_size()); + #endif + } else + _neighbor_shared.setup_auto_cell_size(false,_user_cell_size, + nbor->simd_size()); + nbor->set_cutoff(cutoff); return 0; } @@ -389,13 +578,21 @@ void DeviceT::init_message(FILE *screen, const char *name, fprintf(screen,"-------------------------------------\n"); fprintf(screen,"- Using acceleration for %s:\n",name); fprintf(screen,"- with %d proc(s) per device.\n",_procs_per_gpu); - #ifdef _OPENMP - fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads); + #if (LAL_USE_OMP == 1) + fprintf(screen,"- with %d thread(s) per proc.\n", omp_get_max_threads()); #endif #ifdef USE_OPENCL - fprintf(screen,"- with OpenCL Parameters for: %s\n", - _ocl_vendor_name.c_str()); + fprintf(screen,"- with OpenCL Parameters for: %s (%d)\n", + _ocl_config_name.c_str(),_config_id); #endif + if (shuffle_avail()) + fprintf(screen,"- Horizontal vector operations: ENABLED\n"); + else + fprintf(screen,"- Horizontal vector operations: DISABLED\n"); + if (gpu->shared_memory(first_gpu)) + fprintf(screen,"- Shared memory system: Yes\n"); + else + fprintf(screen,"- Shared memory system: No\n"); fprintf(screen,"-------------------------------------"); fprintf(screen,"-------------------------------------\n"); @@ -431,7 +628,8 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead, double &gpu_driver_overhead) { UCL_H_Vec *host_data_in=nullptr, *host_data_out=nullptr; - UCL_D_Vec *dev_data_in=nullptr, *dev_data_out=nullptr, *kernel_data=nullptr; + UCL_D_Vec *dev_data_in=nullptr, *dev_data_out=nullptr, + *kernel_data=nullptr; UCL_Timer *timers_in=nullptr, *timers_out=nullptr, *timers_kernel=nullptr; UCL_Timer over_timer(*gpu); @@ -472,7 +670,7 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls, gpu_overhead=0.0; gpu_driver_overhead=0.0; - for (int i=0; i<10; i++) { + for (int z=0; z<11; z++) { gpu->sync(); gpu_barrier(); over_timer.start(); @@ -486,9 +684,11 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls, timers_in[i].stop(); } + const int numel=1; for (int i=0; i0) { + gpu_overhead+=mpi_time; + gpu_driver_overhead+=mpi_driver_time; + } } gpu_overhead/=10.0; gpu_driver_overhead/=10.0; @@ -567,19 +773,22 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer &ans, double mpi_max_bytes; MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); double max_mb=mpi_max_bytes/(1024.0*1024.0); - double t_time=times[0]+times[1]+times[2]+times[3]+times[4]; + + #ifdef USE_OPENCL + // Workaround for timing issue on Intel OpenCL + if (times[3] > 80e6) times[3]=0.0; + #endif if (replica_me()==0) - if (screen && times[5]>0.0) { + if (screen && times[6]>0.0) { fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); fprintf(screen," Device Time Info (average): "); fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - if (time_device() && t_time>0) { + if (time_device() && times[3]>0) { fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size); - fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_replica_size); fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size); if (nbor.gpu_nbor()>0) fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/_replica_size); @@ -587,13 +796,15 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer &ans, fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size); fprintf(screen,"Force calc: %.4f s.\n",times[3]/_replica_size); } - if (nbor.gpu_nbor()==2) - fprintf(screen,"Neighbor (CPU): %.4f s.\n",times[8]/_replica_size); if (times[5]>0) fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size); fprintf(screen,"Average split: %.4f.\n",avg_split); - fprintf(screen,"Threads / atom: %d.\n",threads_per_atom); + fprintf(screen,"Lanes / atom: %d.\n",threads_per_atom); + fprintf(screen,"Vector width: %d.\n", simd_size()); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); + if (nbor.gpu_nbor()==2) + fprintf(screen,"CPU Neighbor: %.4f s.\n",times[8]/_replica_size); + fprintf(screen,"CPU Cast/Pack: %.4f s.\n",times[4]/_replica_size); fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size); fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[7]/_replica_size); @@ -612,24 +823,29 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in, const double max_bytes, const double cpu_time, const double idle_time, FILE *screen) { - double single[8], times[8]; + double single[9], times[9]; single[0]=time_out.total_seconds(); single[1]=time_in.total_seconds()+atom.transfer_time()+atom.cast_time(); single[2]=time_map.total_seconds(); single[3]=time_rho.total_seconds(); single[4]=time_interp.total_seconds(); - single[5]=ans.transfer_time()+ans.cast_time(); + single[5]=ans.transfer_time(); single[6]=cpu_time; single[7]=idle_time; + single[8]=ans.cast_time(); - MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica); + MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,_comm_replica); double my_max_bytes=max_bytes+atom.max_gpu_bytes(); double mpi_max_bytes; MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); double max_mb=mpi_max_bytes/(1024.0*1024.0); - double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5]; + #ifdef USE_OPENCL + // Workaround for timing issue on Intel OpenCL + if (times[3] > 80e6) times[3]=0.0; + #endif + if (replica_me()==0) if (screen && times[6]>0.0) { @@ -639,7 +855,7 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in, fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - if (time_device() && t_time>0) { + if (time_device() && times[3]>0) { fprintf(screen,"Data Out: %.4f s.\n",times[0]/_replica_size); fprintf(screen,"Data In: %.4f s.\n",times[1]/_replica_size); fprintf(screen,"Kernel (map): %.4f s.\n",times[2]/_replica_size); @@ -649,12 +865,13 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in, (times[0]+times[2]+times[3])/_replica_size); fprintf(screen,"Total interp: %.4f s.\n", (times[1]+times[4])/_replica_size); - fprintf(screen,"Force copy/cast: %.4f s.\n",times[5]/_replica_size); + fprintf(screen,"Force copy: %.4f s.\n",times[5]/_replica_size); fprintf(screen,"Total: %.4f s.\n", (times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/ _replica_size); } fprintf(screen,"CPU Poisson: %.4f s.\n",times[6]/_replica_size); + fprintf(screen,"CPU Data Cast: %.4f s.\n",times[8]/_replica_size); fprintf(screen,"CPU Idle Time: %.4f s.\n",times[7]/_replica_size); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); @@ -699,14 +916,15 @@ int DeviceT::compile_kernels() { return flag; dev_program=new UCL_Program(*gpu); - int success=dev_program->load_string(device,compile_string().c_str()); + int success=dev_program->load_string(device,compile_string().c_str(), + nullptr,stderr); if (success!=UCL_SUCCESS) return -6; k_zero.set_function(*dev_program,"kernel_zero"); k_info.set_function(*dev_program,"kernel_info"); _compiled=true; - UCL_Vector gpu_lib_data(15,*gpu,UCL_NOT_PINNED); + UCL_Vector gpu_lib_data(19,*gpu,UCL_NOT_PINNED); k_info.set_size(1,1); k_info.run(&gpu_lib_data); gpu_lib_data.update_host(false); @@ -717,39 +935,81 @@ int DeviceT::compile_kernels() { return -4; #endif - _num_mem_threads=gpu_lib_data[1]; - _warp_size=gpu_lib_data[2]; - if (_threads_per_atom<1) - _threads_per_atom=gpu_lib_data[3]; - if (_threads_per_charge<1) - _threads_per_charge=gpu_lib_data[13]; - _pppm_max_spline=gpu_lib_data[4]; - _pppm_block=gpu_lib_data[5]; - if (_block_pair == -1) _block_pair=gpu_lib_data[6]; - _max_shared_types=gpu_lib_data[7]; - _block_cell_2d=gpu_lib_data[8]; - _block_cell_id=gpu_lib_data[9]; - _block_nbor_build=gpu_lib_data[10]; - _block_bio_pair=gpu_lib_data[11]; - _max_bio_shared_types=gpu_lib_data[12]; - _block_ellipse=gpu_lib_data[14]; + _config_id=gpu_lib_data[1]; - if (static_cast(_block_pair)>gpu->group_size()) - _block_pair=gpu->group_size(); - if (static_cast(_block_bio_pair)>gpu->group_size()) - _block_bio_pair=gpu->group_size(); - if (_threads_per_atom>_warp_size) - _threads_per_atom=_warp_size; - if (_warp_size%_threads_per_atom!=0) + if (sizeof(numtyp)==sizeof(float)) + _simd_size=std::max(gpu_lib_data[2],gpu->preferred_fp32_width()); + else + _simd_size=std::max(gpu_lib_data[2],gpu->preferred_fp64_width()); + + _num_mem_threads=gpu_lib_data[3]; + _shuffle_avail=gpu_lib_data[4]; + _fast_math=gpu_lib_data[5]; + + if (_threads_per_atom<1) + _threads_per_atom=gpu_lib_data[6]; + if (_threads_per_charge<1) + _threads_per_charge=gpu_lib_data[7]; + if (_threads_per_three<1) + _threads_per_three=gpu_lib_data[8]; + + if (_block_pair == -1) { + _block_pair=gpu_lib_data[9]; + _block_bio_pair=gpu_lib_data[10]; + _block_ellipse=gpu_lib_data[11]; + } else { + _block_bio_pair=_block_pair; + _block_ellipse=_block_pair; + } + _pppm_block=gpu_lib_data[12]; + _block_nbor_build=gpu_lib_data[13]; + _block_cell_2d=gpu_lib_data[14]; + _block_cell_id=gpu_lib_data[15]; + + _max_shared_types=gpu_lib_data[16]; + _max_bio_shared_types=gpu_lib_data[17]; + _pppm_max_spline=gpu_lib_data[18]; + + if (static_cast(_block_pair)>gpu->group_size_dim(0) || + static_cast(_block_bio_pair)>gpu->group_size_dim(0) || + static_cast(_block_ellipse)>gpu->group_size_dim(0) || + static_cast(_pppm_block)>gpu->group_size_dim(0) || + static_cast(_block_nbor_build)>gpu->group_size_dim(0) || + static_cast(_block_cell_2d)>gpu->group_size_dim(0) || + static_cast(_block_cell_2d)>gpu->group_size_dim(1) || + static_cast(_block_cell_id)>gpu->group_size_dim(0) || + static_cast(_max_shared_types*_max_shared_types* + sizeof(numtyp)*17 > gpu->slm_size()) || + static_cast(_max_bio_shared_types*2*sizeof(numtyp) > + gpu->slm_size())) + return -13; + + if (_block_pair % _simd_size != 0 || _block_bio_pair % _simd_size != 0 || + _block_ellipse % _simd_size != 0 || _pppm_block % _simd_size != 0 || + _block_nbor_build % _simd_size != 0 || + _block_pair < _max_shared_types * _max_shared_types || + _block_bio_pair * 2 < _max_bio_shared_types || + _pppm_block < _pppm_max_spline * _pppm_max_spline) + return -11; + + if (_threads_per_atom>_simd_size) + _threads_per_atom=_simd_size; + if (_simd_size%_threads_per_atom!=0) _threads_per_atom=1; if (_threads_per_atom & (_threads_per_atom - 1)) _threads_per_atom=1; - if (_threads_per_charge>_warp_size) - _threads_per_charge=_warp_size; - if (_warp_size%_threads_per_charge!=0) + if (_threads_per_charge>_simd_size) + _threads_per_charge=_simd_size; + if (_simd_size%_threads_per_charge!=0) _threads_per_charge=1; if (_threads_per_charge & (_threads_per_charge - 1)) _threads_per_charge=1; + if (_threads_per_three>_simd_size) + _threads_per_three=_simd_size; + if (_simd_size%_threads_per_three!=0) + _threads_per_three=1; + if (_threads_per_three & (_threads_per_three - 1)) + _threads_per_three=1; return flag; } @@ -765,14 +1025,16 @@ Device global_device; } using namespace LAMMPS_AL; -int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, - const int last_gpu, const int gpu_mode, - const double particle_split, const int nthreads, - const int t_per_atom, const double cell_size, - char *opencl_vendor, const int block_pair) { - return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode, - particle_split,nthreads,t_per_atom, - cell_size,opencl_vendor,block_pair); +int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int ngpu, + const int first_gpu_id, const int gpu_mode, + const double particle_split, const int t_per_atom, + const double user_cell_size, char *opencl_config, + const int ocl_platform, char *device_type_flags, + const int block_pair) { + return global_device.init_device(world,replica,ngpu,first_gpu_id,gpu_mode, + particle_split,t_per_atom,user_cell_size, + opencl_config,ocl_platform, + device_type_flags,block_pair); } void lmp_clear_device() { @@ -780,8 +1042,16 @@ void lmp_clear_device() { } double lmp_gpu_forces(double **f, double **tor, double *eatom, - double **vatom, double *virial, double &ecoul) { - return global_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul); + double **vatom, double *virial, double &ecoul, + int &error_flag) { + return global_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul,error_flag); +} + +double lmp_gpu_update_bin_size(const double subx, const double suby, + const double subz, const int nlocal, + const double cut) { + return global_device._neighbor_shared.update_cell_size(subx, suby, + subz, nlocal, cut); } bool lmp_gpu_config(const std::string &category, const std::string &setting) diff --git a/lib/gpu/lal_device.cu b/lib/gpu/lal_device.cu index afc7a0b988..61341964b2 100644 --- a/lib/gpu/lal_device.cu +++ b/lib/gpu/lal_device.cu @@ -26,20 +26,30 @@ __kernel void kernel_zero(__global int *restrict mem, } __kernel void kernel_info(__global int *info) { - info[0]=ARCH; - info[1]=MEM_THREADS; - info[2]=WARP_SIZE; - info[3]=THREADS_PER_ATOM; - info[4]=PPPM_MAX_SPLINE; - info[5]=PPPM_BLOCK_1D; - info[6]=BLOCK_PAIR; - info[7]=MAX_SHARED_TYPES; - info[8]=BLOCK_CELL_2D; - info[9]=BLOCK_CELL_ID; - info[10]=BLOCK_NBOR_BUILD; - info[11]=BLOCK_BIO_PAIR; - info[12]=MAX_BIO_SHARED_TYPES; - info[13]=THREADS_PER_CHARGE; - info[14]=BLOCK_ELLIPSE; -} + #ifdef __CUDA_ARCH__ + info[0]=__CUDA_ARCH__; + #else + info[0]=0; + #endif + info[1]=CONFIG_ID; + info[2]=SIMD_SIZE; + info[3]=MEM_THREADS; + info[4]=SHUFFLE_AVAIL; + info[5]=FAST_MATH; + info[6]=THREADS_PER_ATOM; + info[7]=THREADS_PER_CHARGE; + info[8]=THREADS_PER_THREE; + + info[9]=BLOCK_PAIR; + info[10]=BLOCK_BIO_PAIR; + info[11]=BLOCK_ELLIPSE; + info[12]=PPPM_BLOCK_1D; + info[13]=BLOCK_NBOR_BUILD; + info[14]=BLOCK_CELL_2D; + info[15]=BLOCK_CELL_ID; + + info[16]=MAX_SHARED_TYPES; + info[17]=MAX_BIO_SHARED_TYPES; + info[18]=PPPM_MAX_SPLINE; +} diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h index 21bd039c42..1db6ae3127 100644 --- a/lib/gpu/lal_device.h +++ b/lib/gpu/lal_device.h @@ -39,22 +39,23 @@ class Device { /// Initialize the device for use by this process /** Sets up a per-device MPI communicator for load balancing and initializes - * the device (>=first_gpu and <=last_gpu) that this proc will be using + * the device (ngpu starting at first_gpu_id) that this proc will be using * Returns: * - 0 if successful * - -2 if GPU not found * - -4 if GPU library not compiled for GPU * - -6 if GPU could not be initialized for use * - -7 if accelerator sharing is not currently allowed on system - * - -11 if vendor_string has the wrong number of parameters **/ - int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, - const int last_gpu, const int gpu_mode, - const double particle_split, const int nthreads, - const int t_per_atom, const double cell_size, - char *vendor_string, const int block_pair); + * - -11 if config_string has the wrong number of parameters **/ + int init_device(MPI_Comm world, MPI_Comm replica, const int ngpu, + const int first_gpu_id, const int gpu_mode, + const double particle_split, const int t_per_atom, + const double user_cell_size, char *config_string, + const int ocl_platform, char *device_type_flags, + const int block_pair); /// Initialize the device for Atom storage - /** \param charge True if charges need to be stored + /** \param charge True if charges need to be stored * \param rot True if quaternions need to be stored * \param nlocal Total number of local particles to allocate memory for * \param nall Total number of local+ghost particles @@ -94,10 +95,11 @@ class Device { * 1 if gpu_nbor is true, and host needs a half nbor list, * 2 if gpu_nbor is true, and host needs a full nbor list * \param max_nbors Initial number of rows in the neighbor matrix - * \param cell_size cutoff+skin + * \param cutoff cutoff+skin * \param pre_cut True if cutoff test will be performed in separate kernel * than the force kernel * \param threads_per_atom value to be used by the neighbor list only + * \param ilist_map true if ilist mapping data structures used (3-body) * * Returns: * - 0 if successful @@ -108,8 +110,9 @@ class Device { int init_nbor(Neighbor *nbor, const int nlocal, const int host_nlocal, const int nall, const int maxspecial, const int gpu_host, - const int max_nbors, const double cell_size, - const bool pre_cut, const int threads_per_atom); + const int max_nbors, const double cutoff, + const bool pre_cut, const int threads_per_atom, + const bool ilist_map = false); /// Output a message for pair_style acceleration with device stats void init_message(FILE *screen, const char *name, @@ -161,13 +164,16 @@ class Device { /// Add "answers" (force,energies,etc.) into LAMMPS structures inline double fix_gpu(double **f, double **tor, double *eatom, - double **vatom, double *virial, double &ecoul) { + double **vatom, double *virial, double &ecoul, + int &error_flag) { + error_flag=0; atom.data_unavail(); if (ans_queue.empty()==false) { stop_host_timer(); double evdw=0.0; while (ans_queue.empty()==false) { - evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul); + evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul, + error_flag); ans_queue.pop(); } return evdw; @@ -195,8 +201,6 @@ class Device { /// Return the number of procs sharing a device (size of device communicator) inline int procs_per_gpu() const { return _procs_per_gpu; } - /// Return the number of threads per proc - inline int num_threads() const { return _nthreads; } /// My rank within all processes inline int world_me() const { return _world_me; } /// Total number of processes @@ -228,45 +232,49 @@ class Device { /// True if device is being timed inline bool time_device() const { return _time_device; } + /// Accelerator device configuration id + inline int config_id() const { return _config_id; } + /// Number of threads executing concurrently on same multiproc + inline int simd_size() const { return _simd_size; } /// Return the number of threads accessing memory simulatenously inline int num_mem_threads() const { return _num_mem_threads; } + /// 1 if horizontal vector operations enabled, 0 otherwise + inline int shuffle_avail() const { return _shuffle_avail; } + /// For OpenCL, 0 if fast-math options disabled, 1 enabled + inline int fast_math() const { return _fast_math; } + /// Return the number of threads per atom for pair styles inline int threads_per_atom() const { return _threads_per_atom; } /// Return the number of threads per atom for pair styles using charge inline int threads_per_charge() const { return _threads_per_charge; } + /// Return the number of threads per atom for 3-body pair styles + inline int threads_per_three() const { return _threads_per_three; } + /// Return the min of the pair block size or the device max block size inline int pair_block_size() const { return _block_pair; } - /// Return the maximum number of atom types that can be used with shared mem - inline int max_shared_types() const { return _max_shared_types; } - /// Return the maximum order for PPPM splines - inline int pppm_max_spline() const { return _pppm_max_spline; } - /// Return the block size for PPPM kernels - inline int pppm_block() const { return _pppm_block; } - /// Return the block size for neighbor binning - inline int block_cell_2d() const { return _block_cell_2d; } - /// Return the block size for atom mapping for neighbor builds - inline int block_cell_id() const { return _block_cell_id; } - /// Return the block size for neighbor build kernel - inline int block_nbor_build() const { return _block_nbor_build; } /// Return the block size for "bio" pair styles inline int block_bio_pair() const { return _block_bio_pair; } /// Return the block size for "ellipse" pair styles inline int block_ellipse() const { return _block_ellipse; } + /// Return the block size for PPPM kernels + inline int pppm_block() const { return _pppm_block; } + /// Return the block size for neighbor build kernel + inline int block_nbor_build() const { return _block_nbor_build; } + /// Return the block size for neighbor binning + inline int block_cell_2d() const { return _block_cell_2d; } + /// Return the block size for atom mapping for neighbor builds + inline int block_cell_id() const { return _block_cell_id; } + + /// Return the maximum number of atom types that can be used with shared mem + inline int max_shared_types() const { return _max_shared_types; } /// Return the maximum number of atom types for shared mem with "bio" styles inline int max_bio_shared_types() const { return _max_bio_shared_types; } + /// Return the maximum order for PPPM splines + inline int pppm_max_spline() const { return _pppm_max_spline; } + /// Architecture gpu code compiled for (returns 0 for OpenCL) inline double ptx_arch() const { return _ptx_arch; } - /// Number of threads executing concurrently on same multiproc - inline int warp_size() const { return _warp_size; } - - // -------------------- SHARED DEVICE ROUTINES -------------------- - // Perform asynchronous zero of integer array - void zero(UCL_D_Vec &mem, const int numel) { - int num_blocks=static_cast(ceil(static_cast(numel)/ - _block_pair)); - k_zero.set_size(num_blocks,_block_pair); - k_zero.run(&mem,&numel); - } + inline void set_simd_size(int simd_sz) { _simd_size = simd_sz; } // -------------------------- DEVICE DATA ------------------------- @@ -304,35 +312,7 @@ class Device { } inline std::string compile_string() { return _ocl_compile_string; } - - private: - std::queue *> ans_queue; - int _init_count; - bool _device_init, _host_timer_started, _time_device; - MPI_Comm _comm_world, _comm_replica, _comm_gpu; - int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, - _replica_size; - int _gpu_mode, _first_device, _last_device, _platform_id, _nthreads; - double _particle_split; - double _cpu_full; - double _ptx_arch; - double _cell_size; // -1 if the cutoff is used - - int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge; - int _pppm_max_spline, _pppm_block; - int _block_pair, _block_ellipse, _max_shared_types; - int _block_cell_2d, _block_cell_id, _block_nbor_build; - int _block_bio_pair, _max_bio_shared_types; - - UCL_Program *dev_program; - UCL_Kernel k_zero, k_info; - bool _compiled; - int compile_kernels(); - - int _data_in_estimate, _data_out_estimate; - - std::string _ocl_vendor_name, _ocl_vendor_string, _ocl_compile_string; - int set_ocl_params(char *); + inline std::string ocl_config_name() { return _ocl_config_name; } template inline std::string toa(const t& in) { @@ -342,6 +322,34 @@ class Device { return o.str(); } + private: + std::queue *> ans_queue; + int _init_count; + bool _device_init, _host_timer_started, _time_device; + MPI_Comm _comm_world, _comm_replica, _comm_gpu; + int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, + _replica_size; + int _gpu_mode, _first_device, _last_device, _platform_id; + double _particle_split; + double _cpu_full; + double _ptx_arch; + double _user_cell_size; // -1 if the cutoff is used + + int _config_id, _simd_size, _num_mem_threads, _shuffle_avail, _fast_math; + int _threads_per_atom, _threads_per_charge, _threads_per_three; + int _block_pair, _block_bio_pair, _block_ellipse; + int _pppm_block, _block_nbor_build, _block_cell_2d, _block_cell_id; + int _max_shared_types, _max_bio_shared_types, _pppm_max_spline; + + UCL_Program *dev_program; + UCL_Kernel k_zero, k_info; + bool _compiled; + int compile_kernels(); + + int _data_in_estimate, _data_out_estimate; + + std::string _ocl_config_name, _ocl_config_string, _ocl_compile_string; + int set_ocl_params(std::string, std::string); }; } diff --git a/lib/gpu/lal_dipole_lj.cpp b/lib/gpu/lal_dipole_lj.cpp index b0929e2ffb..ffdeb41ca8 100644 --- a/lib/gpu/lal_dipole_lj.cpp +++ b/lib/gpu/lal_dipole_lj.cpp @@ -125,20 +125,9 @@ double DipoleLJT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void DipoleLJT::loop(const bool _eflag, const bool _vflag) { +int DipoleLJT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -146,8 +135,8 @@ void DipoleLJT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, @@ -165,6 +154,7 @@ void DipoleLJT::loop(const bool _eflag, const bool _vflag) { &_qqrd2e, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class DipoleLJ; diff --git a/lib/gpu/lal_dipole_lj.cu b/lib/gpu/lal_dipole_lj.cu index a3ed0d8d40..cbe68ff692 100644 --- a/lib/gpu/lal_dipole_lj.cu +++ b/lib/gpu/lal_dipole_lj.cu @@ -31,106 +31,178 @@ _texture_2d( mu_tex,int4); #define mu_tex mu_ #endif -#if (ARCH < 300) +#if (SHUFFLE_AVAIL == 0) -#define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid, \ - t_per_atom, offset, eflag, vflag, ans, engv) \ +#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid, \ + t_per_atom, offset, eflag, vflag, ans, engv) \ if (t_per_atom>1) { \ - __local acctyp red_acc[8][BLOCK_PAIR]; \ - red_acc[0][tid]=f.x; \ - red_acc[1][tid]=f.y; \ - red_acc[2][tid]=f.z; \ - red_acc[3][tid]=tor.x; \ - red_acc[4][tid]=tor.y; \ - red_acc[5][tid]=tor.z; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ + simd_reduce_add6(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z, \ + tor.x, tor.y, tor.z); \ + if (EVFLAG && (vflag==2 || eflag==2)) { \ + if (eflag) { \ + simdsync(); \ + simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \ } \ - } \ - f.x=red_acc[0][tid]; \ - f.y=red_acc[1][tid]; \ - f.z=red_acc[2][tid]; \ - tor.x=red_acc[3][tid]; \ - tor.y=red_acc[4][tid]; \ - tor.z=red_acc[5][tid]; \ - if (eflag>0 || vflag>0) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid]=virial[r]; \ - red_acc[6][tid]=energy; \ - red_acc[7][tid]=ecoul; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<8; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ - } \ + if (vflag) { \ + simdsync(); \ + simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \ } \ - for (int r=0; r<6; r++) \ - virial[r]=red_acc[r][tid]; \ - energy=red_acc[6][tid]; \ - ecoul=red_acc[7][tid]; \ } \ } \ - if (offset==0) { \ - int ei=ii; \ - if (eflag>0) { \ - engv[ei]=energy*(acctyp)0.5; \ - ei+=inum; \ - engv[ei]=e_coul*(acctyp)0.5; \ - ei+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - engv[ei]=virial[i]*(acctyp)0.5; \ - ei+=inum; \ - } \ - } \ + if (offset==0 && ii1) { \ + simd_reduce_add6(t_per_atom, f.x, f.y, f.z, tor.x, tor.y, tor.z); \ + if (vflag==2 || eflag==2) { \ + if (eflag) \ + simd_reduce_add2(t_per_atom,energy,e_coul); \ + if (vflag) \ + simd_reduce_arr(6, t_per_atom,virial); \ + } \ + } \ + if (offset==0 && ii 1; active_subgs /= vwidth) { \ + if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \ + if (bnum < active_subgs) { \ + if (eflag) { \ + simd_reduce_add2(vwidth, energy, e_coul); \ + if (voffset==0) { \ + red_acc[6][bnum] = energy; \ + red_acc[7][bnum] = e_coul; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (voffset==0) \ + for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \ + } \ + } \ + \ + __syncthreads(); \ + if (tid < active_subgs) { \ + if (eflag) { \ + energy = red_acc[6][tid]; \ + e_coul = red_acc[7][tid]; \ + } \ + if (vflag) \ + for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \ + } else { \ + if (eflag) energy = e_coul = (acctyp)0; \ + if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \ + } \ + } \ + \ + if (bnum == 0) { \ + int ei=BLOCK_ID_X; \ + const int ev_stride=NUM_BLOCKS_X; \ + if (eflag) { \ + simd_reduce_add2(vwidth, energy, e_coul); \ + if (tid==0) { \ + engv[ei]=energy*(acctyp)0.5; \ + ei+=ev_stride; \ + engv[ei]=e_coul*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (tid==0) { \ + for (int r=0; r<6; r++) { \ + engv[ei]=virial[r]*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + } \ + } \ + } else if (offset==0 && ii1) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - f.x += shfl_xor(f.x, s, t_per_atom); \ - f.y += shfl_xor(f.y, s, t_per_atom); \ - f.z += shfl_xor(f.z, s, t_per_atom); \ - tor.x += shfl_xor(tor.x, s, t_per_atom); \ - tor.y += shfl_xor(tor.y, s, t_per_atom); \ - tor.z += shfl_xor(tor.z, s, t_per_atom); \ - energy += shfl_xor(energy, s, t_per_atom); \ - e_coul += shfl_xor(e_coul, s, t_per_atom); \ - } \ - if (vflag>0) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - for (int r=0; r<6; r++) \ - virial[r] += shfl_xor(virial[r], s, t_per_atom); \ - } \ - } \ - } \ - if (offset==0) { \ - int ei=ii; \ - if (eflag>0) { \ - engv[ei]=energy*(acctyp)0.5; \ - ei+=inum; \ - engv[ei]=e_coul*(acctyp)0.5; \ - ei+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - engv[ei]=virial[i]*(acctyp)0.5; \ - ei+=inum; \ - } \ - } \ + if (t_per_atom>1) \ + simd_reduce_add6(t_per_atom, f.x, f.y, f.z, tor.x, tor.y, tor.z); \ + if (offset==0 && ii0) { + if (EVFLAG && eflag) { acctyp e = (acctyp)0.0; if (rsq < lj1[mtype].w) { e = qtmp*qj*rinv; @@ -324,7 +396,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-lj3[mtype].z); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*force.x; virial[1] += dely*force.y; virial[2] += delz*force.z; @@ -335,9 +407,9 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset, + eflag,vflag,ans,engv); } __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, @@ -361,33 +433,33 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) lj3[tid]=lj3_in[tid]; } - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; - acctyp4 f; + acctyp4 f, tor; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp4 tor; - tor.x=(acctyp)0; - tor.y=(acctyp)0; - tor.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { acctyp e = (acctyp)0; if (rsq < lj1[mtype].w) { e = qtmp*qj*rinv; @@ -537,7 +609,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-lj3[mtype].z); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*force.x; virial[1] += dely*force.y; virial[2] += delz*force.z; @@ -548,8 +620,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset, + eflag,vflag,ans,engv); } - diff --git a/lib/gpu/lal_dipole_lj.h b/lib/gpu/lal_dipole_lj.h index bd312324c6..395a7472ba 100644 --- a/lib/gpu/lal_dipole_lj.h +++ b/lib/gpu/lal_dipole_lj.h @@ -77,7 +77,7 @@ class DipoleLJ : public BaseDipole { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_dipole_lj_ext.cpp b/lib/gpu/lal_dipole_lj_ext.cpp index 0a94969c8b..90c9935913 100644 --- a/lib/gpu/lal_dipole_lj_ext.cpp +++ b/lib/gpu/lal_dipole_lj_ext.cpp @@ -57,7 +57,7 @@ int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, int init_ok=0; if (world_me==0) init_ok=DPLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, + host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); @@ -76,7 +76,7 @@ int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, } if (gpu_rank==i && world_me!=0) init_ok=DPLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, - offset, special_lj, inum, nall, 300, maxspecial, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); diff --git a/lib/gpu/lal_dipole_lj_sf.cpp b/lib/gpu/lal_dipole_lj_sf.cpp index dcf95bb126..6b40ffaa11 100644 --- a/lib/gpu/lal_dipole_lj_sf.cpp +++ b/lib/gpu/lal_dipole_lj_sf.cpp @@ -125,20 +125,9 @@ double DipoleLJSFT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) { +int DipoleLJSFT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -146,8 +135,8 @@ void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, @@ -165,6 +154,7 @@ void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) { &_qqrd2e, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class DipoleLJSF; diff --git a/lib/gpu/lal_dipole_lj_sf.cu b/lib/gpu/lal_dipole_lj_sf.cu index 8032ae82ed..717d8959ba 100644 --- a/lib/gpu/lal_dipole_lj_sf.cu +++ b/lib/gpu/lal_dipole_lj_sf.cu @@ -32,106 +32,178 @@ _texture_2d( mu_tex,int4); #define mu_tex mu_ #endif -#if (ARCH < 300) +#if (SHUFFLE_AVAIL == 0) -#define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid, \ - t_per_atom, offset, eflag, vflag, ans, engv) \ +#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid, \ + t_per_atom, offset, eflag, vflag, ans, engv) \ if (t_per_atom>1) { \ - __local acctyp red_acc[8][BLOCK_PAIR]; \ - red_acc[0][tid]=f.x; \ - red_acc[1][tid]=f.y; \ - red_acc[2][tid]=f.z; \ - red_acc[3][tid]=tor.x; \ - red_acc[4][tid]=tor.y; \ - red_acc[5][tid]=tor.z; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ + simd_reduce_add6(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z, \ + tor.x, tor.y, tor.z); \ + if (EVFLAG && (vflag==2 || eflag==2)) { \ + if (eflag) { \ + simdsync(); \ + simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \ } \ - } \ - f.x=red_acc[0][tid]; \ - f.y=red_acc[1][tid]; \ - f.z=red_acc[2][tid]; \ - tor.x=red_acc[3][tid]; \ - tor.y=red_acc[4][tid]; \ - tor.z=red_acc[5][tid]; \ - if (eflag>0 || vflag>0) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid]=virial[r]; \ - red_acc[6][tid]=energy; \ - red_acc[7][tid]=ecoul; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<8; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ - } \ + if (vflag) { \ + simdsync(); \ + simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \ } \ - for (int r=0; r<6; r++) \ - virial[r]=red_acc[r][tid]; \ - energy=red_acc[6][tid]; \ - ecoul=red_acc[7][tid]; \ } \ } \ - if (offset==0) { \ - int ei=ii; \ - if (eflag>0) { \ - engv[ei]=energy*(acctyp)0.5; \ - ei+=inum; \ - engv[ei]=e_coul*(acctyp)0.5; \ - ei+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - engv[ei]=virial[i]*(acctyp)0.5; \ - ei+=inum; \ - } \ - } \ + if (offset==0 && ii1) { \ + simd_reduce_add6(t_per_atom, f.x, f.y, f.z, tor.x, tor.y, tor.z); \ + if (vflag==2 || eflag==2) { \ + if (eflag) \ + simd_reduce_add2(t_per_atom,energy,e_coul); \ + if (vflag) \ + simd_reduce_arr(6, t_per_atom,virial); \ + } \ + } \ + if (offset==0 && ii 1; active_subgs /= vwidth) { \ + if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \ + if (bnum < active_subgs) { \ + if (eflag) { \ + simd_reduce_add2(vwidth, energy, e_coul); \ + if (voffset==0) { \ + red_acc[6][bnum] = energy; \ + red_acc[7][bnum] = e_coul; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (voffset==0) \ + for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \ + } \ + } \ + \ + __syncthreads(); \ + if (tid < active_subgs) { \ + if (eflag) { \ + energy = red_acc[6][tid]; \ + e_coul = red_acc[7][tid]; \ + } \ + if (vflag) \ + for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \ + } else { \ + if (eflag) energy = e_coul = (acctyp)0; \ + if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \ + } \ + } \ + \ + if (bnum == 0) { \ + int ei=BLOCK_ID_X; \ + const int ev_stride=NUM_BLOCKS_X; \ + if (eflag) { \ + simd_reduce_add2(vwidth, energy, e_coul); \ + if (tid==0) { \ + engv[ei]=energy*(acctyp)0.5; \ + ei+=ev_stride; \ + engv[ei]=e_coul*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (tid==0) { \ + for (int r=0; r<6; r++) { \ + engv[ei]=virial[r]*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + } \ + } \ + } else if (offset==0 && ii1) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - f.x += shfl_xor(f.x, s, t_per_atom); \ - f.y += shfl_xor(f.y, s, t_per_atom); \ - f.z += shfl_xor(f.z, s, t_per_atom); \ - tor.x += shfl_xor(tor.x, s, t_per_atom); \ - tor.y += shfl_xor(tor.y, s, t_per_atom); \ - tor.z += shfl_xor(tor.z, s, t_per_atom); \ - energy += shfl_xor(energy, s, t_per_atom); \ - e_coul += shfl_xor(e_coul, s, t_per_atom); \ - } \ - if (vflag>0) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - for (int r=0; r<6; r++) \ - virial[r] += shfl_xor(virial[r], s, t_per_atom); \ - } \ - } \ - } \ - if (offset==0) { \ - int ei=ii; \ - if (eflag>0) { \ - engv[ei]=energy*(acctyp)0.5; \ - ei+=inum; \ - engv[ei]=e_coul*(acctyp)0.5; \ - ei+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - engv[ei]=virial[i]*(acctyp)0.5; \ - ei+=inum; \ - } \ - } \ + t_per_atom, offset, eflag, vflag, ans, engv) \ + if (t_per_atom>1) \ + simd_reduce_add6(t_per_atom, f.x, f.y, f.z, tor.x, tor.y, tor.z); \ + if (offset==0 && ii0) { + if (EVFLAG && eflag) { acctyp e = (acctyp)0.0; if (rsq < lj1[mtype].w) { numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv); @@ -357,7 +429,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, energy+=factor_lj*e; } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*force.x; virial[1] += dely*force.y; virial[2] += delz*force.z; @@ -367,9 +439,9 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, } } } // for nbor - store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset, + eflag,vflag,ans,engv); } __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, @@ -394,33 +466,33 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) lj3[tid]=lj3_in[tid]; } - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; - acctyp4 f; + acctyp4 f, tor; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp4 tor; - tor.x=(acctyp)0; - tor.y=(acctyp)0; - tor.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { acctyp e = (acctyp)0.0; if (rsq < lj1[mtype].w) { numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv); @@ -600,7 +672,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, energy+=factor_lj*e; } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*force.x; virial[1] += dely*force.y; virial[2] += delz*force.z; @@ -611,8 +683,8 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset, + eflag,vflag,ans,engv); } diff --git a/lib/gpu/lal_dipole_lj_sf.h b/lib/gpu/lal_dipole_lj_sf.h index ae73508065..088d8df03e 100644 --- a/lib/gpu/lal_dipole_lj_sf.h +++ b/lib/gpu/lal_dipole_lj_sf.h @@ -77,7 +77,7 @@ class DipoleLJSF : public BaseDipole { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_dipole_lj_sf_ext.cpp b/lib/gpu/lal_dipole_lj_sf_ext.cpp index 3626e8305e..0879702887 100644 --- a/lib/gpu/lal_dipole_lj_sf_ext.cpp +++ b/lib/gpu/lal_dipole_lj_sf_ext.cpp @@ -57,7 +57,7 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1, int init_ok=0; if (world_me==0) init_ok=DPLSFMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, special_lj, inum, nall, 300, + host_lj4, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); @@ -76,7 +76,7 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1, } if (gpu_rank==i && world_me!=0) init_ok=DPLSFMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, - special_lj, inum, nall, 300, maxspecial, + special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); diff --git a/lib/gpu/lal_dipole_long_lj.cpp b/lib/gpu/lal_dipole_long_lj.cpp index 9648e9b15e..5531fa0dc9 100644 --- a/lib/gpu/lal_dipole_long_lj.cpp +++ b/lib/gpu/lal_dipole_long_lj.cpp @@ -128,20 +128,9 @@ double DipoleLongLJT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void DipoleLongLJT::loop(const bool _eflag, const bool _vflag) { +int DipoleLongLJT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -149,8 +138,8 @@ void DipoleLongLJT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, @@ -168,6 +157,7 @@ void DipoleLongLJT::loop(const bool _eflag, const bool _vflag) { &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class DipoleLongLJ; diff --git a/lib/gpu/lal_dipole_long_lj.cu b/lib/gpu/lal_dipole_long_lj.cu index 3aafba43aa..407b63f93e 100644 --- a/lib/gpu/lal_dipole_long_lj.cu +++ b/lib/gpu/lal_dipole_long_lj.cu @@ -31,106 +31,178 @@ _texture_2d( mu_tex,int4); #define mu_tex mu_ #endif -#if (ARCH < 300) +#if (SHUFFLE_AVAIL == 0) -#define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid, \ - t_per_atom, offset, eflag, vflag, ans, engv) \ +#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid, \ + t_per_atom, offset, eflag, vflag, ans, engv) \ if (t_per_atom>1) { \ - __local acctyp red_acc[8][BLOCK_PAIR]; \ - red_acc[0][tid]=f.x; \ - red_acc[1][tid]=f.y; \ - red_acc[2][tid]=f.z; \ - red_acc[3][tid]=tor.x; \ - red_acc[4][tid]=tor.y; \ - red_acc[5][tid]=tor.z; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ + simd_reduce_add6(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z, \ + tor.x, tor.y, tor.z); \ + if (EVFLAG && (vflag==2 || eflag==2)) { \ + if (eflag) { \ + simdsync(); \ + simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \ } \ - } \ - f.x=red_acc[0][tid]; \ - f.y=red_acc[1][tid]; \ - f.z=red_acc[2][tid]; \ - tor.x=red_acc[3][tid]; \ - tor.y=red_acc[4][tid]; \ - tor.z=red_acc[5][tid]; \ - if (eflag>0 || vflag>0) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid]=virial[r]; \ - red_acc[6][tid]=energy; \ - red_acc[7][tid]=ecoul; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<8; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ - } \ + if (vflag) { \ + simdsync(); \ + simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \ } \ - for (int r=0; r<6; r++) \ - virial[r]=red_acc[r][tid]; \ - energy=red_acc[6][tid]; \ - ecoul=red_acc[7][tid]; \ } \ } \ - if (offset==0) { \ - int ei=ii; \ - if (eflag>0) { \ - engv[ei]=energy*(acctyp)0.5; \ - ei+=inum; \ - engv[ei]=e_coul*(acctyp)0.5; \ - ei+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - engv[ei]=virial[i]*(acctyp)0.5; \ - ei+=inum; \ - } \ - } \ + if (offset==0 && ii1) { \ + simd_reduce_add6(t_per_atom, f.x, f.y, f.z, tor.x, tor.y, tor.z); \ + if (vflag==2 || eflag==2) { \ + if (eflag) \ + simd_reduce_add2(t_per_atom,energy,e_coul); \ + if (vflag) \ + simd_reduce_arr(6, t_per_atom,virial); \ + } \ + } \ + if (offset==0 && ii 1; active_subgs /= vwidth) { \ + if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \ + if (bnum < active_subgs) { \ + if (eflag) { \ + simd_reduce_add2(vwidth, energy, e_coul); \ + if (voffset==0) { \ + red_acc[6][bnum] = energy; \ + red_acc[7][bnum] = e_coul; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (voffset==0) \ + for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \ + } \ + } \ + \ + __syncthreads(); \ + if (tid < active_subgs) { \ + if (eflag) { \ + energy = red_acc[6][tid]; \ + e_coul = red_acc[7][tid]; \ + } \ + if (vflag) \ + for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \ + } else { \ + if (eflag) energy = e_coul = (acctyp)0; \ + if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \ + } \ + } \ + \ + if (bnum == 0) { \ + int ei=BLOCK_ID_X; \ + const int ev_stride=NUM_BLOCKS_X; \ + if (eflag) { \ + simd_reduce_add2(vwidth, energy, e_coul); \ + if (tid==0) { \ + engv[ei]=energy*(acctyp)0.5; \ + ei+=ev_stride; \ + engv[ei]=e_coul*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (tid==0) { \ + for (int r=0; r<6; r++) { \ + engv[ei]=virial[r]*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + } \ + } \ + } else if (offset==0 && ii1) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - f.x += shfl_xor(f.x, s, t_per_atom); \ - f.y += shfl_xor(f.y, s, t_per_atom); \ - f.z += shfl_xor(f.z, s, t_per_atom); \ - tor.x += shfl_xor(tor.x, s, t_per_atom); \ - tor.y += shfl_xor(tor.y, s, t_per_atom); \ - tor.z += shfl_xor(tor.z, s, t_per_atom); \ - energy += shfl_xor(energy, s, t_per_atom); \ - e_coul += shfl_xor(e_coul, s, t_per_atom); \ - } \ - if (vflag>0) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - for (int r=0; r<6; r++) \ - virial[r] += shfl_xor(virial[r], s, t_per_atom); \ - } \ - } \ - } \ - if (offset==0) { \ - int ei=ii; \ - if (eflag>0) { \ - engv[ei]=energy*(acctyp)0.5; \ - ei+=inum; \ - engv[ei]=e_coul*(acctyp)0.5; \ - ei+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - engv[ei]=virial[i]*(acctyp)0.5; \ - ei+=inum; \ - } \ - } \ + if (t_per_atom>1) \ + simd_reduce_add6(t_per_atom, f.x, f.y, f.z, tor.x, tor.y, tor.z); \ + if (offset==0 && ii0) { + if (EVFLAG && eflag) { acctyp e = (acctyp)0.0; if (rsq < cut_coulsq && factor_coul > (numtyp)0.0) { e = qqrd2e*(b0*g0 + b1*g1 + b2*g2); @@ -368,7 +440,7 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-lj3[mtype].z); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*force.x; virial[1] += dely*force.y; virial[2] += delz*force.z; @@ -379,9 +451,9 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset, + eflag,vflag,ans,engv); } __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_, @@ -406,26 +478,27 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_, __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) lj3[tid]=lj3_in[tid]; } - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; - acctyp4 f; + acctyp4 f, tor; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp4 tor; - tor.x=(acctyp)0; - tor.y=(acctyp)0; - tor.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); @@ -436,7 +509,6 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_, if (ii0) { + if (EVFLAG && eflag) { acctyp e = (acctyp)0.0; if (rsq < cut_coulsq && factor_coul > (numtyp)0.0) { e = qqrd2e*(b0*g0 + b1*g1 + b2*g2); @@ -622,7 +694,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-lj3[mtype].z); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*force.x; virial[1] += dely*force.y; virial[2] += delz*force.z; @@ -633,8 +705,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset, + eflag,vflag,ans,engv); } - diff --git a/lib/gpu/lal_dipole_long_lj.h b/lib/gpu/lal_dipole_long_lj.h index 77e22a10a7..c8f37efd2b 100644 --- a/lib/gpu/lal_dipole_long_lj.h +++ b/lib/gpu/lal_dipole_long_lj.h @@ -77,7 +77,7 @@ class DipoleLongLJ : public BaseDipole { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_dipole_long_lj_ext.cpp b/lib/gpu/lal_dipole_long_lj_ext.cpp index b2751e8a82..fd61706ba9 100644 --- a/lib/gpu/lal_dipole_long_lj_ext.cpp +++ b/lib/gpu/lal_dipole_long_lj_ext.cpp @@ -58,7 +58,7 @@ int dplj_gpu_init(const int ntypes, double **cutsq, double **host_lj1, int init_ok=0; if (world_me==0) init_ok=DPLJMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, + host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); @@ -77,7 +77,7 @@ int dplj_gpu_init(const int ntypes, double **cutsq, double **host_lj1, } if (gpu_rank==i && world_me!=0) init_ok=DPLJMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, - offset, special_lj, inum, nall, 300, maxspecial, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); diff --git a/lib/gpu/lal_dpd.cpp b/lib/gpu/lal_dpd.cpp index c5cbc7eb53..f890fb53a3 100644 --- a/lib/gpu/lal_dpd.cpp +++ b/lib/gpu/lal_dpd.cpp @@ -52,15 +52,31 @@ int DPDT::init(const int ntypes, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen) { + const int max_shared_types=this->device->max_shared_types(); + + int onetype=0; + #ifdef USE_OPENCL + if (maxspecial==0) + for (int i=1; i0) { + if (onetype>0) + onetype=-1; + else if (onetype==0) + onetype=i*max_shared_types+j; + } + if (onetype<0) onetype=0; + #endif + int success; - success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,_screen,dpd,"k_dpd"); + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, + gpu_split,_screen,dpd,"k_dpd",onetype); if (success!=0) return success; // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; - int max_shared_types=this->device->max_shared_types(); if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { lj_types=max_shared_types; shared_types=true; @@ -117,20 +133,9 @@ double DPDT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void DPDT::loop(const bool _eflag, const bool _vflag) { +int DPDT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -138,8 +143,8 @@ void DPDT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &coeff, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &coeff, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->v, &cutsq, @@ -155,6 +160,7 @@ void DPDT::loop(const bool _eflag, const bool _vflag) { &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template diff --git a/lib/gpu/lal_dpd.cu b/lib/gpu/lal_dpd.cu index a29e04fc7f..2794110a92 100644 --- a/lib/gpu/lal_dpd.cu +++ b/lib/gpu/lal_dpd.cu @@ -179,16 +179,19 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_, int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - acctyp energy=(acctyp)0; + int n_stride; + local_allocate_store_pair(); + acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { // unshifted eng of conservative term: // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]); // eng shifted to 0.0 at cutoff numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd; energy+=factor_dpd*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -267,9 +270,9 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_dpd_fast(const __global numtyp4 *restrict x_, @@ -289,6 +292,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_, int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); + #ifndef ONETYPE __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; if (tid<4) @@ -296,25 +300,36 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_, if (tid tag2) { @@ -359,24 +382,37 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_, // drag force = -gamma * wd^2 * (delx dot delv) / r // random force = sigma * wd * rnd * dtinvsqrt; + #ifndef ONETYPE + const numtyp coeffx=coeff[mtype].x; + const numtyp coeffy=coeff[mtype].y; + const numtyp coeffz=coeff[mtype].z; + #endif numtyp force = (numtyp)0.0; - if (!tstat_only) force = coeff[mtype].x*wd; - force -= coeff[mtype].y*wd*wd*dot*rinv; - force += coeff[mtype].z*wd*randnum*dtinvsqrt; + if (!tstat_only) force = coeffx*wd; + force -= coeffy*wd*wd*dot*rinv; + force += coeffz*wd*randnum*dtinvsqrt; + #ifndef ONETYPE force*=factor_dpd*rinv; + #else + force*=rinv; + #endif f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; - if (eflag>0) { + if (EVFLAG && eflag) { // unshifted eng of conservative term: // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]); // eng shifted to 0.0 at cutoff - numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd; + numtyp e = (numtyp)0.5*coeffx*coeffw * wd*wd; + #ifndef ONETYPE energy+=factor_dpd*e; + #else + energy+=e; + #endif } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -387,8 +423,8 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } diff --git a/lib/gpu/lal_dpd.h b/lib/gpu/lal_dpd.h index 3c36c39e05..be93d988a3 100644 --- a/lib/gpu/lal_dpd.h +++ b/lib/gpu/lal_dpd.h @@ -78,7 +78,7 @@ class DPD : public BaseDPD { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_dpd_ext.cpp b/lib/gpu/lal_dpd_ext.cpp index d727a87319..7637ff03c0 100644 --- a/lib/gpu/lal_dpd_ext.cpp +++ b/lib/gpu/lal_dpd_ext.cpp @@ -55,7 +55,7 @@ int dpd_gpu_init(const int ntypes, double **cutsq, double **host_a0, int init_ok=0; if (world_me==0) init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma, - host_cut, special_lj, false, inum, nall, 300, + host_cut, special_lj, false, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); DPDMF.device->world_barrier(); @@ -73,7 +73,7 @@ int dpd_gpu_init(const int ntypes, double **cutsq, double **host_a0, } if (gpu_rank==i && world_me!=0) init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma, - host_cut, special_lj, false, inum, nall, 300, + host_cut, special_lj, false, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); DPDMF.device->gpu_barrier(); diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp index 03479cd16a..cdafe72898 100644 --- a/lib/gpu/lal_eam.cpp +++ b/lib/gpu/lal_eam.cpp @@ -52,9 +52,23 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen) { + int max_shared_types=this->device->max_shared_types(); + + int onetype=0; + #ifdef USE_OPENCL + for (int i=1; i=0 && host_type2frho[i]<=nfrho-1) { + if (onetype>0) + onetype=-1; + else if (onetype==0) + onetype=i*max_shared_types+i; + } + if (onetype<0) onetype=0; + #endif + int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, - gpu_split,_screen,eam,"k_eam"); + gpu_split,_screen,eam,"k_eam",onetype); if (success!=0) return success; @@ -72,6 +86,13 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor, k_energy_fast.set_function(*(this->pair_program),"k_energy_fast"); fp_tex.get_texture(*(this->pair_program),"fp_tex"); fp_tex.bind_float(_fp,1); + + #if defined(LAL_OCL_EV_JIT) + k_energy_fast_noev.set_function(*(this->pair_program_noev),"k_energy_fast"); + #else + k_energy_sel = &k_energy_fast; + #endif + _compiled_energy = true; // Initialize timers for selected GPU @@ -88,7 +109,6 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int lj_types=ntypes; shared_types=false; - int max_shared_types=this->device->max_shared_types(); if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { lj_types=max_shared_types; shared_types=true; @@ -260,6 +280,9 @@ void EAMT::clear() { if (_compiled_energy) { k_energy_fast.clear(); k_energy.clear(); + #if defined(LAL_OCL_EV_JIT) + k_energy_fast_noev.clear(); + #endif _compiled_energy=false; } @@ -278,11 +301,18 @@ template void EAMT::compute(const int f_ago, const int inum_full, const int nlocal, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, + const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, void **fp_ptr) { this->acc_timers(); + int eflag, vflag; + if (eflag_in) eflag=2; + else eflag=0; + if (vflag_in) vflag=2; + else vflag=0; + + this->set_kernel(eflag,vflag); if (this->device->time_device()) { // Put time from the second part to the total time_pair @@ -346,12 +376,20 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal, template int** EAMT::compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, tagint **special, - const bool eflag, const bool vflag, const bool eatom, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag_in, + const bool vflag_in, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, int &inum, void **fp_ptr) { this->acc_timers(); + int eflag, vflag; + if (eflag_in) eflag=2; + else eflag=0; + if (vflag_in) vflag=2; + else vflag=0; + + this->set_kernel(eflag,vflag); if (this->device->time_device()) { // Put time from the second part to the total time_pair @@ -430,9 +468,9 @@ void EAMT::compute2(int *ilist, const bool eflag, const bool vflag, loop2(eflag,vflag); if (ilist == nullptr) - this->ans->copy_answers(eflag,vflag,eatom,vatom); + this->ans->copy_answers(eflag,vflag,eatom,vatom, this->ans->inum()); else - this->ans->copy_answers(eflag,vflag,eatom,vatom, ilist); + this->ans->copy_answers(eflag,vflag,eatom,vatom, ilist, this->ans->inum()); this->device->add_ans_object(this->ans); this->hd_balancer.stop_timer(); @@ -442,20 +480,9 @@ void EAMT::compute2(int *ilist, const bool eflag, const bool vflag, // Calculate per-atom energies and forces // --------------------------------------------------------------------------- template -void EAMT::loop(const bool _eflag, const bool _vflag) { +int EAMT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -464,13 +491,18 @@ void EAMT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { - this->k_energy_fast.set_size(GX,BX); - this->k_energy_fast.run(&this->atom->x, &type2rhor_z2r, &type2frho, - &rhor_spline2, &frho_spline1,&frho_spline2, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &_fp, &this->ans->engv, &eflag, &ainum, - &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho, - &_rhomax, &_nrho, &_nr, &this->_threads_per_atom); + #if defined(LAL_OCL_EV_JIT) + if (eflag || vflag) k_energy_sel = &k_energy_fast; + else k_energy_sel = &k_energy_fast_noev; + #endif + + k_energy_sel->set_size(GX,BX); + k_energy_sel->run(&this->atom->x, &type2rhor_z2r, &type2frho, + &rhor_spline2, &frho_spline1,&frho_spline2, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &_fp, &this->ans->engv, &eflag, &ainum, + &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho, + &_rhomax, &_nrho, &_nr, &this->_threads_per_atom); } else { this->k_energy.set_size(GX,BX); this->k_energy.run(&this->atom->x, &type2rhor_z2r, &type2frho, @@ -482,6 +514,7 @@ void EAMT::loop(const bool _eflag, const bool _vflag) { } this->time_pair.stop(); + return ainum; } // --------------------------------------------------------------------------- @@ -510,8 +543,8 @@ void EAMT::loop2(const bool _eflag, const bool _vflag) { this->time_pair2.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &_fp, &type2rhor_z2r, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &_fp, &type2rhor_z2r, &rhor_spline1, &z2r_spline1, &z2r_spline2, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, diff --git a/lib/gpu/lal_eam.cu b/lib/gpu/lal_eam.cu index b22ce7b575..3955f3cc8a 100644 --- a/lib/gpu/lal_eam.cu +++ b/lib/gpu/lal_eam.cu @@ -36,6 +36,16 @@ _texture( z2r_sp1_tex,int4); _texture( z2r_sp2_tex,int4); #endif +#if (__CUDACC_VER_MAJOR__ >= 11) +#define fp_tex fp_ +#define rhor_sp1_tex rhor_spline1 +#define rhor_sp2_tex rhor_spline2 +#define frho_sp1_tex frho_spline1 +#define frho_sp2_tex frho_spline2 +#define z2r_sp1_tex z2r_spline1 +#define z2r_sp2_tex z2r_spline2 +#endif + #else #define pos_tex x_ @@ -52,30 +62,33 @@ _texture( z2r_sp2_tex,int4); #define MIN(A,B) ((A) < (B) ? (A) : (B)) #define MAX(A,B) ((A) > (B) ? (A) : (B)) -#if (ARCH < 300) +#if (SHUFFLE_AVAIL == 0) + +#define local_allocate_store_energy_fp() \ + __local acctyp red_acc[BLOCK_PAIR]; #define store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset, \ - eflag,vflag,engv,rdrho,nrho,i,rhomax) \ + eflag,vflag,engv,rdrho,nrho,i,rhomax,tfrho) \ if (t_per_atom>1) { \ - __local acctyp red_acc[BLOCK_PAIR]; \ red_acc[tid]=rho; \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ if (offset < s) \ red_acc[tid] += red_acc[tid+s]; \ } \ rho=red_acc[tid]; \ } \ - if (offset==0) { \ + if (offset==0 && ii0) { \ + if (EVFLAG && eflag) { \ fetch4(coeff,index,frho_sp2_tex); \ energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \ if (rho > rhomax) energy += fp*(rho-rhomax); \ @@ -83,15 +96,18 @@ _texture( z2r_sp2_tex,int4); } \ } +#define local_allocate_store_answers_eam() \ + __local acctyp red_acc[6][BLOCK_PAIR]; + #define store_answers_eam(f, energy, virial, ii, inum, tid, t_per_atom, \ offset, elag, vflag, ans, engv) \ if (t_per_atom>1) { \ - __local acctyp red_acc[6][BLOCK_PAIR]; \ red_acc[0][tid]=f.x; \ red_acc[1][tid]=f.y; \ red_acc[2][tid]=f.z; \ red_acc[3][tid]=energy; \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ if (offset < s) { \ for (int r=0; r<4; r++) \ red_acc[r][tid] += red_acc[r][tid+s]; \ @@ -101,10 +117,12 @@ _texture( z2r_sp2_tex,int4); f.y=red_acc[1][tid]; \ f.z=red_acc[2][tid]; \ energy=red_acc[3][tid]; \ - if (vflag>0) { \ + if (EVFLAG && vflag) { \ + simdsync(); \ for (int r=0; r<6; r++) \ red_acc[r][tid]=virial[r]; \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ if (offset < s) { \ for (int r=0; r<6; r++) \ red_acc[r][tid] += red_acc[r][tid+s]; \ @@ -114,13 +132,13 @@ _texture( z2r_sp2_tex,int4); virial[r]=red_acc[r][tid]; \ } \ } \ - if (offset==0) { \ + if (offset==0 && ii0) { \ + if (EVFLAG && eflag) { \ engv[ei]+=energy*(acctyp)0.5; \ ei+=inum; \ } \ - if (vflag>0) { \ + if (EVFLAG && vflag) { \ for (int i=0; i<6; i++) { \ engv[ei]=virial[i]*(acctyp)0.5; \ ei+=inum; \ @@ -131,53 +149,57 @@ _texture( z2r_sp2_tex,int4); #else +#define local_allocate_store_energy_fp() + #define store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset, \ - eflag,vflag,engv,rdrho,nrho,i,rhomax) \ + eflag,vflag,engv,rdrho,nrho,i,rhomax,tfrho) \ if (t_per_atom>1) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) \ - rho += shfl_xor(rho, s, t_per_atom); \ + rho += shfl_down(rho, s, t_per_atom); \ } \ - if (offset==0) { \ + if (offset==0 && ii0) { \ + if (EVFLAG && eflag) { \ fetch4(coeff,index,frho_sp2_tex); \ energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \ if (rho > rhomax) energy += fp*(rho-rhomax); \ - engv[ii]=energy; \ + engv[ii]=energy; \ } \ } +#define local_allocate_store_answers_eam() + #define store_answers_eam(f, energy, virial, ii, inum, tid, t_per_atom, \ offset, eflag, vflag, ans, engv) \ if (t_per_atom>1) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - f.x += shfl_xor(f.x, s, t_per_atom); \ - f.y += shfl_xor(f.y, s, t_per_atom); \ - f.z += shfl_xor(f.z, s, t_per_atom); \ - energy += shfl_xor(energy, s, t_per_atom); \ + f.x += shfl_down(f.x, s, t_per_atom); \ + f.y += shfl_down(f.y, s, t_per_atom); \ + f.z += shfl_down(f.z, s, t_per_atom); \ + if (EVFLAG) energy += shfl_down(energy, s, t_per_atom); \ } \ - if (vflag>0) { \ + if (EVFLAG && vflag) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - for (int r=0; r<6; r++) \ - virial[r] += shfl_xor(virial[r], s, t_per_atom); \ + for (int r=0; r<6; r++) \ + virial[r] += shfl_down(virial[r], s, t_per_atom); \ } \ } \ } \ - if (offset==0) { \ + if (offset==0 && ii0) { \ + if (EVFLAG && eflag) { \ engv[ei]+=energy*(acctyp)0.5; \ ei+=inum; \ } \ - if (vflag>0) { \ + if (EVFLAG && vflag) { \ for (int i=0; i<6; i++) { \ engv[ei]=virial[i]*(acctyp)0.5; \ ei+=inum; \ @@ -203,21 +225,23 @@ __kernel void k_energy(const __global numtyp4 *restrict x_, const numtyp rdr, const numtyp rdrho, const numtyp rhomax, const int nrho, const int nr, const int t_per_atom) { - int tid, ii, offset; + int tid, ii, offset, i, itype; atom_info(t_per_atom,ii,tid,offset); + int n_stride; + local_allocate_store_energy_fp(); + acctyp rho = (acctyp)0; - acctyp energy = (acctyp)0; + acctyp energy; + if (EVFLAG && eflag) energy=(acctyp)0; if (ii0) { + if (EVFLAG && eflag) { energy += phi; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -431,10 +469,9 @@ __kernel void k_eam(const __global numtyp4 *restrict x_, } } } // for nbor - store_answers_eam(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii - + store_answers_eam(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_eam_fast(const __global numtyp4 *x_, @@ -453,40 +490,51 @@ __kernel void k_eam_fast(const __global numtyp4 *x_, int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); + #ifndef ONETYPE __local int2 type2rhor_z2r[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - if (tid0) { + if (EVFLAG && eflag) { energy += phi; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -547,8 +610,8 @@ __kernel void k_eam_fast(const __global numtyp4 *x_, } } } // for nbor - store_answers_eam(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers_eam(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } diff --git a/lib/gpu/lal_eam.h b/lib/gpu/lal_eam.h index fa05075883..3cbaeac0b8 100644 --- a/lib/gpu/lal_eam.h +++ b/lib/gpu/lal_eam.h @@ -90,7 +90,7 @@ class EAM : public BaseAtomic { const bool eatom, const bool vatom); // ------------------------- DEVICE KERNELS ------------------------- - UCL_Kernel k_energy, k_energy_fast; + UCL_Kernel k_energy, k_energy_fast, k_energy_fast_noev, *k_energy_sel; // --------------------------- TEXTURES ----------------------------- UCL_Texture fp_tex; @@ -133,8 +133,8 @@ class EAM : public BaseAtomic { protected: bool _allocated; int _nlocal; - void loop(const bool _eflag, const bool _vflag); - void loop2(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); + void loop2(const bool eflag, const bool vflag); }; } diff --git a/lib/gpu/lal_eam_alloy_ext.cpp b/lib/gpu/lal_eam_alloy_ext.cpp index e5f1010e76..f7c4986e68 100644 --- a/lib/gpu/lal_eam_alloy_ext.cpp +++ b/lib/gpu/lal_eam_alloy_ext.cpp @@ -67,7 +67,7 @@ int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq, init_ok=EAMALMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, - nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, + nfrho, nr, nlocal, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); EAMALMF.device->world_barrier(); @@ -87,7 +87,7 @@ int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq, init_ok=EAMALMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, - nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, + nz2r, nfrho, nr, nlocal, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); EAMALMF.device->gpu_barrier(); diff --git a/lib/gpu/lal_eam_ext.cpp b/lib/gpu/lal_eam_ext.cpp index 78f2e3c1f8..3010e0ea7f 100644 --- a/lib/gpu/lal_eam_ext.cpp +++ b/lib/gpu/lal_eam_ext.cpp @@ -67,7 +67,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq, init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, - nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, + nfrho, nr, nlocal, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); EAMMF.device->world_barrier(); @@ -87,7 +87,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq, init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, - nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, + nz2r, nfrho, nr, nlocal, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); EAMMF.device->gpu_barrier(); @@ -98,7 +98,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq, fprintf(screen,"\n"); if (init_ok==0) - EAMMF.estimate_gpu_overhead(); + EAMMF.estimate_gpu_overhead(1); return init_ok; } diff --git a/lib/gpu/lal_eam_fs_ext.cpp b/lib/gpu/lal_eam_fs_ext.cpp index 37208e54f8..205b601562 100644 --- a/lib/gpu/lal_eam_fs_ext.cpp +++ b/lib/gpu/lal_eam_fs_ext.cpp @@ -67,7 +67,7 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq, init_ok=EAMFSMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, - nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, + nfrho, nr, nlocal, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); EAMFSMF.device->world_barrier(); @@ -87,7 +87,7 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq, init_ok=EAMFSMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, - nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, + nz2r, nfrho, nr, nlocal, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); EAMFSMF.device->gpu_barrier(); diff --git a/lib/gpu/lal_ellipsoid_extra.h b/lib/gpu/lal_ellipsoid_extra.h index e6122c7404..1c549ab6a6 100644 --- a/lib/gpu/lal_ellipsoid_extra.h +++ b/lib/gpu/lal_ellipsoid_extra.h @@ -32,22 +32,21 @@ _texture_2d( quat_tex,int4); #define quat_tex qif #endif -#define nbor_info_e(nbor_mem, nbor_stride, t_per_atom, ii, offset, \ - i, numj, stride, nbor_end, nbor_begin) \ - i=nbor_mem[ii]; \ - nbor_begin=ii+nbor_stride; \ - numj=nbor_mem[nbor_begin]; \ - nbor_begin+=nbor_stride; \ - nbor_end=nbor_begin+fast_mul(nbor_stride,numj); \ - nbor_begin+=fast_mul(offset,nbor_stride); \ - stride=fast_mul(t_per_atom,nbor_stride); +#define nbor_info_e_ss(nbor_mem, nbor_stride, t_per_atom, ii, offset, \ + i, numj, stride, nbor_end, nbor_begin) \ + i=nbor_mem[ii]; \ + nbor_begin=ii+nbor_stride; \ + numj=nbor_mem[nbor_begin]; \ + nbor_begin+=nbor_stride; \ + nbor_end=nbor_begin+fast_mul(nbor_stride,numj); \ + nbor_begin+=fast_mul(offset,nbor_stride); \ + stride=fast_mul(t_per_atom,nbor_stride); -#if (ARCH < 300) +#if (SHUFFLE_AVAIL == 0) #define store_answers_t(f, tor, energy, virial, ii, astride, tid, \ - t_per_atom, offset, eflag, vflag, ans, engv) \ + t_per_atom, offset, eflag, vflag, ans, engv, inum) \ if (t_per_atom>1) { \ - __local acctyp red_acc[7][BLOCK_PAIR]; \ red_acc[0][tid]=f.x; \ red_acc[1][tid]=f.y; \ red_acc[2][tid]=f.z; \ @@ -55,6 +54,7 @@ _texture_2d( quat_tex,int4); red_acc[4][tid]=tor.y; \ red_acc[5][tid]=tor.z; \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ if (offset < s) { \ for (int r=0; r<6; r++) \ red_acc[r][tid] += red_acc[r][tid+s]; \ @@ -66,28 +66,39 @@ _texture_2d( quat_tex,int4); tor.x=red_acc[3][tid]; \ tor.y=red_acc[4][tid]; \ tor.z=red_acc[5][tid]; \ - if (eflag>0 || vflag>0) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid]=virial[r]; \ - red_acc[6][tid]=energy; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<7; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ + if (EVFLAG && (eflag || vflag)) { \ + if (vflag) { \ + simdsync(); \ + for (int r=0; r<6; r++) \ + red_acc[r][tid]=virial[r]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + for (int r=0; r<6; r++) \ + virial[r]=red_acc[r][tid]; \ + } \ + if (eflag) { \ + simdsync(); \ + red_acc[0][tid]=energy; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) red_acc[0][tid] += red_acc[0][tid+s]; \ } \ } \ - for (int r=0; r<6; r++) \ - virial[r]=red_acc[r][tid]; \ - energy=red_acc[6][tid]; \ + energy=red_acc[0][tid]; \ } \ } \ - if (offset==0) { \ + if (offset==0 && ii0) { \ + if (EVFLAG && eflag) { \ *ap1=energy*(acctyp)0.5; \ ap1+=astride; \ } \ - if (vflag>0) { \ + if (EVFLAG && vflag) { \ for (int i=0; i<6; i++) { \ *ap1=virial[i]*(acctyp)0.5; \ ap1+=astride; \ @@ -100,12 +111,12 @@ _texture_2d( quat_tex,int4); #define acc_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \ eflag, vflag, ans, engv) \ if (t_per_atom>1) { \ - __local acctyp red_acc[6][BLOCK_PAIR]; \ red_acc[0][tid]=f.x; \ red_acc[1][tid]=f.y; \ red_acc[2][tid]=f.z; \ red_acc[3][tid]=energy; \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ if (offset < s) { \ for (int r=0; r<4; r++) \ red_acc[r][tid] += red_acc[r][tid+s]; \ @@ -115,10 +126,11 @@ _texture_2d( quat_tex,int4); f.y=red_acc[1][tid]; \ f.z=red_acc[2][tid]; \ energy=red_acc[3][tid]; \ - if (vflag>0) { \ + if (EVFLAG && vflag) { \ for (int r=0; r<6; r++) \ red_acc[r][tid]=virial[r]; \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ if (offset < s) { \ for (int r=0; r<6; r++) \ red_acc[r][tid] += red_acc[r][tid+s]; \ @@ -128,13 +140,13 @@ _texture_2d( quat_tex,int4); virial[r]=red_acc[r][tid]; \ } \ } \ - if (offset==0) { \ + if (offset==0 && ii0) { \ + if (EVFLAG && eflag) { \ *engv+=energy*(acctyp)0.5; \ engv+=inum; \ } \ - if (vflag>0) { \ + if (EVFLAG && vflag) { \ for (int i=0; i<6; i++) { \ *engv+=virial[i]*(acctyp)0.5; \ engv+=inum; \ @@ -150,31 +162,31 @@ _texture_2d( quat_tex,int4); #else #define store_answers_t(f, tor, energy, virial, ii, astride, tid, \ - t_per_atom, offset, eflag, vflag, ans, engv) \ + t_per_atom, offset, eflag, vflag, ans, engv, inum) \ if (t_per_atom>1) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - f.x += shfl_xor(f.x, s, t_per_atom); \ - f.y += shfl_xor(f.y, s, t_per_atom); \ - f.z += shfl_xor(f.z, s, t_per_atom); \ - tor.x += shfl_xor(tor.x, s, t_per_atom); \ - tor.y += shfl_xor(tor.y, s, t_per_atom); \ - tor.z += shfl_xor(tor.z, s, t_per_atom); \ - energy += shfl_xor(energy, s, t_per_atom); \ + f.x += shfl_down(f.x, s, t_per_atom); \ + f.y += shfl_down(f.y, s, t_per_atom); \ + f.z += shfl_down(f.z, s, t_per_atom); \ + tor.x += shfl_down(tor.x, s, t_per_atom); \ + tor.y += shfl_down(tor.y, s, t_per_atom); \ + tor.z += shfl_down(tor.z, s, t_per_atom); \ + if (EVFLAG) energy += shfl_down(energy, s, t_per_atom); \ } \ - if (vflag>0) { \ + if (EVFLAG && vflag) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - for (int r=0; r<6; r++) \ - virial[r] += shfl_xor(virial[r], s, t_per_atom); \ + for (int r=0; r<6; r++) \ + virial[r] += shfl_down(virial[r], s, t_per_atom); \ } \ } \ } \ - if (offset==0) { \ + if (offset==0 && ii0) { \ + if (EVFLAG && eflag) { \ *ap1=energy*(acctyp)0.5; \ ap1+=astride; \ } \ - if (vflag>0) { \ + if (EVFLAG && vflag) { \ for (int i=0; i<6; i++) { \ *ap1=virial[i]*(acctyp)0.5; \ ap1+=astride; \ @@ -188,25 +200,25 @@ _texture_2d( quat_tex,int4); eflag, vflag, ans, engv) \ if (t_per_atom>1) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - f.x += shfl_xor(f.x, s, t_per_atom); \ - f.y += shfl_xor(f.y, s, t_per_atom); \ - f.z += shfl_xor(f.z, s, t_per_atom); \ - energy += shfl_xor(energy, s, t_per_atom); \ + f.x += shfl_down(f.x, s, t_per_atom); \ + f.y += shfl_down(f.y, s, t_per_atom); \ + f.z += shfl_down(f.z, s, t_per_atom); \ + if (EVFLAG) energy += shfl_down(energy, s, t_per_atom); \ } \ - if (vflag>0) { \ + if (EVFLAG && vflag) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - for (int r=0; r<6; r++) \ - virial[r] += shfl_xor(virial[r], s, t_per_atom); \ + for (int r=0; r<6; r++) \ + virial[r] += shfl_down(virial[r], s, t_per_atom); \ } \ } \ } \ - if (offset==0) { \ + if (offset==0 && ii0) { \ + if (EVFLAG && eflag) { \ *engv+=energy*(acctyp)0.5; \ engv+=inum; \ } \ - if (vflag>0) { \ + if (EVFLAG && vflag) { \ for (int i=0; i<6; i++) { \ *engv+=virial[i]*(acctyp)0.5; \ engv+=inum; \ diff --git a/lib/gpu/lal_ellipsoid_nbor.cu b/lib/gpu/lal_ellipsoid_nbor.cu index 5ad935ba9b..9b9d03914c 100644 --- a/lib/gpu/lal_ellipsoid_nbor.cu +++ b/lib/gpu/lal_ellipsoid_nbor.cu @@ -34,7 +34,8 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_, __global int *dev_nbor, const int nbor_pitch, const int start, const int inum, const __global int *dev_ij, - const int form_low, const int form_high) { + const int form_low, const int form_high, + const int t_per_atom) { // ii indexes the two interacting particles in gi int ii=GLOBAL_ID_X+start; @@ -45,12 +46,15 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_, int numj=dev_ij[nbor]; nbor+=nbor_pitch; int nbor_end=nbor+fast_mul(numj,nbor_pitch); - int packed=ii+nbor_pitch+nbor_pitch; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int iw=ix.w; int itype=fast_mul(iw,ntypes); int newj=0; + + __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom; + const int out_stride=nbor_pitch*t_per_atom-t_per_atom; + for ( ; nbor -void GaussT::loop(const bool _eflag, const bool _vflag) { +int GaussT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -143,19 +132,20 @@ void GaussT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &gauss1, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &gauss1, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &gauss1, &_lj_types, &sp_lj, + this->k_pair.run(&this->atom->x, &gauss1, &_lj_types, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class Gauss; diff --git a/lib/gpu/lal_gauss.cu b/lib/gpu/lal_gauss.cu index 2192fb39ca..2540b8492f 100644 --- a/lib/gpu/lal_gauss.cu +++ b/lib/gpu/lal_gauss.cu @@ -27,7 +27,6 @@ _texture_2d( pos_tex,int4); __kernel void k_gauss(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict gauss1, const int lj_types, - const __global numtyp *restrict sp_lj_in, const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, @@ -37,23 +36,20 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_, int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - __local numtyp sp_lj[4]; - sp_lj[0]=sp_lj_in[0]; - sp_lj[1]=sp_lj_in[1]; - sp_lj[2]=sp_lj_in[2]; - sp_lj[3]=sp_lj_in[3]; + int n_stride; + local_allocate_store_pair(); - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) - gauss1[mtype].w); energy+=e; //factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -101,14 +97,13 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_gauss_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict gauss1_in, - const __global numtyp *restrict sp_lj_in, const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, @@ -119,26 +114,26 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp4 gauss1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __local numtyp sp_lj[4]; - if (tid<4) - sp_lj[tid]=sp_lj_in[tid]; + int n_stride; + local_allocate_store_pair(); + if (tid0) { + if (EVFLAG && eflag) { numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) - gauss1[mtype].w); energy+=e; //factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -186,8 +181,8 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } diff --git a/lib/gpu/lal_gauss.h b/lib/gpu/lal_gauss.h index 1399b82d03..ecb04c49b2 100644 --- a/lib/gpu/lal_gauss.h +++ b/lib/gpu/lal_gauss.h @@ -73,7 +73,7 @@ class Gauss : public BaseAtomic { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_gauss_ext.cpp b/lib/gpu/lal_gauss_ext.cpp index a2804ce3cf..afec2e86f2 100644 --- a/lib/gpu/lal_gauss_ext.cpp +++ b/lib/gpu/lal_gauss_ext.cpp @@ -55,7 +55,7 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a, int init_ok=0; if (world_me==0) init_ok=GLMF.init(ntypes, cutsq, host_a, host_b, - offset, special_lj, inum, nall, 300, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); GLMF.device->world_barrier(); @@ -73,7 +73,7 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a, } if (gpu_rank==i && world_me!=0) init_ok=GLMF.init(ntypes, cutsq, host_a, host_b, - offset, special_lj, inum, nall, 300, maxspecial, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); GLMF.device->gpu_barrier(); diff --git a/lib/gpu/lal_gayberne.cpp b/lib/gpu/lal_gayberne.cpp index f17fc50f5f..2b1a190e5a 100644 --- a/lib/gpu/lal_gayberne.cpp +++ b/lib/gpu/lal_gayberne.cpp @@ -127,7 +127,7 @@ int GayBerneT::init(const int ntypes, const double gamma, host_write[i*4+2]=host_shape[i][2]; } UCL_H_Vec view4; - view4.view((numtyp4*)host_write.begin(),shape.numel(),*(this->ucl_device)); + view4.view(host_write,shape.numel()); ucl_copy(shape,view4,false); well.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY); @@ -136,7 +136,7 @@ int GayBerneT::init(const int ntypes, const double gamma, host_write[i*4+1]=host_well[i][1]; host_write[i*4+2]=host_well[i][2]; } - view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device)); + view4.view(host_write,well.numel()); ucl_copy(well,view4,false); _allocated=true; @@ -184,19 +184,8 @@ double GayBerneT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void GayBerneT::loop(const bool _eflag, const bool _vflag) { +int GayBerneT::loop(const int eflag, const int vflag) { const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=0, NGX; int stride=this->nbor->nbor_pitch(); int ainum=this->ans->inum(); @@ -213,8 +202,8 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) { this->time_nbor1.stop(); this->time_ellipsoid.start(); - this->k_ellipsoid.set_size(GX,BX); - this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, + this->k_elps_sel->set_size(GX,BX); + this->k_elps_sel->run(&this->atom->x, &this->atom->quat, &this->shape, &this->well, &this->gamma_upsilon_mu, &this->sigma_epsilon, &this->_lj_types, &this->lshape, &this->nbor->dev_nbor, &stride, @@ -230,7 +219,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) { this->time_ellipsoid2.stop(); this->time_lj.start(); this->time_lj.stop(); - return; + return ainum; } // ------------ SPHERE_ELLIPSE --------------- @@ -246,8 +235,8 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) { this->time_nbor2.stop(); this->time_ellipsoid2.start(); - this->k_sphere_ellipsoid.set_size(GX,BX); - this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat, + this->k_sphere_elps_sel->set_size(GX,BX); + this->k_sphere_elps_sel->run(&this->atom->x, &this->atom->quat, &this->shape, &this->well, &this->gamma_upsilon_mu, &this->sigma_epsilon, &this->_lj_types, @@ -276,8 +265,8 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) { this->time_lj.start(); if (this->_last_ellipseans->inum()) { if (this->_shared_types) { - this->k_lj_fast.set_size(GX,BX); - this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3, + this->k_lj_sel->set_size(GX,BX); + this->k_lj_sel->run(&this->atom->x, &this->lj1, &this->lj3, &this->gamma_upsilon_mu, &stride, &this->nbor->dev_packed, &this->ans->force, &this->ans->engv, &this->dev_error, &eflag, @@ -303,8 +292,8 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) { ELLIPSE_ELLIPSE,_shared_types,_lj_types); this->time_nbor1.stop(); this->time_ellipsoid.start(); - this->k_ellipsoid.set_size(GX,BX); - this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, + this->k_elps_sel->set_size(GX,BX); + this->k_elps_sel->run(&this->atom->x, &this->atom->quat, &this->shape, &this->well, &this->gamma_upsilon_mu, &this->sigma_epsilon, &this->_lj_types, &this->lshape, &this->nbor->dev_nbor, &stride, &this->ans->force, @@ -312,6 +301,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) { &eflag, &vflag, &ainum, &this->_threads_per_atom); this->time_ellipsoid.stop(); } + return ainum; } template class GayBerne; diff --git a/lib/gpu/lal_gayberne.cu b/lib/gpu/lal_gayberne.cu index c9d0353ca8..9267dfd85d 100644 --- a/lib/gpu/lal_gayberne.cu +++ b/lib/gpu/lal_gayberne.cu @@ -100,29 +100,27 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_ellipse(); + sp_lj[0]=gum[3]; sp_lj[1]=gum[4]; sp_lj[2]=gum[5]; sp_lj[3]=gum[6]; - acctyp energy=(acctyp)0; - acctyp4 f; - f.x=(acctyp)0; - f.y=(acctyp)0; - f.z=(acctyp)0; - acctyp4 tor; - tor.x=(acctyp)0; - tor.y=(acctyp)0; - tor.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp4 f, tor; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) + if (EVFLAG && eflag) energy+=u_r*temp2; numtyp temp1 = -eta*u_r*factor_lj; - if (vflag>0) { + if (EVFLAG && vflag) { r12[0]*=-r; r12[1]*=-r; r12[2]*=-r; @@ -356,8 +354,8 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_, tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2]; } // for nbor - store_answers_t(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_t(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag, + vflag,ans,engv,inum); } diff --git a/lib/gpu/lal_gayberne.h b/lib/gpu/lal_gayberne.h index 750c739cec..5cdc6bcd67 100644 --- a/lib/gpu/lal_gayberne.h +++ b/lib/gpu/lal_gayberne.h @@ -86,7 +86,7 @@ class GayBerne : public BaseEllipsoid { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_gayberne_lj.cu b/lib/gpu/lal_gayberne_lj.cu index fdf40720aa..4582f0d411 100644 --- a/lib/gpu/lal_gayberne_lj.cu +++ b/lib/gpu/lal_gayberne_lj.cu @@ -17,6 +17,13 @@ #include "lal_ellipsoid_extra.h" #endif +#if (SHUFFLE_AVAIL == 0) +#define local_allocate_store_ellipse_lj local_allocate_store_ellipse +#else +#define local_allocate_store_ellipse_lj() \ + __local acctyp red_acc[7][BLOCK_ELLIPSE / SIMD_SIZE]; +#endif + __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict q, const __global numtyp4 *restrict shape, @@ -38,25 +45,26 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_, ii+=start; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_ellipse_lj(); + sp_lj[0]=gum[3]; sp_lj[1]=gum[4]; sp_lj[2]=gum[5]; sp_lj[3]=gum[6]; - acctyp energy=(acctyp)0; acctyp4 f; - f.x=(acctyp)0; - f.y=(acctyp)0; - f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) + if (EVFLAG && eflag) energy+=u_r*temp2; numtyp temp1 = -eta*u_r*factor_lj; - if (vflag>0) { + if (EVFLAG && vflag) { r12[0]*=-1; r12[1]*=-1; r12[2]*=-1; @@ -239,9 +247,9 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_, f.z+=temp1*dchi[2]-temp2*dUr[2]; } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_, @@ -261,26 +269,27 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_, ii+=start; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_ellipse(); + sp_lj[0]=gum[3]; sp_lj[1]=gum[4]; sp_lj[2]=gum[5]; sp_lj[3]=gum[6]; - acctyp energy=(acctyp)0; acctyp4 f; - f.x=(acctyp)0; - f.y=(acctyp)0; - f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y); energy+=factor_lj*(e-lj3[ii].z); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -327,9 +336,9 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_, } } // for nbor - acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_, @@ -351,31 +360,32 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_, __local numtyp sp_lj[4]; __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + int n_stride; + local_allocate_store_ellipse(); + if (tid<4) sp_lj[tid]=gum[tid+3]; if (tid0) + if (EVFLAG && eflag) lj3[tid]=lj3_in[tid]; } - acctyp energy=(acctyp)0; acctyp4 f; - f.x=(acctyp)0; - f.y=(acctyp)0; - f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); energy+=factor_lj*(e-lj3[mtype].z); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -421,8 +431,8 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_, } } // for nbor - acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } diff --git a/lib/gpu/lal_lj.cpp b/lib/gpu/lal_lj.cpp index 5bd015e364..40fefe28b3 100644 --- a/lib/gpu/lal_lj.cpp +++ b/lib/gpu/lal_lj.cpp @@ -51,16 +51,31 @@ int LJT::init(const int ntypes, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen) { + const int max_shared_types=this->device->max_shared_types(); + + int onetype=0; + #ifdef USE_OPENCL + if (maxspecial==0) + for (int i=1; i0) { + if (onetype>0) + onetype=-1; + else if (onetype==0) + onetype=i*max_shared_types+j; + } + if (onetype<0) onetype=0; + #endif + int success; - success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,lj,"k_lj"); + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, + gpu_split,_screen,lj,"k_lj",onetype); if (success!=0) return success; // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; - int max_shared_types=this->device->max_shared_types(); if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { lj_types=max_shared_types; shared_types=true; @@ -130,20 +145,9 @@ double LJT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void LJT::loop(const bool _eflag, const bool _vflag) { +int LJT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -151,8 +155,8 @@ void LJT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, @@ -165,6 +169,7 @@ void LJT::loop(const bool _eflag, const bool _vflag) { &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class LJ; diff --git a/lib/gpu/lal_lj.cu b/lib/gpu/lal_lj.cu index 7297a287e6..382cd140d9 100644 --- a/lib/gpu/lal_lj.cu +++ b/lib/gpu/lal_lj.cu @@ -38,16 +38,19 @@ __kernel void k_lj(const __global numtyp4 *restrict x_, int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - acctyp energy=(acctyp)0; + int n_stride; + local_allocate_store_pair(); + acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); energy+=factor_lj*(e-lj3[mtype].z); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -96,9 +99,9 @@ __kernel void k_lj(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_lj_fast(const __global numtyp4 *restrict x_, @@ -114,6 +117,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_, int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); + #ifndef ONETYPE __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; @@ -121,38 +125,58 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_, sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) lj3[tid]=lj3_in[tid]; } + __syncthreads(); + #else + const numtyp lj1x=lj1_in[ONETYPE].x; + const numtyp lj1y=lj1_in[ONETYPE].y; + const numtyp cutsq=lj1_in[ONETYPE].z; + numtyp lj3x, lj3y, lj3z; + if (EVFLAG && eflag) { + lj3x=lj3_in[ONETYPE].x; + lj3y=lj3_in[ONETYPE].y; + lj3z=lj3_in[ONETYPE].z; + } + #endif + + int n_stride; + local_allocate_store_pair(); - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; - - __syncthreads(); + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { - numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); - energy+=factor_lj*(e-lj3[mtype].z); + if (EVFLAG && eflag) { + #ifndef ONETYPE + numtyp lj3x=lj3[mtype].x; + numtyp lj3y=lj3[mtype].y; + numtyp lj3z=lj3[mtype].z; + #endif + numtyp e=r6inv*(lj3x*r6inv-lj3y); + #ifndef ONETYPE + energy+=factor_lj*(e-lj3z); + #else + energy+=(e-lj3z); + #endif } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -182,10 +223,9 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_, virial[5] += dely*delz*force; } } - } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } diff --git a/lib/gpu/lal_lj.h b/lib/gpu/lal_lj.h index c6fec0d159..cdf850efd7 100644 --- a/lib/gpu/lal_lj.h +++ b/lib/gpu/lal_lj.h @@ -76,7 +76,7 @@ class LJ : public BaseAtomic { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_lj96.cpp b/lib/gpu/lal_lj96.cpp index 6f74cd0f19..df7dc11558 100644 --- a/lib/gpu/lal_lj96.cpp +++ b/lib/gpu/lal_lj96.cpp @@ -113,20 +113,9 @@ double LJ96T::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void LJ96T::loop(const bool _eflag, const bool _vflag) { +int LJ96T::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -134,8 +123,8 @@ void LJ96T::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, @@ -149,6 +138,7 @@ void LJ96T::loop(const bool _eflag, const bool _vflag) { &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class LJ96; diff --git a/lib/gpu/lal_lj96.cu b/lib/gpu/lal_lj96.cu index c602e7555e..d1f7e3791f 100644 --- a/lib/gpu/lal_lj96.cu +++ b/lib/gpu/lal_lj96.cu @@ -39,22 +39,25 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y); energy+=factor_lj*(e-lj3[mtype].z); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -104,9 +107,9 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_lj96_fast(const __global numtyp4 *restrict x_, @@ -125,27 +128,30 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_, __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + if (tid<4) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) lj3[tid]=lj3_in[tid]; } - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y); energy+=factor_lj*(e-lj3[mtype].z); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -195,8 +201,8 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } diff --git a/lib/gpu/lal_lj96.h b/lib/gpu/lal_lj96.h index eef6863f37..535e32a580 100644 --- a/lib/gpu/lal_lj96.h +++ b/lib/gpu/lal_lj96.h @@ -71,7 +71,7 @@ class LJ96 : public BaseAtomic { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_lj96_ext.cpp b/lib/gpu/lal_lj96_ext.cpp index f68b35de57..be7ffc5a09 100644 --- a/lib/gpu/lal_lj96_ext.cpp +++ b/lib/gpu/lal_lj96_ext.cpp @@ -55,7 +55,7 @@ int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, int init_ok=0; if (world_me==0) init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, + host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); LJ96MF.device->world_barrier(); @@ -73,7 +73,7 @@ int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, } if (gpu_rank==i && world_me!=0) init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, - offset, special_lj, inum, nall, 300, maxspecial, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); LJ96MF.device->gpu_barrier(); diff --git a/lib/gpu/lal_lj_class2_long.cpp b/lib/gpu/lal_lj_class2_long.cpp index 24b07212ed..31e03a2a82 100644 --- a/lib/gpu/lal_lj_class2_long.cpp +++ b/lib/gpu/lal_lj_class2_long.cpp @@ -123,20 +123,9 @@ double LJClass2LongT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void LJClass2LongT::loop(const bool _eflag, const bool _vflag) { +int LJClass2LongT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -144,8 +133,8 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, @@ -161,6 +150,7 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) { &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class LJClass2Long; diff --git a/lib/gpu/lal_lj_class2_long.cu b/lib/gpu/lal_lj_class2_long.cu index 65f0bf993c..5c8a2d46b2 100644 --- a/lib/gpu/lal_lj_class2_long.cu +++ b/lib/gpu/lal_lj_class2_long.cu @@ -47,6 +47,9 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; @@ -56,18 +59,18 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) e_coul += prefactor*(_erfc-factor_coul); if (rsq < lj1[mtype].w) { @@ -131,7 +134,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-lj3[mtype].z); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -142,9 +145,9 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_, @@ -168,28 +171,31 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_, __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) lj3[tid]=lj3_in[tid]; } - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) e_coul += prefactor*(_erfc-factor_coul); if (rsq < lj1[mtype].w) { @@ -253,7 +259,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-lj3[mtype].z); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -264,8 +270,8 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } diff --git a/lib/gpu/lal_lj_class2_long.h b/lib/gpu/lal_lj_class2_long.h index eac6451b2e..84e07bf7cd 100644 --- a/lib/gpu/lal_lj_class2_long.h +++ b/lib/gpu/lal_lj_class2_long.h @@ -75,7 +75,7 @@ class LJClass2Long : public BaseCharge { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_lj_class2_long_ext.cpp b/lib/gpu/lal_lj_class2_long_ext.cpp index f669a81189..311b027536 100644 --- a/lib/gpu/lal_lj_class2_long_ext.cpp +++ b/lib/gpu/lal_lj_class2_long_ext.cpp @@ -58,7 +58,7 @@ int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, int init_ok=0; if (world_me==0) init_ok=C2CLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, - offset, special_lj, inum, nall, 300, maxspecial, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); @@ -77,7 +77,7 @@ int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, } if (gpu_rank==i && world_me!=0) init_ok=C2CLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, - offset, special_lj, inum, nall, 300, maxspecial, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); diff --git a/lib/gpu/lal_lj_coul.cpp b/lib/gpu/lal_lj_coul.cpp index 59ce9c5e61..cd8a411a79 100644 --- a/lib/gpu/lal_lj_coul.cpp +++ b/lib/gpu/lal_lj_coul.cpp @@ -125,20 +125,9 @@ double LJCoulT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void LJCoulT::loop(const bool _eflag, const bool _vflag) { +int LJCoulT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -146,8 +135,8 @@ void LJCoulT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, @@ -161,6 +150,7 @@ void LJCoulT::loop(const bool _eflag, const bool _vflag) { &cutsq, &_qqrd2e, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class LJCoul; diff --git a/lib/gpu/lal_lj_coul.cu b/lib/gpu/lal_lj_coul.cu index afbb972942..c728967bc5 100644 --- a/lib/gpu/lal_lj_coul.cu +++ b/lib/gpu/lal_lj_coul.cu @@ -47,6 +47,9 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; @@ -56,18 +59,18 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { e_coul += forcecoul; if (rsq < lj1[mtype].z) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); energy+=factor_lj*(e-lj3[mtype].z); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -133,9 +136,9 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_, @@ -158,29 +161,32 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_, __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) lj3[tid]=lj3_in[tid]; } - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { e_coul += forcecoul; if (rsq < lj1[mtype].z) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); energy+=factor_lj*(e-lj3[mtype].z); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -246,8 +252,8 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } diff --git a/lib/gpu/lal_lj_coul.h b/lib/gpu/lal_lj_coul.h index 0e11162aa5..eb490d5820 100644 --- a/lib/gpu/lal_lj_coul.h +++ b/lib/gpu/lal_lj_coul.h @@ -77,7 +77,7 @@ class LJCoul : public BaseCharge { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_lj_coul_debye.cpp b/lib/gpu/lal_lj_coul_debye.cpp index 556a0a5cd3..78ef1bf3f7 100644 --- a/lib/gpu/lal_lj_coul_debye.cpp +++ b/lib/gpu/lal_lj_coul_debye.cpp @@ -127,20 +127,9 @@ double LJCoulDebyeT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) { +int LJCoulDebyeT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -148,8 +137,8 @@ void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &cutsq, @@ -163,6 +152,7 @@ void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) { &_qqrd2e, &_kappa, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class LJCoulDebye; diff --git a/lib/gpu/lal_lj_coul_debye.cu b/lib/gpu/lal_lj_coul_debye.cu index 053fbeccc8..1804625649 100644 --- a/lib/gpu/lal_lj_coul_debye.cu +++ b/lib/gpu/lal_lj_coul_debye.cu @@ -48,6 +48,9 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; @@ -57,18 +60,18 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { if (rsq < lj1[mtype].z) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); energy+=factor_lj*(e-lj3[mtype].z); @@ -129,7 +132,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_, e_coul+=qqrd2e*qtmp*rinv*screening*factor_coul; } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -140,9 +143,9 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_, @@ -166,29 +169,32 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_, __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) lj3[tid]=lj3_in[tid]; } - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { if (rsq < lj1[mtype].z) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); energy+=factor_lj*(e-lj3[mtype].z); @@ -249,7 +255,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_, e_coul+=qqrd2e*qtmp*rinv*screening*factor_coul; } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -260,8 +266,8 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } diff --git a/lib/gpu/lal_lj_coul_debye.h b/lib/gpu/lal_lj_coul_debye.h index 22fcf7234b..19abf32169 100644 --- a/lib/gpu/lal_lj_coul_debye.h +++ b/lib/gpu/lal_lj_coul_debye.h @@ -77,7 +77,7 @@ class LJCoulDebye : public BaseCharge { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_lj_coul_debye_ext.cpp b/lib/gpu/lal_lj_coul_debye_ext.cpp index 95588eb95a..4f81b01457 100644 --- a/lib/gpu/lal_lj_coul_debye_ext.cpp +++ b/lib/gpu/lal_lj_coul_debye_ext.cpp @@ -58,7 +58,7 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1, int init_ok=0; if (world_me==0) init_ok=LJCDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, + host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, kappa); @@ -77,7 +77,7 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1, } if (gpu_rank==i && world_me!=0) init_ok=LJCDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, - offset, special_lj, inum, nall, 300, maxspecial, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, kappa); diff --git a/lib/gpu/lal_lj_coul_ext.cpp b/lib/gpu/lal_lj_coul_ext.cpp index 060088a7cb..5b7f97e630 100644 --- a/lib/gpu/lal_lj_coul_ext.cpp +++ b/lib/gpu/lal_lj_coul_ext.cpp @@ -57,7 +57,7 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, int init_ok=0; if (world_me==0) init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, + host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); @@ -76,7 +76,7 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, } if (gpu_rank==i && world_me!=0) init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, - offset, special_lj, inum, nall, 300, maxspecial, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); diff --git a/lib/gpu/lal_lj_coul_long.cpp b/lib/gpu/lal_lj_coul_long.cpp index 66897a4aa7..e6be361abb 100644 --- a/lib/gpu/lal_lj_coul_long.cpp +++ b/lib/gpu/lal_lj_coul_long.cpp @@ -140,20 +140,9 @@ double LJCoulLongT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void LJCoulLongT::loop(const bool _eflag, const bool _vflag) { +int LJCoulLongT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -161,8 +150,8 @@ void LJCoulLongT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, @@ -178,6 +167,7 @@ void LJCoulLongT::loop(const bool _eflag, const bool _vflag) { &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class LJCoulLong; diff --git a/lib/gpu/lal_lj_coul_long.cu b/lib/gpu/lal_lj_coul_long.cu index ac3479421f..85af3c3433 100644 --- a/lib/gpu/lal_lj_coul_long.cu +++ b/lib/gpu/lal_lj_coul_long.cu @@ -47,6 +47,9 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; @@ -56,18 +59,18 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) e_coul += prefactor*(_erfc-factor_coul); if (rsq < lj1[mtype].w) { @@ -129,7 +132,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-lj3[mtype].z); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -140,9 +143,9 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_, @@ -164,28 +167,31 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_, __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) lj3[tid]=lj3_in[tid]; } - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) e_coul += prefactor*(_erfc-factor_coul); if (rsq < lj1[mtype].w) { @@ -247,7 +253,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-lj3[mtype].z); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -258,8 +264,8 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } diff --git a/lib/gpu/lal_lj_coul_long.h b/lib/gpu/lal_lj_coul_long.h index 8f77671dc0..bc4fce40a5 100644 --- a/lib/gpu/lal_lj_coul_long.h +++ b/lib/gpu/lal_lj_coul_long.h @@ -80,7 +80,7 @@ class LJCoulLong : public BaseCharge { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_lj_coul_long_ext.cpp b/lib/gpu/lal_lj_coul_long_ext.cpp index 33771af53c..6a027bdc7e 100644 --- a/lib/gpu/lal_lj_coul_long_ext.cpp +++ b/lib/gpu/lal_lj_coul_long_ext.cpp @@ -58,7 +58,7 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, int init_ok=0; if (world_me==0) init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, - offset, special_lj, inum, nall, 300, maxspecial, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); @@ -77,7 +77,7 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, } if (gpu_rank==i && world_me!=0) init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, - offset, special_lj, inum, nall, 300, maxspecial, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); diff --git a/lib/gpu/lal_lj_coul_msm.cpp b/lib/gpu/lal_lj_coul_msm.cpp index 9a17d068ec..656736865b 100644 --- a/lib/gpu/lal_lj_coul_msm.cpp +++ b/lib/gpu/lal_lj_coul_msm.cpp @@ -157,20 +157,9 @@ double LJCoulMSMT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void LJCoulMSMT::loop(const bool _eflag, const bool _vflag) { +int LJCoulMSMT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -178,8 +167,8 @@ void LJCoulMSMT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &gcons, &dgcons, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &gcons, &dgcons, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, @@ -195,6 +184,7 @@ void LJCoulMSMT::loop(const bool _eflag, const bool _vflag) { &_qqrd2e, &_order, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class LJCoulMSM; diff --git a/lib/gpu/lal_lj_coul_msm.cu b/lib/gpu/lal_lj_coul_msm.cu index a3c36eed85..39fc723736 100644 --- a/lib/gpu/lal_lj_coul_msm.cu +++ b/lib/gpu/lal_lj_coul_msm.cu @@ -28,6 +28,11 @@ _texture( gcons_tex,int2); _texture( dgcons_tex,int2); #endif +#if (__CUDACC_VER_MAJOR__ >= 11) +#define gcons_tex gcons +#define dgcons_tex dgcons +#endif + #else #define pos_tex x_ #define q_tex q_ @@ -100,6 +105,9 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; @@ -109,18 +117,18 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) e_coul += prefactor*(egamma-factor_coul); if (rsq < lj1[mtype].w) { @@ -183,7 +191,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-lj3[mtype].z); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -194,9 +202,9 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_, @@ -220,28 +228,31 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_, __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) lj3[tid]=lj3_in[tid]; } - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) e_coul += prefactor*(egamma-factor_coul); if (rsq < lj1[mtype].w) { @@ -304,7 +315,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-lj3[mtype].z); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -315,8 +326,8 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } diff --git a/lib/gpu/lal_lj_coul_msm.h b/lib/gpu/lal_lj_coul_msm.h index 6369ce8cb5..a929848aaf 100644 --- a/lib/gpu/lal_lj_coul_msm.h +++ b/lib/gpu/lal_lj_coul_msm.h @@ -80,7 +80,7 @@ class LJCoulMSM : public BaseCharge { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_lj_coul_msm_ext.cpp b/lib/gpu/lal_lj_coul_msm_ext.cpp index d957cbe376..2d9d77fe77 100644 --- a/lib/gpu/lal_lj_coul_msm_ext.cpp +++ b/lib/gpu/lal_lj_coul_msm_ext.cpp @@ -59,7 +59,7 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1, if (world_me==0) init_ok=LJCMLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, host_gcons, host_dgcons, offset, - special_lj, inum, nall, 300, maxspecial, + special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, order, qqrd2e); @@ -79,7 +79,7 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1, if (gpu_rank==i && world_me!=0) init_ok=LJCMLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, host_gcons, host_dgcons, offset, - special_lj, inum, nall, 300, maxspecial, + special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, order, qqrd2e); diff --git a/lib/gpu/lal_lj_cubic.cpp b/lib/gpu/lal_lj_cubic.cpp index f8200ec037..fa5073d409 100644 --- a/lib/gpu/lal_lj_cubic.cpp +++ b/lib/gpu/lal_lj_cubic.cpp @@ -119,20 +119,9 @@ double LJCubicT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void LJCubicT::loop(const bool _eflag, const bool _vflag) { +int LJCubicT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -140,8 +129,8 @@ void LJCubicT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj2, &lj3, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj2, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, @@ -154,6 +143,7 @@ void LJCubicT::loop(const bool _eflag, const bool _vflag) { &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class LJCubic; diff --git a/lib/gpu/lal_lj_cubic.cu b/lib/gpu/lal_lj_cubic.cu index f93013fe75..a91326d521 100644 --- a/lib/gpu/lal_lj_cubic.cu +++ b/lib/gpu/lal_lj_cubic.cu @@ -46,16 +46,19 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_, int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - acctyp energy=(acctyp)0; + int n_stride; + local_allocate_store_pair(); + acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { numtyp e; if (rsq <= lj2[mtype].x) e = r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); @@ -106,7 +109,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_, e = lj2[mtype].w*(_PHIS + _DPHIDS*t - _A3*t*t*t/6.0); energy+=factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -117,9 +120,9 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_, @@ -140,27 +143,30 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_, __local numtyp4 lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp2 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + if (tid<4) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) lj3[tid]=lj3_in[tid]; } - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { numtyp e; if (rsq <= lj2[mtype].x) e = r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); @@ -211,7 +217,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_, e = lj2[mtype].w*(_PHIS + _DPHIDS*t - _A3*t*t*t/6.0); energy+=factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -222,8 +228,8 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } diff --git a/lib/gpu/lal_lj_cubic.h b/lib/gpu/lal_lj_cubic.h index 9578ca27e4..a37044b279 100644 --- a/lib/gpu/lal_lj_cubic.h +++ b/lib/gpu/lal_lj_cubic.h @@ -73,7 +73,7 @@ class LJCubic : public BaseAtomic { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_lj_cubic_ext.cpp b/lib/gpu/lal_lj_cubic_ext.cpp index f02ce0f184..2f8ebac37b 100644 --- a/lib/gpu/lal_lj_cubic_ext.cpp +++ b/lib/gpu/lal_lj_cubic_ext.cpp @@ -58,7 +58,7 @@ int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq, if (world_me==0) init_ok=LJCubicLMF.init(ntypes, cutsq, cut_inner_sq, cut_inner, sigma, epsilon, host_lj1, host_lj2, host_lj3, host_lj4, - special_lj, inum, nall, 300, maxspecial, + special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); LJCubicLMF.device->world_barrier(); @@ -77,7 +77,7 @@ int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq, if (gpu_rank==i && world_me!=0) init_ok=LJCubicLMF.init(ntypes, cutsq, cut_inner_sq, cut_inner, sigma, epsilon, host_lj1, host_lj2, host_lj3, host_lj4, - special_lj, inum, nall, 300, maxspecial, + special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); LJCubicLMF.device->gpu_barrier(); diff --git a/lib/gpu/lal_lj_dsf.cpp b/lib/gpu/lal_lj_dsf.cpp index b888f33f00..d41aa13deb 100644 --- a/lib/gpu/lal_lj_dsf.cpp +++ b/lib/gpu/lal_lj_dsf.cpp @@ -125,20 +125,9 @@ double LJDSFT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void LJDSFT::loop(const bool _eflag, const bool _vflag) { +int LJDSFT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -146,8 +135,8 @@ void LJDSFT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, @@ -163,6 +152,7 @@ void LJDSFT::loop(const bool _eflag, const bool _vflag) { &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class LJDSF; diff --git a/lib/gpu/lal_lj_dsf.cu b/lib/gpu/lal_lj_dsf.cu index c1bb197148..5beedb0bbb 100644 --- a/lib/gpu/lal_lj_dsf.cu +++ b/lib/gpu/lal_lj_dsf.cu @@ -50,6 +50,9 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; @@ -59,18 +62,18 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; @@ -130,7 +133,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_, f.y+=dely*force; f.z+=delz*force; - if (eflag>0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) { numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift-factor_coul); e_coul += e; @@ -140,7 +143,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-lj3[mtype].z); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -151,9 +154,9 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_, @@ -176,28 +179,31 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_, __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) lj3[tid]=lj3_in[tid]; } - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; @@ -257,7 +263,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_, f.y+=dely*force; f.z+=delz*force; - if (eflag>0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) { numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift-factor_coul); e_coul += e; @@ -267,7 +273,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-lj3[mtype].z); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -278,8 +284,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } - diff --git a/lib/gpu/lal_lj_dsf.h b/lib/gpu/lal_lj_dsf.h index b176e087db..b303285e9c 100644 --- a/lib/gpu/lal_lj_dsf.h +++ b/lib/gpu/lal_lj_dsf.h @@ -77,7 +77,7 @@ class LJDSF : public BaseCharge { private: bool _allocated; numtyp _e_shift, _f_shift, _alpha, _cut_coulsq; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_lj_dsf_ext.cpp b/lib/gpu/lal_lj_dsf_ext.cpp index 6d53896a11..e70059261c 100644 --- a/lib/gpu/lal_lj_dsf_ext.cpp +++ b/lib/gpu/lal_lj_dsf_ext.cpp @@ -59,7 +59,7 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1, int init_ok=0; if (world_me==0) init_ok=LJDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, + host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, e_shift, f_shift, alpha); @@ -79,7 +79,7 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1, } if (gpu_rank==i && world_me!=0) init_ok=LJDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, - offset, special_lj, inum, nall, 300, maxspecial, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, e_shift, f_shift, alpha); diff --git a/lib/gpu/lal_lj_expand.cpp b/lib/gpu/lal_lj_expand.cpp index 1c58cecfae..3d9e526d0c 100644 --- a/lib/gpu/lal_lj_expand.cpp +++ b/lib/gpu/lal_lj_expand.cpp @@ -133,20 +133,9 @@ double LJExpandT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void LJExpandT::loop(const bool _eflag, const bool _vflag) { +int LJExpandT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -154,8 +143,8 @@ void LJExpandT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, @@ -168,6 +157,7 @@ void LJExpandT::loop(const bool _eflag, const bool _vflag) { &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class LJExpand; diff --git a/lib/gpu/lal_lj_expand.cu b/lib/gpu/lal_lj_expand.cu index 46ed9e2a31..2eff2cd89b 100644 --- a/lib/gpu/lal_lj_expand.cu +++ b/lib/gpu/lal_lj_expand.cu @@ -41,22 +41,25 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); energy+=factor_lj*(e-lj3[mtype].z); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -108,9 +111,9 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_, @@ -129,27 +132,30 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_, __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + if (tid<4) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) lj3[tid]=lj3_in[tid]; } - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(numtyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); energy+=factor_lj*(e-lj3[mtype].z); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -201,8 +207,8 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } diff --git a/lib/gpu/lal_lj_expand.h b/lib/gpu/lal_lj_expand.h index 2560d166c7..94448a871d 100644 --- a/lib/gpu/lal_lj_expand.h +++ b/lib/gpu/lal_lj_expand.h @@ -76,7 +76,7 @@ class LJExpand : public BaseAtomic { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_lj_expand_coul_long.cpp b/lib/gpu/lal_lj_expand_coul_long.cpp index 3e5e00ef6a..41c2ff6229 100644 --- a/lib/gpu/lal_lj_expand_coul_long.cpp +++ b/lib/gpu/lal_lj_expand_coul_long.cpp @@ -140,20 +140,9 @@ double LJExpandCoulLongT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void LJExpandCoulLongT::loop(const bool _eflag, const bool _vflag) { +int LJExpandCoulLongT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -161,8 +150,8 @@ void LJExpandCoulLongT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, @@ -178,6 +167,7 @@ void LJExpandCoulLongT::loop(const bool _eflag, const bool _vflag) { &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class LJExpandCoulLong; diff --git a/lib/gpu/lal_lj_expand_coul_long.cu b/lib/gpu/lal_lj_expand_coul_long.cu index 0f0fe4c2fb..abb3d5ca3f 100644 --- a/lib/gpu/lal_lj_expand_coul_long.cu +++ b/lib/gpu/lal_lj_expand_coul_long.cu @@ -47,6 +47,9 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; @@ -56,18 +59,18 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) e_coul += prefactor*(_erfc-factor_coul); if (rsq < lj1[mtype].w) { @@ -133,7 +136,7 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-lj3[mtype].z); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -144,9 +147,9 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_, @@ -168,6 +171,9 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_, __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) e_coul += prefactor*(_erfc-factor_coul); if (rsq < lj1[mtype].w) { @@ -254,7 +260,7 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_, energy+=factor_lj*(e-lj3[mtype].z); } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -265,8 +271,8 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } diff --git a/lib/gpu/lal_lj_expand_coul_long.h b/lib/gpu/lal_lj_expand_coul_long.h index 404a36e5bc..44f7aff3fe 100644 --- a/lib/gpu/lal_lj_expand_coul_long.h +++ b/lib/gpu/lal_lj_expand_coul_long.h @@ -80,7 +80,7 @@ class LJExpandCoulLong : public BaseCharge { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_lj_expand_coul_long_ext.cpp b/lib/gpu/lal_lj_expand_coul_long_ext.cpp index 3ff1bef701..e5506dd7aa 100644 --- a/lib/gpu/lal_lj_expand_coul_long_ext.cpp +++ b/lib/gpu/lal_lj_expand_coul_long_ext.cpp @@ -58,7 +58,7 @@ int ljecl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, int init_ok=0; if (world_me==0) init_ok=LJECLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, - offset, shift, special_lj, inum, nall, 300, maxspecial, + offset, shift, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); @@ -77,7 +77,7 @@ int ljecl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, } if (gpu_rank==i && world_me!=0) init_ok=LJECLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, - offset, shift, special_lj, inum, nall, 300, maxspecial, + offset, shift, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); diff --git a/lib/gpu/lal_lj_expand_ext.cpp b/lib/gpu/lal_lj_expand_ext.cpp index 603e425d3f..02decf2712 100644 --- a/lib/gpu/lal_lj_expand_ext.cpp +++ b/lib/gpu/lal_lj_expand_ext.cpp @@ -56,7 +56,7 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1, int init_ok=0; if (world_me==0) init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, shift, special_lj, inum, nall, 300, + host_lj4, offset, shift, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); LJEMF.device->world_barrier(); @@ -74,7 +74,7 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1, } if (gpu_rank==i && world_me!=0) init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, - offset, shift, special_lj, inum, nall, 300, maxspecial, + offset, shift, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split,screen); LJEMF.device->world_barrier(); diff --git a/lib/gpu/lal_lj_ext.cpp b/lib/gpu/lal_lj_ext.cpp index 124cf46c8c..fa00fc4f64 100644 --- a/lib/gpu/lal_lj_ext.cpp +++ b/lib/gpu/lal_lj_ext.cpp @@ -55,7 +55,7 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, int init_ok=0; if (world_me==0) init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, + host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); LJLMF.device->world_barrier(); @@ -73,7 +73,7 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, } if (gpu_rank==i && world_me!=0) init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, - offset, special_lj, inum, nall, 300, maxspecial, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); LJLMF.device->gpu_barrier(); diff --git a/lib/gpu/lal_lj_gromacs.cpp b/lib/gpu/lal_lj_gromacs.cpp index 0563151ddd..8a385ece6b 100644 --- a/lib/gpu/lal_lj_gromacs.cpp +++ b/lib/gpu/lal_lj_gromacs.cpp @@ -121,20 +121,9 @@ double LJGROMACST::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void LJGROMACST::loop(const bool _eflag, const bool _vflag) { +int LJGROMACST::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -142,8 +131,8 @@ void LJGROMACST::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &ljsw, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &ljsw, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, @@ -159,6 +148,7 @@ void LJGROMACST::loop(const bool _eflag, const bool _vflag) { &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class LJGROMACS; diff --git a/lib/gpu/lal_lj_gromacs.cu b/lib/gpu/lal_lj_gromacs.cu index 21381bef30..4117cc1440 100644 --- a/lib/gpu/lal_lj_gromacs.cu +++ b/lib/gpu/lal_lj_gromacs.cu @@ -42,21 +42,24 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); e += lj3[mtype].w; if (rsq > lj1[mtype].w) { @@ -108,7 +111,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_, } energy+=factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -119,9 +122,9 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_, @@ -142,6 +145,9 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_, __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 ljsw[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + if (tid<4) sp_lj[tid]=sp_lj_in[tid]; if (tid0) { + if (EVFLAG && eflag) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); e += lj3[mtype].w; if (rsq > lj1[mtype].w) { @@ -213,7 +219,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_, } energy+=factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -224,8 +230,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } - diff --git a/lib/gpu/lal_lj_gromacs.h b/lib/gpu/lal_lj_gromacs.h index 3dec13c6d7..8fedaf07a1 100644 --- a/lib/gpu/lal_lj_gromacs.h +++ b/lib/gpu/lal_lj_gromacs.h @@ -76,7 +76,7 @@ class LJGROMACS : public BaseAtomic { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_lj_gromacs_ext.cpp b/lib/gpu/lal_lj_gromacs_ext.cpp index 99d32ab09a..19d1d12513 100644 --- a/lib/gpu/lal_lj_gromacs_ext.cpp +++ b/lib/gpu/lal_lj_gromacs_ext.cpp @@ -58,7 +58,7 @@ int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1, int init_ok=0; if (world_me==0) LJGRMMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, - special_lj, inum, nall, 300, maxspecial, cell_size, + special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, host_ljsw5, cut_inner, cut_inner_sq); @@ -77,7 +77,7 @@ int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1, } if (gpu_rank==i && world_me!=0) init_ok=LJGRMMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, - special_lj, inum, nall, 300, maxspecial, cell_size, + special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, host_ljsw5, cut_inner, cut_inner_sq); diff --git a/lib/gpu/lal_lj_sdk.cpp b/lib/gpu/lal_lj_sdk.cpp index c6a282576c..0da094c953 100644 --- a/lib/gpu/lal_lj_sdk.cpp +++ b/lib/gpu/lal_lj_sdk.cpp @@ -113,20 +113,9 @@ double CGCMMT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void CGCMMT::loop(const bool _eflag, const bool _vflag) { +int CGCMMT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -134,8 +123,8 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, @@ -149,6 +138,7 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) { &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class CGCMM; diff --git a/lib/gpu/lal_lj_sdk.cu b/lib/gpu/lal_lj_sdk.cu index 249b29a4b2..1bd9a93d5e 100644 --- a/lib/gpu/lal_lj_sdk.cu +++ b/lib/gpu/lal_lj_sdk.cu @@ -39,22 +39,25 @@ __kernel void k_lj_sdk(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) + if (EVFLAG && eflag) energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)- lj3[mtype].z; - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -111,9 +114,9 @@ __kernel void k_lj_sdk(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_lj_sdk_fast(const __global numtyp4 *restrict x_, @@ -132,27 +135,30 @@ __kernel void k_lj_sdk_fast(const __global numtyp4 *restrict x_, __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + if (tid<4) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) lj3[tid]=lj3_in[tid]; } - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) + if (EVFLAG && eflag) energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)- lj3[mtype].z; - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -209,8 +215,7 @@ __kernel void k_lj_sdk_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } - diff --git a/lib/gpu/lal_lj_sdk.h b/lib/gpu/lal_lj_sdk.h index fc50756a3f..043bafdda8 100644 --- a/lib/gpu/lal_lj_sdk.h +++ b/lib/gpu/lal_lj_sdk.h @@ -71,7 +71,7 @@ class CGCMM : public BaseAtomic { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_lj_sdk_ext.cpp b/lib/gpu/lal_lj_sdk_ext.cpp index de0c5fef4f..4497233861 100644 --- a/lib/gpu/lal_lj_sdk_ext.cpp +++ b/lib/gpu/lal_lj_sdk_ext.cpp @@ -56,7 +56,7 @@ int sdk_gpu_init(const int ntypes, double **cutsq, int **cg_types, int init_ok=0; if (world_me==0) init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, + host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); CMMMF.device->world_barrier(); @@ -74,7 +74,7 @@ int sdk_gpu_init(const int ntypes, double **cutsq, int **cg_types, } if (gpu_rank==i && world_me!=0) init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, + host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); CMMMF.device->gpu_barrier(); diff --git a/lib/gpu/lal_lj_sdk_long.cpp b/lib/gpu/lal_lj_sdk_long.cpp index 74dbfc40e3..d78e8d84da 100644 --- a/lib/gpu/lal_lj_sdk_long.cpp +++ b/lib/gpu/lal_lj_sdk_long.cpp @@ -124,20 +124,9 @@ double CGCMMLongT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void CGCMMLongT::loop(const bool _eflag, const bool _vflag) { +int CGCMMLongT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -145,8 +134,8 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, @@ -161,6 +150,7 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) { &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class CGCMMLong; diff --git a/lib/gpu/lal_lj_sdk_long.cu b/lib/gpu/lal_lj_sdk_long.cu index 6dd1829c71..3972ed2076 100644 --- a/lib/gpu/lal_lj_sdk_long.cu +++ b/lib/gpu/lal_lj_sdk_long.cu @@ -47,6 +47,9 @@ __kernel void k_lj_sdk_long(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; @@ -56,18 +59,18 @@ __kernel void k_lj_sdk_long(const __global numtyp4 *restrict x_, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) e_coul += prefactor*(_erfc-factor_coul); if (rsq < lj1[mtype].y) { @@ -138,7 +141,7 @@ __kernel void k_lj_sdk_long(const __global numtyp4 *restrict x_, lj3[mtype].w; } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -149,9 +152,9 @@ __kernel void k_lj_sdk_long(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_lj_sdk_long_fast(const __global numtyp4 *restrict x_, @@ -173,6 +176,9 @@ __kernel void k_lj_sdk_long_fast(const __global numtyp4 *restrict x_, __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid0) { + if (EVFLAG && eflag) { if (rsq < cut_coulsq) e_coul += prefactor*(_erfc-factor_coul); if (rsq < lj1[mtype].y) { @@ -264,7 +270,7 @@ __kernel void k_lj_sdk_long_fast(const __global numtyp4 *restrict x_, lj3[mtype].w; } } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -275,8 +281,7 @@ __kernel void k_lj_sdk_long_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } - diff --git a/lib/gpu/lal_lj_sdk_long.h b/lib/gpu/lal_lj_sdk_long.h index 608488bd30..102b007b59 100644 --- a/lib/gpu/lal_lj_sdk_long.h +++ b/lib/gpu/lal_lj_sdk_long.h @@ -75,7 +75,7 @@ class CGCMMLong : public BaseCharge { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_lj_sdk_long_ext.cpp b/lib/gpu/lal_lj_sdk_long_ext.cpp index f293487282..3170ac8b52 100644 --- a/lib/gpu/lal_lj_sdk_long_ext.cpp +++ b/lib/gpu/lal_lj_sdk_long_ext.cpp @@ -58,7 +58,7 @@ int sdkl_gpu_init(const int ntypes, double **cutsq, int **cg_type, int init_ok=0; if (world_me==0) init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, + host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e,g_ewald); @@ -77,7 +77,7 @@ int sdkl_gpu_init(const int ntypes, double **cutsq, int **cg_type, } if (gpu_rank==i && world_me!=0) init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, + host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); diff --git a/lib/gpu/lal_lj_tip4p_long.cpp b/lib/gpu/lal_lj_tip4p_long.cpp index 1f3b32248c..66477d1fb4 100644 --- a/lib/gpu/lal_lj_tip4p_long.cpp +++ b/lib/gpu/lal_lj_tip4p_long.cpp @@ -65,6 +65,12 @@ int LJTIP4PLongT::init(const int ntypes, k_pair_distrib.set_function(*this->pair_program,"k_lj_tip4p_long_distrib"); k_pair_reneigh.set_function(*this->pair_program,"k_lj_tip4p_reneigh"); k_pair_newsite.set_function(*this->pair_program,"k_lj_tip4p_newsite"); + #if defined(LAL_OCL_EV_JIT) + k_pair_distrib_noev.set_function(*this->pair_program_noev, + "k_lj_tip4p_long_distrib"); + #else + k_pair_dt_sel = &k_pair_distrib; + #endif TypeH = tH; TypeO = tO; @@ -151,6 +157,9 @@ void LJTIP4PLongT::clear() { k_pair_distrib.clear(); k_pair_reneigh.clear(); k_pair_newsite.clear(); + #if defined(LAL_OCL_EV_JIT) + k_pair_distrib_noev.clear(); + #endif this->clear_atomic(); } @@ -164,19 +173,9 @@ double LJTIP4PLongT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void LJTIP4PLongT::loop(const bool _eflag, const bool _vflag) { +int LJTIP4PLongT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; int ainum=this->ans->inum(); const int nall = this->atom->nall(); @@ -210,8 +209,8 @@ void LJTIP4PLongT::loop(const bool _eflag, const bool _vflag) { this->ansO.zero(); this->device->gpu->sync(); if(shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, @@ -228,12 +227,19 @@ void LJTIP4PLongT::loop(const bool _eflag, const bool _vflag) { &this->atom->q, &cutsq, &_qqrd2e, &_g_ewald, &cut_coulsq, &cut_coulsqplus, &this->ansO); } + #if defined(LAL_OCL_EV_JIT) + if (eflag || vflag) k_pair_dt_sel = &k_pair_distrib; + else k_pair_dt_sel = &k_pair_distrib_noev; + #endif + GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); - this->k_pair_distrib.set_size(GX,BX); - this->k_pair_distrib.run(&this->atom->x, &this->ans->force, &this->ans->engv, - &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, - &hneight, &m, &TypeO, &TypeH, &alpha,&this->atom->q, &this->ansO); + k_pair_dt_sel->set_size(GX,BX); + k_pair_dt_sel->run(&this->atom->x, &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom, &hneight, &m, &TypeO, &TypeH, + &alpha,&this->atom->q, &this->ansO); this->time_pair.stop(); + return GX; } @@ -269,22 +275,26 @@ void LJTIP4PLongT::copy_relations_data(int n, tagint *tag, int *map_array, } } - - - // --------------------------------------------------------------------------- // Copy nbor list from host if necessary and then calculate forces, virials,.. // --------------------------------------------------------------------------- template void LJTIP4PLongT::compute(const int f_ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, - int &host_start, const double cpu_time, - bool &success, double *host_q, - const int nlocal, double *boxlo, double *prd) { + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { this->acc_timers(); + int eflag, vflag; + if (eflag_in) eflag=2; + else eflag=0; + if (vflag_in) vflag=2; + else vflag=0; + + this->set_kernel(eflag,vflag); if (inum_full==0) { host_start=0; // Make sure textures are correct if realloc by a different hybrid style @@ -315,7 +325,7 @@ void LJTIP4PLongT::compute(const int f_ago, const int inum_full, t_ago = ago; loop(eflag,vflag); - this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist); + this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,inum); this->device->add_ans_object(this->ans); this->hd_balancer.stop_timer(); } @@ -325,16 +335,23 @@ void LJTIP4PLongT::compute(const int f_ago, const int inum_full, // --------------------------------------------------------------------------- template int** LJTIP4PLongT::compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, - int *map_array, int map_size, int *sametag, int max_same, - int **nspecial, tagint **special, const bool eflag, - const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, - const double cpu_time, bool &success, - double *host_q, double *boxlo, double *prd) { + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, + int *map_array, int map_size, int *sametag, + int max_same, int **nspecial, tagint **special, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + double *host_q, double *boxlo, double *prd) { this->acc_timers(); + int eflag, vflag; + if (eflag_in) eflag=2; + else eflag=0; + if (vflag_in) vflag=2; + else vflag=0; + + this->set_kernel(eflag,vflag); if (inum_full==0) { host_start=0; // Make sure textures are correct if realloc by a different hybrid style @@ -373,7 +390,7 @@ int** LJTIP4PLongT::compute(const int ago, const int inum_full, t_ago = ago; loop(eflag,vflag); - this->ans->copy_answers(eflag,vflag,eatom,vatom); + this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,inum); this->device->add_ans_object(this->ans); this->hd_balancer.stop_timer(); diff --git a/lib/gpu/lal_lj_tip4p_long.cu b/lib/gpu/lal_lj_tip4p_long.cu index 782ae43662..bd900d9244 100644 --- a/lib/gpu/lal_lj_tip4p_long.cu +++ b/lib/gpu/lal_lj_tip4p_long.cu @@ -129,7 +129,7 @@ __kernel void k_lj_tip4p_long_distrib(const __global numtyp4 *restrict x_, f.x += fM.x * (acctyp)0.5 * alpha; f.y += fM.y * (acctyp)0.5 * alpha; f.z += fM.z * (acctyp)0.5 * alpha; - if (vflag > 0) { + if (EVFLAG && vflag) { vM = ansO[inum +iO]; engv[inum*engv_iter + i] += vM.x * (acctyp)0.5 * alpha; engv_iter++; engv[inum*engv_iter + i] += vM.y * (acctyp)0.5 * alpha; engv_iter++; @@ -147,13 +147,13 @@ __kernel void k_lj_tip4p_long_distrib(const __global numtyp4 *restrict x_, f.x += fM.x * (acctyp)(1 - alpha); f.y += fM.y * (acctyp)(1 - alpha); f.z += fM.z * (acctyp)(1 - alpha); - if (eflag > 0) { + if (EVFLAG && eflag) { eM = engv[i+inum]; engv[inum+i] = eM*(acctyp)(1 - alpha); if (iH1 < inum) engv[inum+iH1] += eM * (acctyp)0.5 * alpha; if (iH2 < inum) engv[inum+iH2] += eM * (acctyp)0.5 * alpha; } - if (vflag > 0) { + if (EVFLAG && vflag) { vM = ansO[inum + i]; engv[inum*engv_iter + i] += vM.x * (acctyp)(1 - alpha); engv_iter++; engv[inum*engv_iter + i] += vM.y * (acctyp)(1 - alpha); engv_iter++; @@ -276,22 +276,27 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_, int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - acctyp energy = (acctyp)0; - acctyp e_coul = (acctyp)0; + int n_stride; + local_allocate_store_charge(); + acctyp4 f, fO; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; fO.x=(acctyp)0; fO.y=(acctyp)0; fO.z=(acctyp)0; - acctyp virial[6],vO[6]; - for (int i=0; i<6; i++) { - virial[i]=(acctyp)0; - vO[i]=(acctyp)0; + acctyp energy, e_coul, virial[6], vO[6]; + if (EVFLAG) { + energy = (acctyp)0; + e_coul = (acctyp)0; + for (int i=0; i<6; i++) { + virial[i]=(acctyp)0; + vO[i]=(acctyp)0; + } } + int i; if (ii0) { + if (EVFLAG && eflag) { numtyp e = r6inv * (lj3[mtype].x*r6inv-lj3[mtype].y); energy += factor_lj * (e - lj3[mtype].z); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*forcelj; virial[1] += dely*dely*forcelj; virial[2] += delz*delz*forcelj; @@ -396,10 +401,10 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_, fO.z += delz * force_coul; fO.w += 0; } - if (eflag>0) { + if (EVFLAG && eflag) { e_coul += prefactor*(_erfc-factor_coul); } - if (vflag>0) { + if (EVFLAG && vflag) { acctyp4 fd; fd.x = delx*force_coul; fd.y = dely*force_coul; @@ -489,10 +494,10 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_, f.y += fd.y; f.z += fd.z; - if (eflag>0) { + if (EVFLAG && eflag) { e_coul += prefactor*(_erfc-factor_coul) * (acctyp)0.5 * alpha; } - if (vflag>0) { + if (EVFLAG && vflag) { numtyp4 xH1; fetch4(xH1,iH1,pos_tex); numtyp4 xH2; fetch4(xH2,iH2,pos_tex); numtyp4 xO; fetch4(xO,iO,pos_tex); @@ -508,62 +513,64 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_, } } // if cut_coulsqplus } // for nbor - if (t_per_atom>1) { -#if (ARCH < 300) - __local acctyp red_acc[6][BLOCK_PAIR]; - red_acc[0][tid]=fO.x; - red_acc[1][tid]=fO.y; - red_acc[2][tid]=fO.z; - red_acc[3][tid]=fO.w; + } // if ii + if (t_per_atom>1) { +#if (SHUFFLE_AVAIL == 0) + red_acc[0][tid]=fO.x; + red_acc[1][tid]=fO.y; + red_acc[2][tid]=fO.z; + red_acc[3][tid]=fO.w; + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + simdsync(); + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + fO.x=red_acc[0][tid]; + fO.y=red_acc[1][tid]; + fO.z=red_acc[2][tid]; + fO.w=red_acc[3][tid]; + if (EVFLAG && vflag) { + simdsync(); + for (int r=0; r<6; r++) red_acc[r][tid]=vO[r]; for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + simdsync(); if (offset < s) { - for (int r=0; r<4; r++) + for (int r=0; r<6; r++) red_acc[r][tid] += red_acc[r][tid+s]; } } - fO.x=red_acc[0][tid]; - fO.y=red_acc[1][tid]; - fO.z=red_acc[2][tid]; - fO.w=red_acc[3][tid]; - if (vflag>0) { - for (int r=0; r<6; r++) red_acc[r][tid]=vO[r]; - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { - if (offset < s) { - for (int r=0; r<6; r++) - red_acc[r][tid] += red_acc[r][tid+s]; - } - } - for (int r=0; r<6; r++) vO[r]=red_acc[r][tid]; - } + for (int r=0; r<6; r++) vO[r]=red_acc[r][tid]; + } #else + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + fO.x += shfl_down(fO.x, s, t_per_atom); + fO.y += shfl_down(fO.y, s, t_per_atom); + fO.z += shfl_down(fO.z, s, t_per_atom); + fO.w += shfl_down(fO.w, s, t_per_atom); + } + if (EVFLAG && vflag) { for (unsigned int s=t_per_atom/2; s>0; s>>=1) { - fO.x += shfl_xor(fO.x, s, t_per_atom); - fO.y += shfl_xor(fO.y, s, t_per_atom); - fO.z += shfl_xor(fO.z, s, t_per_atom); - fO.w += shfl_xor(fO.w, s, t_per_atom); - } - if (vflag>0) { - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { - for (int r=0; r<6; r++) - vO[r] += shfl_xor(vO[r], s, t_per_atom); - } + for (int r=0; r<6; r++) + vO[r] += shfl_down(vO[r], s, t_per_atom); } + } #endif + } + if(offset == 0 && ii0) { - ansO[inum + i].x = vO[0]; - ansO[inum + i].y = vO[1]; - ansO[inum + i].z = vO[2]; - ansO[inum*2 + i].x = vO[3]; - ansO[inum*2 + i].y = vO[4]; - ansO[inum*2 + i].z = vO[5]; - } - } - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); - } // if ii + } + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_, @@ -592,28 +599,32 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_, __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) lj3[tid]=lj3_in[tid]; } - acctyp energy = (acctyp)0; - acctyp e_coul = (acctyp)0; acctyp4 f, fO; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; fO.x=(acctyp)0; fO.y=(acctyp)0; fO.z=(acctyp)0; - acctyp virial[6],vO[6]; - for (int i=0; i<6; i++) { - virial[i]=(acctyp)0; - vO[i]=(acctyp)0; + acctyp energy, e_coul, virial[6], vO[6]; + if (EVFLAG) { + energy = (acctyp)0; + e_coul = (acctyp)0; + for (int i=0; i<6; i++) { + virial[i]=(acctyp)0; + vO[i]=(acctyp)0; + } } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { numtyp e = r6inv * (lj3[mtype].x*r6inv-lj3[mtype].y); energy += factor_lj * (e - lj3[mtype].z); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*forcelj; virial[1] += dely*dely*forcelj; virial[2] += delz*delz*forcelj; @@ -720,10 +731,10 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_, fO.z += delz * force_coul; fO.w += 0; } - if (eflag>0) { + if (EVFLAG && eflag) { e_coul += prefactor*(_erfc-factor_coul); } - if (vflag>0) { + if (EVFLAG && vflag) { acctyp4 fd; fd.x = delx*force_coul; fd.y = dely*force_coul; @@ -813,10 +824,10 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_, f.y += fd.y; f.z += fd.z; - if (eflag>0) { + if (EVFLAG && eflag) { e_coul += prefactor*(_erfc-factor_coul) * (acctyp)0.5 * alpha; } - if (vflag>0) { + if (EVFLAG && vflag) { numtyp4 xH1; fetch4(xH1,iH1,pos_tex); numtyp4 xH2; fetch4(xH2,iH2,pos_tex); numtyp4 xO; fetch4(xO,iO,pos_tex); @@ -833,13 +844,13 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_, } // if cut_coulsqplus } // for nbor if (t_per_atom>1) { -#if (ARCH < 300) - __local acctyp red_acc[6][BLOCK_PAIR]; +#if (SHUFFLE_AVAIL == 0) red_acc[0][tid]=fO.x; red_acc[1][tid]=fO.y; red_acc[2][tid]=fO.z; red_acc[3][tid]=fO.w; for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + simdsync(); if (offset < s) { for (int r=0; r<4; r++) red_acc[r][tid] += red_acc[r][tid+s]; @@ -849,9 +860,10 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_, fO.y=red_acc[1][tid]; fO.z=red_acc[2][tid]; fO.w=red_acc[3][tid]; - if (vflag>0) { + if (EVFLAG && vflag) { for (int r=0; r<6; r++) red_acc[r][tid]=vO[r]; for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + simdsync(); if (offset < s) { for (int r=0; r<6; r++) red_acc[r][tid] += red_acc[r][tid+s]; @@ -861,22 +873,22 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_, } #else for (unsigned int s=t_per_atom/2; s>0; s>>=1) { - fO.x += shfl_xor(fO.x, s, t_per_atom); - fO.y += shfl_xor(fO.y, s, t_per_atom); - fO.z += shfl_xor(fO.z, s, t_per_atom); - fO.w += shfl_xor(fO.w, s, t_per_atom); + fO.x += shfl_down(fO.x, s, t_per_atom); + fO.y += shfl_down(fO.y, s, t_per_atom); + fO.z += shfl_down(fO.z, s, t_per_atom); + fO.w += shfl_down(fO.w, s, t_per_atom); } - if (vflag>0) { + if (EVFLAG && vflag) { for (unsigned int s=t_per_atom/2; s>0; s>>=1) { for (int r=0; r<6; r++) - vO[r] += shfl_xor(vO[r], s, t_per_atom); + vO[r] += shfl_down(vO[r], s, t_per_atom); } } #endif } if(offset == 0) { ansO[i] = fO; - if (vflag>0) { + if (EVFLAG && vflag) { ansO[inum + i].x = vO[0]; ansO[inum + i].y = vO[1]; ansO[inum + i].z = vO[2]; @@ -885,7 +897,7 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_, ansO[inum*2 + i].z = vO[5]; } } - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } diff --git a/lib/gpu/lal_lj_tip4p_long.h b/lib/gpu/lal_lj_tip4p_long.h index 90c342e246..b163a62309 100644 --- a/lib/gpu/lal_lj_tip4p_long.h +++ b/lib/gpu/lal_lj_tip4p_long.h @@ -74,13 +74,13 @@ public: /// Reimplement BaseCharge pair loop with device neighboring int** compute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag,int *map_array, int map_size, int *sametag, int max_same, - int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **numj, const double cpu_time, bool &success, - double *charge, double *boxlo, double *prd); + double **host_x, int *host_type, double *sublo, double *subhi, + tagint *tag,int *map_array, int map_size, int *sametag, + int max_same, int **nspecial, tagint **special, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, int **numj, + const double cpu_time, bool &success, double *charge, + double *boxlo, double *prd); // --------------------------- TYPE DATA -------------------------- @@ -115,11 +115,12 @@ public: UCL_D_Vec atom_sametag; UCL_Kernel k_pair_distrib, k_pair_reneigh, k_pair_newsite; + UCL_Kernel k_pair_distrib_noev, *k_pair_dt_sel; private: bool _allocated; int t_ago; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_lj_tip4p_long_ext.cpp b/lib/gpu/lal_lj_tip4p_long_ext.cpp index d0d6c7a3d2..7395506c2d 100644 --- a/lib/gpu/lal_lj_tip4p_long_ext.cpp +++ b/lib/gpu/lal_lj_tip4p_long_ext.cpp @@ -62,7 +62,7 @@ int ljtip4p_long_gpu_init(const int ntypes, double **cutsq, double **host_lj1, if (world_me==0) init_ok=LJTIP4PLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, - tH, tO, alpha, qdist, nall, 300, + tH, tO, alpha, qdist, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_cut_coulsqplus, host_special_coul, qqrd2e, g_ewald, map_size, max_same); @@ -83,7 +83,7 @@ int ljtip4p_long_gpu_init(const int ntypes, double **cutsq, double **host_lj1, if (gpu_rank==i && world_me!=0) init_ok=LJTIP4PLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, - tH, tO, alpha, qdist, nall, 300, maxspecial, + tH, tO, alpha, qdist, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_cut_coulsqplus, host_special_coul, qqrd2e, @@ -97,7 +97,7 @@ int ljtip4p_long_gpu_init(const int ntypes, double **cutsq, double **host_lj1, fprintf(screen,"\n"); if (init_ok==0) - LJTIP4PLMF.estimate_gpu_overhead(); + LJTIP4PLMF.estimate_gpu_overhead(2); return init_ok; } diff --git a/lib/gpu/lal_mie.cpp b/lib/gpu/lal_mie.cpp index 394d1f8a2f..e370b7bde5 100644 --- a/lib/gpu/lal_mie.cpp +++ b/lib/gpu/lal_mie.cpp @@ -113,20 +113,9 @@ double MieT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void MieT::loop(const bool _eflag, const bool _vflag) { +int MieT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -134,8 +123,8 @@ void MieT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &mie1, &mie3, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &mie1, &mie3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); @@ -147,6 +136,7 @@ void MieT::loop(const bool _eflag, const bool _vflag) { &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class Mie; diff --git a/lib/gpu/lal_mie.cu b/lib/gpu/lal_mie.cu index 36ec8a496b..fedfaf157a 100644 --- a/lib/gpu/lal_mie.cu +++ b/lib/gpu/lal_mie.cu @@ -39,22 +39,25 @@ __kernel void k_mie(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { numtyp e=(mie3[mtype].x*rgamR - mie3[mtype].y*rgamA) - mie3[mtype].z; energy+=factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -105,9 +108,9 @@ __kernel void k_mie(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_mie_fast(const __global numtyp4 *restrict x_, @@ -126,6 +129,9 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_, __local numtyp4 mie1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 mie3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + if (tid<4) sp_lj[tid]=sp_lj_in[tid]; if (tid0) { + if (EVFLAG && eflag) { numtyp e=(mie3[mtype].x*rgamR - mie3[mtype].y*rgamA) - mie3[mtype].z; energy+=factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -196,8 +202,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } - diff --git a/lib/gpu/lal_mie.h b/lib/gpu/lal_mie.h index dfc2ee6e53..9a41596ccb 100644 --- a/lib/gpu/lal_mie.h +++ b/lib/gpu/lal_mie.h @@ -72,7 +72,7 @@ class Mie : public BaseAtomic { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_mie_ext.cpp b/lib/gpu/lal_mie_ext.cpp index f612de4336..5cbb9c29d2 100644 --- a/lib/gpu/lal_mie_ext.cpp +++ b/lib/gpu/lal_mie_ext.cpp @@ -58,7 +58,7 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1, if (world_me==0) init_ok=MLMF.init(ntypes, cutsq, host_mie1, host_mie2, host_mie3, host_mie4, host_gamA, host_gamR, - offset, special_lj, inum, nall, 300, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); MLMF.device->world_barrier(); @@ -77,7 +77,7 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1, if (gpu_rank==i && world_me!=0) init_ok=MLMF.init(ntypes, cutsq, host_mie1, host_mie2, host_mie3, host_mie4, host_gamA, host_gamR, - offset, special_lj, inum, nall, 300, maxspecial, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); MLMF.device->gpu_barrier(); diff --git a/lib/gpu/lal_morse.cpp b/lib/gpu/lal_morse.cpp index 09da65d252..4bedc67ed7 100644 --- a/lib/gpu/lal_morse.cpp +++ b/lib/gpu/lal_morse.cpp @@ -112,20 +112,9 @@ double MorseT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void MorseT::loop(const bool _eflag, const bool _vflag) { +int MorseT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -133,8 +122,8 @@ void MorseT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &mor1, &mor2, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &mor1, &mor2, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, @@ -147,6 +136,7 @@ void MorseT::loop(const bool _eflag, const bool _vflag) { &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class Morse; diff --git a/lib/gpu/lal_morse.cu b/lib/gpu/lal_morse.cu index d6bab1e131..b1c8f2673b 100644 --- a/lib/gpu/lal_morse.cu +++ b/lib/gpu/lal_morse.cu @@ -41,22 +41,25 @@ __kernel void k_morse(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { numtyp e=mor2[mtype].x*(dexp*dexp - 2.0*dexp) - mor2[mtype].y; energy+=e*factor_lj; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -106,9 +109,9 @@ __kernel void k_morse(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_morse_fast(const __global numtyp4 *restrict x_, @@ -127,27 +130,30 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_, __local numtyp4 mor1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp2 mor2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + if (tid<4) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) mor2[tid]=mor2_in[tid]; } - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { numtyp e=mor2[mtype].x*(dm-dexp)-mor2[mtype].y; energy+=e*factor_lj; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -197,8 +203,7 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } - diff --git a/lib/gpu/lal_morse.h b/lib/gpu/lal_morse.h index bf5f1c0f8f..c5948d8be8 100644 --- a/lib/gpu/lal_morse.h +++ b/lib/gpu/lal_morse.h @@ -71,7 +71,7 @@ class Morse : public BaseAtomic { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_morse_ext.cpp b/lib/gpu/lal_morse_ext.cpp index 3b62d10305..f43676a1b5 100644 --- a/lib/gpu/lal_morse_ext.cpp +++ b/lib/gpu/lal_morse_ext.cpp @@ -56,7 +56,7 @@ int mor_gpu_init(const int ntypes, double **cutsq, int init_ok=0; if (world_me==0) init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, + host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); MORMF.device->world_barrier(); @@ -74,7 +74,7 @@ int mor_gpu_init(const int ntypes, double **cutsq, } if (gpu_rank==i && world_me!=0) init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, - offset, special_lj, inum, nall, 300, maxspecial, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); MORMF.device->gpu_barrier(); diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp index 6c4890ef47..aabba49575 100644 --- a/lib/gpu/lal_neighbor.cpp +++ b/lib/gpu/lal_neighbor.cpp @@ -1,6 +1,7 @@ /*************************************************************************** neighbor.cpp ------------------- + Nitin Dhamankar (Intel) W. Michael Brown (ORNL) Peng Wang (Nvidia) @@ -32,22 +33,25 @@ int Neighbor::bytes_per_atom(const int max_nbors) const { } bool Neighbor::init(NeighborShared *shared, const int inum, - const int host_inum, const int max_nbors, - const int maxspecial, UCL_Device &devi, - const int gpu_nbor, const int gpu_host, - const bool pre_cut, const int block_cell_2d, - const int block_cell_id, const int block_nbor_build, - const int threads_per_atom, const int warp_size, - const bool time_device, - const std::string compile_flags) { + const int host_inum, const int max_nbors, + const int maxspecial, UCL_Device &devi, const int gpu_nbor, + const int gpu_host, const bool pre_cut, + const int block_cell_2d, const int block_cell_id, + const int block_nbor_build, const int threads_per_atom, + const int simd_size, const bool time_device, + const std::string compile_flags, const bool ilist_map) { clear(); + _ilist_map = ilist_map; _threads_per_atom=threads_per_atom; _block_cell_2d=block_cell_2d; _block_cell_id=block_cell_id; - _max_block_nbor_build=block_nbor_build; _block_nbor_build=block_nbor_build; - _warp_size=warp_size; + _simd_size=simd_size; + #ifndef LAL_USE_OLD_NEIGHBOR + if (_block_nbor_build < _simd_size) + _block_nbor_build = _simd_size; + #endif _shared=shared; dev=&devi; _gpu_nbor=gpu_nbor; @@ -90,7 +94,13 @@ bool Neighbor::init(NeighborShared *shared, const int inum, _max_atoms=1000; _max_host=static_cast(static_cast(host_inum)*1.10); - _max_nbors=(max_nbors/threads_per_atom+1)*threads_per_atom; + + _max_neighbor_factor=1.0e-2*max_nbors*1.1; + if (_gpu_nbor != 1) + _max_nbors=0; + else + _max_nbors=300; + if (_old_max_nbors) _max_nbors=_old_max_nbors; _maxspecial=maxspecial; if (gpu_nbor==0) @@ -103,8 +113,36 @@ bool Neighbor::init(NeighborShared *shared, const int inum, if (!success) return false; - if (_use_packing==false) - _shared->compile_kernels(devi,gpu_nbor,compile_flags); + if (_use_packing==false) { + #ifndef LAL_USE_OLD_NEIGHBOR + _shared->compile_kernels(devi, gpu_nbor, compile_flags+ + " -DMAX_SUBGROUPS_PER_BLOCK="+toa(_block_nbor_build/_simd_size)); + #else + _shared->compile_kernels(devi,gpu_nbor,compile_flags); + #endif + + #ifndef LAL_USE_OLD_NEIGHBOR + if (_gpu_nbor) { + #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || \ + defined(CL_VERSION_3_0)) + if (dev->has_subgroup_support()) { + int simd_size_kernel= + _shared->k_build_nbor.max_subgroup_size(_block_nbor_build); + if (_simd_size != simd_size_kernel) { + _simd_size = simd_size_kernel; + if (_block_nbor_build < _simd_size) + _block_nbor_build = _simd_size; + _shared->clear(); + _shared->compile_kernels(devi, gpu_nbor, compile_flags+ + " -DMAX_SUBGROUPS_PER_BLOCK="+toa(_block_nbor_build/_simd_size)); + } + } + #endif + _bin_stencil.get_global(*(_shared->build_program),"bin_stencil"); + } + #endif + } + _max_block_nbor_build=_block_nbor_build; return success; } @@ -113,24 +151,44 @@ void Neighbor::alloc(bool &success) { dev_nbor.clear(); host_acc.clear(); int nt=_max_atoms+_max_host; - if (_use_packing==false || _gpu_nbor>0) - success=success && - (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev)==UCL_SUCCESS); - else + if (_max_nbors) + _max_nbors = ((_max_nbors-1)/_threads_per_atom+1)*_threads_per_atom; + if (_use_packing==false || _gpu_nbor>0) { + if (_max_nbors) + success=success && + (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev)==UCL_SUCCESS); + } else success=success && (dev_nbor.alloc(3*_max_atoms,*dev, UCL_READ_ONLY)==UCL_SUCCESS); - success=success && (host_acc.alloc(nt*2,*dev, - UCL_READ_WRITE)==UCL_SUCCESS); + if (_gpu_nbor != 2 || _max_host>0) + success=success && (host_acc.alloc(nt*2,*dev, + UCL_READ_WRITE)==UCL_SUCCESS); _c_bytes=dev_nbor.row_bytes(); if (_alloc_packed) { + if (_use_packing==false) { + dev_packed_begin.clear(); + success=success && (dev_packed_begin.alloc(_max_atoms,*dev, + _packed_permissions)==UCL_SUCCESS); + } + dev_packed.clear(); - success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev, - _packed_permissions)==UCL_SUCCESS); - dev_ilist.clear(); - success=success && (dev_ilist.alloc(_max_atoms,*dev, - UCL_READ_WRITE)==UCL_SUCCESS); - _c_bytes+=dev_packed.row_bytes()+dev_ilist.row_bytes(); + if (_max_nbors) + success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev, + _packed_permissions)==UCL_SUCCESS); + if (_ilist_map) { + if (_gpu_nbor) { + if (three_ilist.numel()==0) + success=success && (three_ilist.alloc(16,*dev,UCL_READ_WRITE, + UCL_READ_ONLY)==UCL_SUCCESS); + } else { + three_ilist.clear(); + success=success && (three_ilist.alloc(_max_atoms,*dev,UCL_READ_WRITE, + UCL_READ_ONLY)==UCL_SUCCESS); + } + _c_bytes+=three_ilist.row_bytes(); + } + _c_bytes+=dev_packed.row_bytes()+dev_packed_begin.row_bytes(); } if (_max_host>0) { nbor_host.clear(); @@ -138,8 +196,9 @@ void Neighbor::alloc(bool &success) { host_ilist.clear(); host_jlist.clear(); - success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_READ_WRITE, - UCL_READ_WRITE)==UCL_SUCCESS) && success; + if (_max_nbors) + success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_READ_WRITE, + UCL_READ_WRITE)==UCL_SUCCESS) && success; success=success && (dev_numj_host.alloc(_max_host,*dev, UCL_READ_WRITE)==UCL_SUCCESS); success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS); @@ -157,7 +216,8 @@ void Neighbor::alloc(bool &success) { ptr+=_max_nbors; } _c_bytes+=nbor_host.device.row_bytes()+dev_numj_host.row_bytes(); - } else { + } else if (dev_nbor.numel()) { + if (!success) return; // Some OpenCL implementations return errors for nullptr pointers as args nbor_host.device.view(dev_nbor); dev_numj_host.view(dev_nbor); @@ -188,6 +248,12 @@ void Neighbor::clear() { if (_ncells>0) { _ncells=0; cell_counts.clear(); +#ifndef LAL_USE_OLD_NEIGHBOR + cell_subgroup_counts.clear(); + subgroup2cell.clear(); + _host_bin_stencil.clear(); + _bin_stencil.clear(); +#endif if (_gpu_nbor==2) delete [] cell_iter; } @@ -195,12 +261,15 @@ void Neighbor::clear() { _allocated=false; _nbor_time_avail=false; + _old_max_nbors=_max_nbors; + _max_nbors=0; host_packed.clear(); host_acc.clear(); - dev_ilist.clear(); + three_ilist.clear(); dev_nbor.clear(); nbor_host.clear(); dev_packed.clear(); + dev_packed_begin.clear(); dev_numj_host.clear(); host_ilist.clear(); host_jlist.clear(); @@ -236,9 +305,9 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj, UCL_H_Vec ilist_view; ilist_view.view(ilist,inum,*dev); ucl_copy(dev_nbor,ilist_view,false); - - UCL_D_Vec nbor_offset; - UCL_H_Vec host_offset; + #ifndef GERYON_OCL_FLUSH + dev_nbor.flush(); + #endif int copy_count=0; int ij_count=0; @@ -263,9 +332,12 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj, if (ij_count==IJ_SIZE) { dev_nbor.sync(); - host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE); - nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE); - ucl_copy(nbor_offset,host_offset,true); + _host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE); + _nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE); + ucl_copy(_nbor_offset,_host_offset,true); + #ifndef GERYON_OCL_FLUSH + _nbor_offset.flush(); + #endif copy_count++; ij_count=0; dev_count+=IJ_SIZE; @@ -275,21 +347,29 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj, } if (ij_count!=0) { dev_nbor.sync(); - host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count); - nbor_offset.view_offset(dev_count,dev_packed,ij_count); - ucl_copy(nbor_offset,host_offset,true); + _host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count); + _nbor_offset.view_offset(dev_count,dev_packed,ij_count); + ucl_copy(_nbor_offset,_host_offset,true); + } + _acc_view.view_offset(inum,dev_nbor,inum*2); + if (_use_packing) + ucl_copy(_acc_view,host_acc,inum*2,true); + else { + ucl_copy(_acc_view,host_acc,inum,true); + _host_offset.view_offset(inum,host_acc,inum); + ucl_copy(dev_packed_begin,_host_offset,inum,true); } - UCL_D_Vec acc_view; - acc_view.view_offset(inum,dev_nbor,inum*2); - ucl_copy(acc_view,host_acc,inum*2,true); - UCL_H_Vec host_view; - host_view.alloc(_max_atoms,*dev,UCL_READ_WRITE); - for (int ii=0; ii(ceil(static_cast(inum)*_threads_per_atom/ block_size)); _shared->k_nbor.set_size(GX,block_size); - _shared->k_nbor.run(&dev_nbor, &dev_packed, &inum, &_threads_per_atom); + _shared->k_nbor.run(&dev_nbor, &dev_packed, &dev_packed_begin, &inum, + &_threads_per_atom); time_kernel.stop(); } } @@ -315,9 +396,6 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj, ilist_view.view(ilist,inum,*dev); ucl_copy(dev_nbor,ilist_view,false); - UCL_D_Vec nbor_offset; - UCL_H_Vec host_offset; - int copy_count=0; int ij_count=0; int acc_count=0; @@ -346,9 +424,9 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj, if (ij_count==IJ_SIZE) { dev_nbor.sync(); - host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE); - nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE); - ucl_copy(nbor_offset,host_offset,true); + _host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE); + _nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE); + ucl_copy(_nbor_offset,_host_offset,true); copy_count++; ij_count=0; dev_count+=IJ_SIZE; @@ -358,13 +436,18 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj, } if (ij_count!=0) { dev_nbor.sync(); - host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count); - nbor_offset.view_offset(dev_count,dev_packed,ij_count); - ucl_copy(nbor_offset,host_offset,true); + _host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count); + _nbor_offset.view_offset(dev_count,dev_packed,ij_count); + ucl_copy(_nbor_offset,_host_offset,true); + } + _acc_view.view_offset(inum,dev_nbor,inum*2); + if (_use_packing) + ucl_copy(_acc_view,host_acc,inum*2,true); + else { + ucl_copy(_acc_view,host_acc,inum,true); + _host_offset.view_offset(inum,host_acc,inum); + ucl_copy(dev_packed_begin,_host_offset,inum,true); } - UCL_D_Vec acc_view; - acc_view.view_offset(inum,dev_nbor,inum*2); - ucl_copy(acc_view,host_acc,inum*2,true); time_nbor.stop(); if (_use_packing==false) { @@ -372,20 +455,28 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj, int GX=static_cast(ceil(static_cast(inum)*_threads_per_atom/ block_size)); _shared->k_nbor.set_size(GX,block_size); - _shared->k_nbor.run(&dev_nbor, &dev_packed, &inum, &_threads_per_atom); + _shared->k_nbor.run(&dev_nbor, &dev_packed, &dev_packed_begin, &inum, + &_threads_per_atom); time_kernel.stop(); } } template -void Neighbor::resize_max_neighbors(const int maxn, bool &success) { +void Neighbor::resize_max_neighbors(int maxn, bool &success) { + if (maxn == 0) maxn = 1; if (maxn>_max_nbors) { int mn=static_cast(static_cast(maxn)*1.10); - mn=(mn/_threads_per_atom+1)*_threads_per_atom; - success=success && (dev_nbor.resize((mn+1)*_max_atoms)==UCL_SUCCESS); + mn = ((mn-1)/_threads_per_atom+1)*_threads_per_atom; + dev_nbor.clear(); + success=success && + (dev_nbor.alloc((mn+2)*_max_atoms,*dev)==UCL_SUCCESS); + if (!success) return; _gpu_bytes=dev_nbor.row_bytes(); if (_max_host>0) { - success=success && (nbor_host.resize(mn*_max_host)==UCL_SUCCESS); + nbor_host.clear(); + success=(nbor_host.alloc(mn*_max_host,*dev,UCL_READ_WRITE, + UCL_READ_WRITE)==UCL_SUCCESS) && success; + if (!success) return; int *ptr=nbor_host.host.begin(); for (int i=0; i<_max_host; i++) { host_jlist[i]=ptr; @@ -397,7 +488,9 @@ void Neighbor::resize_max_neighbors(const int maxn, bool &success) { dev_numj_host.view(dev_nbor); } if (_alloc_packed) { - success=success && (dev_packed.resize((mn+2)*_max_atoms)==UCL_SUCCESS); + dev_packed.clear(); + success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev, + _packed_permissions)==UCL_SUCCESS); _gpu_bytes+=dev_packed.row_bytes(); } _max_nbors=mn; @@ -409,32 +502,66 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, const int nall, Atom &atom, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, bool &success, - int &mn) { + int &mn, UCL_Vector &error_flag) { _nbor_time_avail=true; const int nt=inum+host_inum; + const double subx = subhi[0]-sublo[0]; + const double suby = subhi[1]-sublo[1]; + const double subz = subhi[2]-sublo[2]; + // Calculate number of cells and allocate storage for binning as necessary - int ncellx, ncelly, ncellz, ncell_3d; - int ghost_cells=2*_cells_in_cutoff; - ncellx = static_cast(ceil((subhi[0]-sublo[0])/_cell_size))+ghost_cells; - ncelly = static_cast(ceil((subhi[1]-sublo[1])/_cell_size))+ghost_cells; - ncellz = static_cast(ceil((subhi[2]-sublo[2])/_cell_size))+ghost_cells; - ncell_3d = ncellx * ncelly * ncellz; + int ncellx, ncelly, ncellz; + int cells_in_cutoff=static_cast(ceil(_cutoff/_cell_size)); + int ghost_cells=2*cells_in_cutoff; + ncellx = static_cast(ceil(subx/_cell_size))+ghost_cells; + ncelly = static_cast(ceil(suby/_cell_size))+ghost_cells; + ncellz = static_cast(ceil(subz/_cell_size))+ghost_cells; + + #ifndef LAL_USE_OLD_NEIGHBOR + if (_auto_cell_size && subz>0.0) { + if (_old_ncellx!=ncellx || _old_ncelly!=ncelly || _old_ncellz!=ncellz) { + _cell_size = _shared->best_cell_size(subx, suby, subz, nt, _cutoff); + cells_in_cutoff=static_cast(ceil(_cutoff/_cell_size)); + ghost_cells=2*cells_in_cutoff; + ncellx = static_cast(ceil(subx/_cell_size))+ghost_cells; + ncelly = static_cast(ceil(suby/_cell_size))+ghost_cells; + ncellz = static_cast(ceil(subz/_cell_size))+ghost_cells; + } + } + #endif + + int ncell_3d = ncellx * ncelly * ncellz; if (ncell_3d+1>_ncells) { cell_counts.clear(); +#ifndef LAL_USE_OLD_NEIGHBOR + cell_subgroup_counts.clear(); +#endif if (_gpu_nbor==2) { if (_ncells>0) delete [] cell_iter; cell_iter = new int[ncell_3d+1]; - cell_counts.alloc(ncell_3d+1,dev_nbor,UCL_READ_WRITE,UCL_READ_ONLY); + success = success && (cell_counts.alloc(ncell_3d+1,*dev, + UCL_READ_WRITE,UCL_READ_ONLY) == UCL_SUCCESS); +#ifndef LAL_USE_OLD_NEIGHBOR + success = success && (cell_subgroup_counts.alloc(ncell_3d+1,*dev, + UCL_READ_WRITE,UCL_READ_ONLY) == UCL_SUCCESS); + if (!success) return; + cell_subgroup_counts.host[0]=0; +#endif } else { cell_counts.device.clear(); - cell_counts.device.alloc(ncell_3d+1,dev_nbor); + success = success && (cell_counts.device.alloc(ncell_3d+1, + *dev) == UCL_SUCCESS); } + if (!success) return; _ncells=ncell_3d+1; _cell_bytes=cell_counts.device.row_bytes(); +#ifndef LAL_USE_OLD_NEIGHBOR + _cell_bytes+=cell_subgroup_counts.row_bytes()+subgroup2cell.row_bytes(); +#endif } const numtyp cutoff_cast=static_cast(_cutoff); @@ -463,7 +590,13 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, } // If binning on CPU, do this now +#ifndef LAL_USE_OLD_NEIGHBOR + int subgroup_count = 0; +#endif if (_gpu_nbor==2) { + #ifndef GERYON_OCL_FLUSH + dev_nbor.flush(); + #endif double stime = MPI_Wtime(); int *cell_id=atom.host_cell_id.begin(); int *particle_id=atom.host_particle_id.begin(); @@ -472,21 +605,21 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, cell_counts.host.zero(); double i_cell_size=1.0/_cell_size; - int offset_hi=_cells_in_cutoff+1; + int offset_hi=cells_in_cutoff+1; for (int i=0; i(px*i_cell_size+1); - ix = std::max(ix,_cells_in_cutoff); + int ix = static_cast(px*i_cell_size+cells_in_cutoff); + ix = std::max(ix,cells_in_cutoff); ix = std::min(ix,ncellx-offset_hi); - int iy = static_cast(py*i_cell_size+1); - iy = std::max(iy,_cells_in_cutoff); + int iy = static_cast(py*i_cell_size+cells_in_cutoff); + iy = std::max(iy,cells_in_cutoff); iy = std::min(iy,ncelly-offset_hi); - int iz = static_cast(pz*i_cell_size+1); - iz = std::max(iz,_cells_in_cutoff); + int iz = static_cast(pz*i_cell_size+cells_in_cutoff); + iz = std::max(iz,cells_in_cutoff); iz = std::min(iz,ncellz-offset_hi); int id = ix+iy*ncellx+iz*ncellx*ncelly; @@ -494,19 +627,40 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, cell_counts[id+1]++; } +#ifndef LAL_USE_OLD_NEIGHBOR + // populate subgroup counts only for the local atoms + for (int i=1; i<_ncells; i++) { + cell_subgroup_counts[i] = ceil(static_cast(cell_counts[i]) / + _simd_size); + subgroup_count += cell_subgroup_counts[i]; + cell_subgroup_counts[i] += cell_subgroup_counts[i-1]; + } + if (subgroup_count > subgroup2cell.numel()) { + subgroup2cell.clear(); + success = success && (subgroup2cell.alloc(1.1*subgroup_count,*dev, + UCL_READ_WRITE,UCL_READ_ONLY) == UCL_SUCCESS); + if (!success) return; + _cell_bytes=cell_counts.device.row_bytes() + + cell_subgroup_counts.row_bytes()+subgroup2cell.row_bytes(); + } + for (int i=1; i<_ncells; i++) + for (int j=cell_subgroup_counts[i-1]; j(px*i_cell_size+1); + int ix = static_cast(px*i_cell_size); ix = std::max(ix,0); ix = std::min(ix,ncellx-1); - int iy = static_cast(py*i_cell_size+1); + int iy = static_cast(py*i_cell_size); iy = std::max(iy,0); iy = std::min(iy,ncelly-1); - int iz = static_cast(pz*i_cell_size+1); + int iz = static_cast(pz*i_cell_size); iz = std::max(iz,0); iz = std::min(iz,ncellz-1); @@ -518,21 +672,54 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, mn=0; for (int i=0; i<_ncells; i++) mn=std::max(mn,cell_counts[i]); - mn*=8; - set_nbor_block_size(mn/2); - + double mind=std::min(subx,suby); + mind=std::min(mind,subz) + _cutoff; + double ics; + if (mind >= _cell_size) ics = i_cell_size; + else ics = 1.0 / mind; + double vadjust=_cutoff*ics; + vadjust*=vadjust*vadjust*4.1888; + if (_cutoff < _cell_size) vadjust*=1.46; + mn=std::max(mn,static_cast(ceil(_max_neighbor_factor*vadjust*mn))); + if (mn<33) mn+=3; resize_max_neighbors(mn,success); + set_nbor_block_size(mn/2); if (!success) return; _total_atoms=nt; + // For neighbor builds for host atoms, _max_nbors is used for neighbor + // allocation offsets. + if (_max_host > 0) mn=_max_nbors; + cell_iter[0]=0; for (int i=1; i<_ncells; i++) { cell_counts[i]+=cell_counts[i-1]; cell_iter[i]=cell_counts[i]; } time_hybrid1.start(); - cell_counts.update_device(true); + #ifndef LAL_USE_OLD_NEIGHBOR + if (_old_ncellx!=ncellx || _old_ncelly!=ncelly || _old_ncellz!=ncellz) { + _old_ncellx = ncellx; + _old_ncelly = ncelly; + _old_ncellz = ncellz; + const int bin_stencil_stride = cells_in_cutoff * 2 + 1; + const int bin_stencil_size = bin_stencil_stride * bin_stencil_stride; + if (bin_stencil_size > _host_bin_stencil.numel()) + _host_bin_stencil.alloc(bin_stencil_size,*dev); + for (int s = 0; sk_cell_id.run(&atom.x, &atom.dev_cell_id, &atom.dev_particle_id, &sublo0, &sublo1, &sublo2, &i_cell_size, &ncellx, &ncelly, &ncellz, - &nt, &nall, &_cells_in_cutoff); + &nt, &nall, &cells_in_cutoff); atom.sort_neighbor(nall); @@ -575,22 +762,37 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, /* build the neighbor list */ const int cell_block=_block_nbor_build; +#ifndef LAL_USE_OLD_NEIGHBOR + int nblocks = (subgroup_count-1)/(cell_block/_simd_size)+1; + _shared->k_build_nbor.set_size(nblocks, cell_block); + _shared->k_build_nbor.run(&atom.x, &atom.dev_particle_id, + &cell_counts, &dev_nbor, &nbor_host, + &dev_numj_host, &mn, &cutoff_cast, &ncellx, + &ncelly, &ncellz, &inum, &nt, &nall, + &_threads_per_atom, &cells_in_cutoff, + &cell_subgroup_counts, &subgroup2cell, + &subgroup_count, _bin_stencil.begin(), + &error_flag); + error_flag.update_host(); +#else _shared->k_build_nbor.set_size(ncellx-ghost_cells,(ncelly-ghost_cells)* (ncellz-ghost_cells),cell_block,1); _shared->k_build_nbor.run(&atom.x, &atom.dev_particle_id, &cell_counts, &dev_nbor, &nbor_host, - &dev_numj_host, &_max_nbors, &cutoff_cast, &ncellx, + &dev_numj_host, &mn, &cutoff_cast, &ncellx, &ncelly, &ncellz, &inum, &nt, &nall, - &_threads_per_atom, &_cells_in_cutoff); + &_threads_per_atom, &cells_in_cutoff); +#endif /* Get the maximum number of nbors and realloc if necessary */ - UCL_D_Vec numj; - numj.view_offset(inum,dev_nbor,inum); - ucl_copy(host_acc,numj,inum,true); - if (nt>inum) { - UCL_H_Vec host_offset; - host_offset.view_offset(inum,host_acc,nt-inum); - ucl_copy(host_offset,dev_numj_host,nt-inum,true); + UCL_D_Vec _numj_view; + if (_gpu_nbor!=2 || inuminum) { + _host_offset.view_offset(inum,host_acc,nt-inum); + ucl_copy(_host_offset,dev_numj_host,nt-inum,true); + } } if (_gpu_nbor!=2) { @@ -608,7 +810,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, if (_time_device) time_kernel.add_to_total(); build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag, - nspecial, special, success, mn); + nspecial, special, success, mn, error_flag); return; } } @@ -634,5 +836,5 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, template void Neighbor::build_nbor_list (double **x, const int inum, const int host_inum, const int nall, Atom &atom, double *sublo, double *subhi, - tagint *, int **, tagint **, bool &success, int &mn); - + tagint *, int **, tagint **, bool &success, int &mn, + UCL_Vector &error_flag); diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h index 996deaff6d..5939567a41 100644 --- a/lib/gpu/lal_neighbor.h +++ b/lib/gpu/lal_neighbor.h @@ -1,6 +1,7 @@ /*************************************************************************** neighbor.h ------------------- + Nitin Dhamankar (Intel) W. Michael Brown (ORNL) Peng Wang (Nvidia) @@ -19,14 +20,25 @@ #include "lal_atom.h" #include "lal_neighbor_shared.h" +#include #define IJ_SIZE 131072 +#if !defined(USE_OPENCL) && !defined(USE_HIP) +#ifndef LAL_USE_OLD_NEIGHBOR +// Issue with incorrect results with CUDA 11.2 +#if (CUDA_VERSION > 11019) && (CUDA_VERSION < 11030) +#define LAL_USE_OLD_NEIGHBOR +#endif +#endif +#endif + namespace LAMMPS_AL { class Neighbor { public: - Neighbor() : _allocated(false), _use_packing(false), _ncells(0) {} + Neighbor() : _allocated(false), _use_packing(false), _ncells(0), + _old_max_nbors(0) {} ~Neighbor() { clear(); } /// Determine whether neighbor unpacking should be used @@ -37,7 +49,7 @@ class Neighbor { /// Clear any old data and setup for new LAMMPS run /** \param inum Initial number of particles whose neighbors stored on device * \param host_inum Initial number of particles whose nbors copied to host - * \param max_nbors Initial number of rows in the neighbor matrix + * \param max_nbors Factor (in percentage) applied to density calculated max * \param gpu_nbor 0 if neighboring will be performed on host * gpu_nbor 1 if neighboring will be performed on device * gpu_nbor 2 if binning on host and neighboring on device @@ -48,33 +60,41 @@ class Neighbor { * than the force kernel * \param threads_per_atom Number of threads used per atom for force * calculation - * \param compile_flags Flags for JIT compiling **/ + * \param compile_flags Flags for JIT compiling + * \param ilist_map true if ilist mapping data structures used (3-body) **/ bool init(NeighborShared *shared, const int inum, const int host_inum, const int max_nbors, const int maxspecial, UCL_Device &dev, const int gpu_nbor, const int gpu_host, const bool pre_cut, const int block_cell_2d, const int block_cell_id, const int block_nbor_build, const int threads_per_atom, - const int warp_size, const bool time_device, - const std::string compile_flags); + const int simd_size, const bool time_device, + const std::string compile_flags, const bool ilist_map); - /// Set the size of the cutoff+skin - inline void cell_size(const double size, const double cutoff) { - _cell_size=size; + /// Set the cutoff+skin + inline void set_cutoff(const double cutoff) { _cutoff=cutoff; - if (cutoff>size) - _cells_in_cutoff=static_cast(ceil(cutoff/size)); - else - _cells_in_cutoff=1; + + #ifndef LAL_USE_OLD_NEIGHBOR + _cell_size=_shared->cell_size(); + _auto_cell_size=_shared->auto_cell_size(); + const int cells_in_cutoff=static_cast(ceil(_cutoff/_cell_size)); + if (cells_in_cutoff > 2) _cell_size=_cutoff*0.5; + _old_ncellx = _old_ncelly = _old_ncellz = -1; + #else + _cell_size=cutoff; + _auto_cell_size=false; + #endif } - /// Get the size of the cutoff+skin - inline double cell_size() const { return _cell_size; } + /// Get the cutoff+skin + inline double cutoff() { return _cutoff; } /// Check if there is enough memory for neighbor data and realloc if not /** \param inum Number of particles whose nbors will be stored on device * \param max_nbor Current max number of neighbors for a particle * \param success False if insufficient memory **/ - inline void resize(const int inum, const int max_nbor, bool &success) { + inline void resize(const int inum, int max_nbor, bool &success) { + if (max_nbor == 0) max_nbor = 1; if (inum>_max_atoms || max_nbor>_max_nbors) { _max_atoms=static_cast(static_cast(inum)*1.10); if (max_nbor>_max_nbors) @@ -88,8 +108,9 @@ class Neighbor { * \param host_inum Number of particles whose nbors will be copied to host * \param max_nbor Current max number of neighbors for a particle * \param success False if insufficient memory **/ - inline void resize(const int inum, const int host_inum, const int max_nbor, + inline void resize(const int inum, const int host_inum, int max_nbor, bool &success) { + if (max_nbor == 0) max_nbor = 1; if (inum>_max_atoms || max_nbor>_max_nbors || host_inum>_max_host) { _max_atoms=static_cast(static_cast(inum)*1.10); _max_host=static_cast(static_cast(host_inum)*1.10); @@ -99,15 +120,8 @@ class Neighbor { } } - inline void acc_timers() { + inline void acc_timers(FILE *screen) { if (_nbor_time_avail) { - if (_gpu_nbor==2) { - int mn=0; - for (int i=0; i<_total_atoms; i++) - mn=std::max(mn,host_acc[i]); - if (mn>_max_nbors) - assert(0==1); - } if (_time_device) { time_nbor.add_to_total(); if (_use_packing==false) time_kernel.add_to_total(); @@ -172,9 +186,10 @@ class Neighbor { /// Build nbor list on the device template void build_nbor_list(double **x, const int inum, const int host_inum, - const int nall, Atom &atom, double *sublo, - double *subhi, tagint *tag, int **nspecial, tagint **special, - bool &success, int &max_nbors); + const int nall, Atom &atom, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, bool &success, + int &max_nbors, UCL_Vector &error_flag); /// Return the number of bytes used on device inline double gpu_bytes() { @@ -193,14 +208,16 @@ class Neighbor { * - 3rd row is starting location in packed nbors * - Remaining rows are the neighbors arranged for coalesced access **/ UCL_D_Vec dev_nbor; + /// Starting location in packed neighbors used only by unpack kernel + UCL_D_Vec dev_packed_begin; /// Packed storage for neighbor lists copied from host UCL_D_Vec dev_packed; /// Host buffer for copying neighbor lists UCL_H_Vec host_packed; /// Host storage for nbor counts (row 1) & accumulated neighbor counts (row2) UCL_H_Vec host_acc; - /// Device storage for accessing atom indices from the neighbor list (3-body) - UCL_D_Vec dev_ilist; + /// Storage for accessing atom indices from the neighbor list (3-body) + UCL_Vector three_ilist; // ----------------- Data for GPU Neighbor Calculation --------------- @@ -217,18 +234,36 @@ class Neighbor { UCL_D_Vec dev_special, dev_special_t; /// Host/Device storage for number of particles per cell UCL_Vector cell_counts; + #ifndef LAL_USE_OLD_NEIGHBOR + /// Host/Device storage for number of subgroups per cell + UCL_Vector cell_subgroup_counts; + /// Host/Device storage for subgroup to cell mapping + UCL_Vector subgroup2cell; + #endif int *cell_iter; /// Device timers UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2, time_transpose; + /// Effective SIMD width of neighbor build kernel + inline int simd_size() { return _simd_size; } + + template + inline std::string toa(const t& in) { + std::ostringstream o; + o.precision(2); + o << in; + return o.str(); + } + private: NeighborShared *_shared; UCL_Device *dev; bool _allocated, _use_packing, _nbor_time_avail, _time_device; int _gpu_nbor, _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial; - bool _gpu_host, _alloc_packed; - double _cutoff, _cell_size, _bin_time; + int _old_max_nbors; + bool _gpu_host, _alloc_packed, _ilist_map, _auto_cell_size; + double _cutoff, _bin_time, _max_neighbor_factor, _cell_size; enum UCL_MEMOPT _packed_permissions; double _gpu_bytes, _c_bytes, _cell_bytes; @@ -236,18 +271,29 @@ class Neighbor { int _block_cell_2d, _block_cell_id, _max_block_nbor_build, _block_nbor_build; int _ncells, _threads_per_atom, _total_atoms; - int _cells_in_cutoff; template - inline void resize_max_neighbors(const int maxn, bool &success); + inline void resize_max_neighbors(int maxn, bool &success); - int _warp_size; + // For viewing host arrays for data copy operations + UCL_H_Vec _host_offset; + UCL_D_Vec _nbor_offset, _acc_view, _numj_view; + + #ifndef LAL_USE_OLD_NEIGHBOR + UCL_H_Vec _host_bin_stencil; + UCL_Const _bin_stencil; + int _old_ncellx, _old_ncelly, _old_ncellz; + #endif + + int _simd_size; inline void set_nbor_block_size(const int mn) { - int desired=mn/(2*_warp_size); - desired*=_warp_size; - if (desired<_warp_size) desired=_warp_size; + #ifdef LAL_USE_OLD_NEIGHBOR + int desired=mn/(2*_simd_size); + desired*=_simd_size; + if (desired<_simd_size) desired=_simd_size; else if (desired>_max_block_nbor_build) desired=_max_block_nbor_build; _block_nbor_build=desired; + #endif } }; diff --git a/lib/gpu/lal_neighbor_cpu.cu b/lib/gpu/lal_neighbor_cpu.cu index f8b32e1746..3dfe23bdc2 100644 --- a/lib/gpu/lal_neighbor_cpu.cu +++ b/lib/gpu/lal_neighbor_cpu.cu @@ -19,6 +19,7 @@ __kernel void kernel_unpack(__global int *dev_nbor, const __global int *dev_ij, + const __global int *dev_ij_begin, const int inum, const int t_per_atom) { int tid=THREAD_ID_X; int offset=tid & (t_per_atom-1); @@ -28,7 +29,7 @@ __kernel void kernel_unpack(__global int *dev_nbor, int nbor=ii+inum; int numj=dev_nbor[nbor]; nbor+=inum; - int list=dev_nbor[nbor]; + int list=dev_ij_begin[ii]; int list_end=list+numj; list+=offset; nbor+=fast_mul(ii,t_per_atom-1)+offset; @@ -40,4 +41,3 @@ __kernel void kernel_unpack(__global int *dev_nbor, } } // if ii } - diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu index f1da437c86..2aca505396 100644 --- a/lib/gpu/lal_neighbor_gpu.cu +++ b/lib/gpu/lal_neighbor_gpu.cu @@ -1,6 +1,7 @@ // ************************************************************************** // neighbor_gpu.cu // ------------------- +// Nitin Dhamankar (Intel) // Peng Wang (Nvidia) // W. Michael Brown (ORNL) // @@ -32,7 +33,14 @@ _texture( pos_tex,float4); _texture_2d( pos_tex,int4); #endif -__kernel void calc_cell_id(const numtyp4 *restrict pos, +#ifdef NV_KERNEL +#if (__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 2) +// Issue with incorrect results in CUDA 11.2 +#define LAL_USE_OLD_NEIGHBOR +#endif +#endif + +__kernel void calc_cell_id(const numtyp4 *restrict x_, unsigned *restrict cell_id, int *restrict particle_id, numtyp boxlo0, numtyp boxlo1, numtyp boxlo2, @@ -43,7 +51,7 @@ __kernel void calc_cell_id(const numtyp4 *restrict pos, if (i < nall) { numtyp4 p; - fetch4(p,i,pos_tex); //pos[i]; + fetch4(p,i,pos_tex); //x_[i]; p.x -= boxlo0; p.y -= boxlo1; @@ -138,16 +146,219 @@ __kernel void transpose(__global tagint *restrict out, out[j*rows_in+i] = block[ti][tj]; } +#ifndef LAL_USE_OLD_NEIGHBOR + +#define MAX_STENCIL_SIZE 25 +#if !defined(MAX_SUBGROUPS_PER_BLOCK) +#define MAX_SUBGROUPS_PER_BLOCK 8 +#endif + +#if defined(NV_KERNEL) || defined(USE_HIP) +__device__ __constant__ int bin_stencil[MAX_STENCIL_SIZE]; +#endif + __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, - const __global int *restrict cell_particle_id, - const __global int *restrict cell_counts, - __global int *nbor_list, - __global int *host_nbor_list, - __global int *host_numj, - int neigh_bin_size, numtyp cell_size, - int ncellx, int ncelly, int ncellz, - int inum, int nt, int nall, int t_per_atom, - int cells_in_cutoff) + const __global int *restrict cell_particle_id, + const __global int *restrict cell_counts, + __global int *nbor_list, + __global int *host_nbor_list, + __global int *host_numj, + int neigh_bin_size, numtyp cutoff_neigh, + int ncellx, int ncelly, int ncellz, + int inum, int nt, int nall, int t_per_atom, + int cells_in_cutoff, + const __global int *restrict cell_subgroup_counts, + const __global int *restrict subgroup2cell, + int subgroup_count, +#if defined(NV_KERNEL) || defined(USE_HIP) + int *not_used, __global int *error_flag) +#else + __constant int *bin_stencil, + __global int *error_flag) +#endif +{ + int tid = THREAD_ID_X; + int bsx = BLOCK_SIZE_X; + int simd_size = simd_size(); + int subgroup_id_local = tid / simd_size; + int subgroup_id_global = BLOCK_ID_X * bsx / simd_size + subgroup_id_local; + int lane_id = tid % simd_size; + +#if (SHUFFLE_AVAIL == 0) + __local int cell_list_sh[BLOCK_NBOR_BUILD]; + __local numtyp4 pos_sh[BLOCK_NBOR_BUILD]; + __local int local_cell_counts[BLOCK_NBOR_BUILD]; +#endif + __local int local_begin[(MAX_STENCIL_SIZE+1)*MAX_SUBGROUPS_PER_BLOCK]; + __local int local_counts[(MAX_STENCIL_SIZE+1)*MAX_SUBGROUPS_PER_BLOCK]; + + if (subgroup_id_global < subgroup_count) { + // identify own cell for subgroup (icell) and local atom (i) for the lane + int icell = subgroup2cell[subgroup_id_global]; + int icell_end = cell_counts[icell+1]; + int i = cell_counts[icell] + (subgroup_id_global - + cell_subgroup_counts[icell]) * + simd_size + lane_id; + + // Get count of the number of iterations to finish all cells + const int bin_stencil_stride = cells_in_cutoff * 2 + 1; + const int bin_stencil_size = bin_stencil_stride * bin_stencil_stride; + int offset = 0; + int cell_count = 0, jcellyz, jcell_begin; + const int offset2 = subgroup_id_local * (MAX_STENCIL_SIZE+1); + const int niter = (bin_stencil_size - 1)/simd_size + 1; + int end_idx = simd_size; + for (int ni = 0; ni < niter; ni++) { + if (ni == niter - 1) + end_idx = bin_stencil_size - offset; + if (lane_id < end_idx) { + jcellyz = icell + bin_stencil[lane_id + offset]; + jcell_begin = cell_counts[jcellyz - cells_in_cutoff]; + local_begin[lane_id + offset2 + offset] = jcell_begin; + const int local_count = cell_counts[jcellyz + cells_in_cutoff + 1] - + jcell_begin; + cell_count += local_count; + local_counts[lane_id + offset2 + offset] = local_count; + } + offset += simd_size; + } + +#if (SHUFFLE_AVAIL == 0) + local_cell_counts[tid] = cell_count; + offset = subgroup_id_local * simd_size; + for (unsigned int mask=simd_size/2; mask>0; mask>>=1) { + simdsync(); + local_cell_counts[tid] += local_cell_counts[ offset + lane_id^mask ]; + } + simdsync(); + cell_count = local_cell_counts[tid]; +#else + #pragma unroll + for (unsigned int s=simd_size/2; s>0; s>>=1) + cell_count += shfl_xor(cell_count, s, simd_size); +#endif + + int num_iter = cell_count; + int remainder = num_iter % simd_size; + if (remainder == 0) remainder = simd_size; + if (num_iter) num_iter = (num_iter - 1) / simd_size + 1; + + numtyp4 diff; + numtyp r2; + + int pid_i = nall, lpid_j, stride; + numtyp4 atom_i, atom_j; + int cnt = 0; + __global int *neigh_counts, *neigh_list; + + if (i < icell_end) + pid_i = cell_particle_id[i]; + + if (pid_i < nt) { + fetch4(atom_i,pid_i,pos_tex); //pos[i]; + } + + if (pid_i < inum) { + stride=inum; + neigh_counts=nbor_list+stride+pid_i; + neigh_list=neigh_counts+stride+pid_i*(t_per_atom-1); + stride=stride*t_per_atom-t_per_atom; + nbor_list[pid_i]=pid_i; + } else { + stride=0; + neigh_counts=host_numj+pid_i-inum; + neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size; + } + + // loop through neighbors + int bin_shift = 0; + int zy = -1; + int num_atom_cell = 0; + int cell_pos = lane_id; + end_idx = simd_size; + for (int ci = 0; ci < num_iter; ci++) { + cell_pos += simd_size; + while (cell_pos >= num_atom_cell && zy < bin_stencil_size) { + // Shift lane index into atom bins based on remainder from last bin + bin_shift += num_atom_cell % simd_size; + if (bin_shift >= simd_size) bin_shift -= simd_size; + cell_pos = lane_id - bin_shift; + if (cell_pos < 0) cell_pos += simd_size; + // Move to next bin + zy++; + jcell_begin = local_begin[offset2 + zy]; + num_atom_cell = local_counts[offset2 + zy]; + } + + if (zy < bin_stencil_size) { + lpid_j = cell_particle_id[jcell_begin + cell_pos]; + fetch4(atom_j,lpid_j,pos_tex); +#if (SHUFFLE_AVAIL == 0) + cell_list_sh[tid] = lpid_j; + pos_sh[tid].x = atom_j.x; + pos_sh[tid].y = atom_j.y; + pos_sh[tid].z = atom_j.z; + } + simdsync(); +#else + } +#endif + + if (ci == num_iter-1) end_idx = remainder; + + for (int j = 0; j < end_idx; j++) { +#if (SHUFFLE_AVAIL == 0) + int pid_j = cell_list_sh[offset+j]; // gather from shared memory + diff.x = atom_i.x - pos_sh[offset+j].x; + diff.y = atom_i.y - pos_sh[offset+j].y; + diff.z = atom_i.z - pos_sh[offset+j].z; +#else + int pid_j = simd_broadcast_i(lpid_j, j, simd_size); +#ifdef _DOUBLE_DOUBLE + diff.x = atom_i.x - simd_broadcast_d(atom_j.x, j, simd_size); + diff.y = atom_i.y - simd_broadcast_d(atom_j.y, j, simd_size); + diff.z = atom_i.z - simd_broadcast_d(atom_j.z, j, simd_size); +#else + diff.x = atom_i.x - simd_broadcast_f(atom_j.x, j, simd_size); + diff.y = atom_i.y - simd_broadcast_f(atom_j.y, j, simd_size); + diff.z = atom_i.z - simd_broadcast_f(atom_j.z, j, simd_size); +#endif +#endif + + r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z; +//USE CUTOFFSQ? + if (r2 < cutoff_neigh*cutoff_neigh && pid_j != pid_i && pid_i < nt) { + if (cnt < neigh_bin_size) { + cnt++; + *neigh_list = pid_j; + neigh_list++; + if ((cnt & (t_per_atom-1))==0) + neigh_list=neigh_list+stride; + } else + *error_flag=1; + } + } // for j +#if (SHUFFLE_AVAIL == 0) + simdsync(); +#endif + } // for (ci) + if (pid_i < nt) + *neigh_counts = cnt; + } // if (subgroup_id_global < subgroup_count) +} + +#else + +__kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, + const __global int *restrict cell_particle_id, + const __global int *restrict cell_counts, + __global int *nbor_list, + __global int *host_nbor_list, + __global int *host_numj, + int neigh_bin_size, numtyp cell_size, + int ncellx, int ncelly, int ncellz, + int inum, int nt, int nall, int t_per_atom, + int cells_in_cutoff) { int tid = THREAD_ID_X; int ix = BLOCK_ID_X + cells_in_cutoff; @@ -232,7 +443,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, diff.z = atom_i.z - pos_sh[j].z; r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z; - if (r2 < cell_size*cell_size && pid_j != pid_i) { // && r2 > 1e-5 + if (r2 < cell_size*cell_size && pid_j != pid_i) { cnt++; if (cnt <= neigh_bin_size) { *neigh_list = pid_j; @@ -253,6 +464,8 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, } // for (i) } +#endif + __kernel void kernel_special(__global int *dev_nbor, __global int *host_nbor_list, const __global int *host_numj, @@ -310,4 +523,3 @@ __kernel void kernel_special(__global int *dev_nbor, } } // if ii } - diff --git a/lib/gpu/lal_neighbor_shared.cpp b/lib/gpu/lal_neighbor_shared.cpp index f1458b35be..e1c3f5ca68 100644 --- a/lib/gpu/lal_neighbor_shared.cpp +++ b/lib/gpu/lal_neighbor_shared.cpp @@ -13,6 +13,7 @@ email : brownw@ornl.gov ***************************************************************************/ +#include #include "lal_precision.h" #include "lal_neighbor_shared.h" @@ -48,6 +49,45 @@ void NeighborShared::clear() { } } +double NeighborShared::best_cell_size(const double subx, const double suby, + const double subz, const int nlocal, + const double cut) { + if (_cached_cell_size && _cut_sort==cut) { + _cached_cell_size=false; + return _cell_size; + } + + const double box_density = static_cast(nlocal) / (subx*suby*subz); + const double density=box_density*cut*cut*cut; + if (density >= 4.0 * _simd_size) return cut*0.5; + else if (density >= 0.5 * _simd_size) return cut; + + const double iters = 60; + const double inc = cut/(iters-1); + const double iss = 1.0 / _simd_size; + double test_size = cut; + double best_iters = 1e200; + double best_size; + for (int i = 0; i < iters; i++) { + const double i_test_size = 1.0/test_size; + const int ncellx = static_cast(ceil(subx*i_test_size)); + const int ncelly = static_cast(ceil(suby*i_test_size)); + const int ncellz = static_cast(ceil(subz*i_test_size)); + const double density = box_density*test_size*test_size*test_size; + const double iters_per_cell = ceil(iss*density); + const double iters = ncellx*ncelly*ncellz*iters_per_cell* + ceil(density*27.0*iss); + if (iters < best_iters) { + best_iters = iters; + best_size = test_size; + } + test_size += inc; + } + const int cells_in_cutoff=static_cast(ceil(cut/best_size)); + if (cells_in_cutoff > 2) best_size=cut*0.5; + return best_size; +} + void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor, const std::string flags) { if (_compiled) @@ -56,11 +96,11 @@ void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor, _gpu_nbor=gpu_nbor; if (_gpu_nbor==0) { nbor_program=new UCL_Program(dev); - nbor_program->load_string(neighbor_cpu,flags.c_str()); + nbor_program->load_string(neighbor_cpu,flags.c_str(),nullptr,stderr); k_nbor.set_function(*nbor_program,"kernel_unpack"); } else { build_program=new UCL_Program(dev); - build_program->load_string(neighbor_gpu,flags.c_str()); + build_program->load_string(neighbor_gpu,flags.c_str(),nullptr,stderr); if (_gpu_nbor==1) { k_cell_id.set_function(*build_program,"calc_cell_id"); diff --git a/lib/gpu/lal_neighbor_shared.h b/lib/gpu/lal_neighbor_shared.h index 5cfc4e4767..e574aaeaeb 100644 --- a/lib/gpu/lal_neighbor_shared.h +++ b/lib/gpu/lal_neighbor_shared.h @@ -47,6 +47,44 @@ class NeighborShared { /// Texture for cached position/type access with CUDA UCL_Texture neigh_tex; + /// Use a heuristic to approximate best bin size assuming uniform density + /** This is only called by core LAMMPS for atom sort sizes **/ + inline double update_cell_size(const double subx, const double suby, + const double subz, const int nlocal, + const double cut) { + if (_auto_cell_size==false || subz==0.0) return cut; + else { + _cell_size=best_cell_size(subx, suby, subz, nlocal, cut); + _cached_cell_size=true; + _cut_sort=cut; + return _cell_size; + } + } + + /// Use a heuristic to approximate best bin size assuming uniform density + double best_cell_size(const double subx, const double suby, + const double subz, const int nlocal, + const double cut); + + /// Current cutoff used for cell size determination + inline double neighbor_cutoff() { return _neighbor_cutoff; } + + /// Current neighbor cell size + inline double cell_size() { return _cell_size; } + + /// Return setting for auto cell size + inline bool auto_cell_size() { return _auto_cell_size; } + + inline void setup_auto_cell_size(const bool autosize, const double cut, + const int simd_size) { + _auto_cell_size = autosize; + _cached_cell_size = false; + _neighbor_cutoff = cut; + _cell_size = cut; + _simd_size = simd_size; + if (_simd_size < 2) _auto_cell_size = false; + } + /// Compile kernels for neighbor lists void compile_kernels(UCL_Device &dev, const int gpu_nbor, const std::string flags); @@ -59,6 +97,8 @@ class NeighborShared { private: bool _compiled; int _gpu_nbor; + bool _auto_cell_size, _cached_cell_size; + double _neighbor_cutoff, _cell_size, _simd_size, _cut_sort; }; } diff --git a/lib/gpu/lal_pppm.cpp b/lib/gpu/lal_pppm.cpp index 6b5bf88ea5..6e8fe237a6 100644 --- a/lib/gpu/lal_pppm.cpp +++ b/lib/gpu/lal_pppm.cpp @@ -71,7 +71,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen, if (flag!=0) return 0; if (sizeof(grdtyp)==sizeof(double) && device->double_precision()==false) { - flag=-5; + flag=-15; return 0; } if (device->ptx_arch()>0.0 && device->ptx_arch()<1.1) { @@ -133,7 +133,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen, UCL_SUCCESS); UCL_H_Vec view; view.view(rho_coeff[0]+n2lo,numel,*ucl_device); - ucl_copy(d_rho_coeff,view,true); + ucl_copy(d_rho_coeff,view,false); _max_bytes+=d_rho_coeff.row_bytes(); // Allocate storage for grid @@ -191,6 +191,7 @@ void PPPMT::clear(const double cpu_time) { d_brick_counts.clear(); error_flag.clear(); d_brick_atoms.clear(); + d_rho_coeff.clear(); acc_timers(); device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp, @@ -261,7 +262,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall, double delvolinv = delxinv*delyinv*delzinv; grdtyp f_delvolinv = delvolinv; - device->zero(d_brick_counts,d_brick_counts.numel()); + d_brick_counts.zero(); k_particle_map.set_size(GX,BX); k_particle_map.run(&atom->x, &atom->q, &f_delvolinv, &ainum, &d_brick_counts, &d_brick_atoms, &_brick_x, &_brick_y, @@ -286,6 +287,10 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall, error_flag.update_host(true); time_out.stop(); + #ifndef GERYON_OCL_FLUSH + error_flag.flush(); + #endif + _precompute_done=true; } @@ -351,7 +356,7 @@ void PPPMT::interp(const grdtyp qqrd2e_scale) { &ans->force); time_interp.stop(); - ans->copy_answers(false,false,false,false); + ans->copy_answers(false,false,false,false,0); if (_kspace_split==false) device->add_ans_object(ans); } @@ -374,18 +379,19 @@ void PPPMT::compile_kernels(UCL_Device &dev) { #ifdef USE_OPENCL flags+=std::string(" -Dgrdtyp=")+ucl_template_name()+" -Dgrdtyp4="+ ucl_template_name()+"4"; + if (sizeof(grdtyp)==sizeof(double)) flags+=std::string(" -DGRD_DBL"); #endif if (pppm_program) delete pppm_program; pppm_program=new UCL_Program(dev); #ifdef USE_OPENCL - pppm_program->load_string(pppm,flags.c_str()); + pppm_program->load_string(pppm,flags.c_str(),nullptr,screen); #else if (sizeof(grdtyp)==sizeof(float)) - pppm_program->load_string(pppm_f,flags.c_str()); + pppm_program->load_string(pppm_f,flags.c_str(),nullptr,screen); else - pppm_program->load_string(pppm_d,flags.c_str()); + pppm_program->load_string(pppm_d,flags.c_str(),nullptr,screen); #endif k_particle_map.set_function(*pppm_program,"particle_map"); diff --git a/lib/gpu/lal_pppm.cu b/lib/gpu/lal_pppm.cu index ee9f1b61d6..e17df5b88c 100644 --- a/lib/gpu/lal_pppm.cu +++ b/lib/gpu/lal_pppm.cu @@ -35,11 +35,14 @@ _texture( q_tex,int2); #define pos_tex x_ #define q_tex q_ #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable + +#ifdef GRD_DBL #if defined(cl_amd_fp64) #pragma OPENCL EXTENSION cl_amd_fp64 : enable #else #pragma OPENCL EXTENSION cl_khr_fp64 : enable #endif +#endif #endif diff --git a/lib/gpu/lal_pppm_ext.cpp b/lib/gpu/lal_pppm_ext.cpp index b826881392..d548b94be1 100644 --- a/lib/gpu/lal_pppm_ext.cpp +++ b/lib/gpu/lal_pppm_ext.cpp @@ -129,7 +129,8 @@ double pppm_gpu_bytes_f() { void pppm_gpu_forces_f(double **f) { double etmp; PPPMF.atom->data_unavail(); - PPPMF.ans->get_answers(f,nullptr,nullptr,nullptr,nullptr,etmp); + int error_flag; + PPPMF.ans->get_answers(f,nullptr,nullptr,nullptr,nullptr,etmp,error_flag); } double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen, @@ -173,6 +174,7 @@ double pppm_gpu_bytes_d() { void pppm_gpu_forces_d(double **f) { double etmp; PPPMD.atom->data_unavail(); - PPPMD.ans->get_answers(f,nullptr,nullptr,nullptr,nullptr,etmp); + int error_flag; + PPPMD.ans->get_answers(f,nullptr,nullptr,nullptr,nullptr,etmp,error_flag); } diff --git a/lib/gpu/lal_pre_cuda_hip.h b/lib/gpu/lal_pre_cuda_hip.h new file mode 100644 index 0000000000..d37b4a94c2 --- /dev/null +++ b/lib/gpu/lal_pre_cuda_hip.h @@ -0,0 +1,355 @@ +// ************************************************************************** +// pre_cuda_hip.h +// ------------------- +// W. Michael Brown (ORNL) +// Nitin Dhamankar (Intel) +// +// Device-side preprocessor definitions for CUDA and HIP builds +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +//************************************************************************* +// Device Configuration Definitions +// See lal_preprocessor.h for definitions +//*************************************************************************/ + +// ------------------------------------------------------------------------- +// CUDA and HIP DEFINITIONS +// ------------------------------------------------------------------------- + +#if defined(NV_KERNEL) || defined(USE_HIP) + +// ------------------------------------------------------------------------- +// DEVICE CONFIGURATION +// ------------------------------------------------------------------------- + + +#ifdef __HIP_PLATFORM_HCC__ +#define CONFIG_ID 303 +#define SIMD_SIZE 64 +#else +#define CONFIG_ID 103 +#define SIMD_SIZE 32 +#endif + +#define MEM_THREADS SIMD_SIZE +#define SHUFFLE_AVAIL 1 +#define FAST_MATH 1 + +#define THREADS_PER_ATOM 4 +#define THREADS_PER_CHARGE 8 +#define THREADS_PER_THREE 2 + +#define BLOCK_PAIR 256 +#define BLOCK_BIO_PAIR 256 +#define BLOCK_ELLIPSE 128 +#define PPPM_BLOCK_1D 64 +#define BLOCK_NBOR_BUILD 128 +#define BLOCK_CELL_2D 8 +#define BLOCK_CELL_ID 128 + +#define MAX_SHARED_TYPES 11 +#define MAX_BIO_SHARED_TYPES 128 +#define PPPM_MAX_SPLINE 8 + +// ------------------------------------------------------------------------- +// LEGACY DEVICE CONFIGURATION +// ------------------------------------------------------------------------- + +#ifdef __CUDA_ARCH__ + +#if (__CUDA_ARCH__ < 200) + +#undef CONFIG_ID +#define CONFIG_ID 101 +#define MEM_THREADS 16 +#undef THREADS_PER_ATOM +#define THREADS_PER_ATOM 1 +#undef THREADS_PER_CHARGE +#define THREADS_PER_CHARGE 16 +#undef BLOCK_PAIR +#define BLOCK_PAIR 64 +#undef BLOCK_BIO_PAIR +#define BLOCK_BIO_PAIR 64 +#undef BLOCK_NBOR_BUILD +#define BLOCK_NBOR_BUILD 64 +#undef MAX_SHARED_TYPES +#define MAX_SHARED_TYPES 8 +#undef SHUFFLE_AVAIL +#define SHUFFLE_AVAIL 0 + +#elseif (__CUDA_ARCH__ < 300) + +#undef CONFIG_ID +#define CONFIG_ID 102 +#undef BLOCK_PAIR +#define BLOCK_PAIR 128 +#undef BLOCK_BIO_PAIR +#define BLOCK_BIO_PAIR 128 +#undef MAX_SHARED_TYPES +#define MAX_SHARED_TYPES 8 +#undef SHUFFLE_AVAIL +#define SHUFFLE_AVAIL 0 + +#endif + +#endif + +// ------------------------------------------------------------------------- +// KERNEL MACROS +// ------------------------------------------------------------------------- + +#ifdef USE_HIP +#include +#endif + +#define fast_mul(X,Y) (X)*(Y) + +#ifdef __CUDA_ARCH__ +#if (__CUDA_ARCH__ < 200) +#define fast_mul __mul24 +#endif +#endif + +#define EVFLAG 1 +#define NOUNROLL +#define GLOBAL_ID_X threadIdx.x+fast_mul(blockIdx.x,blockDim.x) +#define GLOBAL_ID_Y threadIdx.y+fast_mul(blockIdx.y,blockDim.y) +#define GLOBAL_SIZE_X fast_mul(gridDim.x,blockDim.x); +#define GLOBAL_SIZE_Y fast_mul(gridDim.y,blockDim.y); +#define THREAD_ID_X threadIdx.x +#define THREAD_ID_Y threadIdx.y +#define BLOCK_ID_X blockIdx.x +#define BLOCK_ID_Y blockIdx.y +#define BLOCK_SIZE_X blockDim.x +#define BLOCK_SIZE_Y blockDim.y +#define NUM_BLOCKS_X gridDim.x + +#define __kernel extern "C" __global__ +#ifdef __local +#undef __local +#endif +#define __local __shared__ +#define __global +#define restrict __restrict__ +#define atom_add atomicAdd +#define ucl_inline static __inline__ __device__ + +#define simd_size() SIMD_SIZE + +#define simdsync() + +#ifdef NV_KERNEL +#if (__CUDACC_VER_MAJOR__ >= 9) +#undef simdsync +#define simdsync() __syncwarp(0xffffffff) +#endif +#endif + +#ifdef __HIP_PLATFORM_NVCC__ +#undef simdsync() +#define simdsync() __syncwarp(0xffffffff) +#endif + +// ------------------------------------------------------------------------- +// KERNEL MACROS - TEXTURES +// ------------------------------------------------------------------------- + +#ifdef __HIP_PLATFORM_HCC__ +#define _texture(name, type) __device__ type* name +#define _texture_2d(name, type) __device__ type* name +#else +#define _texture(name, type) texture name +#define _texture_2d(name, type) texture name +#endif + +#if (__CUDACC_VER_MAJOR__ < 11) + #ifdef _DOUBLE_DOUBLE + #define fetch4(ans,i,pos_tex) { \ + int4 xy = tex1Dfetch(pos_tex,i*2); \ + int4 zt = tex1Dfetch(pos_tex,i*2+1); \ + ans.x=__hiloint2double(xy.y, xy.x); \ + ans.y=__hiloint2double(xy.w, xy.z); \ + ans.z=__hiloint2double(zt.y, zt.x); \ + ans.w=__hiloint2double(zt.w, zt.z); \ + } + #define fetch(ans,i,q_tex) { \ + int2 qt = tex1Dfetch(q_tex,i); \ + ans=__hiloint2double(qt.y, qt.x); \ + } + #else + #define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i); + #define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i); + #endif +#else + #define fetch4(ans,i,x) ans=x[i] + #define fetch(ans,i,q) ans=q[i] + #undef _texture + #undef _texture_2d + #define _texture(name, type) + #define _texture_2d(name, type) + #define pos_tex x_ + #define quat_tex qif + #define q_tex q_ + #define vel_tex v_ + #define mu_tex mu_ +#endif + +#ifdef __HIP_PLATFORM_HCC__ + +#undef fetch4 +#undef fetch + +#ifdef _DOUBLE_DOUBLE +#define fetch4(ans,i,pos_tex) (ans=*(((double4*)pos_tex) + i)) +#define fetch(ans,i,q_tex) (ans=*(((double *) q_tex) + i)) +#else +#define fetch4(ans,i,pos_tex) (ans=*(((float4*)pos_tex) + i)) +#define fetch(ans,i,q_tex) (ans=*(((float *) q_tex) + i)) +#endif + +#endif + +// ------------------------------------------------------------------------- +// KERNEL MACROS - MATH +// ------------------------------------------------------------------------- + +#ifdef CUDA_PRE_THREE +struct __builtin_align__(16) _double4 +{ + double x, y, z, w; +}; +typedef struct _double4 double4; +#endif + +#ifdef _DOUBLE_DOUBLE + +#define ucl_exp exp +#define ucl_powr pow +#define ucl_atan atan +#define ucl_cbrt cbrt +#define ucl_ceil ceil +#define ucl_abs fabs +#define ucl_rsqrt rsqrt +#define ucl_sqrt sqrt +#define ucl_recip(x) ((numtyp)1.0/(x)) + +#else + +#define ucl_atan atanf +#define ucl_cbrt cbrtf +#define ucl_ceil ceilf +#define ucl_abs fabsf +#define ucl_recip(x) ((numtyp)1.0/(x)) +#define ucl_rsqrt rsqrtf +#define ucl_sqrt sqrtf +#define ucl_exp expf +#define ucl_powr powf + +#endif + +// ------------------------------------------------------------------------- +// KERNEL MACROS - SHUFFLE +// ------------------------------------------------------------------------- + +#if SHUFFLE_AVAIL == 1 + +#ifndef USE_HIP +#if (__CUDACC_VER_MAJOR__ < 9) +#define CUDA_PRE_NINE +#endif +#endif + +#if defined(CUDA_PRE_NINE) || defined(__HIP_PLATFORM_HCC__) + + #ifdef _SINGLE_SINGLE + #define shfl_down __shfl_down + #define shfl_xor __shfl_xor + #else + ucl_inline double shfl_down(double var, unsigned int delta, int width) { + int2 tmp; + tmp.x = __double2hiint(var); + tmp.y = __double2loint(var); + tmp.x = __shfl_down(tmp.x,delta,width); + tmp.y = __shfl_down(tmp.y,delta,width); + return __hiloint2double(tmp.x,tmp.y); + } + ucl_inline double shfl_xor(double var, unsigned int lanemask, int width) { + int2 tmp; + tmp.x = __double2hiint(var); + tmp.y = __double2loint(var); + tmp.x = __shfl_xor(tmp.x,lanemask,width); + tmp.y = __shfl_xor(tmp.y,lanemask,width); + return __hiloint2double(tmp.x,tmp.y); + } + #endif + #define simd_broadcast_i __shfl + #define simd_broadcast_f __shfl + #ifdef _DOUBLE_DOUBLE + ucl_inline double simd_broadcast_d(double var, unsigned int src, + int width) { + int2 tmp; + tmp.x = __double2hiint(var); + tmp.y = __double2loint(var); + tmp.x = __shfl(tmp.x,src,width); + tmp.y = __shfl(tmp.y,src,width); + return __hiloint2double(tmp.x,tmp.y); + } + #endif + +#else + + #ifdef _SINGLE_SINGLE + ucl_inline float shfl_down(float var, unsigned int delta, int width) { + return __shfl_down_sync(0xffffffff, var, delta, width); + } + ucl_inline float shfl_xor(float var, unsigned int lanemask, int width) { + return __shfl_xor_sync(0xffffffff, var, lanemask, width); + } + #else + ucl_inline double shfl_down(double var, unsigned int delta, int width) { + int2 tmp; + tmp.x = __double2hiint(var); + tmp.y = __double2loint(var); + tmp.x = __shfl_down_sync(0xffffffff,tmp.x,delta,width); + tmp.y = __shfl_down_sync(0xffffffff,tmp.y,delta,width); + return __hiloint2double(tmp.x,tmp.y); + } + ucl_inline double shfl_xor(double var, unsigned int lanemask, int width) { + int2 tmp; + tmp.x = __double2hiint(var); + tmp.y = __double2loint(var); + tmp.x = __shfl_xor_sync(0xffffffff,tmp.x,lanemask,width); + tmp.y = __shfl_xor_sync(0xffffffff,tmp.y,lanemask,width); + return __hiloint2double(tmp.x,tmp.y); + } + #endif + #define simd_broadcast_i(var, src, width) \ + __shfl_sync(0xffffffff, var, src, width) + #define simd_broadcast_f(var, src, width) \ + __shfl_sync(0xffffffff, var, src, width) + #ifdef _DOUBLE_DOUBLE + ucl_inline double simd_broadcast_d(double var, unsigned int src, int width) { + int2 tmp; + tmp.x = __double2hiint(var); + tmp.y = __double2loint(var); + tmp.x = __shfl_sync(0xffffffff,tmp.x,src,width); + tmp.y = __shfl_sync(0xffffffff,tmp.y,src,width); + return __hiloint2double(tmp.x,tmp.y); + } + #endif +#endif + +#endif + +// ------------------------------------------------------------------------- +// END CUDA / HIP DEFINITIONS +// ------------------------------------------------------------------------- + +#endif diff --git a/lib/gpu/lal_pre_ocl_config.h b/lib/gpu/lal_pre_ocl_config.h new file mode 100644 index 0000000000..15c503c942 --- /dev/null +++ b/lib/gpu/lal_pre_ocl_config.h @@ -0,0 +1,53 @@ +// ************************************************************************** +// pre_ocl_config.h +// ------------------- +// W. Michael Brown (ORNL) +// Nitin Dhamankar (Intel) +// +// Device-side preprocessor definitions +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +//************************************************************************* +// Device Configuration Definitions +// See lal_preprocessor.h for definitions +// Configuration order: +// +// {CONFIG_NAME, CONFIG_ID, SIMD_SIZE, MEM_THREADS, SHUFFLE_AVAIL, FAST_MATH, +// THREADS_PER_ATOM, THREADS_PER_CHARGE, THREADS_PER_THREE, BLOCK_PAIR, +// BLOCK_BIO_PAIR, BLOCK_ELLIPSE, PPPM_BLOCK_1D, BLOCK_NBOR_BUILD, +// BLOCK_CELL_2D, BLOCK_CELL_ID, MAX_SHARED_TYPES, MAX_BIO_SHARED_TYPES, +// PPPM_MAX_SPLINE} +// +//*************************************************************************/ + +const int nconfigs=6; +const char * ocl_config_names[] = + { + "generic", + "nvidiagpu", + "amdgpu", + "intelgpu", + "applegpu", + "intelcpu" + }; +const char * ocl_config_strings[] = + { + "GENERIC,1,1,16,0,1,1,1,1,64,64,64,64,64,8,128,8,128,8", + "NVIDIA_GPU,203,32,32,1,1,4,8,2,256,256,128,64,128,8,128,11,128,8", + "AMD_GPU,403,64,64,0,1,4,8,2,256,256,128,64,128,8,128,11,128,8", +#ifdef _SINGLE_SINGLE + "INTEL_GPU,500,8,16,1,1,4,8,1,64,64,64,64,64,8,128,8,128,8", + "APPLE_GPU,600,16,16,0,1,4,8,1,64,64,64,64,64,8,128,8,128,8", +#else + "INTEL_GPU,500,8,16,1,1,2,8,1,64,64,64,64,64,8,128,8,128,8", + "APPLE_GPU,600,16,16,0,1,2,8,1,64,64,64,64,64,8,128,8,128,8", +#endif + "INTEL_CPU,1500,8,8,1,1,1,1,1,64,64,64,64,64,8,64,8,128,8" + }; diff --git a/lib/gpu/lal_precision.h b/lib/gpu/lal_precision.h index 7f82ba18aa..bb2423198f 100644 --- a/lib/gpu/lal_precision.h +++ b/lib/gpu/lal_precision.h @@ -20,6 +20,29 @@ #include #endif +// ---------------------- OPENMP PREPROCESSOR STUFF ------------------ +#if defined(_OPENMP) + #if !defined(LAL_USE_OMP) + #define LAL_USE_OMP 1 + #endif + + #if !defined(LAL_USE_OMP_SIMD) + #if (_OPENMP >= 201307) + #define LAL_USE_OMP_SIMD 1 + #else + #define LAL_USE_OMP_SIMD 0 + #endif + #endif +#else + #if !defined(LAL_USE_OMP) + #define LAL_USE_OMP 0 + #endif + + #if !defined(LAL_USE_OMP_SIMD) + #define LAL_USE_OMP_SIMD 0 + #endif +#endif + struct _lgpu_int2 { int x; int y; }; @@ -75,6 +98,7 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) { #define ACC_PRECISION double #define numtyp2 _lgpu_float2 #define numtyp4 _lgpu_float4 +#define acctyp2 _lgpu_double2 #define acctyp4 _lgpu_double4 #endif @@ -84,6 +108,7 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) { #define ACC_PRECISION double #define numtyp2 _lgpu_double2 #define numtyp4 _lgpu_double4 +#define acctyp2 _lgpu_double2 #define acctyp4 _lgpu_double4 #endif @@ -93,44 +118,16 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) { #define ACC_PRECISION float #define numtyp2 _lgpu_float2 #define numtyp4 _lgpu_float4 +#define acctyp2 _lgpu_float2 #define acctyp4 _lgpu_float4 #endif enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; -// OCL_DEFAULT_VENDOR: preprocessor define for hardware -// specific sizes of OpenCL kernel related constants - -#ifdef FERMI_OCL -#define OCL_DEFAULT_VENDOR "fermi" -#endif - -#ifdef KEPLER_OCL -#define OCL_DEFAULT_VENDOR "kepler" -#endif - -#ifdef CYPRESS_OCL -#define OCL_DEFAULT_VENDOR "cypress" -#endif - -#ifdef GENERIC_OCL -#define OCL_DEFAULT_VENDOR "generic" -#endif - -#ifdef INTEL_OCL -#define OCL_DEFAULT_VENDOR "intel" -#endif - -#ifdef PHI_OCL -#define OCL_DEFAULT_VENDOR "phi" -#endif - -#ifndef OCL_DEFAULT_VENDOR -#define OCL_DEFAULT_VENDOR "none" -#endif - -// default to 32-bit smallint and other ints, 64-bit bigint: same as defined in src/lmptype.h -#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && !defined(LAMMPS_SMALLBIG) +// default to 32-bit smallint and other ints, 64-bit bigint: +// same as defined in src/lmptype.h +#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \ + !defined(LAMMPS_SMALLBIG) #define LAMMPS_SMALLBIG #endif diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h index 7c94438272..12cf6345c2 100644 --- a/lib/gpu/lal_preprocessor.h +++ b/lib/gpu/lal_preprocessor.h @@ -1,9 +1,10 @@ // ************************************************************************** -// preprocessor.cu +// preprocessor.h // ------------------- // W. Michael Brown (ORNL) +// Nitin Dhamankar (Intel) // -// Device code for CUDA-specific preprocessor definitions +// Device-side preprocessor definitions // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) @@ -14,566 +15,136 @@ // ***************************************************************************/ //************************************************************************* -// Preprocessor Definitions +// Device Configuration Definitions // -// Note: It is assumed that constants with the same names are defined with -// the same values in all files. +// For OpenCL, the configuration is a string (optionally controlled at +// runtime) where tokens specify the values below in order) // -// ARCH -// Definition: Architecture number for accelerator +// CONFIG_ID: +// Definition: Unique ID for a configuration +// 100-199 for NVIDIA GPUs with CUDA / HIP +// 200-299 for NVIDIA GPUs with OpenCL +// 300-399 for AMD GPUs with HIP +// 400-499 for AMD GPUs with OpenCL +// 500-599 for Intel GPUs with OpenCL +// SIMD_SIZE: +// Definition: For CUDA this is the warp size. +// For AMD this is the wavefront size. +// For OpenCL < 2.1 this is the number of workitems +// guarenteed to have the same instruction pointer +// For OpenCL >= 2.1 this is the smallest expected subgroup +// size. Actually subgroup sizes are determined per kernel. // MEM_THREADS -// Definition: Number of threads with sequential ids accessing memory -// simultaneously on multiprocessor -// WARP_SIZE: -// Definition: Number of threads guaranteed to be on the same instruction +// Definition: Number of elements in main memory transaction. Used in +// PPPM. If unknown, set to SIMD_SIZE. +// SHUFFLE_AVAIL +// Definition: Controls the use of instructions for horizontal vector +// operations. 0 disables and will increase shared memory +// usage. 1 enables for CUDA, HIP, and OpenCL >= 2.1 on +// NVIDIA and Intel devices. +// FAST_MATH +// Definition: 0: do not use -cl-fast-relaxed-math optimization flag or +// native transcendentals for OpenCL (fused multiply-add +// still enabled). For CUDA and HIP, this is controlled by +// the Makefile at compile time. 1: enable fast math opts +// // THREADS_PER_ATOM -// Definition: Default number of threads assigned per atom for pair styles -// Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE +// Definition: Default number of work items or CUDA threads assigned per +// per atom for pair styles +// Restrictions: Must be power of 2; THREADS_PER_ATOM<=SIMD_SIZE // THREADS_PER_CHARGE -// Definition: Default number of threads assigned per atom for pair styles -// with charge -// Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE -// PPPM_MAX_SPLINE -// Definition: Maximum order for splines in PPPM -// PPPM_BLOCK_1D -// Definition: Thread block size for PPPM kernels -// Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE -// PPPM_BLOCK_1D%32==0 +// Definition: Default number of work items or CUDA threads assigned per +// per atom for pair styles using charge +// Restrictions: Must be power of 2; THREADS_PER_ATOM<=SIMD_SIZE +// THREADS_PER_THREE +// Definition: Default number of work items or CUDA threads assigned per +// per atom for 3-body styles +// Restrictions: Must be power of 2; THREADS_PER_ATOM^2<=SIMD_SIZE +// // BLOCK_PAIR -// Definition: Default thread block size for pair styles -// Restrictions: +// Definition: Default block size for pair styles +// Restrictions: Must be integer multiple of SIMD_SIZE +// BLOCK_BIO_PAIR +// Definition: Default block size for CHARMM styles +// Restrictions: Must be integer multiple of SIMD_SIZE +// BLOCK_ELLIPSE +// Definition: Default block size for ellipsoidal models and some 3-body +// styles +// Restrictions: Must be integer multiple of SIMD_SIZE +// PPPM_BLOCK_1D +// Definition: Default block size for PPPM kernels +// Restrictions: Must be integer multiple of SIMD_SIZE +// BLOCK_NBOR_BUILD +// Definition: Default block size for neighbor list builds +// Restrictions: Must be integer multiple of SIMD_SIZE +// BLOCK_CELL_2D +// Definition: Default block size in each dimension for matrix transpose +// BLOCK_CELL_ID +// Definition: Unused in current implementation; Maintained for legacy +// purposes and specialized builds +// // MAX_SHARED_TYPES 8 // Definition: Max # of atom type params can be stored in shared memory // Restrictions: MAX_SHARED_TYPES*MAX_SHARED_TYPES<=BLOCK_PAIR -// BLOCK_CELL_2D -// Definition: Default block size in each dimension for cell list builds -// and matrix transpose -// BLOCK_CELL_ID -// Definition: Default block size for binning atoms in cell list builds -// BLOCK_NBOR_BUILD -// Definition: Default block size for neighbor list builds -// BLOCK_BIO_PAIR -// Definition: Default thread block size for "bio" pair styles // MAX_BIO_SHARED_TYPES // Definition: Max # of atom type params can be stored in shared memory -// Restrictions: MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2 +// Restrictions: MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2 +// PPPM_MAX_SPLINE +// Definition: Maximum order for splines in PPPM +// Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE // //*************************************************************************/ -#define _texture(name, type) texture name -#define _texture_2d(name, type) texture name - // ------------------------------------------------------------------------- -// HIP DEFINITIONS +// CUDA and HIP DEFINITIONS // ------------------------------------------------------------------------- -#ifdef USE_HIP - #include - #ifdef __HIP_PLATFORM_HCC__ - #define mul24(x, y) __mul24(x, y) - #undef _texture - #undef _texture_2d - #define _texture(name, type) __device__ type* name - #define _texture_2d(name, type) __device__ type* name - #endif - #define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x) - #define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y) - #define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x); - #define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y); - #define THREAD_ID_X threadIdx.x - #define THREAD_ID_Y threadIdx.y - #define BLOCK_ID_X blockIdx.x - #define BLOCK_ID_Y blockIdx.y - #define BLOCK_SIZE_X blockDim.x - #define BLOCK_SIZE_Y blockDim.y - #define __kernel extern "C" __global__ - #ifdef __local - #undef __local - #endif - #define __local __shared__ - #define __global - #define restrict __restrict__ - #define atom_add atomicAdd - #define ucl_inline static __inline__ __device__ - - #define THREADS_PER_ATOM 4 - #define THREADS_PER_CHARGE 8 - #define BLOCK_NBOR_BUILD 128 - #define BLOCK_PAIR 256 - #define BLOCK_BIO_PAIR 256 - #define BLOCK_ELLIPSE 128 - #define MAX_SHARED_TYPES 11 - - #ifdef _SINGLE_SINGLE - ucl_inline double shfl_xor(double var, int laneMask, int width) { - #ifdef __HIP_PLATFORM_HCC__ - return __shfl_xor(var, laneMask, width); - #else - return __shfl_xor_sync(0xffffffff, var, laneMask, width); - #endif - } - #else - ucl_inline double shfl_xor(double var, int laneMask, int width) { - int2 tmp; - tmp.x = __double2hiint(var); - tmp.y = __double2loint(var); - #ifdef __HIP_PLATFORM_HCC__ - tmp.x = __shfl_xor(tmp.x,laneMask,width); - tmp.y = __shfl_xor(tmp.y,laneMask,width); - #else - tmp.x = __shfl_xor_sync(0xffffffff, tmp.x,laneMask,width); - tmp.y = __shfl_xor_sync(0xffffffff, tmp.y,laneMask,width); - #endif - return __hiloint2double(tmp.x,tmp.y); - } - #endif - - #ifdef __HIP_PLATFORM_HCC__ - #define ARCH 600 - #define WARP_SIZE 64 - #endif - - #ifdef __HIP_PLATFORM_NVCC__ - #define ARCH __CUDA_ARCH__ - #define WARP_SIZE 32 - #endif - - #define fast_mul(X,Y) (X)*(Y) - - #define MEM_THREADS WARP_SIZE - #define PPPM_BLOCK_1D 64 - #define BLOCK_CELL_2D 8 - #define BLOCK_CELL_ID 128 - #define MAX_BIO_SHARED_TYPES 128 - - #ifdef __HIP_PLATFORM_NVCC__ - #ifdef _DOUBLE_DOUBLE - #define fetch4(ans,i,pos_tex) { \ - int4 xy = tex1Dfetch(pos_tex,i*2); \ - int4 zt = tex1Dfetch(pos_tex,i*2+1); \ - ans.x=__hiloint2double(xy.y, xy.x); \ - ans.y=__hiloint2double(xy.w, xy.z); \ - ans.z=__hiloint2double(zt.y, zt.x); \ - ans.w=__hiloint2double(zt.w, zt.z); \ - } - #define fetch(ans,i,q_tex) { \ - int2 qt = tex1Dfetch(q_tex,i); \ - ans=__hiloint2double(qt.y, qt.x); \ - } - #else - #define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i); - #define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i); - #endif - #else - #ifdef _DOUBLE_DOUBLE - #define fetch4(ans,i,pos_tex) (ans=*(((double4*)pos_tex) + i)) - #define fetch(ans,i,q_tex) (ans=*(((double *) q_tex) + i)) - #else - #define fetch4(ans,i,pos_tex) (ans=*(((float4*)pos_tex) + i)) - #define fetch(ans,i,q_tex) (ans=*(((float *) q_tex) + i)) - #endif - #endif - - #ifdef _DOUBLE_DOUBLE - #define ucl_exp exp - #define ucl_powr pow - #define ucl_atan atan - #define ucl_cbrt cbrt - #define ucl_ceil ceil - #define ucl_abs fabs - #define ucl_rsqrt rsqrt - #define ucl_sqrt sqrt - #define ucl_recip(x) ((numtyp)1.0/(x)) - - #else - #define ucl_atan atanf - #define ucl_cbrt cbrtf - #define ucl_ceil ceilf - #define ucl_abs fabsf - #define ucl_recip(x) ((numtyp)1.0/(x)) - #define ucl_rsqrt rsqrtf - #define ucl_sqrt sqrtf - - #ifdef NO_HARDWARE_TRANSCENDENTALS - #define ucl_exp expf - #define ucl_powr powf - #else - #define ucl_exp __expf - #define ucl_powr __powf - #endif - #endif -#endif - -// ------------------------------------------------------------------------- -// CUDA DEFINITIONS -// ------------------------------------------------------------------------- - -#ifdef NV_KERNEL - -#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x) -#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y) -#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x); -#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y); -#define THREAD_ID_X threadIdx.x -#define THREAD_ID_Y threadIdx.y -#define BLOCK_ID_X blockIdx.x -#define BLOCK_ID_Y blockIdx.y -#define BLOCK_SIZE_X blockDim.x -#define BLOCK_SIZE_Y blockDim.y -#define __kernel extern "C" __global__ -#define __local __shared__ -#define __global -#define restrict __restrict__ -#define atom_add atomicAdd -#define ucl_inline static __inline__ __device__ - -#ifdef __CUDA_ARCH__ -#define ARCH __CUDA_ARCH__ -#else -#define ARCH 100 -#endif - -#if (ARCH < 200) - -#define THREADS_PER_ATOM 1 -#define THREADS_PER_CHARGE 16 -#define BLOCK_NBOR_BUILD 64 -#define BLOCK_PAIR 64 -#define BLOCK_BIO_PAIR 64 -#define MAX_SHARED_TYPES 8 - -#else - -#if (ARCH < 300) - -#define THREADS_PER_ATOM 4 -#define THREADS_PER_CHARGE 8 -#define BLOCK_NBOR_BUILD 128 -#define BLOCK_PAIR 128 -#define BLOCK_BIO_PAIR 128 -#define MAX_SHARED_TYPES 8 - -#else - -#define THREADS_PER_ATOM 4 -#define THREADS_PER_CHARGE 8 -#define BLOCK_NBOR_BUILD 128 -#define BLOCK_PAIR 256 -#define BLOCK_BIO_PAIR 256 -#define BLOCK_ELLIPSE 128 -#define MAX_SHARED_TYPES 11 - -#if (__CUDACC_VER_MAJOR__ < 9) - -#ifdef _SINGLE_SINGLE -#define shfl_xor __shfl_xor -#else -ucl_inline double shfl_xor(double var, int laneMask, int width) { - int2 tmp; - tmp.x = __double2hiint(var); - tmp.y = __double2loint(var); - tmp.x = __shfl_xor(tmp.x,laneMask,width); - tmp.y = __shfl_xor(tmp.y,laneMask,width); - return __hiloint2double(tmp.x,tmp.y); -} -#endif - -#else - -#ifdef _SINGLE_SINGLE -ucl_inline double shfl_xor(double var, int laneMask, int width) { - return __shfl_xor_sync(0xffffffff, var, laneMask, width); -} -#else -ucl_inline double shfl_xor(double var, int laneMask, int width) { - int2 tmp; - tmp.x = __double2hiint(var); - tmp.y = __double2loint(var); - tmp.x = __shfl_xor_sync(0xffffffff,tmp.x,laneMask,width); - tmp.y = __shfl_xor_sync(0xffffffff,tmp.y,laneMask,width); - return __hiloint2double(tmp.x,tmp.y); -} -#endif - -#endif - -#endif - -#endif - -#define WARP_SIZE 32 -#define PPPM_BLOCK_1D 64 -#define BLOCK_CELL_2D 8 -#define BLOCK_CELL_ID 128 -#define MAX_BIO_SHARED_TYPES 128 - -#ifdef _DOUBLE_DOUBLE -#define fetch4(ans,i,pos_tex) { \ - int4 xy = tex1Dfetch(pos_tex,i*2); \ - int4 zt = tex1Dfetch(pos_tex,i*2+1); \ - ans.x=__hiloint2double(xy.y, xy.x); \ - ans.y=__hiloint2double(xy.w, xy.z); \ - ans.z=__hiloint2double(zt.y, zt.x); \ - ans.w=__hiloint2double(zt.w, zt.z); \ -} -#define fetch(ans,i,q_tex) { \ - int2 qt = tex1Dfetch(q_tex,i); \ - ans=__hiloint2double(qt.y, qt.x); \ -} -#else -#define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i); -#define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i); -#endif - -#if (__CUDA_ARCH__ < 200) -#define fast_mul __mul24 -#define MEM_THREADS 16 -#else -#define fast_mul(X,Y) (X)*(Y) -#define MEM_THREADS 32 -#endif - -#ifdef CUDA_PRE_THREE -struct __builtin_align__(16) _double4 -{ - double x, y, z, w; -}; -typedef struct _double4 double4; -#endif - -#ifdef _DOUBLE_DOUBLE - -#define ucl_exp exp -#define ucl_powr pow -#define ucl_atan atan -#define ucl_cbrt cbrt -#define ucl_ceil ceil -#define ucl_abs fabs -#define ucl_rsqrt rsqrt -#define ucl_sqrt sqrt -#define ucl_recip(x) ((numtyp)1.0/(x)) - -#else - -#define ucl_atan atanf -#define ucl_cbrt cbrtf -#define ucl_ceil ceilf -#define ucl_abs fabsf -#define ucl_recip(x) ((numtyp)1.0/(x)) -#define ucl_rsqrt rsqrtf -#define ucl_sqrt sqrtf - -#ifdef NO_HARDWARE_TRANSCENDENTALS - -#define ucl_exp expf -#define ucl_powr powf - -#else - -#define ucl_exp __expf -#define ucl_powr __powf - -#endif - -#endif - +#if defined(NV_KERNEL) || defined(USE_HIP) +#include "lal_pre_cuda_hip.h" #endif // ------------------------------------------------------------------------- -// NVIDIA GENERIC OPENCL DEFINITIONS +// OPENCL DEVICE CONFIGURATAIONS // ------------------------------------------------------------------------- -#ifdef NV_GENERIC_OCL +// See lal_pre_ocl_config.h for OpenCL device configurations + +#if !defined(NV_KERNEL) && !defined(USE_HIP) #define USE_OPENCL -#define fast_mul mul24 -#define MEM_THREADS 16 -#define THREADS_PER_ATOM 1 -#define THREADS_PER_CHARGE 1 -#define BLOCK_PAIR 64 -#define MAX_SHARED_TYPES 8 -#define BLOCK_NBOR_BUILD 64 -#define BLOCK_BIO_PAIR 64 - -#define WARP_SIZE 32 -#define PPPM_BLOCK_1D 64 -#define BLOCK_CELL_2D 8 -#define BLOCK_CELL_ID 128 -#define MAX_BIO_SHARED_TYPES 128 - -#endif // ------------------------------------------------------------------------- -// NVIDIA FERMI OPENCL DEFINITIONS +// OPENCL KERNEL MACROS // ------------------------------------------------------------------------- -#ifdef FERMI_OCL - -#define USE_OPENCL -#define MEM_THREADS 32 -#define THREADS_PER_ATOM 4 -#define THREADS_PER_CHARGE 8 -#define BLOCK_PAIR 128 -#define MAX_SHARED_TYPES 11 -#define BLOCK_NBOR_BUILD 128 -#define BLOCK_BIO_PAIR 128 - -#define WARP_SIZE 32 -#define PPPM_BLOCK_1D 64 -#define BLOCK_CELL_2D 8 -#define BLOCK_CELL_ID 128 -#define MAX_BIO_SHARED_TYPES 128 - -#endif - -// ------------------------------------------------------------------------- -// NVIDIA KEPLER OPENCL DEFINITIONS -// ------------------------------------------------------------------------- - -#ifdef KEPLER_OCL - -#define USE_OPENCL -#define MEM_THREADS 32 -#define THREADS_PER_ATOM 4 -#define THREADS_PER_CHARGE 8 -#define BLOCK_PAIR 256 -#define MAX_SHARED_TYPES 11 -#define BLOCK_NBOR_BUILD 128 -#define BLOCK_BIO_PAIR 256 -#define BLOCK_ELLIPSE 128 - -#define WARP_SIZE 32 -#define PPPM_BLOCK_1D 64 -#define BLOCK_CELL_2D 8 -#define BLOCK_CELL_ID 128 -#define MAX_BIO_SHARED_TYPES 128 - -#ifndef NO_OCL_PTX -#define ARCH 300 -#ifdef _SINGLE_SINGLE -inline float shfl_xor(float var, int laneMask, int width) { - float ret; - int c; - c = ((WARP_SIZE-width) << 8) | 0x1f; - asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c)); - return ret; -} +#if (__OPENCL_VERSION__ > 199) +#define NOUNROLL __attribute__((opencl_unroll_hint(1))) #else -#pragma OPENCL EXTENSION cl_khr_fp64 : enable -inline double shfl_xor(double var, int laneMask, int width) { - int c = ((WARP_SIZE-width) << 8) | 0x1f; - int x,y,x2,y2; - double ans; - asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var)); - asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(x2) : "r"(x), "r"(laneMask), "r"(c)); - asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(y2) : "r"(y), "r"(laneMask), "r"(c)); - asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2)); - return ans; -} -#endif +#define NOUNROLL #endif -#endif +#define GLOBAL_ID_X get_global_id(0) +#define THREAD_ID_X get_local_id(0) +#define BLOCK_ID_X get_group_id(0) +#define BLOCK_SIZE_X get_local_size(0) +#define GLOBAL_SIZE_X get_global_size(0) +#define THREAD_ID_Y get_local_id(1) +#define BLOCK_ID_Y get_group_id(1) +#define NUM_BLOCKS_X get_num_groups(0) +#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) +#define ucl_inline inline // ------------------------------------------------------------------------- -// AMD CYPRESS OPENCL DEFINITIONS +// OPENCL KERNEL MACROS - TEXTURES // ------------------------------------------------------------------------- -#ifdef CYPRESS_OCL - -#define USE_OPENCL -#define MEM_THREADS 32 -#define THREADS_PER_ATOM 4 -#define THREADS_PER_CHARGE 8 -#define BLOCK_PAIR 128 -#define MAX_SHARED_TYPES 8 -#define BLOCK_NBOR_BUILD 64 -#define BLOCK_BIO_PAIR 64 - -#define WARP_SIZE 64 -#define PPPM_BLOCK_1D 64 -#define BLOCK_CELL_2D 8 -#define BLOCK_CELL_ID 128 -#define MAX_BIO_SHARED_TYPES 128 - -#endif +#define fetch4(ans,i,x) ans=x[i] +#define fetch(ans,i,q) ans=q[i] // ------------------------------------------------------------------------- -// INTEL CPU OPENCL DEFINITIONS +// OPENCL KERNEL MACROS - MATH // ------------------------------------------------------------------------- -#ifdef INTEL_OCL - -#define USE_OPENCL -#define MEM_THREADS 16 -#define THREADS_PER_ATOM 1 -#define THREADS_PER_CHARGE 1 -#define BLOCK_PAIR 1 -#define MAX_SHARED_TYPES 0 -#define BLOCK_NBOR_BUILD 4 -#define BLOCK_BIO_PAIR 2 -#define BLOCK_ELLIPSE 2 - -#define WARP_SIZE 1 -#define PPPM_BLOCK_1D 32 -#define BLOCK_CELL_2D 1 -#define BLOCK_CELL_ID 2 -#define MAX_BIO_SHARED_TYPES 0 - -#endif - -// ------------------------------------------------------------------------- -// INTEL PHI OPENCL DEFINITIONS -// ------------------------------------------------------------------------- - -#ifdef PHI_OCL - -#define USE_OPENCL -#define MEM_THREADS 16 -#define THREADS_PER_ATOM 1 -#define THREADS_PER_CHARGE 1 -#define BLOCK_PAIR 16 -#define MAX_SHARED_TYPES 0 -#define BLOCK_NBOR_BUILD 16 -#define BLOCK_BIO_PAIR 16 -#define BLOCK_ELLIPSE 16 - -#define WARP_SIZE 1 -#define PPPM_BLOCK_1D 32 -#define BLOCK_CELL_2D 4 -#define BLOCK_CELL_ID 16 -#define MAX_BIO_SHARED_TYPES 0 - -#endif - -// ------------------------------------------------------------------------- -// GENERIC OPENCL DEFINITIONS -// ------------------------------------------------------------------------- - -#ifdef GENERIC_OCL - -#define USE_OPENCL -#define MEM_THREADS 16 -#define THREADS_PER_ATOM 1 -#define THREADS_PER_CHARGE 1 -#define BLOCK_PAIR 64 -#define MAX_SHARED_TYPES 8 -#define BLOCK_NBOR_BUILD 64 -#define BLOCK_BIO_PAIR 64 - -#define WARP_SIZE 1 -#define PPPM_BLOCK_1D 64 -#define BLOCK_CELL_2D 8 -#define BLOCK_CELL_ID 128 -#define MAX_BIO_SHARED_TYPES 128 - -#endif - -// ------------------------------------------------------------------------- -// OPENCL Stuff for All Hardware -// ------------------------------------------------------------------------- -#ifdef USE_OPENCL - #ifndef _SINGLE_SINGLE #ifndef cl_khr_fp64 @@ -589,48 +160,14 @@ inline double shfl_xor(double var, int laneMask, int width) { #endif -#ifndef fast_mul #define fast_mul(X,Y) (X)*(Y) -#endif - -#ifndef ARCH -#define ARCH 0 -#endif - -#ifndef DRIVER -#define DRIVER 0 -#endif - -#define GLOBAL_ID_X get_global_id(0) -#define THREAD_ID_X get_local_id(0) -#define BLOCK_ID_X get_group_id(0) -#define BLOCK_SIZE_X get_local_size(0) -#define GLOBAL_SIZE_X get_global_size(0) -#define THREAD_ID_Y get_local_id(1) -#define BLOCK_ID_Y get_group_id(1) -#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) -#define ucl_inline inline -#define fetch4(ans,i,x) ans=x[i] -#define fetch(ans,i,q) ans=q[i] #define ucl_atan atan #define ucl_cbrt cbrt #define ucl_ceil ceil #define ucl_abs fabs -#ifdef _DOUBLE_DOUBLE -#define NO_HARDWARE_TRANSCENDENTALS -#endif - -#ifdef NO_HARDWARE_TRANSCENDENTALS - -#define ucl_exp exp -#define ucl_powr powr -#define ucl_rsqrt rsqrt -#define ucl_sqrt sqrt -#define ucl_recip(x) ((numtyp)1.0/(x)) - -#else +#if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE) #define ucl_exp native_exp #define ucl_powr native_powr @@ -638,23 +175,128 @@ inline double shfl_xor(double var, int laneMask, int width) { #define ucl_sqrt native_sqrt #define ucl_recip native_recip +#else + +#define ucl_exp exp +#define ucl_powr powr +#define ucl_rsqrt rsqrt +#define ucl_sqrt sqrt +#define ucl_recip(x) ((numtyp)1.0/(x)) + #endif +// ------------------------------------------------------------------------- +// OPENCL KERNEL MACROS - SHUFFLE +// ------------------------------------------------------------------------- + +#if (SHUFFLE_AVAIL == 1) + #ifdef cl_intel_subgroups + #pragma OPENCL EXTENSION cl_intel_subgroups : enable + #define shfl_down(var, delta, width) \ + intel_sub_group_shuffle_down(var, var, delta) + #define shfl_xor(var, lanemask, width) \ + intel_sub_group_shuffle_xor(var, lanemask) + #define simd_broadcast_i(var, src, width) sub_group_broadcast(var, src) + #define simd_broadcast_f(var, src, width) sub_group_broadcast(var, src) + #define simd_broadcast_d(var, src, width) sub_group_broadcast(var, src) + #else + #ifdef _SINGLE_SINGLE + inline float shfl_down(float var, unsigned int delta, int width) { + float ret; + int c; + c = ((SIMD_SIZE-width) << 8) | 0x1f; + asm volatile ("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c), "r"(0xffffffff)); + return ret; + } + inline float shfl_xor(float var, unsigned int lanemask, int width) { + float ret; + int c; + c = ((SIMD_SIZE-width) << 8) | 0x1f; + asm volatile ("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(lanemask), "r"(c), "r"(0xffffffff)); + return ret; + } + #else + inline double shfl_down(double var, unsigned int delta, int width) { + int c = ((SIMD_SIZE-width) << 8) | 0x1f; + int x,y,x2,y2; + double ans; + asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var)); + asm volatile ("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=r"(x2) : "r"(x), "r"(delta), "r"(c), "r"(0xffffffff)); + asm volatile ("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=r"(y2) : "r"(y), "r"(delta), "r"(c), "r"(0xffffffff)); + asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2)); + return ans; + } + inline double shfl_xor(double var, unsigned int lanemask, int width) { + int c = ((SIMD_SIZE-width) << 8) | 0x1f; + int x,y,x2,y2; + double ans; + asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var)); + asm volatile ("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;" : "=r"(x2) : "r"(x), "r"(lanemask), "r"(c), "r"(0xffffffff)); + asm volatile ("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;" : "=r"(y2) : "r"(y), "r"(lanemask), "r"(c), "r"(0xffffffff)); + asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2)); + return ans; + } + #endif + inline int simd_broadcast_i(int var, unsigned int src, int width) { + int ret; + int c; + c = ((SIMD_SIZE-width) << 8) | 0x1f; + asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(src), "r"(c), "r"(0xffffffff)); + return ret; + } + inline float simd_broadcast_f(float var, unsigned int src, int width) { + float ret; + int c; + c = ((SIMD_SIZE-width) << 8) | 0x1f; + asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(src), "r"(c), "r"(0xffffffff)); + return ret; + } + #ifdef _DOUBLE_DOUBLE + inline double simd_broadcast_d(double var, unsigned int src, int width) { + int c = ((SIMD_SIZE-width) << 8) | 0x1f; + int x,y,x2,y2; + double ans; + asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var)); + asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=r"(x2) : "r"(x), "r"(src), "r"(c), "r"(0xffffffff)); + asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=r"(y2) : "r"(y), "r"(src), "r"(c), "r"(0xffffffff)); + asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2)); + return ans; + } + #endif + #endif +#endif + +// ------------------------------------------------------------------------- +// OPENCL KERNEL MACROS - SUBGROUPS +// ------------------------------------------------------------------------- + +#ifdef USE_OPENCL_SUBGROUPS + #ifndef cl_intel_subgroups + #pragma OPENCL EXTENSION cl_khr_subgroups : enable + #endif + #define simdsync() sub_group_barrier(CLK_LOCAL_MEM_FENCE) + #define simd_size() get_max_sub_group_size() +#else + #define simdsync() + #define simd_size() SIMD_SIZE +#endif + +// ------------------------------------------------------------------------- +// END OPENCL DEFINITIONS +// ------------------------------------------------------------------------- + #endif // ------------------------------------------------------------------------- // ARCHITECTURE INDEPENDENT DEFINITIONS // ------------------------------------------------------------------------- -#ifndef PPPM_MAX_SPLINE -#define PPPM_MAX_SPLINE 8 -#endif - #ifdef _DOUBLE_DOUBLE #define numtyp double #define numtyp2 double2 #define numtyp4 double4 #define acctyp double +#define acctyp2 double2 #define acctyp4 double4 #endif @@ -663,6 +305,7 @@ inline double shfl_xor(double var, int laneMask, int width) { #define numtyp2 float2 #define numtyp4 float4 #define acctyp double +#define acctyp2 double2 #define acctyp4 double4 #endif @@ -671,6 +314,7 @@ inline double shfl_xor(double var, int laneMask, int width) { #define numtyp2 float2 #define numtyp4 float4 #define acctyp float +#define acctyp2 float2 #define acctyp4 float4 #endif @@ -686,11 +330,9 @@ inline double shfl_xor(double var, int laneMask, int width) { #define NEIGHMASK 0x3FFFFFFF ucl_inline int sbmask(int j) { return j >> SBBITS & 3; }; -#ifndef BLOCK_ELLIPSE -#define BLOCK_ELLIPSE BLOCK_PAIR -#endif - -// default to 32-bit smallint and other ints, 64-bit bigint: same as defined in src/lmptype.h -#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && !defined(LAMMPS_SMALLBIG) +// default to 32-bit smallint and other ints, 64-bit bigint: +// same as defined in src/lmptype.h +#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \ + !defined(LAMMPS_SMALLBIG) #define LAMMPS_SMALLBIG #endif diff --git a/lib/gpu/lal_re_squared.cpp b/lib/gpu/lal_re_squared.cpp index 81dc3b13a4..aabfb9d39f 100644 --- a/lib/gpu/lal_re_squared.cpp +++ b/lib/gpu/lal_re_squared.cpp @@ -116,7 +116,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well, host_write[i*4+2]=host_shape[i][2]; } UCL_H_Vec view4; - view4.view((numtyp4*)host_write.begin(),shape.numel(),*(this->ucl_device)); + view4.view(host_write,shape.numel()); ucl_copy(shape,view4,false); well.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY); @@ -125,7 +125,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well, host_write[i*4+1]=host_well[i][1]; host_write[i*4+2]=host_well[i][2]; } - view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device)); + view4.view(host_write,well.numel()); ucl_copy(well,view4,false); _allocated=true; @@ -172,18 +172,8 @@ double RESquaredT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void RESquaredT::loop(const bool _eflag, const bool _vflag) { +int RESquaredT::loop(const int eflag, const int vflag) { const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; int GX=0, NGX; int stride=this->nbor->nbor_pitch(); @@ -201,8 +191,8 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { this->time_nbor1.stop(); this->time_ellipsoid.start(); - this->k_ellipsoid.set_size(GX,BX); - this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, + this->k_elps_sel->set_size(GX,BX); + this->k_elps_sel->run(&this->atom->x, &this->atom->quat, &this->shape, &this->well, &this->special_lj, &this->sigma_epsilon, &this->_lj_types, &this->nbor->dev_nbor, &stride, @@ -218,8 +208,8 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { this->time_nbor2.stop(); this->time_ellipsoid2.start(); - this->k_ellipsoid_sphere.set_size(GX,BX); - this->k_ellipsoid_sphere.run(&this->atom->x, &this->atom->quat, + this->k_elps_sphere_sel->set_size(GX,BX); + this->k_elps_sphere_sel->run(&this->atom->x, &this->atom->quat, &this->shape, &this->well, &this->special_lj, &this->sigma_epsilon, &this->_lj_types, &this->nbor->dev_nbor, &stride, @@ -233,7 +223,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { this->time_nbor3.zero(); this->time_ellipsoid3.zero(); this->time_lj.zero(); - return; + return ainum; } // ------------ SPHERE_ELLIPSE --------------- @@ -249,8 +239,8 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { this->time_nbor3.stop(); this->time_ellipsoid3.start(); - this->k_sphere_ellipsoid.set_size(GX,BX); - this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat, + this->k_sphere_elps_sel->set_size(GX,BX); + this->k_sphere_elps_sel->run(&this->atom->x, &this->atom->quat, &this->shape, &this->well, &this->special_lj, &this->sigma_epsilon, &this->_lj_types, &this->nbor->dev_nbor, &stride, @@ -277,8 +267,8 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { this->time_lj.start(); if (this->_last_ellipseans->inum()) { if (this->_shared_types) { - this->k_lj_fast.set_size(GX,BX); - this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3, + this->k_lj_sel->set_size(GX,BX); + this->k_lj_sel->run(&this->atom->x, &this->lj1, &this->lj3, &this->special_lj, &stride, &this->nbor->dev_packed, &this->ans->force, &this->ans->engv, &this->dev_error, @@ -303,8 +293,8 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { ELLIPSE_ELLIPSE,_shared_types,_lj_types); this->time_nbor1.stop(); this->time_ellipsoid.start(); - this->k_ellipsoid.set_size(GX,BX); - this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, + this->k_elps_sel->set_size(GX,BX); + this->k_elps_sel->run(&this->atom->x, &this->atom->quat, &this->shape, &this->well, &this->special_lj, &this->sigma_epsilon, &this->_lj_types, &this->nbor->dev_nbor, &stride, &this->ans->force, @@ -312,6 +302,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { &eflag, &vflag, &ainum, &this->_threads_per_atom); this->time_ellipsoid.stop(); } + return ainum; } template class RESquared; diff --git a/lib/gpu/lal_re_squared.cu b/lib/gpu/lal_re_squared.cu index 8852a46913..c69a338749 100644 --- a/lib/gpu/lal_re_squared.cu +++ b/lib/gpu/lal_re_squared.cu @@ -51,33 +51,30 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_ellipse(); + sp_lj[0]=splj[0]; sp_lj[1]=splj[1]; sp_lj[2]=splj[2]; sp_lj[3]=splj[3]; - __local numtyp b_alpha, cr60; - b_alpha=(numtyp)45.0/(numtyp)56.0; - cr60=ucl_cbrt((numtyp)60.0); + const numtyp b_alpha=(numtyp)45.0/(numtyp)56.0; + const numtyp cr60=ucl_cbrt((numtyp)60.0); - acctyp energy=(acctyp)0; - acctyp4 f; - f.x=(acctyp)0; - f.y=(acctyp)0; - f.z=(acctyp)0; - acctyp4 tor; - tor.x=(acctyp)0; - tor.y=(acctyp)0; - tor.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp4 f, tor; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) + if (EVFLAG && vflag) virial[0]+=-r[0]*force; } else if (i==1) { f.y+=force; - if (vflag>0) { + if (EVFLAG && vflag) { virial[1]+=-r[1]*force; virial[3]+=-r[0]*force; } } else { f.z+=force; - if (vflag>0) { + if (EVFLAG && vflag) { virial[2]+=-r[2]*force; virial[4]+=-r[0]*force; virial[5]+=-r[1]*force; @@ -452,8 +449,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_t(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_t(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag, + vflag,ans,engv,inum); } - diff --git a/lib/gpu/lal_re_squared.h b/lib/gpu/lal_re_squared.h index 9e4f4af67a..1b0a837764 100644 --- a/lib/gpu/lal_re_squared.h +++ b/lib/gpu/lal_re_squared.h @@ -82,7 +82,7 @@ class RESquared : public BaseEllipsoid { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_re_squared_lj.cu b/lib/gpu/lal_re_squared_lj.cu index 112a4db8d9..ca1b08facd 100644 --- a/lib/gpu/lal_re_squared_lj.cu +++ b/lib/gpu/lal_re_squared_lj.cu @@ -17,12 +17,18 @@ #include "lal_ellipsoid_extra.h" #endif -#if (ARCH < 300) +#if (SHUFFLE_AVAIL == 0) +#define local_allocate_store_ellipse_lj local_allocate_store_ellipse +#else +#define local_allocate_store_ellipse_lj() \ + __local acctyp red_acc[7][BLOCK_ELLIPSE / SIMD_SIZE]; +#endif + +#if (SHUFFLE_AVAIL == 0) #define store_answers_rt(f, tor, energy, virial, ii, astride, tid, \ - t_per_atom, offset, eflag, vflag, ans, engv) \ + t_per_atom, offset, eflag, vflag, ans, engv, inum) \ if (t_per_atom>1) { \ - __local acctyp red_acc[7][BLOCK_PAIR]; \ red_acc[0][tid]=f.x; \ red_acc[1][tid]=f.y; \ red_acc[2][tid]=f.z; \ @@ -30,6 +36,7 @@ red_acc[4][tid]=tor.y; \ red_acc[5][tid]=tor.z; \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ if (offset < s) { \ for (int r=0; r<6; r++) \ red_acc[r][tid] += red_acc[r][tid+s]; \ @@ -41,28 +48,39 @@ tor.x=red_acc[3][tid]; \ tor.y=red_acc[4][tid]; \ tor.z=red_acc[5][tid]; \ - if (eflag>0 || vflag>0) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid]=virial[r]; \ - red_acc[6][tid]=energy; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<7; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ + if (EVFLAG && (eflag || vflag)) { \ + if (vflag) { \ + simdsync(); \ + for (int r=0; r<6; r++) \ + red_acc[r][tid]=virial[r]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + for (int r=0; r<6; r++) \ + virial[r]=red_acc[r][tid]; \ + } \ + if (eflag) { \ + simdsync(); \ + red_acc[0][tid]=energy; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) red_acc[0][tid] += red_acc[0][tid+s]; \ } \ } \ - for (int r=0; r<6; r++) \ - virial[r]=red_acc[r][tid]; \ - energy=red_acc[6][tid]; \ + energy=red_acc[0][tid]; \ } \ } \ - if (offset==0) { \ + if (offset==0 && ii0) { \ + if (EVFLAG && eflag) { \ *ap1+=energy*(acctyp)0.5; \ ap1+=astride; \ } \ - if (vflag>0) { \ + if (EVFLAG && vflag) { \ for (int i=0; i<6; i++) { \ *ap1+=virial[i]*(acctyp)0.5; \ ap1+=astride; \ @@ -82,32 +100,32 @@ #else -#define store_answers_rt(f, tor, energy, virial, ii, astride, tid, \ - t_per_atom, offset, eflag, vflag, ans, engv) \ - if (t_per_atom>1) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - f.x += shfl_xor(f.x, s, t_per_atom); \ - f.y += shfl_xor(f.y, s, t_per_atom); \ - f.z += shfl_xor(f.z, s, t_per_atom); \ - tor.x += shfl_xor(tor.x, s, t_per_atom); \ - tor.y += shfl_xor(tor.y, s, t_per_atom); \ - tor.z += shfl_xor(tor.z, s, t_per_atom); \ - energy += shfl_xor(energy, s, t_per_atom); \ +#define store_answers_rt(f, tor, energy, virial, ii, astride, tid, \ + t_per_atom, offset, eflag, vflag, ans, engv, inum) \ + if (t_per_atom>1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + f.x += shfl_down(f.x, s, t_per_atom); \ + f.y += shfl_down(f.y, s, t_per_atom); \ + f.z += shfl_down(f.z, s, t_per_atom); \ + tor.x += shfl_down(tor.x, s, t_per_atom); \ + tor.y += shfl_down(tor.y, s, t_per_atom); \ + tor.z += shfl_down(tor.z, s, t_per_atom); \ + energy += shfl_down(energy, s, t_per_atom); \ } \ - if (vflag>0) { \ + if (EVFLAG && vflag) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ for (int r=0; r<6; r++) \ - virial[r] += shfl_xor(virial[r], s, t_per_atom); \ + virial[r] += shfl_down(virial[r], s, t_per_atom); \ } \ } \ } \ - if (offset==0) { \ + if (offset==0 && ii0) { \ + if (EVFLAG && eflag) { \ *ap1+=energy*(acctyp)0.5; \ ap1+=astride; \ } \ - if (vflag>0) { \ + if (EVFLAG && vflag) { \ for (int i=0; i<6; i++) { \ *ap1+=virial[i]*(acctyp)0.5; \ ap1+=astride; \ @@ -147,35 +165,34 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_ellipse(); + sp_lj[0]=splj[0]; sp_lj[1]=splj[1]; sp_lj[2]=splj[2]; sp_lj[3]=splj[3]; - __local numtyp b_alpha, cr60, solv_f_a, solv_f_r; - b_alpha=(numtyp)45.0/(numtyp)56.0; - cr60=ucl_cbrt((numtyp)60.0); - solv_f_a = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0); - solv_f_r = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0); + const numtyp b_alpha=(numtyp)45.0/(numtyp)56.0; + const numtyp cr60=ucl_cbrt((numtyp)60.0); + const numtyp solv_f_a = + (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0); + const numtyp solv_f_r = + (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0); - acctyp energy=(acctyp)0; - acctyp4 f; - f.x=(acctyp)0; - f.y=(acctyp)0; - f.z=(acctyp)0; - acctyp4 tor; - tor.x=(acctyp)0; - tor.y=(acctyp)0; - tor.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp4 f, tor; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) + if (EVFLAG && vflag) virial[0]+=-r[0]*force; } else if (i==1) { f.y+=force; - if (vflag>0) { + if (EVFLAG && vflag) { virial[1]+=-r[1]*force; virial[3]+=-r[0]*force; } } else { f.z+=force; - if (vflag>0) { + if (EVFLAG && vflag) { virial[2]+=-r[2]*force; virial[4]+=-r[0]*force; virial[5]+=-r[1]*force; @@ -378,9 +395,9 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, } } // for nbor - store_answers_rt(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag, - vflag,ans,engv); } // if ii + store_answers_rt(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset, + eflag,vflag,ans,engv,inum); } __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, @@ -403,31 +420,33 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, ii+=start; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_ellipse_lj(); + sp_lj[0]=splj[0]; sp_lj[1]=splj[1]; sp_lj[2]=splj[2]; sp_lj[3]=splj[3]; - __local numtyp b_alpha, cr60, solv_f_a, solv_f_r; - b_alpha=(numtyp)45.0/(numtyp)56.0; - cr60=ucl_cbrt((numtyp)60.0); - solv_f_a = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0); - solv_f_r = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0); + const numtyp b_alpha=(numtyp)45.0/(numtyp)56.0; + const numtyp cr60=ucl_cbrt((numtyp)60.0); + const numtyp solv_f_a = + (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0); + const numtyp solv_f_r = + (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0); - acctyp energy=(acctyp)0; acctyp4 f; - f.x=(acctyp)0; - f.y=(acctyp)0; - f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) + if (EVFLAG && vflag) virial[0]+=-r[0]*force; } else if (i==1) { f.y+=force; - if (vflag>0) { + if (EVFLAG && vflag) { virial[1]+=-r[1]*force; virial[3]+=-r[0]*force; } } else { f.z+=force; - if (vflag>0) { + if (EVFLAG && vflag) { virial[2]+=-r[2]*force; virial[4]+=-r[0]*force; virial[5]+=-r[1]*force; @@ -579,9 +598,9 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, } } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_resquared_lj(const __global numtyp4 *restrict x_, @@ -601,26 +620,27 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_, ii+=start; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_ellipse(); + sp_lj[0]=gum[0]; sp_lj[1]=gum[1]; sp_lj[2]=gum[2]; sp_lj[3]=gum[3]; - acctyp energy=(acctyp)0; acctyp4 f; - f.x=(acctyp)0; - f.y=(acctyp)0; - f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y); energy+=factor_lj*(e-lj3[ii].z); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -666,9 +686,9 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_, } } } // for nbor - acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_, @@ -690,31 +710,32 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_, __local numtyp sp_lj[4]; __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + int n_stride; + local_allocate_store_ellipse(); + if (tid<4) sp_lj[tid]=gum[tid]; if (tid0) + if (EVFLAG && eflag) lj3[tid]=lj3_in[tid]; } - acctyp energy=(acctyp)0; acctyp4 f; - f.x=(acctyp)0; - f.y=(acctyp)0; - f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); energy+=factor_lj*(e-lj3[mtype].z); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -760,8 +781,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_, } } // for nbor - acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } - diff --git a/lib/gpu/lal_soft.cpp b/lib/gpu/lal_soft.cpp index 8e944fa0a5..e77be5a011 100644 --- a/lib/gpu/lal_soft.cpp +++ b/lib/gpu/lal_soft.cpp @@ -121,20 +121,9 @@ double SoftT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void SoftT::loop(const bool _eflag, const bool _vflag) { +int SoftT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -142,8 +131,8 @@ void SoftT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &coeff, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &coeff, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); @@ -155,6 +144,7 @@ void SoftT::loop(const bool _eflag, const bool _vflag) { &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class Soft; diff --git a/lib/gpu/lal_soft.cu b/lib/gpu/lal_soft.cu index 5df34e7b1d..74ac0e0c97 100644 --- a/lib/gpu/lal_soft.cu +++ b/lib/gpu/lal_soft.cu @@ -40,22 +40,25 @@ __kernel void k_soft(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { numtyp e=coeff[mtype].x * ((numtyp)1.0+cos(arg)); energy+=factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -106,9 +109,9 @@ __kernel void k_soft(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_soft_fast(const __global numtyp4 *restrict x_, @@ -125,25 +128,28 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_, __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + if (tid<4) sp_lj[tid]=sp_lj_in[tid]; if (tid0) { + if (EVFLAG && eflag) { numtyp e=coeff[mtype].x * ((numtyp)1.0+cos(arg)); energy+=factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -194,8 +200,8 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } diff --git a/lib/gpu/lal_soft.h b/lib/gpu/lal_soft.h index b33314ee03..fd86f62927 100644 --- a/lib/gpu/lal_soft.h +++ b/lib/gpu/lal_soft.h @@ -73,7 +73,7 @@ class Soft : public BaseAtomic { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_soft_ext.cpp b/lib/gpu/lal_soft_ext.cpp index 7c0cbe7973..a32a5e5a00 100644 --- a/lib/gpu/lal_soft_ext.cpp +++ b/lib/gpu/lal_soft_ext.cpp @@ -55,7 +55,7 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor, int init_ok=0; if (world_me==0) init_ok=SLMF.init(ntypes, cutsq, host_prefactor, host_cut, - special_lj, inum, nall, 300, + special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); SLMF.device->world_barrier(); @@ -73,7 +73,7 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor, } if (gpu_rank==i && world_me!=0) init_ok=SLMF.init(ntypes, cutsq, host_prefactor, host_cut, - special_lj, inum, nall, 300, maxspecial, + special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); SLMF.device->gpu_barrier(); diff --git a/lib/gpu/lal_sw.cpp b/lib/gpu/lal_sw.cpp index 5c7bd45c76..eb42c710cc 100644 --- a/lib/gpu/lal_sw.cpp +++ b/lib/gpu/lal_sw.cpp @@ -43,114 +43,83 @@ int SWT::bytes_per_atom(const int max_nbors) const { } template -int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_nbors, - const double cell_size, const double gpu_split, FILE *_screen, - int* host_map, const int nelements, int*** host_elem2param, const int nparams, - const double* epsilon, const double* sigma, - const double* lambda, const double* gamma, - const double* costheta, const double* biga, - const double* bigb, const double* powerp, - const double* powerq, const double* cut, const double* cutsq) -{ +int SWT::init(const int ntypes, const int nlocal, const int nall, + const int max_nbors, const double cell_size, + const double gpu_split, FILE *_screen, double **ncutsq, + double **ncut, double **sigma, double **powerp, double **powerq, + double **sigma_gamma, double **c1, double **c2, double **c3, + double **c4, double **c5, double **c6, double ***lambda_epsilon, + double ***costheta, const int *map, int ***e2param) { + _lj_types=ntypes; + + int oldparam=-1; + int onetype=-1; + int onetype3=0; + int spq=1; + int mtypes=0; + #ifdef USE_OPENCL + for (int ii=1; ii1) onetype=-1; + #endif + int success; success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,sw,"k_sw","k_sw_three_center", - "k_sw_three_end","k_sw_short_nbor"); + "k_sw_three_end","k_sw_short_nbor",onetype, + onetype3,spq); if (success!=0) return success; - // If atom type constants fit in shared memory use fast kernel - int lj_types=ntypes; - shared_types=false; - int max_shared_types=this->device->max_shared_types(); - if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { - lj_types=max_shared_types; - shared_types=true; - } - _lj_types=lj_types; + UCL_H_Vec host_write(ntypes*ntypes*ntypes*4,*(this->ucl_device), + UCL_WRITE_ONLY); + host_write.zero(); - _nparams = nparams; - _nelements = nelements; - - UCL_H_Vec dview(nparams,*(this->ucl_device), - UCL_WRITE_ONLY); - - for (int i=0; i 0.0 && ncutsq[i][j]>=ccutsq) + ncutsq[i][j]=ccutsq*0.98; } // pack coefficients into arrays - sw1.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); - - for (int i=0; i(epsilon[i]); - dview[i].y=static_cast(sigma[i]); - dview[i].z=static_cast(lambda[i]); - dview[i].w=static_cast(gamma[i]); - } - - ucl_copy(sw1,dview,false); - sw1_tex.get_texture(*(this->pair_program),"sw1_tex"); - sw1_tex.bind_float(sw1,4); - - sw2.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); - - for (int i=0; i(biga[i]); - dview[i].y=static_cast(bigb[i]); - dview[i].z=static_cast(powerp[i]); - dview[i].w=static_cast(powerq[i]); - } - - ucl_copy(sw2,dview,false); - sw2_tex.get_texture(*(this->pair_program),"sw2_tex"); - sw2_tex.bind_float(sw2,4); - - sw3.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); - - for (int i=0; i=sw_cut*sw_cut) - sw_cutsq=sw_cut*sw_cut-1e-4; - dview[i].x=static_cast(sw_cut); - dview[i].y=static_cast(sw_cutsq); - dview[i].z=static_cast(costheta[i]); - dview[i].w=(numtyp)0; - } - - ucl_copy(sw3,dview,false); - sw3_tex.get_texture(*(this->pair_program),"sw3_tex"); - sw3_tex.bind_float(sw3,4); - - UCL_H_Vec dview_elem2param(nelements*nelements*nelements, - *(this->ucl_device), UCL_WRITE_ONLY); - - elem2param.alloc(nelements*nelements*nelements,*(this->ucl_device), - UCL_READ_ONLY); - - for (int i = 0; i < nelements; i++) - for (int j = 0; j < nelements; j++) - for (int k = 0; k < nelements; k++) { - int idx = i*nelements*nelements+j*nelements+k; - dview_elem2param[idx] = host_elem2param[i][j][k]; - } - - ucl_copy(elem2param,dview_elem2param,false); - - UCL_H_Vec dview_map(lj_types, *(this->ucl_device), UCL_WRITE_ONLY); - for (int i = 0; i < ntypes; i++) - dview_map[i] = host_map[i]; - - map.alloc(lj_types,*(this->ucl_device), UCL_READ_ONLY); - ucl_copy(map,dview_map,false); + cutsq.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack1(ntypes,ntypes,cutsq,host_write,ncutsq); + sw_pre.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,ntypes,sw_pre,host_write,ncut,sigma, + powerp,powerq); + c_14.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,ntypes,c_14,host_write,c1,c2,c3,c4); + c_56.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack2(ntypes,ntypes,c_56,host_write,c5,c6); + cut_sigma_gamma.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack2(ntypes,ntypes,cut_sigma_gamma,host_write,ncut, + sigma_gamma); + sw_pre3.alloc(ntypes*ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack2(ntypes,sw_pre3,host_write,lambda_epsilon,costheta); _allocated=true; - this->_max_bytes=sw1.row_bytes()+sw2.row_bytes()+sw3.row_bytes()+ - map.row_bytes()+elem2param.row_bytes(); + this->_max_bytes=cutsq.row_bytes()+sw_pre.row_bytes()+c_14.row_bytes()+ + c_56.row_bytes()+cut_sigma_gamma.row_bytes()+sw_pre3.row_bytes(); return 0; } @@ -160,11 +129,12 @@ void SWT::clear() { return; _allocated=false; - sw1.clear(); - sw2.clear(); - sw3.clear(); - map.clear(); - elem2param.clear(); + cutsq.clear(); + sw_pre.clear(); + c_14.clear(); + c_56.clear(); + cut_sigma_gamma.clear(); + sw_pre3.clear(); this->clear_atomic(); } @@ -179,58 +149,33 @@ double SWT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) { - // Compute the block size and grid size to keep all cores busy - int BX=this->block_pair(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; +int SWT::loop(const int eflag, const int vflag, const int evatom, + bool &success) { + const int nbor_pitch=this->nbor->nbor_pitch(); // build the short neighbor list int ainum=this->_ainum; - int nbor_pitch=this->nbor->nbor_pitch(); - int GX=static_cast(ceil(static_cast(ainum)/ - (BX/this->_threads_per_atom))); + this->time_pair.start(); + + int BX=this->block_pair(); + int GX=static_cast(ceil(static_cast(ainum)/BX)); this->k_short_nbor.set_size(GX,BX); - this->k_short_nbor.run(&this->atom->x, &sw3, &map, &elem2param, &_nelements, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, &ainum, - &nbor_pitch, &this->_threads_per_atom); + this->k_short_nbor.run(&this->atom->x, &cutsq, &_lj_types, + &this->nbor->dev_nbor, &this->nbor->dev_packed, + &ainum, &nbor_pitch, &this->_threads_per_atom); // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1 // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1 ainum=this->ans->inum(); - nbor_pitch=this->nbor->nbor_pitch(); - GX=static_cast(ceil(static_cast(this->ans->inum())/ - (BX/this->_threads_per_atom))); - this->time_pair.start(); - - this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3, - &map, &elem2param, &_nelements, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, - &this->ans->force, &this->ans->engv, - &eflag, &vflag, &ainum, &nbor_pitch, - &this->_threads_per_atom); - BX=this->block_size(); GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/(KTHREADS*JTHREADS)))); - this->k_three_center.set_size(GX,BX); - this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3, - &map, &elem2param, &_nelements, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, - &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom, &evatom); + this->k_3center_sel->set_size(GX,BX); + this->k_3center_sel->run(&this->atom->x, &cut_sigma_gamma, &sw_pre3, + &_lj_types, &this->nbor->dev_nbor, + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom, &evatom); Answer *end_ans; #ifdef THREE_CONCURRENT @@ -240,25 +185,32 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) { #endif if (evatom!=0) { this->k_three_end_vatom.set_size(GX,BX); - this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3, - &map, &elem2param, &_nelements, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_ilist, &this->dev_short_nbor, - &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); - + this->k_three_end_vatom.run(&this->atom->x, &cut_sigma_gamma, + &sw_pre3, &_lj_types, &this->nbor->dev_nbor, + &this->nbor->three_ilist, &end_ans->force, + &end_ans->engv, &eflag, &vflag, &ainum, + &nbor_pitch,&this->_threads_per_atom, + &this->_gpu_nbor); } else { - this->k_three_end.set_size(GX,BX); - this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3, - &map, &elem2param, &_nelements, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_ilist, &this->dev_short_nbor, - &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); - + this->k_3end_sel->set_size(GX,BX); + this->k_3end_sel->run(&this->atom->x, &cut_sigma_gamma, &sw_pre3, + &_lj_types, &this->nbor->dev_nbor, + &this->nbor->three_ilist, &end_ans->force, + &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom, &this->_gpu_nbor); } + BX=this->block_pair(); + int GXT=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->k_sel->set_size(GXT,BX); + this->k_sel->run(&this->atom->x, &sw_pre, &c_14, &c_56, + &_lj_types, &this->nbor->dev_nbor, + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom, &GX); + this->time_pair.stop(); + return GX; } template class SW; diff --git a/lib/gpu/lal_sw.cu b/lib/gpu/lal_sw.cu index 2b38bd02dc..621ba87208 100644 --- a/lib/gpu/lal_sw.cu +++ b/lib/gpu/lal_sw.cu @@ -39,88 +39,161 @@ _texture( sw3_tex,int4); //#define THREE_CONCURRENT -#if (ARCH < 300) +#if (SHUFFLE_AVAIL == 0) -#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, offset, \ - eflag, vflag, ans, engv) \ +#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, \ + offset, eflag, vflag, ans, engv, ev_stride) \ if (t_per_atom>1) { \ - __local acctyp red_acc[6][BLOCK_ELLIPSE]; \ - red_acc[0][tid]=f.x; \ - red_acc[1][tid]=f.y; \ - red_acc[2][tid]=f.z; \ - red_acc[3][tid]=energy; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<4; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ + simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ + if (EVFLAG && (vflag==2 || eflag==2)) { \ + if (eflag) { \ + simdsync(); \ + simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy); \ } \ - } \ - f.x=red_acc[0][tid]; \ - f.y=red_acc[1][tid]; \ - f.z=red_acc[2][tid]; \ - energy=red_acc[3][tid]; \ - if (vflag>0) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid]=virial[r]; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ - } \ + if (vflag) { \ + simdsync(); \ + simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \ } \ - for (int r=0; r<6; r++) \ - virial[r]=red_acc[r][tid]; \ } \ } \ - if (offset==0) { \ - int ei=ii; \ - if (eflag>0) { \ - engv[ei]+=energy*(acctyp)0.5; \ - ei+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - engv[ei]+=virial[i]*(acctyp)0.5; \ - ei+=inum; \ - } \ - } \ + if (offset==0 && ii1) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - f.x += shfl_xor(f.x, s, t_per_atom); \ - f.y += shfl_xor(f.y, s, t_per_atom); \ - f.z += shfl_xor(f.z, s, t_per_atom); \ - energy += shfl_xor(energy, s, t_per_atom); \ - } \ - if (vflag>0) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - for (int r=0; r<6; r++) \ - virial[r] += shfl_xor(virial[r], s, t_per_atom); \ - } \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (vflag==2 || eflag==2) { \ + if (eflag) \ + simd_reduce_add1(t_per_atom,energy); \ + if (vflag) \ + simd_reduce_arr(6, t_per_atom,virial); \ } \ } \ - if (offset==0) { \ - int ei=ii; \ - if (eflag>0) { \ - engv[ei]+=energy*(acctyp)0.5; \ - ei+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - engv[ei]+=virial[i]*(acctyp)0.5; \ + if (offset==0 && ii 1; active_subgs /= vwidth) { \ + if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \ + if (bnum < active_subgs) { \ + if (eflag) { \ + simd_reduce_add1(vwidth, energy); \ + if (voffset==0) red_acc[6][bnum] = energy; \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (voffset==0) \ + for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \ + } \ + } \ + \ + __syncthreads(); \ + if (tid < active_subgs) { \ + if (eflag) energy = red_acc[6][tid]; \ + if (vflag) \ + for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \ + } else { \ + if (eflag) energy = (acctyp)0; \ + if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \ + } \ + } \ + \ + if (bnum == 0) { \ + int ei=BLOCK_ID_X; \ + if (eflag) { \ + simd_reduce_add1(vwidth, energy); \ + if (tid==0) { \ + engv[ei]+=energy*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (tid==0) { \ + for (int r=0; r<6; r++) { \ + engv[ei]+=virial[r]*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + } \ + } \ + } else if (offset==0 && ii1) \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (offset==0 && ii0) - energy+=(pre_sw_c5*rp - pre_sw_c6*rq) * expsrainv; - - if (vflag>0) { - virial[0] += delx*delx*force; - virial[1] += dely*dely*force; - virial[2] += delz*delz*force; - virial[3] += delx*dely*force; - virial[4] += delx*delz*force; - virial[5] += dely*delz*force; - } + if (EVFLAG && vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; } } // for nbor - - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii - + store_answers_p(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv,ev_stride); } #define threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z, eflag, energy) \ @@ -334,7 +389,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_, numtyp facrad = sw_lambda_epsilon_ijk * facexp*delcssq; \ numtyp frad1 = facrad*gsrainvsq1; \ numtyp frad2 = facrad*gsrainvsq2; \ - numtyp facang = sw_lambda_epsilon2_ijk * facexp*delcs; \ + numtyp facang = (numtyp)2.0 * sw_lambda_epsilon_ijk * facexp*delcs; \ numtyp facang12 = rinv12*facang; \ numtyp csfacang = cs*facang; \ numtyp csfac1 = rinvsq1*csfacang; \ @@ -349,9 +404,9 @@ __kernel void k_sw(const __global numtyp4 *restrict x_, fky = delr2y*(frad2+csfac2)-delr1y*facang12; \ fkz = delr2z*(frad2+csfac2)-delr1z*facang12; \ \ - if (eflag>0) \ + if (EVFLAG && eflag) \ energy+=facrad; \ - if (vflag>0) { \ + if (EVFLAG && vflag) { \ virial[0] += delr1x*fjx + delr2x*fkx; \ virial[1] += delr1y*fjy + delr2y*fky; \ virial[2] += delr1z*fjz + delr2z*fkz; \ @@ -384,7 +439,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_, \ numtyp facrad = sw_lambda_epsilon_ijk * facexp*delcssq; \ numtyp frad1 = facrad*gsrainvsq1; \ - numtyp facang = sw_lambda_epsilon2_ijk * facexp*delcs; \ + numtyp facang = (numtyp)2.0 * sw_lambda_epsilon_ijk * facexp*delcs; \ numtyp facang12 = rinv12*facang; \ numtyp csfacang = cs*facang; \ numtyp csfac1 = rinvsq1*csfacang; \ @@ -394,67 +449,68 @@ __kernel void k_sw(const __global numtyp4 *restrict x_, fjz = delr1z*(frad1+csfac1)-delr2z*facang12; \ } +#ifdef ONETYPE +#define sw_cut_ij sw_cut +#define sw_cut_ik sw_cut +#define sw_sigma_gamma_ij sw_sigma_gamma +#define sw_sigma_gamma_ik sw_sigma_gamma +#endif + __kernel void k_sw_three_center(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict sw1, - const __global numtyp4 *restrict sw2, - const __global numtyp4 *restrict sw3, - const __global int *restrict map, - const __global int *restrict elem2param, - const int nelements, + const __global numtyp2 *restrict cut_sig_gamma, + const __global numtyp2 *restrict sw_pre3, + const int ntypes, const __global int * dev_nbor, - const __global int * dev_packed, - const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom, const int evatom) { - __local int tpa_sq, n_stride; - tpa_sq=fast_mul(t_per_atom,t_per_atom); - numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik; - numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk; + int n_stride; + const int tpa_sq=fast_mul(t_per_atom,t_per_atom); + local_allocate_store_three(); int tid, ii, offset; atom_info(tpa_sq,ii,tid,offset); - acctyp energy=(acctyp)0; + #ifdef ONETYPE + const numtyp sw_cut=cut_sig_gamma[ONETYPE].x; + const numtyp sw_sigma_gamma=cut_sig_gamma[ONETYPE].y; + const numtyp sw_lambda_epsilon_ijk=sw_pre3[ONETYPE3].x; + const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y; + #endif + acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; - - __syncthreads(); + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii sw3_ijparam.y) continue; + int nbor_k; + nbor_k = nbor_j-offset_j+offset_k; + if (nbor_k<=nbor_j) nbor_k += n_stride; - numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex); - sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma; - sw_cut_ij=sw3_ijparam.x; - - int nbor_k,k_end; - if (dev_packed==dev_nbor) { - nbor_k=nborj_start-offset_j+offset_k; - int numk = dev_short_nbor[nbor_k-n_stride]; - k_end = nbor_k+fast_mul(numk,n_stride); - } else { - nbor_k = nbor_j-offset_j+offset_k; - if (nbor_k<=nbor_j) nbor_k += n_stride; - k_end = nbor_end; - } - - for ( ; nbor_k sw3_ijparam.y) continue; - - numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex); - sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma; - sw_cut_ij=sw3_ijparam.x; - - int nbor_k,numk; - if (dev_nbor==dev_packed) { - if (gpu_nbor) nbor_k=j+nbor_pitch; - else nbor_k=dev_ilist[j]+nbor_pitch; - numk=dev_nbor[nbor_k]; - nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1); - k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1)); - nbor_k+=offset_k; - } else { - nbor_k=dev_ilist[j]+nbor_pitch; - numk=dev_nbor[nbor_k]; - nbor_k+=nbor_pitch; - nbor_k=dev_nbor[nbor_k]; - k_end=nbor_k+numk; - nbor_k+=offset_k; - } - - // recalculate numk and k_end for the use of short neighbor list - if (dev_packed==dev_nbor) { - numk = dev_short_nbor[nbor_k]; - nbor_k += n_stride; - k_end = nbor_k+fast_mul(numk,n_stride); - } + int nbor_k; + if (gpu_nbor) nbor_k=j+nbor_pitch; + else nbor_k=dev_ilist[j]+nbor_pitch; + const int numk=dev_nbor[nbor_k]; + nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1); + k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk&(t_per_atom-1)); + nbor_k+=offset_k; for ( ; nbor_k sw3_ijparam.y) continue; - - numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex); - sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma; - sw_cut_ij=sw3_ijparam.x; - - int nbor_k,numk; - if (dev_nbor==dev_packed) { - if (gpu_nbor) nbor_k=j+nbor_pitch; - else nbor_k=dev_ilist[j]+nbor_pitch; - numk=dev_nbor[nbor_k]; - nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1); - k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1)); - nbor_k+=offset_k; - } else { - nbor_k=dev_ilist[j]+nbor_pitch; - numk=dev_nbor[nbor_k]; - nbor_k+=nbor_pitch; - nbor_k=dev_nbor[nbor_k]; - k_end=nbor_k+numk; - nbor_k+=offset_k; - } - - // recalculate numk and k_end for the use of short neighbor list - if (dev_packed==dev_nbor) { - numk = dev_short_nbor[nbor_k]; - nbor_k += n_stride; - k_end = nbor_k+fast_mul(numk,n_stride); - } + int nbor_k; + if (gpu_nbor) nbor_k=j+nbor_pitch; + else nbor_k=dev_ilist[j]+nbor_pitch; + const int numk=dev_nbor[nbor_k]; + nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1); + k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk&(t_per_atom-1)); + nbor_k+=offset_k; for ( ; nbor_k { * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ - int init(const int ntypes, const int nlocal, const int nall, const int max_nbors, - const double cell_size, const double gpu_split, FILE *screen, - int* host_map, const int nelements, int*** host_elem2param, const int nparams, - const double* epsilon, const double* sigma, - const double* lambda, const double* gamma, - const double* costheta, const double* biga, - const double* bigb, const double* powerp, - const double* powerq, const double* cut, const double* cutsq); + int init(const int ntypes, const int nlocal, const int nall, + const int max_nbors, const double cell_size, + const double gpu_split, FILE *screen, double **ncutsq, + double **ncut, double **sigma, double **powerp, double **powerq, + double **sigma_gamma, double **c1, double **c2, double **c3, + double **c4, double **c5, double **c6, double ***lambda_epsilon, + double ***costheta, const int *map, int ***e2param); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ @@ -64,22 +63,21 @@ class SW : public BaseThree { /// Number of atom types int _lj_types; - /// sw1.x = epsilon, sw1.y = sigma, sw1.z = lambda, sw1.w = gamma - UCL_D_Vec sw1; - /// sw2.x = biga, sw2.y = bigb, sw2.z = powerp, sw2.w = powerq - UCL_D_Vec sw2; - /// sw3.x = cut, sw3.y = cutsq, sw3.z = costheta - UCL_D_Vec sw3; - - UCL_D_Vec elem2param; - UCL_D_Vec map; - int _nparams,_nelements; - - UCL_Texture sw1_tex, sw2_tex, sw3_tex; + UCL_D_Vec cutsq; + /// sw_pre.x = cut, sw_pre.y = sigma, sw_pre.z = powerp, sw_pre.w = powerq + UCL_D_Vec sw_pre; + /// c_14.x = c1, c_14.y = c2, c_14.z = c3, c_14.w = c4 + UCL_D_Vec c_14; + /// c_56.x = c5, c_56.y = c6 + UCL_D_Vec c_56; + /// cut_sigma_gamma.x = cut, cut_sigma_gamma.y = sigma_gamma + UCL_D_Vec cut_sigma_gamma; + /// sw_pre3.x = lambda_epsilon, sw_pre3.y = costheta + UCL_D_Vec sw_pre3; private: bool _allocated; - void loop(const bool _eflag, const bool _vflag, const int evatom); + int loop(const int eflag, const int vflag, const int evatom, bool &success); }; diff --git a/lib/gpu/lal_sw_ext.cpp b/lib/gpu/lal_sw_ext.cpp index 1935ed615b..5158f135a3 100644 --- a/lib/gpu/lal_sw_ext.cpp +++ b/lib/gpu/lal_sw_ext.cpp @@ -27,15 +27,13 @@ static SW SWMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors, - const double cell_size, int &gpu_mode, FILE *screen, - int* host_map, const int nelements, int*** host_elem2param, const int nparams, - const double* sw_epsilon, const double* sw_sigma, - const double* sw_lambda, const double* sw_gamma, - const double* sw_costheta, const double* sw_biga, - const double* sw_bigb, const double* sw_powerp, - const double* sw_powerq, const double* sw_cut, - const double* sw_cutsq) { +int sw_gpu_init(const int ntypes, const int inum, const int nall, + const int max_nbors, const double cell_size, int &gpu_mode, + FILE *screen, double **ncutsq, double **ncut, double **sigma, + double **powerp, double **powerq, double **sigma_gamma, + double **c1, double **c2, double **c3, double **c4, + double **c5, double **c6, double ***lambda_epsilon, + double ***costheta, const int *map, int ***e2param) { SWMF.clear(); gpu_mode=SWMF.device->gpu_mode(); double gpu_split=SWMF.device->particle_split(); @@ -62,10 +60,10 @@ int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_ int init_ok=0; if (world_me==0) - init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen, - host_map, nelements, host_elem2param, nparams, - sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, - sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq); + init_ok=SWMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, + screen, ncutsq, ncut, sigma, powerp, powerq, + sigma_gamma, c1, c2, c3, c4, c5, c6, lambda_epsilon, + costheta, map, e2param); SWMF.device->world_barrier(); if (message) @@ -81,11 +79,10 @@ int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_ fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen, - host_map, nelements, host_elem2param, nparams, - sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, - sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, - sw_cutsq); + init_ok=SWMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, + screen, ncutsq, ncut, sigma, powerp, powerq, + sigma_gamma, c1, c2, c3, c4, c5, c6, lambda_epsilon, + costheta, map, e2param); SWMF.device->gpu_barrier(); if (message) @@ -127,5 +124,3 @@ void sw_gpu_compute(const int ago, const int nlocal, const int nall, double sw_gpu_bytes() { return SWMF.host_memory_usage(); } - - diff --git a/lib/gpu/lal_table.cpp b/lib/gpu/lal_table.cpp index d07b2716e4..0c336c6990 100644 --- a/lib/gpu/lal_table.cpp +++ b/lib/gpu/lal_table.cpp @@ -69,6 +69,20 @@ int TableT::init(const int ntypes, k_pair_spline_fast.set_function(*(this->pair_program),"k_table_spline_fast"); k_pair_bitmap.set_function(*(this->pair_program),"k_table_bitmap"); k_pair_bitmap_fast.set_function(*(this->pair_program),"k_table_bitmap_fast"); + + #if defined(LAL_OCL_EV_JIT) + k_pair_linear_noev.set_function(*(this->pair_program_noev), + "k_table_linear_fast"); + k_pair_spline_noev.set_function(*(this->pair_program_noev), + "k_table_spline_fast"); + k_pair_bitmap_noev.set_function(*(this->pair_program_noev), + "k_table_bitmap_fast"); + #else + k_pair_linear_sel = &k_pair_linear_fast; + k_pair_spline_sel = &k_pair_spline_fast; + k_pair_bitmap_sel = &k_pair_bitmap_fast; + #endif + _compiled_styles = true; // If atom type constants fit in shared memory use fast kernel @@ -228,6 +242,11 @@ void TableT::clear() { k_pair_spline.clear(); k_pair_bitmap_fast.clear(); k_pair_bitmap.clear(); + #if defined(LAL_OCL_EV_JIT) + k_pair_linear_noev.clear(); + k_pair_spline_noev.clear(); + k_pair_bitmap_noev.clear(); + #endif _compiled_styles=false; } @@ -243,19 +262,22 @@ double TableT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void TableT::loop(const bool _eflag, const bool _vflag) { +int TableT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - if (_vflag) - vflag=1; - else - vflag=0; + #if defined(LAL_OCL_EV_JIT) + if (eflag || vflag) { + k_pair_linear_sel = &k_pair_linear_fast; + k_pair_spline_sel = &k_pair_spline_fast; + k_pair_bitmap_sel = &k_pair_bitmap_fast; + } else { + k_pair_linear_sel = &k_pair_linear_noev; + k_pair_spline_sel = &k_pair_spline_noev; + k_pair_bitmap_sel = &k_pair_bitmap_noev; + } + #endif + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -265,37 +287,37 @@ void TableT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { if (_tabstyle == LOOKUP) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &tabindex, &coeff2, &coeff3, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &tabindex, &coeff2, &coeff3, &coeff4, &cutsq, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &_tablength); } else if (_tabstyle == LINEAR) { - this->k_pair_linear_fast.set_size(GX,BX); - this->k_pair_linear_fast.run(&this->atom->x, &tabindex, &coeff2, - &coeff3, &coeff4, &cutsq, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, - &eflag, &vflag, &ainum, &nbor_pitch, - &this->_threads_per_atom, &_tablength); + k_pair_linear_sel->set_size(GX,BX); + k_pair_linear_sel->run(&this->atom->x, &tabindex, &coeff2, + &coeff3, &coeff4, &cutsq, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom, &_tablength); } else if (_tabstyle == SPLINE) { - this->k_pair_spline_fast.set_size(GX,BX); - this->k_pair_spline_fast.run(&this->atom->x, &tabindex, &coeff2, - &coeff3, &coeff4, &cutsq, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, - &eflag, &vflag, &ainum, &nbor_pitch, - &this->_threads_per_atom, &_tablength); + k_pair_spline_sel->set_size(GX,BX); + k_pair_spline_sel->run(&this->atom->x, &tabindex, &coeff2, + &coeff3, &coeff4, &cutsq, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom, &_tablength); } else if (_tabstyle == BITMAP) { - this->k_pair_bitmap_fast.set_size(GX,BX); - this->k_pair_bitmap_fast.run(&this->atom->x, &tabindex, &nshiftbits, - &nmask, &coeff2, &coeff3, &coeff4, &cutsq, - &sp_lj, &this->nbor->dev_nbor, - &this->_nbor_data->begin(), &this->ans->force, - &this->ans->engv, &eflag, &vflag, - &ainum, &nbor_pitch, - &this->_threads_per_atom, &_tablength); + k_pair_bitmap_sel->set_size(GX,BX); + k_pair_bitmap_sel->run(&this->atom->x, &tabindex, &nshiftbits, + &nmask, &coeff2, &coeff3, &coeff4, &cutsq, + &sp_lj, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), &this->ans->force, + &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, + &this->_threads_per_atom, &_tablength); } } else { if (_tabstyle == LOOKUP) { @@ -334,6 +356,7 @@ void TableT::loop(const bool _eflag, const bool _vflag) { } } this->time_pair.stop(); + return GX; } template class Table; diff --git a/lib/gpu/lal_table.cu b/lib/gpu/lal_table.cu index 0cf0de2af0..eb29218712 100644 --- a/lib/gpu/lal_table.cu +++ b/lib/gpu/lal_table.cu @@ -58,24 +58,27 @@ __kernel void k_table(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } int tlm1 = tablength - 1; if (ii0) { + if (EVFLAG && eflag) { numtyp e = (numtyp)0.0; if (itable < tlm1) e = coeff3[idx].y; energy+=factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -129,9 +132,9 @@ __kernel void k_table(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_table_fast(const __global numtyp4 *restrict x_, @@ -153,18 +156,22 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_, __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + if (tid<4) sp_lj[tid]=sp_lj_in[tid]; if (tid0) { + if (EVFLAG && eflag) { numtyp e = (numtyp)0.0; if (itable < tlm1) e = coeff3[idx].y; energy+=factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -228,9 +234,9 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } /// ---------------- LINEAR ------------------------------------------------- @@ -254,24 +260,27 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } int tlm1 = tablength - 1; if (ii0) { + if (EVFLAG && eflag) { numtyp e = (numtyp)0.0; if (itable < tlm1) e = coeff3[idx].y + fraction*coeff4[idx].y; energy+=factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -329,9 +338,9 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_, @@ -353,18 +362,22 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_, __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + if (tid<4) sp_lj[tid]=sp_lj_in[tid]; if (tid0) { + if (EVFLAG && eflag) { numtyp e = (numtyp)0.0; if (itable < tlm1) e = coeff3[idx].y + fraction*coeff4[idx].y; energy+=factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -432,9 +444,9 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } /// ---------------- SPLINE ------------------------------------------------- @@ -458,24 +470,27 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } int tlm1 = tablength - 1; if (ii0) { + if (EVFLAG && eflag) { numtyp e = (numtyp)0.0; if (itable < tlm1) { e = a * coeff3[idx].y + b * coeff3[idx+1].y + @@ -529,7 +544,7 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_, } energy+=factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -540,9 +555,9 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_table_spline_fast(const __global numtyp4 *x_, @@ -564,19 +579,22 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_, __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + if (tid<4) sp_lj[tid]=sp_lj_in[tid]; if (tid0) { + if (EVFLAG && eflag) { numtyp e = (numtyp)0.0; if (itable < tlm1) { e = a * coeff3[idx].y + b * coeff3[idx+1].y + @@ -639,7 +656,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_, } energy+=factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -650,9 +667,9 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } /// ---------------- BITMAP ------------------------------------------------- @@ -678,24 +695,27 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } int tlm1 = tablength - 1; if (ii0) { + if (EVFLAG && eflag) { numtyp e = (numtyp)0.0; if (itable <= tlm1) e = coeff3[idx].y + fraction*coeff4[idx].y; energy+=factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -756,9 +776,9 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_table_bitmap_fast(const __global numtyp4 *x_, @@ -782,18 +802,22 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_, __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + if (tid<4) sp_lj[tid]=sp_lj_in[tid]; if (tid0) { + if (EVFLAG && eflag) { numtyp e = (numtyp)0.0; if (itable <= tlm1) e = coeff3[idx].y + fraction*coeff4[idx].y; energy+=factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -864,7 +887,7 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } diff --git a/lib/gpu/lal_table.h b/lib/gpu/lal_table.h index 38ae012bee..b67a369dad 100644 --- a/lib/gpu/lal_table.h +++ b/lib/gpu/lal_table.h @@ -56,9 +56,10 @@ class Table : public BaseAtomic { double host_memory_usage() const; // ------------------------- DEVICE KERNELS ------------------------- - UCL_Kernel k_pair_linear, k_pair_linear_fast; - UCL_Kernel k_pair_spline, k_pair_spline_fast; - UCL_Kernel k_pair_bitmap, k_pair_bitmap_fast; + UCL_Kernel k_pair_linear, k_pair_linear_fast, k_pair_linear_noev; + UCL_Kernel k_pair_spline, k_pair_spline_fast, k_pair_spline_noev; + UCL_Kernel k_pair_bitmap, k_pair_bitmap_fast, k_pair_bitmap_noev; + UCL_Kernel *k_pair_linear_sel, *k_pair_spline_sel, *k_pair_bitmap_sel; // --------------------------- TYPE DATA -------------------------- @@ -90,7 +91,7 @@ class Table : public BaseAtomic { private: bool _allocated, _compiled_styles; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_table_ext.cpp b/lib/gpu/lal_table_ext.cpp index f067881b88..6237c4d7cd 100644 --- a/lib/gpu/lal_table_ext.cpp +++ b/lib/gpu/lal_table_ext.cpp @@ -55,7 +55,7 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs, int init_ok=0; if (world_me==0) init_ok=TBMF.init(ntypes, cutsq, table_coeffs, table_data, - special_lj, inum, nall, 300, maxspecial, cell_size, + special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, tabstyle, ntables, tablength); TBMF.device->world_barrier(); @@ -73,7 +73,7 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs, } if (gpu_rank==i && world_me!=0) init_ok=TBMF.init(ntypes, cutsq, table_coeffs, table_data, - special_lj, inum, nall, 300, maxspecial, cell_size, + special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, tabstyle, ntables, tablength); TBMF.device->gpu_barrier(); diff --git a/lib/gpu/lal_tersoff.cpp b/lib/gpu/lal_tersoff.cpp index 63691a2047..e0e87d9148 100644 --- a/lib/gpu/lal_tersoff.cpp +++ b/lib/gpu/lal_tersoff.cpp @@ -39,7 +39,7 @@ TersoffT::~Tersoff() { template int TersoffT::bytes_per_atom(const int max_nbors) const { - return this->bytes_per_atom_atomic(max_nbors); + return this->bytes_per_atom_atomic(max_nbors)+max_nbors*sizeof(acctyp)*4; } template @@ -52,34 +52,82 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int const double* c, const double* d, const double* h, const double* gamma, const double* beta, const double* powern, const double* host_cutsq) { + int oldparam=-1; + int onetype=-1; + int onetype3=0; + int spq=0; + int mtypes=0; + #ifdef USE_OPENCL + for (int ii=1; ii1) onetype=-1; + if (onetype>=0) spq=powermint[onetype3]; + #endif + int success; success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,tersoff,"k_tersoff_repulsive", "k_tersoff_three_center", "k_tersoff_three_end", - "k_tersoff_short_nbor"); + "k_tersoff_short_nbor",onetype,onetype3,spq,1); if (success!=0) return success; int ef_nall=nall; if (ef_nall==0) ef_nall=2000; - _zetaij.alloc(ef_nall*max_nbors,*(this->ucl_device),UCL_READ_WRITE); + if (this->nbor->max_nbors()) { + _zetaij.alloc(ef_nall*this->nbor->max_nbors(),*(this->ucl_device), + UCL_READ_WRITE); + _zetaij_eng.alloc(ef_nall*this->nbor->max_nbors(),*(this->ucl_device), + UCL_READ_WRITE); + } k_zeta.set_function(*(this->pair_program),"k_tersoff_zeta"); + #if defined(LAL_OCL_EV_JIT) + k_zeta_noev.set_function(*(this->pair_program_noev),"k_tersoff_zeta"); + #else + k_zeta_selt = &k_zeta; + #endif - // If atom type constants fit in shared memory use fast kernel - int lj_types=ntypes; - shared_types=false; - int max_shared_types=this->device->max_shared_types(); - if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { - lj_types=max_shared_types; - shared_types=true; - } - _lj_types=lj_types; - + _ntypes=ntypes; _nparams = nparams; _nelements = nelements; + UCL_H_Vec host_write(ntypes*ntypes,*(this->ucl_device), + UCL_READ_WRITE); + host_write.zero(); + cutsq_pair.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY); + for (int ii=1; iihost_write[ii*ntypes+jj]) + host_write[ii*ntypes+jj]=host_cutsq[ijkparam]; + } + } + } + ucl_copy(cutsq_pair,host_write,ntypes*ntypes); + + // -------------------------------------------------------------------- UCL_H_Vec dview(nparams,*(this->ucl_device), UCL_WRITE_ONLY); @@ -90,32 +138,29 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int dview[i].w=(numtyp)0; } + // pack coefficients into arrays // pack coefficients into arrays ts1.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i(lam1[i]); - dview[i].y=static_cast(lam2[i]); - dview[i].z=static_cast(lam3[i]); - dview[i].w=static_cast(powermint[i]); + dview[i].x=static_cast(lam3[i]); + dview[i].y=static_cast(powermint[i]); + dview[i].z=static_cast(bigr[i]); + dview[i].w=static_cast(bigd[i]); } ucl_copy(ts1,dview,false); - ts1_tex.get_texture(*(this->pair_program),"ts1_tex"); - ts1_tex.bind_float(ts1,4); ts2.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i(biga[i]); - dview[i].y=static_cast(bigb[i]); + dview[i].y=static_cast(lam1[i]); dview[i].z=static_cast(bigr[i]); dview[i].w=static_cast(bigd[i]); } ucl_copy(ts2,dview,false); - ts2_tex.get_texture(*(this->pair_program),"ts2_tex"); - ts2_tex.bind_float(ts2,4); ts3.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); @@ -127,46 +172,28 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int } ucl_copy(ts3,dview,false); - ts3_tex.get_texture(*(this->pair_program),"ts3_tex"); - ts3_tex.bind_float(ts3,4); ts4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i(c[i]); - dview[i].y=static_cast(d[i]); + dview[i].x=static_cast(c[i]*c[i]); + dview[i].y=static_cast(d[i]*d[i]); dview[i].z=static_cast(h[i]); dview[i].w=static_cast(gamma[i]); } ucl_copy(ts4,dview,false); - ts4_tex.get_texture(*(this->pair_program),"ts4_tex"); - ts4_tex.bind_float(ts4,4); ts5.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i(beta[i]); dview[i].y=static_cast(powern[i]); - dview[i].z=(numtyp)0; - dview[i].w=(numtyp)0; + dview[i].z=static_cast(lam2[i]); + dview[i].w=static_cast(bigb[i]); } ucl_copy(ts5,dview,false); - ts5_tex.get_texture(*(this->pair_program),"ts5_tex"); - ts5_tex.bind_float(ts5,4); - - UCL_H_Vec cutsq_view(nparams,*(this->ucl_device), - UCL_WRITE_ONLY); - double cutsqmax = 0.0; - for (int i=0; i(host_cutsq[i]); - if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i]; - } - cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); - ucl_copy(cutsq,cutsq_view,false); - - _cutshortsq = static_cast(cutsqmax); UCL_H_Vec dview_elem2param(nelements*nelements*nelements, *(this->ucl_device), UCL_WRITE_ONLY); @@ -183,17 +210,17 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int ucl_copy(elem2param,dview_elem2param,false); - UCL_H_Vec dview_map(lj_types, *(this->ucl_device), UCL_WRITE_ONLY); + UCL_H_Vec dview_map(ntypes, *(this->ucl_device), UCL_WRITE_ONLY); for (int i = 0; i < ntypes; i++) dview_map[i] = host_map[i]; - map.alloc(lj_types,*(this->ucl_device), UCL_READ_ONLY); + map.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY); ucl_copy(map,dview_map,false); _allocated=true; this->_max_bytes=ts1.row_bytes()+ts2.row_bytes()+ts3.row_bytes()+ - ts4.row_bytes()+ts5.row_bytes()+cutsq.row_bytes()+ - map.row_bytes()+elem2param.row_bytes()+_zetaij.row_bytes(); + ts4.row_bytes()+ts5.row_bytes()+map.row_bytes()+ + elem2param.row_bytes()+_zetaij.row_bytes()+_zetaij_eng.row_bytes(); return 0; } @@ -208,12 +235,16 @@ void TersoffT::clear() { ts3.clear(); ts4.clear(); ts5.clear(); - cutsq.clear(); + cutsq_pair.clear(); map.clear(); elem2param.clear(); _zetaij.clear(); + _zetaij_eng.clear(); k_zeta.clear(); + #if defined(LAL_OCL_EV_JIT) + k_zeta_noev.clear(); + #endif this->clear_atomic(); } @@ -229,75 +260,60 @@ double TersoffT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { - // Compute the block size and grid size to keep all cores busy - int BX=this->block_pair(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - - // build the short neighbor list - int ainum=this->_ainum; - int nbor_pitch=this->nbor->nbor_pitch(); - int GX=static_cast(ceil(static_cast(ainum)/ - (BX/this->_threads_per_atom))); - - this->k_short_nbor.set_size(GX,BX); - this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, - &this->_nbor_data->begin(), - &this->dev_short_nbor, &_cutshortsq, &ainum, - &nbor_pitch, &this->_threads_per_atom); +int TersoffT::loop(const int eflag, const int vflag, const int evatom, + bool &success) { + const int nbor_pitch=this->nbor->nbor_pitch(); // re-allocate zetaij if necessary int nall = this->_nall; - if (nall*this->_max_nbors > _zetaij.cols()) { + if (nall*this->nbor->max_nbors() > _zetaij.cols()) { int _nmax=static_cast(static_cast(nall)*1.10); - _zetaij.resize(this->_max_nbors*_nmax); + _zetaij.clear(); + _zetaij_eng.clear(); + success = success && (_zetaij.alloc(this->nbor->max_nbors()*_nmax, + *(this->ucl_device), + UCL_READ_WRITE) == UCL_SUCCESS); + success = success && (_zetaij_eng.alloc(this->nbor->max_nbors()*_nmax, + *(this->ucl_device), + UCL_READ_WRITE) == UCL_SUCCESS); + if (!success) return 0; } - nbor_pitch=this->nbor->nbor_pitch(); + // build the short neighbor list + int ainum=this->_ainum; + this->time_pair.start(); + + int BX=this->block_pair(); + int GX=static_cast(ceil(static_cast(ainum)/BX)); + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &cutsq_pair, &_ntypes, + &this->nbor->dev_nbor, &this->nbor->dev_packed, + &ainum, &nbor_pitch, &this->_threads_per_atom); + + #if defined(LAL_OCL_EV_JIT) + if (eflag || vflag) k_zeta_selt = &k_zeta; + else k_zeta_selt = &k_zeta_noev; + #endif + GX=static_cast(ceil(static_cast(this->_ainum)/ (BX/(JTHREADS*KTHREADS)))); - - this->k_zeta.set_size(GX,BX); - this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, + k_zeta_selt->set_size(GX,BX); + k_zeta_selt->run(&this->atom->x, &ts1, &ts3, &ts4, &ts5, &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, - &eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom); + &_zetaij_eng, &this->nbor->dev_nbor, &eflag, &this->_ainum, + &nbor_pitch, &this->_threads_per_atom); ainum=this->ans->inum(); - nbor_pitch=this->nbor->nbor_pitch(); - GX=static_cast(ceil(static_cast(this->ans->inum())/ - (BX/this->_threads_per_atom))); - - this->time_pair.start(); - this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq, - &map, &elem2param, &_nelements, &_nparams, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, - &this->ans->force, &this->ans->engv, - &eflag, &vflag, &ainum, &nbor_pitch, - &this->_threads_per_atom); - BX=this->block_size(); GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/(KTHREADS*JTHREADS)))); - this->k_three_center.set_size(GX,BX); - this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, - &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, - &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom, &evatom); + this->k_3center_sel->set_size(GX,BX); + this->k_3center_sel->run(&this->atom->x, &ts1, &ts4, &map, + &elem2param, &_nelements, &_nparams, &_zetaij, + &_zetaij_eng, &this->nbor->dev_nbor, + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom, &evatom); Answer *end_ans; #ifdef THREE_CONCURRENT @@ -307,24 +323,34 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { #endif if (evatom!=0) { this->k_three_end_vatom.set_size(GX,BX); - this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, - &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_ilist, &this->dev_short_nbor, - &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); + this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts4, &map, &elem2param, + &_nelements, &_nparams, &_zetaij, &_zetaij_eng, + &this->nbor->dev_nbor, &this->nbor->three_ilist, + &end_ans->force, &end_ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom, + &this->_gpu_nbor); } else { - this->k_three_end.set_size(GX,BX); - this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, - &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_ilist, &this->dev_short_nbor, - &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); + this->k_3end_sel->set_size(GX,BX); + this->k_3end_sel->run(&this->atom->x, &ts1, &ts4, &map, &elem2param, + &_nelements, &_nparams, &_zetaij, &_zetaij_eng, + &this->nbor->dev_nbor, &this->nbor->three_ilist, + &end_ans->force, &end_ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom, + &this->_gpu_nbor); } + BX=this->block_pair(); + int GXT=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->k_sel->set_size(GXT,BX); + this->k_sel->run(&this->atom->x, &ts2, &map, &elem2param, &_nelements, + &_nparams, &this->nbor->dev_nbor, &this->ans->force, + &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom, &GX); + this->time_pair.stop(); + return GX; } template class Tersoff; diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu index b08fddfd6e..03ce68be77 100644 --- a/lib/gpu/lal_tersoff.cu +++ b/lib/gpu/lal_tersoff.cu @@ -18,99 +18,28 @@ #ifndef _DOUBLE_DOUBLE _texture( pos_tex,float4); -_texture( ts1_tex,float4); -_texture( ts2_tex,float4); -_texture( ts3_tex,float4); -_texture( ts4_tex,float4); -_texture( ts5_tex,float4); #else _texture_2d( pos_tex,int4); -_texture( ts1_tex,int4); -_texture( ts2_tex,int4); -_texture( ts3_tex,int4); -_texture( ts4_tex,int4); -_texture( ts5_tex,int4); #endif #else #define pos_tex x_ -#define ts1_tex ts1 -#define ts2_tex ts2 -#define ts3_tex ts3 -#define ts4_tex ts4 -#define ts5_tex ts5 #endif //#define THREE_CONCURRENT #define TWOTHIRD (numtyp)0.66666666666666666667 -#define zeta_idx(nbor_mem, packed_mem, nbor_pitch, n_stride, t_per_atom, \ - i, nbor_j, offset_j, idx) \ - if (nbor_mem==packed_mem) { \ - int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride; \ - idx = jj*n_stride + i*t_per_atom + offset_j; \ - } else { \ - idx = nbor_j; \ - } +#if (SHUFFLE_AVAIL == 0) -#if (ARCH < 300) - -#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, \ - offset, eflag, vflag, ans, engv) \ - if (t_per_atom>1) { \ - __local acctyp red_acc[6][BLOCK_PAIR]; \ - red_acc[0][tid]=f.x; \ - red_acc[1][tid]=f.y; \ - red_acc[2][tid]=f.z; \ - red_acc[3][tid]=energy; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<4; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ - } \ - } \ - f.x=red_acc[0][tid]; \ - f.y=red_acc[1][tid]; \ - f.z=red_acc[2][tid]; \ - energy=red_acc[3][tid]; \ - if (vflag>0) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid]=virial[r]; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ - } \ - } \ - for (int r=0; r<6; r++) \ - virial[r]=red_acc[r][tid]; \ - } \ - } \ - if (offset==0) { \ - int ei=ii; \ - if (eflag>0) { \ - engv[ei]+=energy*(acctyp)0.5; \ - ei+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - engv[ei]+=virial[i]*(acctyp)0.5; \ - ei+=inum; \ - } \ - } \ - acctyp4 old=ans[ii]; \ - old.x+=f.x; \ - old.y+=f.y; \ - old.z+=f.z; \ - ans[ii]=old; \ - } +#define local_allocate_acc_zeta() \ + __local acctyp red_acc[BLOCK_PAIR]; #define acc_zeta(z, tid, t_per_atom, offset) \ if (t_per_atom>1) { \ - __local acctyp red_acc[BLOCK_PAIR]; \ red_acc[tid]=z; \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ if (offset < s) { \ red_acc[tid] += red_acc[tid+s]; \ } \ @@ -118,36 +47,168 @@ _texture( ts5_tex,int4); z=red_acc[tid]; \ } -#else - #define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, \ - offset, eflag, vflag, ans, engv) \ + offset, eflag, vflag, ans, engv, ev_stride) \ if (t_per_atom>1) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - f.x += shfl_xor(f.x, s, t_per_atom); \ - f.y += shfl_xor(f.y, s, t_per_atom); \ - f.z += shfl_xor(f.z, s, t_per_atom); \ - energy += shfl_xor(energy, s, t_per_atom); \ - } \ - if (vflag>0) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - for (int r=0; r<6; r++) \ - virial[r] += shfl_xor(virial[r], s, t_per_atom); \ + simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ + if (EVFLAG && (vflag==2 || eflag==2)) { \ + if (eflag) { \ + simdsync(); \ + simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy); \ + } \ + if (vflag) { \ + simdsync(); \ + simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \ } \ } \ } \ - if (offset==0) { \ - int ei=ii; \ - if (eflag>0) { \ - engv[ei]+=energy*(acctyp)0.5; \ - ei+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - engv[ei]+=virial[i]*(acctyp)0.5; \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + z += shfl_down(z, s, t_per_atom); \ + } \ + } + +#if (EVFLAG == 1) + +#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, \ + offset, eflag, vflag, ans, engv, ev_stride) \ + if (t_per_atom>1) { \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (vflag==2 || eflag==2) { \ + if (eflag) \ + simd_reduce_add1(t_per_atom,energy); \ + if (vflag) \ + simd_reduce_arr(6, t_per_atom,virial); \ + } \ + } \ + if (offset==0 && ii 1; active_subgs /= vwidth) { \ + if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \ + if (bnum < active_subgs) { \ + if (eflag) { \ + simd_reduce_add1(vwidth, energy); \ + if (voffset==0) red_acc[6][bnum] = energy; \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (voffset==0) \ + for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \ + } \ + } \ + \ + __syncthreads(); \ + if (tid < active_subgs) { \ + if (eflag) energy = red_acc[6][tid]; \ + if (vflag) \ + for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \ + } else { \ + if (eflag) energy = (acctyp)0; \ + if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \ + } \ + } \ + \ + if (bnum == 0) { \ + int ei=BLOCK_ID_X; \ + if (eflag) { \ + simd_reduce_add1(vwidth, energy); \ + if (tid==0) { \ + engv[ei]+=energy*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (tid==0) { \ + for (int r=0; r<6; r++) { \ + engv[ei]+=virial[r]*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + } \ + } \ + } else if (offset==0 && ii1) \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (offset==0 && ii1) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - z += shfl_xor(z, s, t_per_atom); \ - } \ - } +#endif +#endif +#ifdef LAL_SIMD_IP_SYNC +#define t_per_atom t_per_atom_in +#else +#define t_per_atom 1 #endif __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_, - const __global int * dev_nbor, + const __global numtyp *restrict cutsq_pair, + const int ntypes, __global int * dev_nbor, const __global int * dev_packed, - __global int * dev_short_nbor, - const numtyp _cutshortsq, const int inum, const int nbor_pitch, - const int t_per_atom) { - __local int n_stride; - int tid, ii, offset; - atom_info(t_per_atom,ii,tid,offset); + const int t_per_atom_in) { + const int ii=GLOBAL_ID_X; + + #ifdef ONETYPE + const numtyp cutsq=cutsq_pair[ONETYPE]; + #endif if (ii cutsq[ijkparam]) continue; - - numtyp4 ts1_ijkparam = ts1[ijkparam]; //fetch4(ts1_ijkparam,ijkparam,ts1_tex); - numtyp ijkparam_lam3 = ts1_ijkparam.z; - numtyp ijkparam_powermint = ts1_ijkparam.w; - numtyp4 ts2_ijkparam = ts2[ijkparam]; //fetch4(ts2_ijkparam,ijkparam,ts2_tex); - numtyp ijkparam_bigr = ts2_ijkparam.z; - numtyp ijkparam_bigd = ts2_ijkparam.w; - numtyp4 ts4_ijkparam = ts4[ijkparam]; //fetch4(ts4_ijkparam,ijkparam,ts4_tex); - numtyp ijkparam_c = ts4_ijkparam.x; - numtyp ijkparam_d = ts4_ijkparam.y; - numtyp ijkparam_h = ts4_ijkparam.z; - numtyp ijkparam_gamma = ts4_ijkparam.w; - z += zeta(ijkparam_powermint, ijkparam_lam3, ijkparam_bigr, ijkparam_bigd, - ijkparam_c, ijkparam_d, ijkparam_h, ijkparam_gamma, - rsq1, rsq2, delr1, delr2); + #ifndef ONETYPE + const numtyp4 ts1_ijkparam = ts1[ijkparam]; + const numtyp ijkparam_lam3 = ts1_ijkparam.x; + const int ijkparam_powermint = ts1_ijkparam.y; + const numtyp ijkparam_bigr = ts1_ijkparam.z; + const numtyp ijkparam_bigd = ts1_ijkparam.w; + const numtyp4 ts4_ijkparam = ts4[ijkparam]; + const numtyp ijkparam_c = ts4_ijkparam.x; + const numtyp ijkparam_d = ts4_ijkparam.y; + const numtyp ijkparam_h = ts4_ijkparam.z; + const numtyp ijkparam_gamma = ts4_ijkparam.w; + #endif + z += zeta(ijkparam_powermint, ijkparam_lam3, ijkparam_bigr, + ijkparam_bigd, ijkparam_c, ijkparam_d, ijkparam_h, + ijkparam_gamma, r1, rsq2, delr1, delr2); } - // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor - int idx = nbor_j; - if (dev_packed==dev_nbor) idx -= n_stride; acc_zeta(z, tid, t_per_atom, offset_k); - numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex); - numtyp ijparam_lam2 = ts1_ijparam.y; - numtyp4 ts2_ijparam = ts2[ijparam]; //fetch4(ts2_ijparam,ijparam,ts2_tex); - numtyp ijparam_bigb = ts2_ijparam.y; - numtyp ijparam_bigr = ts2_ijparam.z; - numtyp ijparam_bigd = ts2_ijparam.w; - numtyp4 ts3_ijparam = ts3[ijparam]; //fetch4(ts3_ijparam,ijparam,ts3_tex); - numtyp ijparam_c1 = ts3_ijparam.x; - numtyp ijparam_c2 = ts3_ijparam.y; - numtyp ijparam_c3 = ts3_ijparam.z; - numtyp ijparam_c4 = ts3_ijparam.w; - numtyp4 ts5_ijparam = ts5[ijparam]; //fetch4(ts5_ijparam,ijparam,ts5_tex); - numtyp ijparam_beta = ts5_ijparam.x; - numtyp ijparam_powern = ts5_ijparam.y; + #ifndef ONETYPE + const numtyp ijparam_bigr = ts1[ijparam].z; + const numtyp ijparam_bigd = ts1[ijparam].w; + const numtyp4 ts3_ijparam = ts3[ijparam]; + const numtyp ijparam_c1 = ts3_ijparam.x; + const numtyp ijparam_c2 = ts3_ijparam.y; + const numtyp ijparam_c3 = ts3_ijparam.z; + const numtyp ijparam_c4 = ts3_ijparam.w; + const numtyp4 ts5_ijparam = ts5[ijparam]; + const numtyp ijparam_beta = ts5_ijparam.x; + const numtyp ijparam_powern = ts5_ijparam.y; + const numtyp ijparam_lam2 = ts5_ijparam.z; + const numtyp ijparam_bigb = ts5_ijparam.w; + #else + const numtyp ijparam_bigr = ijkparam_bigr; + const numtyp ijparam_bigd = ijkparam_bigd; + #endif if (offset_k == 0) { numtyp fpfeng[4]; force_zeta(ijparam_bigb, ijparam_bigr, ijparam_bigd, ijparam_lam2, - ijparam_beta, ijparam_powern, ijparam_c1, ijparam_c2, ijparam_c3, - ijparam_c4, rsq1, z, eflag, fpfeng); - acctyp4 zij; + ijparam_beta, ijparam_powern, ijparam_c1, ijparam_c2, + ijparam_c3, ijparam_c4, r1, z, eflag, fpfeng); + acctyp2 zij; zij.x = fpfeng[0]; zij.y = fpfeng[1]; - zij.z = fpfeng[2]; - zij.w = z; - zetaij[idx] = zij; + zetaij[nbor_j-2*nbor_pitch] = zij; + if (EVFLAG && eflag) zetaij_eng[nbor_j-2*nbor_pitch] = fpfeng[2]; } - } // for nbor } // if ii } __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict ts1_in, const __global numtyp4 *restrict ts2_in, - const __global numtyp *restrict cutsq, const __global int *restrict map, const __global int *restrict elem2param, const int nelements, const int nparams, const __global int * dev_nbor, - const __global int * dev_packed, - const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, - const int t_per_atom) { - __local int n_stride; - int tid, ii, offset; + const int t_per_atom_in, + const int ev_stride) { + int tid, ii, offset, n_stride; atom_info(t_per_atom,ii,tid,offset); - __local numtyp4 ts1[SHARED_SIZE]; + local_allocate_store_pair(); + + #ifndef ONETYPE __local numtyp4 ts2[SHARED_SIZE]; if (tid= cutsq[ijparam]) continue; + #ifndef ONETYPE + numtyp4 ts2_ijparam = ts2[ijparam]; + const numtyp ijparam_biga = ts2_ijparam.x; + const numtyp ijparam_lam1 = ts2_ijparam.y; + const numtyp ijparam_bigr = ts2_ijparam.z; + const numtyp ijparam_bigd = ts2_ijparam.w; + #endif numtyp feng[2]; - numtyp ijparam_lam1 = ts1[ijparam].x; - numtyp4 ts2_ijparam = ts2[ijparam]; - numtyp ijparam_biga = ts2_ijparam.x; - numtyp ijparam_bigr = ts2_ijparam.z; - numtyp ijparam_bigd = ts2_ijparam.w; repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga, rsq, eflag, feng); @@ -469,9 +538,9 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_, f.y+=dely*force; f.z+=delz*force; - if (eflag>0) + if (EVFLAG && eflag) energy+=feng[1]; - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -480,86 +549,85 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_, virial[5] += dely*delz*force; } } // for nbor - - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii - + store_answers_p(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv,ev_stride); } __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict ts1_in, - const __global numtyp4 *restrict ts2_in, const __global numtyp4 *restrict ts4_in, - const __global numtyp *restrict cutsq, const __global int *restrict map, const __global int *restrict elem2param, const int nelements, const int nparams, - const __global acctyp4 *restrict zetaij, + const __global acctyp2 *restrict zetaij, + const __global acctyp *restrict zetaij_e, const __global int * dev_nbor, - const __global int * dev_packed, - const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, - const int t_per_atom, const int evatom) { - __local int tpa_sq, n_stride; - tpa_sq=fast_mul(t_per_atom,t_per_atom); - numtyp lam3, powermint, bigr, bigd, c, d, h, gamma; + const int t_per_atom_in, + const int evatom) { + const int tpa_sq=fast_mul(t_per_atom,t_per_atom); - int tid, ii, offset; + int tid, ii, offset, n_stride; atom_info(tpa_sq,ii,tid,offset); // offset ranges from 0 to tpa_sq-1 + local_allocate_store_three(); + + #ifndef ONETYPE __local numtyp4 ts1[SHARED_SIZE]; - __local numtyp4 ts2[SHARED_SIZE]; __local numtyp4 ts4[SHARED_SIZE]; if (tid= cutsq[ijparam]) continue; numtyp r1 = ucl_sqrt(rsq1); numtyp r1inv = ucl_rsqrt(rsq1); // look up for zeta_ij - // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor - int idx = nbor_j; - if (dev_packed==dev_nbor) idx -= n_stride; - acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex); + acctyp2 zeta_ij = zetaij[nbor_j-2*nbor_pitch]; numtyp force = zeta_ij.x*tpainv; numtyp prefactor = zeta_ij.y; f.x += delr1[0]*force; f.y += delr1[1]*force; f.z += delr1[2]*force; - if (eflag>0) { - energy+=zeta_ij.z*tpainv; + if (EVFLAG && eflag) { + energy+=zetaij_e[nbor_j-2*nbor_pitch]*tpainv; } - if (vflag>0) { + if (EVFLAG && vflag) { numtyp mforce = -force; virial[0] += delr1[0]*delr1[0]*mforce; virial[1] += delr1[1]*delr1[1]*mforce; @@ -597,48 +661,45 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_, } int nbor_k = nborj_start-offset_j+offset_k; - int k_end = nbor_end; - if (dev_packed==dev_nbor) { - int numk = dev_short_nbor[nbor_k-n_stride]; - k_end = nbor_k+fast_mul(numk,n_stride); - } - - for ( ; nbor_k cutsq[ijkparam]) continue; + #ifndef ONETYPE + const numtyp4 ts1_ijkparam = ts1[ijkparam]; + const numtyp lam3 = ts1_ijkparam.x; + const int powermint = ts1_ijkparam.y; + const numtyp bigr = ts1_ijkparam.z; + const numtyp bigd = ts1_ijkparam.w; + const numtyp4 ts4_ijkparam = ts4[ijkparam]; + const numtyp c = ts4_ijkparam.x; + const numtyp d = ts4_ijkparam.y; + const numtyp h = ts4_ijkparam.z; + const numtyp gamma = ts4_ijkparam.w; + #endif numtyp r2 = ucl_sqrt(rsq2); numtyp r2inv = ucl_rsqrt(rsq2); numtyp fi[3], fj[3], fk[3]; - numtyp4 ts1_ijkparam = ts1[ijkparam]; //fetch4(ts1_ijkparam,ijkparam,ts1_tex); - lam3 = ts1_ijkparam.z; - powermint = ts1_ijkparam.w; - numtyp4 ts2_ijkparam = ts2[ijkparam]; //fetch4(ts2_ijkparam,ijkparam,ts2_tex); - bigr = ts2_ijkparam.z; - bigd = ts2_ijkparam.w; - numtyp4 ts4_ijkparam = ts4[ijkparam]; //fetch4(ts4_ijkparam,ijkparam,ts4_tex); - c = ts4_ijkparam.x; - d = ts4_ijkparam.y; - h = ts4_ijkparam.z; - gamma = ts4_ijkparam.w; - if (vflag>0) - attractive(bigr, bigd, powermint, lam3, c, d, h, gamma, - prefactor, r1, r1inv, r2, r2inv, delr1, delr2, fi, fj, fk); + if (EVFLAG && vflag) + attractive(bigr, bigd, powermint, lam3, c, d, h, gamma, prefactor, + r1, r1inv, r2, r2inv, delr1, delr2, fi, fj, fk); else attractive_fi(bigr, bigd, powermint, lam3, c, d, h, gamma, prefactor, r1, r1inv, r2, r2inv, delr1, delr2, fi); @@ -646,7 +707,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_, f.y += fi[1]; f.z += fi[2]; - if (vflag>0) { + if (EVFLAG && vflag) { acctyp v[6]; numtyp pre = (numtyp)2.0; if (evatom==1) pre = TWOTHIRD; @@ -662,87 +723,90 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_, } } // nbor_k } // for nbor_j - - store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq, - offset,eflag,vflag,ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,tpa_sq, + offset,eflag,vflag,ans,engv); } __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict ts1_in, - const __global numtyp4 *restrict ts2_in, const __global numtyp4 *restrict ts4_in, - const __global numtyp *restrict cutsq, const __global int *restrict map, const __global int *restrict elem2param, const int nelements, const int nparams, - const __global acctyp4 *restrict zetaij, + const __global acctyp2 *restrict zetaij, + const __global acctyp *restrict zetaij_e, const __global int * dev_nbor, - const __global int * dev_packed, const __global int * dev_ilist, - const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, - const int t_per_atom, const int gpu_nbor) { - __local int tpa_sq, n_stride; - tpa_sq=fast_mul(t_per_atom,t_per_atom); - numtyp lam3, powermint, bigr, bigd, c, d, h, gamma; + const int t_per_atom_in, + const int gpu_nbor) { + const int tpa_sq=fast_mul(t_per_atom,t_per_atom); - int tid, ii, offset; + int tid, ii, offset, n_stride; atom_info(tpa_sq,ii,tid,offset); + local_allocate_store_three(); + + #ifndef ONETYPE __local numtyp4 ts1[SHARED_SIZE]; - __local numtyp4 ts2[SHARED_SIZE]; __local numtyp4 ts4[SHARED_SIZE]; if (tid0) { - energy+=zeta_ji.z*tpainv; + if (EVFLAG && eflag) { + energy+=zetaij_e[ijnum-2*nbor_pitch]*tpainv; } - if (vflag>0) { + if (EVFLAG && vflag) { numtyp mforce = -force; virial[0] += mdelr1[0]*mdelr1[0]*mforce; virial[1] += mdelr1[1]*mdelr1[1]*mforce; @@ -823,62 +877,62 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, // attractive forces for (nbor_k = nbork_start ; nbor_k0) { - energy+=zeta_ji.z*tpainv; + if (EVFLAG && eflag) { + energy+=zetaij_e[ijnum-2*nbor_pitch]*tpainv; } - if (vflag>0) { + if (EVFLAG && vflag) { numtyp mforce = -force; virial[0] += mdelr1[0]*mdelr1[0]*mforce; virial[1] += mdelr1[1]*mdelr1[1]*mforce; @@ -1052,41 +1099,44 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, // attractive forces for (nbor_k = nbork_start; nbor_k cutsq[jikparam]) continue; numtyp r2 = ucl_sqrt(rsq2); numtyp r2inv = ucl_rsqrt(rsq2); - numtyp fi[3], fj[3], fk[3]; - numtyp4 ts1_param, ts2_param, ts4_param; - ts1_param = ts1[jikparam]; //fetch4(ts1_jikparam,jikparam,ts1_tex); - lam3 = ts1_param.z; - powermint = ts1_param.w; - ts2_param = ts2[jikparam]; //fetch4(ts2_jikparam,jikparam,ts2_tex); - bigr = ts2_param.z; - bigd = ts2_param.w; - ts4_param = ts4[jikparam]; //fetch4(ts4_jikparam,jikparam,ts4_tex); - c = ts4_param.x; - d = ts4_param.y; - h = ts4_param.z; - gamma = ts4_param.w; attractive(bigr, bigd, powermint, lam3, c, d, h, gamma, - prefactor_ji, r1, r1inv, r2, r2inv, mdelr1, delr2, fi, fj, fk); + prefactor_ji, r1, r1inv, r2, r2inv, mdelr1, delr2, fi, fj, + fk); f.x += fj[0]; f.y += fj[1]; f.z += fj[2]; @@ -1098,26 +1148,25 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, virial[4] += TWOTHIRD*(mdelr1[0]*fj[2] + delr2[0]*fk[2]); virial[5] += TWOTHIRD*(mdelr1[1]*fj[2] + delr2[1]*fk[2]); - // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor - int idx = nbor_k; - if (dev_packed==dev_nbor) idx -= n_stride; - acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex); - numtyp prefactor_jk = zeta_jk.y; + numtyp prefactor_jk = zetaij[nbor_k-2*nbor_pitch].y; - int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype]; - ts1_param = ts1[jkiparam]; //fetch4(ts1_jkiparam,jkiparam,ts1_tex); - lam3 = ts1_param.z; - powermint = ts1_param.w; - ts2_param = ts2[jkiparam]; //fetch4(ts2_jkiparam,jkiparam,ts2_tex); - bigr = ts2_param.z; - bigd = ts2_param.w; + #ifndef ONETYPE + int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+ + itype]; + ts1_param = ts1[jkiparam]; + lam3 = ts1_param.x; + powermint = ts1_param.y; + bigr = ts1_param.z; + bigd = ts1_param.w; ts4_param = ts4[jkiparam]; //fetch4(ts4_jkiparam,jkiparam,ts4_tex); c = ts4_param.x; d = ts4_param.y; h = ts4_param.z; gamma = ts4_param.w; attractive(bigr, bigd, powermint, lam3, c, d, h, gamma, - prefactor_jk, r2, r2inv, r1, r1inv, delr2, mdelr1, fi, fj, fk); + prefactor_jk, r2, r2inv, r1, r1inv, delr2, mdelr1, fi, fj, + fk); + #endif f.x += fk[0]; f.y += fk[1]; f.z += fk[2]; @@ -1130,14 +1179,13 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]); } } // for nbor - - #ifdef THREE_CONCURRENT - store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset, - eflag,vflag,ans,engv); - #else - store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset, - eflag,vflag,ans,engv); - #endif } // if ii + #ifdef THREE_CONCURRENT + store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset, + eflag,vflag,ans,engv); + #else + store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset, + eflag,vflag,ans,engv,NUM_BLOCKS_X); + #endif } diff --git a/lib/gpu/lal_tersoff.h b/lib/gpu/lal_tersoff.h index 51e64c987b..8f99569162 100644 --- a/lib/gpu/lal_tersoff.h +++ b/lib/gpu/lal_tersoff.h @@ -59,41 +59,36 @@ class Tersoff : public BaseThree { // --------------------------- TYPE DATA -------------------------- - /// If atom type constants fit in shared memory, use fast kernels - bool shared_types; - /// Number of atom types - int _lj_types; + int _ntypes; - /// ts1.x = lam1, ts1.y = lam2, ts1.z = lam3, ts1.w = powermint + /// ts1.x = lam3, ts1.y = powermint, ts1.z = c3, ts1.w = c4 UCL_D_Vec ts1; - /// ts2.x = biga, ts2.y = bigb, ts2.z = bigr, ts2.w = bigd + /// ts2.x = biga, ts2.y = lam1, ts2.z = bigr, ts2.w = bigd UCL_D_Vec ts2; /// ts3.x = c1, ts3.y = c2, ts3.z = c3, ts3.w = c4 UCL_D_Vec ts3; - /// ts4.x = c, ts4.y = d, ts4.z = h, ts4.w = gamma + /// ts4.x = c*c, ts4.y = d*d, ts4.z = h, ts4.w = gamma UCL_D_Vec ts4; - /// ts5.x = beta, ts5.y = powern + /// ts5.x = beta, ts5.y = powern, ts5.z = lam2, ts5.w = bigb UCL_D_Vec ts5; - UCL_D_Vec cutsq; + UCL_D_Vec cutsq_pair; UCL_D_Vec elem2param; UCL_D_Vec map; int _nparams,_nelements; /// Per-atom arrays: - /// zetaij.x = force, zetaij.y = prefactor, zetaij.z = evdwl, - /// zetaij.w = zetaij - UCL_D_Vec _zetaij; + /// zetaij.x = force, zetaij.y = prefactor + UCL_D_Vec _zetaij; + UCL_D_Vec _zetaij_eng; - UCL_Kernel k_zeta; - UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex; - numtyp _cutshortsq; + UCL_Kernel k_zeta, k_zeta_noev, *k_zeta_selt; private: bool _allocated; - void loop(const bool _eflag, const bool _vflag, const int evatom); + int loop(const int eflag, const int vflag, const int evatom, bool &success); }; } diff --git a/lib/gpu/lal_tersoff_ext.cpp b/lib/gpu/lal_tersoff_ext.cpp index 749842864f..ac700d014a 100644 --- a/lib/gpu/lal_tersoff_ext.cpp +++ b/lib/gpu/lal_tersoff_ext.cpp @@ -63,7 +63,7 @@ int tersoff_gpu_init(const int ntypes, const int inum, const int nall, const int int init_ok=0; if (world_me==0) - init_ok=TSMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen, + init_ok=TSMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen, host_map, nelements, host_elem2param, nparams, ts_lam1, ts_lam2, ts_lam3, ts_powermint, ts_biga, ts_bigb, ts_bigr, ts_bigd, @@ -84,7 +84,7 @@ int tersoff_gpu_init(const int ntypes, const int inum, const int nall, const int fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=TSMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen, + init_ok=TSMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen, host_map, nelements, host_elem2param, nparams, ts_lam1, ts_lam2, ts_lam3, ts_powermint, ts_biga, ts_bigb, ts_bigr, ts_bigd, @@ -99,7 +99,7 @@ int tersoff_gpu_init(const int ntypes, const int inum, const int nall, const int fprintf(screen,"\n"); if (init_ok==0) - TSMF.estimate_gpu_overhead(); + TSMF.estimate_gpu_overhead(1); return init_ok; } diff --git a/lib/gpu/lal_tersoff_extra.h b/lib/gpu/lal_tersoff_extra.h index 7ee29751b7..da2568aa1b 100644 --- a/lib/gpu/lal_tersoff_extra.h +++ b/lib/gpu/lal_tersoff_extra.h @@ -55,11 +55,9 @@ ucl_inline numtyp ters_gijk(const numtyp costheta, const numtyp param_h, const numtyp param_gamma) { - const numtyp ters_c = param_c * param_c; - const numtyp ters_d = param_d * param_d; const numtyp hcth = param_h - costheta; - return param_gamma*((numtyp)1.0 + ters_c*ucl_recip(ters_d) - - ters_c *ucl_recip(ters_d + hcth*hcth)); + return param_gamma*((numtyp)1.0 + param_c*ucl_recip(param_d) - + param_c *ucl_recip(param_d + hcth*hcth)); } /* ---------------------------------------------------------------------- */ @@ -68,19 +66,20 @@ ucl_inline numtyp ters_gijk_d(const numtyp costheta, const numtyp param_c, const numtyp param_d, const numtyp param_h, - const numtyp param_gamma) + const numtyp param_gamma, + numtyp *ans_d) { - const numtyp ters_c = param_c * param_c; - const numtyp ters_d = param_d * param_d; const numtyp hcth = param_h - costheta; - const numtyp numerator = (numtyp)-2.0 * ters_c * hcth; - const numtyp denominator = ucl_recip(ters_d + hcth*hcth); - return param_gamma*numerator*denominator*denominator; + const numtyp idhh=ucl_recip(param_d + hcth*hcth); + const numtyp numerator = (numtyp)-2.0 * param_c * hcth; + *ans_d=param_gamma*numerator*idhh*idhh; + return param_gamma*((numtyp)1.0+param_c*ucl_recip(param_d)-param_c*idhh); } /* ---------------------------------------------------------------------- */ -ucl_inline void costheta_d(const numtyp rij_hat[3], +ucl_inline void costheta_d(const numtyp cos_theta, + const numtyp rij_hat[3], const numtyp rij, const numtyp rik_hat[3], const numtyp rik, @@ -89,9 +88,6 @@ ucl_inline void costheta_d(const numtyp rij_hat[3], numtyp *drk) { // first element is derivative wrt Ri, second wrt Rj, third wrt Rk - - numtyp cos_theta = vec3_dot(rij_hat,rik_hat); - vec3_scaleadd(-cos_theta,rij_hat,rik_hat,drj); vec3_scale(ucl_recip(rij),drj,drj); vec3_scaleadd(-cos_theta,rik_hat,rij_hat,drk); @@ -107,7 +103,9 @@ ucl_inline numtyp ters_fc(const numtyp r, const numtyp param_bigd) { if (r < param_bigr-param_bigd) return (numtyp)1.0; + #ifndef ONETYPE if (r > param_bigr+param_bigd) return (numtyp)0.0; + #endif return (numtyp)0.5*((numtyp)1.0 - sin(MY_PI2*(r - param_bigr)/param_bigd)); } @@ -115,24 +113,23 @@ ucl_inline numtyp ters_fc(const numtyp r, ucl_inline numtyp ters_fc_d(const numtyp r, const numtyp param_bigr, - const numtyp param_bigd) + const numtyp param_bigd, + numtyp *ans_d) { - if (r < param_bigr-param_bigd) return (numtyp)0.0; - if (r > param_bigr+param_bigd) return (numtyp)0.0; - return -(MY_PI4/param_bigd) * cos(MY_PI2*(r - param_bigr)/param_bigd); -} - -/* ---------------------------------------------------------------------- */ - -ucl_inline numtyp ters_fa(const numtyp r, - const numtyp param_bigb, - const numtyp param_bigr, - const numtyp param_bigd, - const numtyp param_lam2) -{ - if (r > param_bigr + param_bigd) return (numtyp)0.0; - return -param_bigb * ucl_exp(-param_lam2 * r) * - ters_fc(r,param_bigr,param_bigd); + if (r < param_bigr-param_bigd) { + *ans_d=(numtyp)0.0; + return (numtyp)1.0; + } + #ifndef ONETYPE + if (r > param_bigr+param_bigd) { + *ans_d=(numtyp)0.0; + return (numtyp)0.0; + } + #endif + const numtyp ibigd = ucl_recip(param_bigd); + const numtyp angle = MY_PI2*(r - param_bigr)*ibigd; + *ans_d=-(MY_PI4*ibigd) * cos(angle); + return (numtyp)0.5*((numtyp)1.0 - sin(angle)); } /* ---------------------------------------------------------------------- */ @@ -141,33 +138,17 @@ ucl_inline numtyp ters_fa_d(const numtyp r, const numtyp param_bigb, const numtyp param_bigr, const numtyp param_bigd, - const numtyp param_lam2) + const numtyp param_lam2, + numtyp *ans_d) { + #ifndef ONETYPE if (r > param_bigr + param_bigd) return (numtyp)0.0; - return param_bigb * ucl_exp(-param_lam2 * r) * (param_lam2 * - ters_fc(r,param_bigr,param_bigd) - ters_fc_d(r,param_bigr,param_bigd)); -} - -/* ---------------------------------------------------------------------- */ - -ucl_inline numtyp ters_bij(const numtyp zeta, - const numtyp param_beta, - const numtyp param_powern, - const numtyp param_c1, - const numtyp param_c2, - const numtyp param_c3, - const numtyp param_c4) -{ - numtyp tmp = param_beta * zeta; - if (tmp > param_c1) return ucl_rsqrt(tmp); - if (tmp > param_c2) - return ((numtyp)1.0 - ucl_powr(tmp,-param_powern) / - ((numtyp)2.0*param_powern))*ucl_rsqrt(tmp); - if (tmp < param_c4) return (numtyp)1.0; - if (tmp < param_c3) - return (numtyp)1.0 - ucl_powr(tmp,param_powern)/((numtyp)2.0*param_powern); - return ucl_powr((numtyp)1.0 + ucl_powr(tmp,param_powern), - (numtyp)-1.0/((numtyp)2.0*param_powern)); + #endif + numtyp dfc; + const numtyp fc=ters_fc_d(r,param_bigr,param_bigd,&dfc); + const numtyp blr = param_bigb * ucl_exp(-param_lam2 * r); + *ans_d = blr * (param_lam2 * fc - dfc); + return -blr * fc; } /* ---------------------------------------------------------------------- */ @@ -178,24 +159,35 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta, const numtyp param_c1, const numtyp param_c2, const numtyp param_c3, - const numtyp param_c4) + const numtyp param_c4, + numtyp *ans_d) { - numtyp tmp = param_beta * zeta; - if (tmp > param_c1) - return param_beta * (numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5); - if (tmp > param_c2) - return param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) * - // error in negligible 2nd term fixed 9/30/2015 - // (1.0 - 0.5*(1.0 + 1.0/(2.0*param->powern)) * - ((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 /((numtyp)2.0 * param_powern)) * - ucl_powr(tmp,-param_powern))); - if (tmp < param_c4) return (numtyp)0.0; - if (tmp < param_c3) - return (numtyp)-0.5*param_beta * ucl_powr(tmp,param_powern-(numtyp)1.0); - - numtyp tmp_n = ucl_powr(tmp,param_powern); - return (numtyp)-0.5 * ucl_powr((numtyp)1.0+tmp_n, (numtyp) - - (numtyp)1.0-((numtyp)1.0 / ((numtyp)2.0 * param_powern)))*tmp_n / zeta; + const numtyp tmp = param_beta * zeta; + if (tmp > param_c1) { + *ans_d = param_beta * (numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5); + return ucl_rsqrt(tmp); + } + if (tmp > param_c2) { + const numtyp ptmp = ucl_powr(tmp,-param_powern); + const numtyp i2n = ucl_recip((numtyp)2.0 * param_powern); + *ans_d = param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) * + ((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 * i2n) * + ptmp)); + return ((numtyp)1.0 - ptmp * i2n)*ucl_rsqrt(tmp); + } + if (tmp < param_c4) { + *ans_d = (numtyp)0.0; + return (numtyp)1.0; + } + if (tmp < param_c3) { + *ans_d = (numtyp)-0.5*param_beta * ucl_powr(tmp,param_powern-(numtyp)1.0); + return (numtyp)1.0 - ucl_powr(tmp,param_powern)/((numtyp)2.0*param_powern); + } + const numtyp tmp_n = (numtyp)1.0+ucl_powr(tmp,param_powern); + const numtyp i2n = -ucl_recip((numtyp)2.0*param_powern); + *ans_d = (numtyp)-0.5*ucl_powr(tmp_n,(numtyp)-1.0+i2n)*(tmp_n-(numtyp)1.0)/ + zeta; + return ucl_powr(tmp_n, i2n); } /* ---------------------------------------------------------------------- */ @@ -207,7 +199,7 @@ ucl_inline void ters_zetaterm_d(const numtyp prefactor, const numtyp rik, const numtyp param_bigr, const numtyp param_bigd, - const numtyp param_powermint, + const int param_powermint, const numtyp param_lam3, const numtyp param_c, const numtyp param_d, @@ -220,25 +212,23 @@ ucl_inline void ters_zetaterm_d(const numtyp prefactor, numtyp gijk,gijk_d,ex_delr,ex_delr_d,fc,dfc,cos_theta,tmp; numtyp dcosdri[3],dcosdrj[3],dcosdrk[3]; - fc = ters_fc(rik,param_bigr,param_bigd); - dfc = ters_fc_d(rik,param_bigr,param_bigd); + fc = ters_fc_d(rik,param_bigr,param_bigd,&dfc); numtyp t = param_lam3*(rij-rik); - if ((int)param_powermint == 3) tmp = t*t*t; + if (param_powermint == 3) tmp = t*t*t; else tmp = t; if (tmp > (numtyp)69.0776) ex_delr = (numtyp)1.e30; else if (tmp < (numtyp)-69.0776) ex_delr = (numtyp)0.0; else ex_delr = ucl_exp(tmp); - if ((int)param_powermint == 3) + if (param_powermint == 3) ex_delr_d = (numtyp)3.0*param_lam3*t*t*ex_delr; else ex_delr_d = param_lam3 * ex_delr; cos_theta = vec3_dot(rij_hat,rik_hat); - gijk = ters_gijk(cos_theta,param_c,param_d,param_h,param_gamma); - gijk_d = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma); - costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk); + gijk = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma,&gijk_d); + costheta_d(cos_theta,rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk); // compute the derivative wrt Ri // dri = -dfc*gijk*ex_delr*rik_hat; @@ -277,7 +267,7 @@ ucl_inline void ters_zetaterm_d_fi(const numtyp prefactor, const numtyp rik, const numtyp param_bigr, const numtyp param_bigd, - const numtyp param_powermint, + const int param_powermint, const numtyp param_lam3, const numtyp param_c, const numtyp param_d, @@ -288,25 +278,23 @@ ucl_inline void ters_zetaterm_d_fi(const numtyp prefactor, numtyp gijk,gijk_d,ex_delr,ex_delr_d,fc,dfc,cos_theta,tmp; numtyp dcosdri[3],dcosdrj[3],dcosdrk[3]; - fc = ters_fc(rik,param_bigr,param_bigd); - dfc = ters_fc_d(rik,param_bigr,param_bigd); + fc = ters_fc_d(rik,param_bigr,param_bigd,&dfc); numtyp t = param_lam3*(rij-rik); - if ((int)param_powermint == 3) tmp = t*t*t; + if (param_powermint == 3) tmp = t*t*t; else tmp = t; if (tmp > (numtyp)69.0776) ex_delr = (numtyp)1.e30; else if (tmp < (numtyp)-69.0776) ex_delr = (numtyp)0.0; else ex_delr = ucl_exp(tmp); - if ((int)param_powermint == 3) + if (param_powermint == 3) ex_delr_d = (numtyp)3.0*param_lam3*t*t*ex_delr; else ex_delr_d = param_lam3 * ex_delr; cos_theta = vec3_dot(rij_hat,rik_hat); - gijk = ters_gijk(cos_theta,param_c,param_d,param_h,param_gamma); - gijk_d = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma); - costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk); + gijk = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma,&gijk_d); + costheta_d(cos_theta,rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk); // compute the derivative wrt Ri // dri = -dfc*gijk*ex_delr*rik_hat; @@ -327,7 +315,7 @@ ucl_inline void ters_zetaterm_d_fj(const numtyp prefactor, const numtyp rik, const numtyp param_bigr, const numtyp param_bigd, - const numtyp param_powermint, + const int param_powermint, const numtyp param_lam3, const numtyp param_c, const numtyp param_d, @@ -341,21 +329,20 @@ ucl_inline void ters_zetaterm_d_fj(const numtyp prefactor, fc = ters_fc(rik,param_bigr,param_bigd); numtyp t = param_lam3*(rij-rik); - if ((int)param_powermint == 3) tmp = t*t*t; + if (param_powermint == 3) tmp = t*t*t; else tmp = t; if (tmp > (numtyp)69.0776) ex_delr = (numtyp)1.e30; else if (tmp < (numtyp)-69.0776) ex_delr = (numtyp)0.0; else ex_delr = ucl_exp(tmp); - if ((int)param_powermint == 3) + if (param_powermint == 3) ex_delr_d = (numtyp)3.0*param_lam3*t*t*ex_delr; else ex_delr_d = param_lam3 * ex_delr; cos_theta = vec3_dot(rij_hat,rik_hat); - gijk = ters_gijk(cos_theta,param_c,param_d,param_h,param_gamma); - gijk_d = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma); - costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk); + gijk = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma,&gijk_d); + costheta_d(cos_theta,rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk); // compute the derivative wrt Rj // drj = fc*gijk_d*ex_delr*dcosdrj; @@ -373,7 +360,7 @@ ucl_inline void ters_zetaterm_d_fk(const numtyp prefactor, const numtyp rik, const numtyp param_bigr, const numtyp param_bigd, - const numtyp param_powermint, + const int param_powermint, const numtyp param_lam3, const numtyp param_c, const numtyp param_d, @@ -384,25 +371,23 @@ ucl_inline void ters_zetaterm_d_fk(const numtyp prefactor, numtyp gijk,gijk_d,ex_delr,ex_delr_d,fc,dfc,cos_theta,tmp; numtyp dcosdri[3],dcosdrj[3],dcosdrk[3]; - fc = ters_fc(rik,param_bigr,param_bigd); - dfc = ters_fc_d(rik,param_bigr,param_bigd); + fc = ters_fc_d(rik,param_bigr,param_bigd,&dfc); numtyp t = param_lam3*(rij-rik); - if ((int)param_powermint == 3) tmp = t*t*t; + if (param_powermint == 3) tmp = t*t*t; else tmp = t; if (tmp > (numtyp)69.0776) ex_delr = (numtyp)1.e30; else if (tmp < (numtyp)-69.0776) ex_delr = (numtyp)0.0; else ex_delr = ucl_exp(tmp); - if ((int)param_powermint == 3) + if (param_powermint == 3) ex_delr_d = (numtyp)3.0*param_lam3*t*t*ex_delr; else ex_delr_d = param_lam3 * ex_delr; cos_theta = vec3_dot(rij_hat,rik_hat); - gijk = ters_gijk(cos_theta,param_c,param_d,param_h,param_gamma); - gijk_d = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma); - costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk); + gijk = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma,&gijk_d); + costheta_d(cos_theta,rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk); // compute the derivative wrt Rk // drk = dfc*gijk*ex_delr*rik_hat; @@ -427,18 +412,17 @@ ucl_inline void repulsive(const numtyp param_bigr, { numtyp r,tmp_fc,tmp_fc_d,tmp_exp; r = ucl_sqrt(rsq); - tmp_fc = ters_fc(r,param_bigr,param_bigd); - tmp_fc_d = ters_fc_d(r,param_bigr,param_bigd); - tmp_exp = ucl_exp(-param_lam1 * r); + tmp_fc = ters_fc_d(r,param_bigr,param_bigd,&tmp_fc_d); + tmp_exp = param_biga * ucl_exp(-param_lam1 * r); // fforce - ans[0] = -param_biga*tmp_exp*(tmp_fc_d - tmp_fc*param_lam1)*ucl_recip(r); + ans[0] = -tmp_exp*(tmp_fc_d - tmp_fc*param_lam1)*ucl_recip(r); // eng - if (eflag) ans[1] = tmp_fc * param_biga * tmp_exp; + if (EVFLAG && eflag) ans[1] = tmp_fc * tmp_exp; } /* ---------------------------------------------------------------------- */ -ucl_inline numtyp zeta(const numtyp param_powermint, +ucl_inline numtyp zeta(const int param_powermint, const numtyp param_lam3, const numtyp param_bigr, const numtyp param_bigd, @@ -446,20 +430,19 @@ ucl_inline numtyp zeta(const numtyp param_powermint, const numtyp param_d, const numtyp param_h, const numtyp param_gamma, - const numtyp rsqij, + const numtyp rij, const numtyp rsqik, const numtyp4 delrij, const numtyp4 delrik) { - numtyp rij,rik,costheta,arg,ex_delr; + numtyp rik,costheta,arg,ex_delr; - rij = ucl_sqrt(rsqij); rik = ucl_sqrt(rsqik); costheta = (delrij.x*delrik.x + delrij.y*delrik.y + delrij.z*delrik.z) / (rij*rik); numtyp t = param_lam3*(rij-rik); - if ((int)param_powermint == 3) arg = t*t*t; + if (param_powermint == 3) arg = t*t*t; else arg = t; if (arg > (numtyp)69.0776) ex_delr = (numtyp)1.e30; @@ -482,22 +465,19 @@ ucl_inline void force_zeta(const numtyp param_bigb, const numtyp param_c2, const numtyp param_c3, const numtyp param_c4, - const numtyp rsq, + const numtyp r, const numtyp zeta_ij, const int eflag, numtyp fpfeng[4]) { - numtyp r,fa,fa_d,bij; + numtyp fa,fa_d,bij,bij_d; - r = ucl_sqrt(rsq); - fa = ters_fa(r,param_bigb,param_bigr,param_bigd,param_lam2); - fa_d = ters_fa_d(r,param_bigb,param_bigr,param_bigd,param_lam2); - bij = ters_bij(zeta_ij,param_beta,param_powern, - param_c1,param_c2, param_c3, param_c4); - fpfeng[0] = (numtyp)0.5*bij*fa_d * ucl_recip(r); // fforce - fpfeng[1] = (numtyp)-0.5*fa * ters_bij_d(zeta_ij,param_beta, param_powern, - param_c1,param_c2, param_c3, param_c4); // prefactor - if (eflag) fpfeng[2] = (numtyp)0.5*bij*fa; // eng + fa = ters_fa_d(r,param_bigb,param_bigr,param_bigd,param_lam2,&fa_d); + bij = ters_bij_d(zeta_ij,param_beta,param_powern, + param_c1,param_c2, param_c3, param_c4, &bij_d); + fpfeng[0] = (numtyp)0.5*bij*fa_d*ucl_recip(r); // fforce + fpfeng[1] = (numtyp)-0.5*fa*bij_d; // prefactor + if (EVFLAG && eflag) fpfeng[2] = (numtyp)0.5*bij*fa; // eng } /* ---------------------------------------------------------------------- @@ -508,7 +488,7 @@ ucl_inline void force_zeta(const numtyp param_bigb, ucl_inline void attractive(const numtyp param_bigr, const numtyp param_bigd, - const numtyp param_powermint, + const int param_powermint, const numtyp param_lam3, const numtyp param_c, const numtyp param_d, @@ -535,7 +515,7 @@ ucl_inline void attractive(const numtyp param_bigr, ucl_inline void attractive_fi(const numtyp param_bigr, const numtyp param_bigd, - const numtyp param_powermint, + const int param_powermint, const numtyp param_lam3, const numtyp param_c, const numtyp param_d, @@ -560,7 +540,7 @@ ucl_inline void attractive_fi(const numtyp param_bigr, ucl_inline void attractive_fj(const numtyp param_bigr, const numtyp param_bigd, - const numtyp param_powermint, + const int param_powermint, const numtyp param_lam3, const numtyp param_c, const numtyp param_d, @@ -585,7 +565,7 @@ ucl_inline void attractive_fj(const numtyp param_bigr, ucl_inline void attractive_fk(const numtyp param_bigr, const numtyp param_bigd, - const numtyp param_powermint, + const int param_powermint, const numtyp param_lam3, const numtyp param_c, const numtyp param_d, @@ -610,5 +590,3 @@ ucl_inline void attractive_fk(const numtyp param_bigr, #endif - - diff --git a/lib/gpu/lal_tersoff_mod.cpp b/lib/gpu/lal_tersoff_mod.cpp index 2b56991cc6..b7b0fff1b9 100644 --- a/lib/gpu/lal_tersoff_mod.cpp +++ b/lib/gpu/lal_tersoff_mod.cpp @@ -39,7 +39,7 @@ TersoffMT::~TersoffMod() { template int TersoffMT::bytes_per_atom(const int max_nbors) const { - return this->bytes_per_atom_atomic(max_nbors); + return this->bytes_per_atom_atomic(max_nbors)+max_nbors*sizeof(acctyp)*4; } template @@ -52,34 +52,78 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in const double* c5, const double* h, const double* beta, const double* powern, const double* powern_del, const double* ca1, const double* host_cutsq) { + int oldparam=-1; + int onetype=-1; + int onetype3=0; + int spq=1; + int mtypes=0; + #ifdef USE_OPENCL + for (int ii=1; ii1) onetype=-1; + #endif + int success; success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,tersoff_mod,"k_tersoff_mod_repulsive", - "k_tersoff_mod_three_center", "k_tersoff_mod_three_end", - "k_tersoff_mod_short_nbor"); + "k_tersoff_mod_three_center", + "k_tersoff_mod_three_end", + "k_tersoff_mod_short_nbor",onetype,onetype3,0,1); if (success!=0) return success; int ef_nall=nall; if (ef_nall==0) ef_nall=2000; - _zetaij.alloc(ef_nall*max_nbors,*(this->ucl_device),UCL_READ_WRITE); + if (this->nbor->max_nbors()) + _zetaij.alloc(ef_nall*this->nbor->max_nbors(),*(this->ucl_device), + UCL_READ_WRITE); k_zeta.set_function(*(this->pair_program),"k_tersoff_mod_zeta"); + #if defined(LAL_OCL_EV_JIT) + k_zeta_noev.set_function(*(this->pair_program_noev),"k_tersoff_mod_zeta"); + #else + k_zeta_selt = &k_zeta; + #endif - // If atom type constants fit in shared memory use fast kernel - int lj_types=ntypes; - shared_types=false; - int max_shared_types=this->device->max_shared_types(); - if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { - lj_types=max_shared_types; - shared_types=true; - } - _lj_types=lj_types; - + _ntypes=ntypes; _nparams = nparams; _nelements = nelements; + UCL_H_Vec host_write(ntypes*ntypes,*(this->ucl_device), + UCL_READ_WRITE); + host_write.zero(); + cutsq_pair.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY); + for (int ii=1; iihost_write[ii*ntypes+jj]) + host_write[ii*ntypes+jj]=host_cutsq[ijkparam]; + } + } + } + ucl_copy(cutsq_pair,host_write,ntypes*ntypes); + UCL_H_Vec dview(nparams,*(this->ucl_device), UCL_WRITE_ONLY); @@ -101,8 +145,6 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in } ucl_copy(ts1,dview,false); - ts1_tex.get_texture(*(this->pair_program),"ts1_tex"); - ts1_tex.bind_float(ts1,4); ts2.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); @@ -114,8 +156,6 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in } ucl_copy(ts2,dview,false); - ts2_tex.get_texture(*(this->pair_program),"ts2_tex"); - ts2_tex.bind_float(ts2,4); ts3.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); @@ -127,8 +167,6 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in } ucl_copy(ts3,dview,false); - ts3_tex.get_texture(*(this->pair_program),"ts3_tex"); - ts3_tex.bind_float(ts3,4); ts4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); @@ -140,8 +178,6 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in } ucl_copy(ts4,dview,false); - ts4_tex.get_texture(*(this->pair_program),"ts4_tex"); - ts4_tex.bind_float(ts4,4); ts5.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); @@ -153,20 +189,6 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in } ucl_copy(ts5,dview,false); - ts5_tex.get_texture(*(this->pair_program),"ts5_tex"); - ts5_tex.bind_float(ts5,4); - - UCL_H_Vec cutsq_view(nparams,*(this->ucl_device), - UCL_WRITE_ONLY); - double cutsqmax = 0.0; - for (int i=0; i(host_cutsq[i]); - if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i]; - } - cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); - ucl_copy(cutsq,cutsq_view,false); - - _cutshortsq = static_cast(cutsqmax); UCL_H_Vec dview_elem2param(nelements*nelements*nelements, *(this->ucl_device), UCL_WRITE_ONLY); @@ -183,17 +205,16 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in ucl_copy(elem2param,dview_elem2param,false); - UCL_H_Vec dview_map(lj_types, *(this->ucl_device), UCL_WRITE_ONLY); + UCL_H_Vec dview_map(ntypes, *(this->ucl_device), UCL_WRITE_ONLY); for (int i = 0; i < ntypes; i++) dview_map[i] = host_map[i]; - map.alloc(lj_types,*(this->ucl_device), UCL_READ_ONLY); + map.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY); ucl_copy(map,dview_map,false); _allocated=true; this->_max_bytes=ts1.row_bytes()+ts2.row_bytes()+ts3.row_bytes()+ - ts4.row_bytes()+ts5.row_bytes()+cutsq.row_bytes()+ - map.row_bytes()+elem2param.row_bytes()+_zetaij.row_bytes(); + ts4.row_bytes()+map.row_bytes()+elem2param.row_bytes()+_zetaij.row_bytes(); return 0; } @@ -208,12 +229,15 @@ void TersoffMT::clear() { ts3.clear(); ts4.clear(); ts5.clear(); - cutsq.clear(); + cutsq_pair.clear(); map.clear(); elem2param.clear(); _zetaij.clear(); k_zeta.clear(); + #if defined(LAL_OCL_EV_JIT) + k_zeta_noev.clear(); + #endif this->clear_atomic(); } @@ -229,74 +253,54 @@ double TersoffMT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { - // Compute the block size and grid size to keep all cores busy - int BX=this->block_pair(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - - // build the short neighbor list - int ainum=this->_ainum; - int nbor_pitch=this->nbor->nbor_pitch(); - int GX=static_cast(ceil(static_cast(ainum)/ - (BX/this->_threads_per_atom))); - - this->k_short_nbor.set_size(GX,BX); - this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, - &this->_nbor_data->begin(), - &this->dev_short_nbor, &_cutshortsq, &ainum, - &nbor_pitch, &this->_threads_per_atom); +int TersoffMT::loop(const int eflag, const int vflag, const int evatom, + bool &success) { + const int nbor_pitch=this->nbor->nbor_pitch(); // re-allocate zetaij if necessary int nall = this->_nall; - if (nall*this->_max_nbors > _zetaij.cols()) { + if (nall*this->nbor->max_nbors() > _zetaij.cols()) { int _nmax=static_cast(static_cast(nall)*1.10); - _zetaij.resize(this->_max_nbors*_nmax); + _zetaij.clear(); + success = success && (_zetaij.alloc(this->nbor->max_nbors()*_nmax, + *(this->ucl_device), + UCL_READ_WRITE) == UCL_SUCCESS); + if (!success) return 0; } - nbor_pitch=this->nbor->nbor_pitch(); + // build the short neighbor list + int ainum=this->_ainum; + this->time_pair.start(); + + int BX=this->block_pair(); + int GX=static_cast(ceil(static_cast(ainum)/BX)); + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &cutsq_pair, &_ntypes, + &this->nbor->dev_nbor, &this->nbor->dev_packed, + &ainum, &nbor_pitch, &this->_threads_per_atom); + + #if defined(LAL_OCL_EV_JIT) + if (eflag || vflag) k_zeta_selt = &k_zeta; + else k_zeta_selt = &k_zeta_noev; + #endif + GX=static_cast(ceil(static_cast(this->_ainum)/ (BX/(JTHREADS*KTHREADS)))); - - this->k_zeta.set_size(GX,BX); - this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, + k_zeta_selt->set_size(GX,BX); + k_zeta_selt->run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, - &eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom); - - ainum=this->ans->inum(); - nbor_pitch=this->nbor->nbor_pitch(); - GX=static_cast(ceil(static_cast(this->ans->inum())/ - (BX/this->_threads_per_atom))); - - this->time_pair.start(); - this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq, - &map, &elem2param, &_nelements, &_nparams, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, - &this->ans->force, &this->ans->engv, - &eflag, &vflag, &ainum, &nbor_pitch, + &this->nbor->dev_nbor,&eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom); + ainum=this->ans->inum(); BX=this->block_size(); GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/(KTHREADS*JTHREADS)))); - this->k_three_center.set_size(GX,BX); - this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq, + this->k_3center_sel->set_size(GX,BX); + this->k_3center_sel->run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, - &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, + &this->nbor->dev_nbor, &this->ans->force, + &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &evatom); Answer *end_ans; @@ -307,24 +311,34 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { #endif if (evatom!=0) { this->k_three_end_vatom.set_size(GX,BX); - this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq, + this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_ilist, &this->dev_short_nbor, - &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); + &this->nbor->dev_nbor, &this->nbor->three_ilist, + &end_ans->force, &end_ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom, + &this->_gpu_nbor); } else { - this->k_three_end.set_size(GX,BX); - this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq, + this->k_3end_sel->set_size(GX,BX); + this->k_3end_sel->run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_ilist, &this->dev_short_nbor, - &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); + &this->nbor->dev_nbor, &this->nbor->three_ilist, + &end_ans->force, &end_ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom, + &this->_gpu_nbor); } + BX=this->block_pair(); + int GXT=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->k_sel->set_size(GXT,BX); + this->k_sel->run(&this->atom->x, &ts1, &ts2, &map, &elem2param, + &_nelements, &_nparams, &this->nbor->dev_nbor, + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom, &GX); + this->time_pair.stop(); + return GX; } template class TersoffMod; diff --git a/lib/gpu/lal_tersoff_mod.cu b/lib/gpu/lal_tersoff_mod.cu index 0f45653264..44b04c6933 100644 --- a/lib/gpu/lal_tersoff_mod.cu +++ b/lib/gpu/lal_tersoff_mod.cu @@ -18,99 +18,28 @@ #ifndef _DOUBLE_DOUBLE _texture( pos_tex,float4); -_texture( ts1_tex,float4); -_texture( ts2_tex,float4); -_texture( ts3_tex,float4); -_texture( ts4_tex,float4); -_texture( ts5_tex,float4); #else _texture_2d( pos_tex,int4); -_texture( ts1_tex,int4); -_texture( ts2_tex,int4); -_texture( ts3_tex,int4); -_texture( ts4_tex,int4); -_texture( ts5_tex,int4); #endif #else #define pos_tex x_ -#define ts1_tex ts1 -#define ts2_tex ts2 -#define ts3_tex ts3 -#define ts4_tex ts4 -#define ts5_tex ts5 #endif //#define THREE_CONCURRENT #define TWOTHIRD (numtyp)0.66666666666666666667 -#define zeta_idx(nbor_mem, packed_mem, nbor_pitch, n_stride, t_per_atom, \ - i, nbor_j, offset_j, idx) \ - if (nbor_mem==packed_mem) { \ - int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride; \ - idx = jj*n_stride + i*t_per_atom + offset_j; \ - } else { \ - idx = nbor_j; \ - } +#if (SHUFFLE_AVAIL == 0) -#if (ARCH < 300) - -#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, \ - offset, eflag, vflag, ans, engv) \ - if (t_per_atom>1) { \ - __local acctyp red_acc[6][BLOCK_PAIR]; \ - red_acc[0][tid]=f.x; \ - red_acc[1][tid]=f.y; \ - red_acc[2][tid]=f.z; \ - red_acc[3][tid]=energy; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<4; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ - } \ - } \ - f.x=red_acc[0][tid]; \ - f.y=red_acc[1][tid]; \ - f.z=red_acc[2][tid]; \ - energy=red_acc[3][tid]; \ - if (vflag>0) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid]=virial[r]; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ - } \ - } \ - for (int r=0; r<6; r++) \ - virial[r]=red_acc[r][tid]; \ - } \ - } \ - if (offset==0) { \ - int ei=ii; \ - if (eflag>0) { \ - engv[ei]+=energy*(acctyp)0.5; \ - ei+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - engv[ei]+=virial[i]*(acctyp)0.5; \ - ei+=inum; \ - } \ - } \ - acctyp4 old=ans[ii]; \ - old.x+=f.x; \ - old.y+=f.y; \ - old.z+=f.z; \ - ans[ii]=old; \ - } +#define local_allocate_acc_zeta() \ + __local acctyp red_acc[BLOCK_PAIR]; #define acc_zeta(z, tid, t_per_atom, offset) \ if (t_per_atom>1) { \ - __local acctyp red_acc[BLOCK_PAIR]; \ red_acc[tid]=z; \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ if (offset < s) { \ red_acc[tid] += red_acc[tid+s]; \ } \ @@ -118,36 +47,168 @@ _texture( ts5_tex,int4); z=red_acc[tid]; \ } -#else - #define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, \ - offset, eflag, vflag, ans, engv) \ + offset, eflag, vflag, ans, engv, ev_stride) \ if (t_per_atom>1) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - f.x += shfl_xor(f.x, s, t_per_atom); \ - f.y += shfl_xor(f.y, s, t_per_atom); \ - f.z += shfl_xor(f.z, s, t_per_atom); \ - energy += shfl_xor(energy, s, t_per_atom); \ - } \ - if (vflag>0) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - for (int r=0; r<6; r++) \ - virial[r] += shfl_xor(virial[r], s, t_per_atom); \ + simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ + if (EVFLAG && (vflag==2 || eflag==2)) { \ + if (eflag) { \ + simdsync(); \ + simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy); \ + } \ + if (vflag) { \ + simdsync(); \ + simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \ } \ } \ } \ - if (offset==0) { \ - int ei=ii; \ - if (eflag>0) { \ - engv[ei]+=energy*(acctyp)0.5; \ - ei+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - engv[ei]+=virial[i]*(acctyp)0.5; \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + z += shfl_down(z, s, t_per_atom); \ + } \ + } + +#if (EVFLAG == 1) + +#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, \ + offset, eflag, vflag, ans, engv, ev_stride) \ + if (t_per_atom>1) { \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (vflag==2 || eflag==2) { \ + if (eflag) \ + simd_reduce_add1(t_per_atom,energy); \ + if (vflag) \ + simd_reduce_arr(6, t_per_atom,virial); \ + } \ + } \ + if (offset==0 && ii 1; active_subgs /= vwidth) { \ + if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \ + if (bnum < active_subgs) { \ + if (eflag) { \ + simd_reduce_add1(vwidth, energy); \ + if (voffset==0) red_acc[6][bnum] = energy; \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (voffset==0) \ + for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \ + } \ + } \ + \ + __syncthreads(); \ + if (tid < active_subgs) { \ + if (eflag) energy = red_acc[6][tid]; \ + if (vflag) \ + for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \ + } else { \ + if (eflag) energy = (acctyp)0; \ + if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \ + } \ + } \ + \ + if (bnum == 0) { \ + int ei=BLOCK_ID_X; \ + if (eflag) { \ + simd_reduce_add1(vwidth, energy); \ + if (tid==0) { \ + engv[ei]+=energy*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (tid==0) { \ + for (int r=0; r<6; r++) { \ + engv[ei]+=virial[r]*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + } \ + } \ + } else if (offset==0 && ii1) \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (offset==0 && ii1) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - z += shfl_xor(z, s, t_per_atom); \ - } \ - } - +#endif #endif __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_, - const __global int * dev_nbor, + const __global numtyp *restrict cutsq_pair, + const int ntypes, __global int * dev_nbor, const __global int * dev_packed, - __global int * dev_short_nbor, - const numtyp _cutshortsq, const int inum, const int nbor_pitch, const int t_per_atom) { - __local int n_stride; - int tid, ii, offset; - atom_info(t_per_atom,ii,tid,offset); + const int ii=GLOBAL_ID_X; + + #ifdef ONETYPE + const numtyp cutsq=cutsq_pair[ONETYPE]; + #endif if (ii cutsq[ijkparam]) continue; - numtyp4 ts1_ijkparam = ts1[ijkparam]; //fetch4(ts1_ijkparam,ijkparam,ts1_tex); numtyp ijkparam_lam3 = ts1_ijkparam.z; numtyp ijkparam_powermint = ts1_ijkparam.w; @@ -348,9 +390,6 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_, ijkparam_c5, rsq1, rsq2, delr1, delr2); } - // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor - int idx = nbor_j; - if (dev_packed==dev_nbor) idx -= n_stride; acc_zeta(z, tid, t_per_atom, offset_k); numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex); @@ -376,7 +415,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_, zij.y = fpfeng[1]; zij.z = fpfeng[2]; zij.w = z; - zetaij[idx] = zij; + zetaij[nbor_j-2*nbor_pitch] = zij; } } // for nbor @@ -386,22 +425,20 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_, __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict ts1_in, const __global numtyp4 *restrict ts2_in, - const __global numtyp *restrict cutsq, const __global int *restrict map, const __global int *restrict elem2param, const int nelements, const int nparams, const __global int * dev_nbor, - const __global int * dev_packed, - const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, - const int t_per_atom) { - __local int n_stride; - int tid, ii, offset; + const int t_per_atom, const int ev_stride) { + int tid, ii, offset, n_stride; atom_info(t_per_atom,ii,tid,offset); + local_allocate_store_pair(); + __local numtyp4 ts1[SHARED_SIZE]; __local numtyp4 ts2[SHARED_SIZE]; if (tid= cutsq[ijparam]) continue; - numtyp feng[2]; numtyp ijparam_lam1 = ts1[ijparam].x; numtyp4 ts2_ijparam = ts2[ijparam]; @@ -470,9 +497,9 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_, f.y+=dely*force; f.z+=delz*force; - if (eflag>0) + if (EVFLAG && eflag) energy+=feng[1]; - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -481,11 +508,9 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_, virial[5] += dely*delz*force; } } // for nbor - - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii - + store_answers_p(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv,ev_stride); } __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_, @@ -493,26 +518,24 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict ts2_in, const __global numtyp4 *restrict ts4_in, const __global numtyp4 *restrict ts5_in, - const __global numtyp *restrict cutsq, const __global int *restrict map, const __global int *restrict elem2param, const int nelements, const int nparams, const __global acctyp4 *restrict zetaij, const __global int * dev_nbor, - const __global int * dev_packed, - const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom, const int evatom) { - __local int tpa_sq, n_stride; - tpa_sq=fast_mul(t_per_atom,t_per_atom); + const int tpa_sq=fast_mul(t_per_atom,t_per_atom); numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h; - int tid, ii, offset; + int tid, ii, offset, n_stride; atom_info(tpa_sq,ii,tid,offset); // offset ranges from 0 to tpa_sq-1 + local_allocate_store_three(); + __local numtyp4 ts1[SHARED_SIZE]; __local numtyp4 ts2[SHARED_SIZE]; __local numtyp4 ts4[SHARED_SIZE]; @@ -524,46 +547,37 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_, ts5[tid]=ts5_in[tid]; } - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } numtyp tpainv = ucl_recip((numtyp)t_per_atom); __syncthreads(); if (ii= cutsq[ijparam]) continue; numtyp r1 = ucl_sqrt(rsq1); numtyp r1inv = ucl_rsqrt(rsq1); // look up for zeta_ij - // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor - int idx = nbor_j; - if (dev_packed==dev_nbor) idx -= n_stride; - acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex); + acctyp4 zeta_ij = zetaij[nbor_j-2*nbor_pitch]; numtyp force = zeta_ij.x*tpainv; numtyp prefactor = zeta_ij.y; f.x += delr1[0]*force; f.y += delr1[1]*force; f.z += delr1[2]*force; - if (eflag>0) { + if (EVFLAG && eflag) { energy+=zeta_ij.z*tpainv; } - if (vflag>0) { + if (EVFLAG && vflag) { numtyp mforce = -force; virial[0] += delr1[0]*delr1[0]*mforce; virial[1] += delr1[1]*delr1[1]*mforce; @@ -601,14 +611,8 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_, } int nbor_k = nborj_start-offset_j+offset_k; - int k_end = nbor_end; - if (dev_packed==dev_nbor) { - int numk = dev_short_nbor[nbor_k-n_stride]; - k_end = nbor_k+fast_mul(numk,n_stride); - } - - for ( ; nbor_k cutsq[ijkparam]) continue; numtyp r2 = ucl_sqrt(rsq2); numtyp r2inv = ucl_rsqrt(rsq2); @@ -643,7 +646,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_, numtyp4 ts5_ijkparam = ts5[ijkparam]; //fetch4(ts5_ijkparam,ijkparam,ts5_tex); c5 = ts5_ijkparam.x; h = ts5_ijkparam.y; - if (vflag>0) + if (EVFLAG && vflag) attractive(bigr, bigd, powermint, lam3, h, c1, c2, c3, c4, c5, prefactor, r1, r1inv, r2, r2inv, delr1, delr2, fi, fj, fk); else @@ -653,7 +656,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_, f.y += fi[1]; f.z += fi[2]; - if (vflag>0) { + if (EVFLAG && vflag) { acctyp v[6]; numtyp pre = (numtyp)2.0; if (evatom==1) pre = TWOTHIRD; @@ -669,10 +672,9 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_, } } // nbor_k } // for nbor_j - - store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq, - offset,eflag,vflag,ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,tpa_sq, + offset,eflag,vflag,ans,engv); } __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_, @@ -680,27 +682,25 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict ts2_in, const __global numtyp4 *restrict ts4_in, const __global numtyp4 *restrict ts5_in, - const __global numtyp *restrict cutsq, const __global int *restrict map, const __global int *restrict elem2param, const int nelements, const int nparams, const __global acctyp4 *restrict zetaij, const __global int * dev_nbor, - const __global int * dev_packed, const __global int * dev_ilist, - const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom, const int gpu_nbor) { - __local int tpa_sq, n_stride; - tpa_sq=fast_mul(t_per_atom,t_per_atom); + const int tpa_sq=fast_mul(t_per_atom,t_per_atom); numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h; - int tid, ii, offset; + int tid, ii, offset, n_stride; atom_info(tpa_sq,ii,tid,offset); + local_allocate_store_three(); + __local numtyp4 ts1[SHARED_SIZE]; __local numtyp4 ts2[SHARED_SIZE]; __local numtyp4 ts4[SHARED_SIZE]; @@ -712,23 +712,25 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_, ts5[tid]=ts5_in[tid]; } - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } - __local int ijnum_shared[BLOCK_PAIR]; + #ifdef LAL_SIMD_IP_SYNC + __local int localk[BLOCK_PAIR]; + #endif __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { energy+=zeta_ji.z*tpainv; } - if (vflag>0) { + if (EVFLAG && vflag) { numtyp mforce = -force; virial[0] += mdelr1[0]*mdelr1[0]*mforce; virial[1] += mdelr1[1]*mdelr1[1]*mforce; @@ -833,7 +816,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_, // attractive forces for (nbor_k = nbork_start ; nbor_k0) { + if (EVFLAG && eflag) { energy+=zeta_ji.z*tpainv; } - if (vflag>0) { + if (EVFLAG && vflag) { numtyp mforce = -force; virial[0] += mdelr1[0]*mdelr1[0]*mforce; virial[1] += mdelr1[1]*mdelr1[1]*mforce; @@ -1071,7 +1031,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_, // attractive forces for (nbor_k = nbork_start; nbor_k cutsq[jikparam]) continue; numtyp r2 = ucl_sqrt(rsq2); numtyp r2inv = ucl_rsqrt(rsq2); @@ -1120,10 +1078,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_, virial[4] += TWOTHIRD*(mdelr1[0]*fj[2] + delr2[0]*fk[2]); virial[5] += TWOTHIRD*(mdelr1[1]*fj[2] + delr2[1]*fk[2]); - // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor - int idx = nbor_k; - if (dev_packed==dev_nbor) idx -= n_stride; - acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex); + acctyp4 zeta_jk = zetaij[nbor_k-2*nbor_pitch]; numtyp prefactor_jk = zeta_jk.y; int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype]; @@ -1155,14 +1110,13 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_, virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]); } } // for nbor - - #ifdef THREE_CONCURRENT - store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset, - eflag,vflag,ans,engv); - #else - store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset, - eflag,vflag,ans,engv); - #endif } // if ii + #ifdef THREE_CONCURRENT + store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset, + eflag,vflag,ans,engv); + #else + store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset, + eflag,vflag,ans,engv,NUM_BLOCKS_X); + #endif } diff --git a/lib/gpu/lal_tersoff_mod.h b/lib/gpu/lal_tersoff_mod.h index 29a561c71d..0baa1307cb 100644 --- a/lib/gpu/lal_tersoff_mod.h +++ b/lib/gpu/lal_tersoff_mod.h @@ -63,7 +63,7 @@ class TersoffMod : public BaseThree { bool shared_types; /// Number of atom types - int _lj_types; + int _ntypes; /// ts1.x = lam1, ts1.y = lam2, ts1.z = lam3, ts1.w = powermint UCL_D_Vec ts1; @@ -76,7 +76,7 @@ class TersoffMod : public BaseThree { /// ts5.x = c5, ts5.y = h UCL_D_Vec ts5; - UCL_D_Vec cutsq; + UCL_D_Vec cutsq_pair; UCL_D_Vec elem2param; UCL_D_Vec map; @@ -87,13 +87,11 @@ class TersoffMod : public BaseThree { /// zetaij.w = zetaij UCL_D_Vec _zetaij; - UCL_Kernel k_zeta; - UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex; - numtyp _cutshortsq; + UCL_Kernel k_zeta, k_zeta_noev, *k_zeta_selt; private: bool _allocated; - void loop(const bool _eflag, const bool _vflag, const int evatom); + int loop(const int eflag, const int vflag, const int evatom, bool &success); }; } diff --git a/lib/gpu/lal_tersoff_mod_ext.cpp b/lib/gpu/lal_tersoff_mod_ext.cpp index cce9df8713..cac284fb70 100644 --- a/lib/gpu/lal_tersoff_mod_ext.cpp +++ b/lib/gpu/lal_tersoff_mod_ext.cpp @@ -63,7 +63,7 @@ int tersoff_mod_gpu_init(const int ntypes, const int inum, const int nall, int init_ok=0; if (world_me==0) - init_ok=TSMMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen, + init_ok=TSMMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen, host_map, nelements, host_elem2param, nparams, ts_lam1, ts_lam2, ts_lam3, ts_powermint, ts_biga, ts_bigb, ts_bigr, ts_bigd, ts_c1, ts_c2, @@ -84,7 +84,7 @@ int tersoff_mod_gpu_init(const int ntypes, const int inum, const int nall, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=TSMMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen, + init_ok=TSMMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen, host_map, nelements, host_elem2param, nparams, ts_lam1, ts_lam2, ts_lam3, ts_powermint, ts_biga, ts_bigb, ts_bigr, ts_bigd, ts_c1, ts_c2, @@ -99,7 +99,7 @@ int tersoff_mod_gpu_init(const int ntypes, const int inum, const int nall, fprintf(screen,"\n"); if (init_ok==0) - TSMMF.estimate_gpu_overhead(); + TSMMF.estimate_gpu_overhead(1); return init_ok; } diff --git a/lib/gpu/lal_tersoff_zbl.cpp b/lib/gpu/lal_tersoff_zbl.cpp index 7d254d568d..4456712b0a 100644 --- a/lib/gpu/lal_tersoff_zbl.cpp +++ b/lib/gpu/lal_tersoff_zbl.cpp @@ -39,7 +39,7 @@ TersoffZT::~TersoffZBL() { template int TersoffZT::bytes_per_atom(const int max_nbors) const { - return this->bytes_per_atom_atomic(max_nbors); + return this->bytes_per_atom_atomic(max_nbors)+max_nbors*sizeof(acctyp)*4; } template @@ -59,34 +59,78 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall, const double global_a_0, const double global_epsilon_0, const double* host_cutsq) { + int oldparam=-1; + int onetype=-1; + int onetype3=0; + int spq=1; + int mtypes=0; + #ifdef USE_OPENCL + for (int ii=1; ii1) onetype=-1; + #endif + int success; success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,tersoff_zbl,"k_tersoff_zbl_repulsive", - "k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end", - "k_tersoff_zbl_short_nbor"); + "k_tersoff_zbl_three_center", + "k_tersoff_zbl_three_end", + "k_tersoff_zbl_short_nbor",onetype,onetype3,0,1); if (success!=0) return success; int ef_nall=nall; if (ef_nall==0) ef_nall=2000; - _zetaij.alloc(ef_nall*max_nbors,*(this->ucl_device),UCL_READ_WRITE); + if (this->nbor->max_nbors()) + _zetaij.alloc(ef_nall*this->nbor->max_nbors(),*(this->ucl_device), + UCL_READ_WRITE); k_zeta.set_function(*(this->pair_program),"k_tersoff_zbl_zeta"); + #if defined(LAL_OCL_EV_JIT) + k_zeta_noev.set_function(*(this->pair_program_noev),"k_tersoff_zbl_zeta"); + #else + k_zeta_selt = &k_zeta; + #endif - // If atom type constants fit in shared memory use fast kernel - int lj_types=ntypes; - shared_types=false; - int max_shared_types=this->device->max_shared_types(); - if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { - lj_types=max_shared_types; - shared_types=true; - } - _lj_types=lj_types; - + _ntypes = ntypes; _nparams = nparams; _nelements = nelements; + UCL_H_Vec host_write(ntypes*ntypes,*(this->ucl_device), + UCL_READ_WRITE); + host_write.zero(); + cutsq_pair.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY); + for (int ii=1; iihost_write[ii*ntypes+jj]) + host_write[ii*ntypes+jj]=host_cutsq[ijkparam]; + } + } + } + ucl_copy(cutsq_pair,host_write,ntypes*ntypes); + UCL_H_Vec dview(nparams,*(this->ucl_device), UCL_WRITE_ONLY); @@ -108,8 +152,6 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall, } ucl_copy(ts1,dview,false); - ts1_tex.get_texture(*(this->pair_program),"ts1_tex"); - ts1_tex.bind_float(ts1,4); ts2.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); @@ -121,8 +163,6 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall, } ucl_copy(ts2,dview,false); - ts2_tex.get_texture(*(this->pair_program),"ts2_tex"); - ts2_tex.bind_float(ts2,4); ts3.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); @@ -134,8 +174,6 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall, } ucl_copy(ts3,dview,false); - ts3_tex.get_texture(*(this->pair_program),"ts3_tex"); - ts3_tex.bind_float(ts3,4); ts4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); @@ -147,8 +185,6 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall, } ucl_copy(ts4,dview,false); - ts4_tex.get_texture(*(this->pair_program),"ts4_tex"); - ts4_tex.bind_float(ts4,4); ts5.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); @@ -160,8 +196,6 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall, } ucl_copy(ts5,dview,false); - ts5_tex.get_texture(*(this->pair_program),"ts5_tex"); - ts5_tex.bind_float(ts5,4); ts6.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); @@ -173,20 +207,6 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall, } ucl_copy(ts6,dview,false); - ts6_tex.get_texture(*(this->pair_program),"ts6_tex"); - ts6_tex.bind_float(ts6,4); - - UCL_H_Vec cutsq_view(nparams,*(this->ucl_device), - UCL_WRITE_ONLY); - double cutsqmax = 0.0; - for (int i=0; i(host_cutsq[i]); - if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i]; - } - cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); - ucl_copy(cutsq,cutsq_view,false); - - _cutshortsq = static_cast(cutsqmax); UCL_H_Vec dview_elem2param(nelements*nelements*nelements, *(this->ucl_device), UCL_WRITE_ONLY); @@ -203,11 +223,11 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall, ucl_copy(elem2param,dview_elem2param,false); - UCL_H_Vec dview_map(lj_types, *(this->ucl_device), UCL_WRITE_ONLY); + UCL_H_Vec dview_map(ntypes, *(this->ucl_device), UCL_WRITE_ONLY); for (int i = 0; i < ntypes; i++) dview_map[i] = host_map[i]; - map.alloc(lj_types,*(this->ucl_device), UCL_READ_ONLY); + map.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY); ucl_copy(map,dview_map,false); _global_e = global_e; @@ -216,8 +236,8 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall, _allocated=true; this->_max_bytes=ts1.row_bytes()+ts2.row_bytes()+ts3.row_bytes()+ - ts4.row_bytes()+ts5.row_bytes()+cutsq.row_bytes()+ - map.row_bytes()+elem2param.row_bytes()+_zetaij.row_bytes(); + ts4.row_bytes()+ts5.row_bytes()+map.row_bytes()+elem2param.row_bytes()+ + _zetaij.row_bytes(); return 0; } @@ -233,12 +253,15 @@ void TersoffZT::clear() { ts4.clear(); ts5.clear(); ts6.clear(); - cutsq.clear(); + cutsq_pair.clear(); map.clear(); elem2param.clear(); _zetaij.clear(); k_zeta.clear(); + #if defined(LAL_OCL_EV_JIT) + k_zeta_noev.clear(); + #endif this->clear_atomic(); } @@ -254,75 +277,54 @@ double TersoffZT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { - // Compute the block size and grid size to keep all cores busy - int BX=this->block_pair(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - - // build the short neighbor list - int ainum=this->_ainum; - int nbor_pitch=this->nbor->nbor_pitch(); - int GX=static_cast(ceil(static_cast(ainum)/ - (BX/this->_threads_per_atom))); - - this->k_short_nbor.set_size(GX,BX); - this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, - &this->_nbor_data->begin(), - &this->dev_short_nbor, &_cutshortsq, &ainum, - &nbor_pitch, &this->_threads_per_atom); +int TersoffZT::loop(const int eflag, const int vflag, const int evatom, + bool &success) { + const int nbor_pitch=this->nbor->nbor_pitch(); // re-allocate zetaij if necessary int nall = this->_nall; - if (nall*this->_max_nbors > _zetaij.cols()) { + if (nall*this->nbor->max_nbors() > _zetaij.cols()) { int _nmax=static_cast(static_cast(nall)*1.10); - _zetaij.resize(this->_max_nbors*_nmax); + _zetaij.clear(); + success = success && (_zetaij.alloc(this->nbor->max_nbors()*_nmax, + *(this->ucl_device), + UCL_READ_WRITE) == UCL_SUCCESS); + if (!success) return 0; } - nbor_pitch=this->nbor->nbor_pitch(); - GX=static_cast(ceil(static_cast(this->_ainum)/ - (BX/(JTHREADS*KTHREADS)))); - - this->k_zeta.set_size(GX,BX); - this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq, - &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, - &eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom); - - ainum=this->ans->inum(); - nbor_pitch=this->nbor->nbor_pitch(); - GX=static_cast(ceil(static_cast(this->ans->inum())/ - (BX/this->_threads_per_atom))); - + // build the short neighbor list + int ainum=this->_ainum; this->time_pair.start(); - this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &ts1, &ts2, &ts6, - &_global_e, &_global_a_0, &_global_epsilon_0, &cutsq, - &map, &elem2param, &_nelements, &_nparams, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, - &this->ans->force, &this->ans->engv, - &eflag, &vflag, &ainum, &nbor_pitch, + + int BX=this->block_pair(); + int GX=static_cast(ceil(static_cast(ainum)/BX)); + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &cutsq_pair, &_ntypes, + &this->nbor->dev_nbor, &this->nbor->dev_packed, + &ainum, &nbor_pitch, &this->_threads_per_atom); + + #if defined(LAL_OCL_EV_JIT) + if (eflag || vflag) k_zeta_selt = &k_zeta; + else k_zeta_selt = &k_zeta_noev; + #endif + + GX=static_cast(ceil(static_cast(this->_ainum)/ + (BX/(JTHREADS*KTHREADS)))); + k_zeta_selt->set_size(GX,BX); + k_zeta_selt->run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, + &map, &elem2param, &_nelements, &_nparams, &_zetaij, + &this->nbor->dev_nbor, &eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom); + ainum=this->ans->inum(); BX=this->block_size(); GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/(KTHREADS*JTHREADS)))); - this->k_three_center.set_size(GX,BX); - this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, - &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, - &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, + this->k_3center_sel->set_size(GX,BX); + this->k_3center_sel->run(&this->atom->x, &ts1, &ts2, &ts4, &map, + &elem2param, &_nelements, &_nparams, &_zetaij, + &this->nbor->dev_nbor, &this->ans->force, + &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &evatom); Answer *end_ans; @@ -333,24 +335,35 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { #endif if (evatom!=0) { this->k_three_end_vatom.set_size(GX,BX); - this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, + this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_ilist, &this->dev_short_nbor, - &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); + &this->nbor->dev_nbor, &this->nbor->three_ilist, + &end_ans->force, &end_ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom, + &this->_gpu_nbor); } else { - this->k_three_end.set_size(GX,BX); - this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, - &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_ilist, &this->dev_short_nbor, - &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); + this->k_3end_sel->set_size(GX,BX); + this->k_3end_sel->run(&this->atom->x, &ts1, &ts2, &ts4, &map, + &elem2param, &_nelements, &_nparams, &_zetaij, + &this->nbor->dev_nbor, &this->nbor->three_ilist, + &end_ans->force, &end_ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom, + &this->_gpu_nbor); } + BX=this->block_pair(); + int GXT=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->k_sel->set_size(GXT,BX); + this->k_sel->run(&this->atom->x, &ts1, &ts2, &ts6, &_global_e, &_global_a_0, + &_global_epsilon_0, &map, &elem2param, &_nelements, + &_nparams, &this->nbor->dev_nbor, &this->ans->force, + &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom, &GX); + this->time_pair.stop(); + return GX; } template class TersoffZBL; diff --git a/lib/gpu/lal_tersoff_zbl.cu b/lib/gpu/lal_tersoff_zbl.cu index f631cab91f..fce1ccc406 100644 --- a/lib/gpu/lal_tersoff_zbl.cu +++ b/lib/gpu/lal_tersoff_zbl.cu @@ -48,72 +48,16 @@ _texture( ts6_tex,int4); #define TWOTHIRD (numtyp)0.66666666666666666667 -#define zeta_idx(nbor_mem, packed_mem, nbor_pitch, n_stride, t_per_atom, \ - i, nbor_j, offset_j, idx) \ - if (nbor_mem==packed_mem) { \ - int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride; \ - idx = jj*n_stride + i*t_per_atom + offset_j; \ - } else { \ - idx = nbor_j; \ - } +#if (SHUFFLE_AVAIL == 0) -#if (ARCH < 300) - -#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, \ - offset, eflag, vflag, ans, engv) \ - if (t_per_atom>1) { \ - __local acctyp red_acc[6][BLOCK_PAIR]; \ - red_acc[0][tid]=f.x; \ - red_acc[1][tid]=f.y; \ - red_acc[2][tid]=f.z; \ - red_acc[3][tid]=energy; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<4; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ - } \ - } \ - f.x=red_acc[0][tid]; \ - f.y=red_acc[1][tid]; \ - f.z=red_acc[2][tid]; \ - energy=red_acc[3][tid]; \ - if (vflag>0) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid]=virial[r]; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ - } \ - } \ - for (int r=0; r<6; r++) \ - virial[r]=red_acc[r][tid]; \ - } \ - } \ - if (offset==0) { \ - int ei=ii; \ - if (eflag>0) { \ - engv[ei]+=energy*(acctyp)0.5; \ - ei+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - engv[ei]+=virial[i]*(acctyp)0.5; \ - ei+=inum; \ - } \ - } \ - acctyp4 old=ans[ii]; \ - old.x+=f.x; \ - old.y+=f.y; \ - old.z+=f.z; \ - ans[ii]=old; \ - } +#define local_allocate_acc_zeta() \ + __local acctyp red_acc[BLOCK_PAIR]; #define acc_zeta(z, tid, t_per_atom, offset) \ if (t_per_atom>1) { \ - __local acctyp red_acc[BLOCK_PAIR]; \ red_acc[tid]=z; \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ if (offset < s) { \ red_acc[tid] += red_acc[tid+s]; \ } \ @@ -121,36 +65,168 @@ _texture( ts6_tex,int4); z=red_acc[tid]; \ } -#else - #define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, \ - offset, eflag, vflag, ans, engv) \ + offset, eflag, vflag, ans, engv, ev_stride) \ if (t_per_atom>1) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - f.x += shfl_xor(f.x, s, t_per_atom); \ - f.y += shfl_xor(f.y, s, t_per_atom); \ - f.z += shfl_xor(f.z, s, t_per_atom); \ - energy += shfl_xor(energy, s, t_per_atom); \ - } \ - if (vflag>0) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - for (int r=0; r<6; r++) \ - virial[r] += shfl_xor(virial[r], s, t_per_atom); \ + simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ + if (EVFLAG && (vflag==2 || eflag==2)) { \ + if (eflag) { \ + simdsync(); \ + simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy); \ + } \ + if (vflag) { \ + simdsync(); \ + simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \ } \ } \ } \ - if (offset==0) { \ - int ei=ii; \ - if (eflag>0) { \ - engv[ei]+=energy*(acctyp)0.5; \ - ei+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - engv[ei]+=virial[i]*(acctyp)0.5; \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + z += shfl_down(z, s, t_per_atom); \ + } \ + } + +#if (EVFLAG == 1) + +#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, \ + offset, eflag, vflag, ans, engv, ev_stride) \ + if (t_per_atom>1) { \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (vflag==2 || eflag==2) { \ + if (eflag) \ + simd_reduce_add1(t_per_atom,energy); \ + if (vflag) \ + simd_reduce_arr(6, t_per_atom,virial); \ + } \ + } \ + if (offset==0 && ii 1; active_subgs /= vwidth) { \ + if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \ + if (bnum < active_subgs) { \ + if (eflag) { \ + simd_reduce_add1(vwidth, energy); \ + if (voffset==0) red_acc[6][bnum] = energy; \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (voffset==0) \ + for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \ + } \ + } \ + \ + __syncthreads(); \ + if (tid < active_subgs) { \ + if (eflag) energy = red_acc[6][tid]; \ + if (vflag) \ + for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \ + } else { \ + if (eflag) energy = (acctyp)0; \ + if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \ + } \ + } \ + \ + if (bnum == 0) { \ + int ei=BLOCK_ID_X; \ + if (eflag) { \ + simd_reduce_add1(vwidth, energy); \ + if (tid==0) { \ + engv[ei]+=energy*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (tid==0) { \ + for (int r=0; r<6; r++) { \ + engv[ei]+=virial[r]*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + } \ + } \ + } else if (offset==0 && ii1) \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (offset==0 && ii1) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - z += shfl_xor(z, s, t_per_atom); \ - } \ - } - +#endif #endif __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_, - const __global int * dev_nbor, + const __global numtyp *restrict cutsq_pair, + const int ntypes, __global int * dev_nbor, const __global int * dev_packed, - __global int * dev_short_nbor, - const numtyp _cutshortsq, const int inum, const int nbor_pitch, const int t_per_atom) { - __local int n_stride; - int tid, ii, offset; - atom_info(t_per_atom,ii,tid,offset); + const int ii=GLOBAL_ID_X; + + #ifdef ONETYPE + const numtyp cutsq=cutsq_pair[ONETYPE]; + #endif if (ii cutsq[ijkparam]) continue; - numtyp4 ts1_ijkparam = ts1[ijkparam]; //fetch4(ts1_ijkparam,ijkparam,ts1_tex); numtyp ijkparam_lam3 = ts1_ijkparam.z; numtyp ijkparam_powermint = ts1_ijkparam.w; @@ -351,9 +408,6 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_, rsq1, rsq2, delr1, delr2); } - // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor - int idx = nbor_j; - if (dev_packed==dev_nbor) idx -= n_stride; acc_zeta(z, tid, t_per_atom, offset_k); numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex); @@ -384,7 +438,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_, zij.y = fpfeng[1]; zij.z = fpfeng[2]; zij.w = z; - zetaij[idx] = zij; + zetaij[nbor_j-2*nbor_pitch] = zij; } } // for nbor @@ -397,22 +451,20 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict ts6_in, const numtyp global_e, const numtyp global_a_0, const numtyp global_epsilon_0, - const __global numtyp *restrict cutsq, const __global int *restrict map, const __global int *restrict elem2param, const int nelements, const int nparams, const __global int * dev_nbor, - const __global int * dev_packed, - const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, - const int t_per_atom) { - __local int n_stride; - int tid, ii, offset; + const int t_per_atom, const int ev_stride) { + int tid, ii, offset, n_stride; atom_info(t_per_atom,ii,tid,offset); + local_allocate_store_pair(); + __local numtyp4 ts1[SHARED_SIZE]; __local numtyp4 ts2[SHARED_SIZE]; __local numtyp4 ts6[SHARED_SIZE]; @@ -422,36 +474,28 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_, ts6[tid]=ts6_in[tid]; } - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii= cutsq[ijparam]) continue; - numtyp feng[2]; numtyp ijparam_lam1 = ts1[ijparam].x; numtyp4 ts2_ijparam = ts2[ijparam]; @@ -489,9 +531,9 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_, f.y+=dely*force; f.z+=delz*force; - if (eflag>0) + if (EVFLAG && eflag) energy+=feng[1]; - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -500,37 +542,33 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_, virial[5] += dely*delz*force; } } // for nbor - - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii - + store_answers_p(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv,ev_stride); } __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict ts1_in, const __global numtyp4 *restrict ts2_in, const __global numtyp4 *restrict ts4_in, - const __global numtyp *restrict cutsq, const __global int *restrict map, const __global int *restrict elem2param, const int nelements, const int nparams, const __global acctyp4 *restrict zetaij, const __global int * dev_nbor, - const __global int * dev_packed, - const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom, const int evatom) { - __local int tpa_sq, n_stride; - tpa_sq=fast_mul(t_per_atom,t_per_atom); + const int tpa_sq=fast_mul(t_per_atom,t_per_atom); numtyp lam3, powermint, bigr, bigd, c, d, h, gamma; - int tid, ii, offset; + int tid, ii, offset, n_stride; atom_info(tpa_sq,ii,tid,offset); // offset ranges from 0 to tpa_sq-1 + local_allocate_store_three(); + __local numtyp4 ts1[SHARED_SIZE]; __local numtyp4 ts2[SHARED_SIZE]; __local numtyp4 ts4[SHARED_SIZE]; @@ -540,46 +578,37 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_, ts4[tid]=ts4_in[tid]; } - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } numtyp tpainv = ucl_recip((numtyp)t_per_atom); __syncthreads(); if (ii= cutsq[ijparam]) continue; numtyp r1 = ucl_sqrt(rsq1); numtyp r1inv = ucl_rsqrt(rsq1); // look up for zeta_ij - // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor - int idx = nbor_j; - if (dev_packed==dev_nbor) idx -= n_stride; - acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex); + acctyp4 zeta_ij = zetaij[nbor_j-2*nbor_pitch]; numtyp force = zeta_ij.x*tpainv; numtyp prefactor = zeta_ij.y; f.x += delr1[0]*force; f.y += delr1[1]*force; f.z += delr1[2]*force; - if (eflag>0) { + if (EVFLAG && eflag) { energy+=zeta_ij.z*tpainv; } - if (vflag>0) { + if (EVFLAG && vflag) { numtyp mforce = -force; virial[0] += delr1[0]*delr1[0]*mforce; virial[1] += delr1[1]*delr1[1]*mforce; @@ -617,14 +642,8 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_, } int nbor_k = nborj_start-offset_j+offset_k; - int k_end = nbor_end; - if (dev_packed==dev_nbor) { - int numk = dev_short_nbor[nbor_k-n_stride]; - k_end = nbor_k+fast_mul(numk,n_stride); - } - - for ( ; nbor_k cutsq[ijkparam]) continue; numtyp r2 = ucl_sqrt(rsq2); numtyp r2inv = ucl_rsqrt(rsq2); @@ -656,7 +674,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_, d = ts4_ijkparam.y; h = ts4_ijkparam.z; gamma = ts4_ijkparam.w; - if (vflag>0) + if (EVFLAG && vflag) attractive(bigr, bigd, powermint, lam3, c, d, h, gamma, prefactor, r1, r1inv, r2, r2inv, delr1, delr2, fi, fj, fk); else @@ -666,7 +684,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_, f.y += fi[1]; f.z += fi[2]; - if (vflag>0) { + if (EVFLAG && vflag) { acctyp v[6]; numtyp pre = (numtyp)2.0; if (evatom==1) pre = TWOTHIRD; @@ -682,37 +700,34 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_, } } // nbor_k } // for nbor_j - - store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq, - offset,eflag,vflag,ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,tpa_sq, + offset,eflag,vflag,ans,engv); } __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict ts1_in, const __global numtyp4 *restrict ts2_in, const __global numtyp4 *restrict ts4_in, - const __global numtyp *restrict cutsq, const __global int *restrict map, const __global int *restrict elem2param, const int nelements, const int nparams, const __global acctyp4 *restrict zetaij, const __global int * dev_nbor, - const __global int * dev_packed, const __global int * dev_ilist, - const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom, const int gpu_nbor) { - __local int tpa_sq, n_stride; - tpa_sq=fast_mul(t_per_atom,t_per_atom); + const int tpa_sq=fast_mul(t_per_atom,t_per_atom); numtyp lam3, powermint, bigr, bigd, c, d, h, gamma; - int tid, ii, offset; + int tid, ii, offset, n_stride; atom_info(tpa_sq,ii,tid,offset); + local_allocate_store_three(); + __local numtyp4 ts1[SHARED_SIZE]; __local numtyp4 ts2[SHARED_SIZE]; __local numtyp4 ts4[SHARED_SIZE]; @@ -722,23 +737,25 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_, ts4[tid]=ts4_in[tid]; } - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } - __local int ijnum_shared[BLOCK_PAIR]; + #ifdef LAL_SIMD_IP_SYNC + __local int localk[BLOCK_PAIR]; + #endif __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { energy+=zeta_ji.z*tpainv; } - if (vflag>0) { + if (EVFLAG && vflag) { numtyp mforce = -force; virial[0] += mdelr1[0]*mdelr1[0]*mforce; virial[1] += mdelr1[1]*mdelr1[1]*mforce; @@ -843,7 +841,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_, // attractive forces for (nbor_k = nbork_start ; nbor_k0) { + if (EVFLAG && eflag) { energy+=zeta_ji.z*tpainv; } - if (vflag>0) { + if (EVFLAG && vflag) { numtyp mforce = -force; virial[0] += mdelr1[0]*mdelr1[0]*mforce; virial[1] += mdelr1[1]*mdelr1[1]*mforce; @@ -1072,7 +1047,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_, // attractive forces for (nbor_k = nbork_start; nbor_k cutsq[jikparam]) continue; numtyp r2 = ucl_sqrt(rsq2); numtyp r2inv = ucl_rsqrt(rsq2); @@ -1118,10 +1092,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_, virial[4] += TWOTHIRD*(mdelr1[0]*fj[2] + delr2[0]*fk[2]); virial[5] += TWOTHIRD*(mdelr1[1]*fj[2] + delr2[1]*fk[2]); - // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor - int idx = nbor_k; - if (dev_packed==dev_nbor) idx -= n_stride; - acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex); + acctyp4 zeta_jk = zetaij[nbor_k-2*nbor_pitch]; numtyp prefactor_jk = zeta_jk.y; int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype]; @@ -1150,14 +1121,13 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_, virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]); } } // for nbor - - #ifdef THREE_CONCURRENT - store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset, - eflag,vflag,ans,engv); - #else - store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset, - eflag,vflag,ans,engv); - #endif } // if ii + #ifdef THREE_CONCURRENT + store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset, + eflag,vflag,ans,engv); + #else + store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset, + eflag,vflag,ans,engv,NUM_BLOCKS_X); + #endif } diff --git a/lib/gpu/lal_tersoff_zbl.h b/lib/gpu/lal_tersoff_zbl.h index eb03e9fb02..b82b391765 100644 --- a/lib/gpu/lal_tersoff_zbl.h +++ b/lib/gpu/lal_tersoff_zbl.h @@ -65,7 +65,7 @@ class TersoffZBL : public BaseThree { bool shared_types; /// Number of atom types - int _lj_types; + int _ntypes; /// ts1.x = lam1, ts1.y = lam2, ts1.z = lam3, ts1.w = powermint UCL_D_Vec ts1; @@ -80,7 +80,7 @@ class TersoffZBL : public BaseThree { /// ts6.x = Z_i, ts6.y = Z_j, ts6.z = ZBLcut, ts6.w = ZBLexpscale UCL_D_Vec ts6; - UCL_D_Vec cutsq; + UCL_D_Vec cutsq_pair; UCL_D_Vec elem2param; UCL_D_Vec map; @@ -91,15 +91,13 @@ class TersoffZBL : public BaseThree { /// zetaij.w = zetaij UCL_D_Vec _zetaij; - UCL_Kernel k_zeta; - UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex, ts6_tex; + UCL_Kernel k_zeta, k_zeta_noev, *k_zeta_selt; numtyp _global_e,_global_a_0,_global_epsilon_0; - numtyp _cutshortsq; private: bool _allocated; - void loop(const bool _eflag, const bool _vflag, const int evatom); + int loop(const int eflag, const int vflag, const int evatom, bool &success); }; } diff --git a/lib/gpu/lal_tersoff_zbl_ext.cpp b/lib/gpu/lal_tersoff_zbl_ext.cpp index d1a9e090b6..518b535627 100644 --- a/lib/gpu/lal_tersoff_zbl_ext.cpp +++ b/lib/gpu/lal_tersoff_zbl_ext.cpp @@ -70,7 +70,7 @@ int tersoff_zbl_gpu_init(const int ntypes, const int inum, const int nall, int init_ok=0; if (world_me==0) - init_ok=TSZMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen, + init_ok=TSZMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen, host_map, nelements, host_elem2param, nparams, ts_lam1, ts_lam2, ts_lam3, ts_powermint, ts_biga, ts_bigb, ts_bigr, ts_bigd, @@ -93,7 +93,7 @@ int tersoff_zbl_gpu_init(const int ntypes, const int inum, const int nall, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=TSZMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen, + init_ok=TSZMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen, host_map, nelements, host_elem2param, nparams, ts_lam1, ts_lam2, ts_lam3, ts_powermint, ts_biga, ts_bigb, ts_bigr, ts_bigd, @@ -110,7 +110,7 @@ int tersoff_zbl_gpu_init(const int ntypes, const int inum, const int nall, fprintf(screen,"\n"); if (init_ok==0) - TSZMF.estimate_gpu_overhead(); + TSZMF.estimate_gpu_overhead(1); return init_ok; } diff --git a/lib/gpu/lal_ufm.cpp b/lib/gpu/lal_ufm.cpp index a86d07f340..f6a48d4470 100644 --- a/lib/gpu/lal_ufm.cpp +++ b/lib/gpu/lal_ufm.cpp @@ -131,20 +131,9 @@ double UFMT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void UFMT::loop(const bool _eflag, const bool _vflag) { +int UFMT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -152,8 +141,8 @@ void UFMT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &uf1, &uf3, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &uf1, &uf3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, @@ -166,6 +155,7 @@ void UFMT::loop(const bool _eflag, const bool _vflag) { &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class UFM; diff --git a/lib/gpu/lal_ufm.cu b/lib/gpu/lal_ufm.cu index 03d1e85bdf..9d6c7b978a 100644 --- a/lib/gpu/lal_ufm.cu +++ b/lib/gpu/lal_ufm.cu @@ -40,16 +40,19 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_, int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - acctyp energy=(acctyp)0; + int n_stride; + local_allocate_store_pair(); + acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { energy += - factor_lj * uf3[mtype].x*log(1.0 - expuf) - uf3[mtype].z; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -95,9 +98,9 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_ufm_fast(const __global numtyp4 *restrict x_, @@ -116,26 +119,29 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_, __local numtyp4 uf1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 uf3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + if (tid<4) sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) uf3[tid]=uf3_in[tid]; } - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii0) { + if (EVFLAG && eflag) { energy += - factor_lj * uf3[mtype].x * log(1.0 - expuf) - uf3[mtype].z; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -181,8 +187,8 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } diff --git a/lib/gpu/lal_ufm.h b/lib/gpu/lal_ufm.h index 14b96bcc86..390af831ba 100644 --- a/lib/gpu/lal_ufm.h +++ b/lib/gpu/lal_ufm.h @@ -77,7 +77,7 @@ class UFM : public BaseAtomic { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_ufm_ext.cpp b/lib/gpu/lal_ufm_ext.cpp index 12809a28fb..432cbb2e63 100644 --- a/lib/gpu/lal_ufm_ext.cpp +++ b/lib/gpu/lal_ufm_ext.cpp @@ -57,7 +57,7 @@ int ufml_gpu_init(const int ntypes, double **cutsq, double **host_uf1, int init_ok=0; if (world_me==0) init_ok=UFMLMF.init(ntypes, cutsq, host_uf1, host_uf2, host_uf3, - offset, special_lj, inum, nall, 300, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); UFMLMF.device->world_barrier(); @@ -75,7 +75,7 @@ int ufml_gpu_init(const int ntypes, double **cutsq, double **host_uf1, } if (gpu_rank==i && world_me!=0) init_ok=UFMLMF.init(ntypes, cutsq, host_uf1, host_uf2, host_uf3, - offset, special_lj, inum, nall, 300, maxspecial, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); UFMLMF.device->gpu_barrier(); diff --git a/lib/gpu/lal_vashishta.cpp b/lib/gpu/lal_vashishta.cpp index 4af8a0f71c..c343de3f55 100644 --- a/lib/gpu/lal_vashishta.cpp +++ b/lib/gpu/lal_vashishta.cpp @@ -50,7 +50,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i const double* gamma, const double* eta, const double* lam1inv, const double* lam4inv, const double* zizj, const double* mbigd, - const double* dvrc, const double* big6w, + const double* dvrc, const double* big6w, const double* heta, const double* bigh, const double* bigw, const double* c0, const double* costheta, const double* bigb, @@ -138,8 +138,6 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i dview[i].w=static_cast(r0[i]); } - _cutshortsq = static_cast(r0sqmax); - ucl_copy(param4,dview,false); param4_tex.get_texture(*(this->pair_program),"param4_tex"); param4_tex.bind_float(param4,4); @@ -212,60 +210,33 @@ double VashishtaT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) { - // Compute the block size and grid size to keep all cores busy - int BX=this->block_pair(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; +int VashishtaT::loop(const int eflag, const int vflag, const int evatom, + bool &success) { + const int nbor_pitch=this->nbor->nbor_pitch(); // build the short neighbor list int ainum=this->_ainum; - int nbor_pitch=this->nbor->nbor_pitch(); - int GX=static_cast(ceil(static_cast(ainum)/ - (BX/this->_threads_per_atom))); - - this->k_short_nbor.set_size(GX,BX); - this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, - &this->_nbor_data->begin(), - &this->dev_short_nbor, &_cutshortsq, &ainum, - &nbor_pitch, &this->_threads_per_atom); - - // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1 - // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1 - ainum=this->ans->inum(); - nbor_pitch=this->nbor->nbor_pitch(); - GX=static_cast(ceil(static_cast(this->ans->inum())/ - (BX/this->_threads_per_atom))); this->time_pair.start(); - // note that k_pair does not run with the short neighbor list - this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5, - &map, &elem2param, &_nelements, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, - &eflag, &vflag, &ainum, &nbor_pitch, - &this->_threads_per_atom); + int BX=this->block_pair(); + int GX=static_cast(ceil(static_cast(ainum)/BX)); + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, ¶m4, &map, &elem2param, + &_nelements, &_nparams, &this->nbor->dev_nbor, + &this->nbor->dev_packed, &ainum, &nbor_pitch, + &this->_threads_per_atom); + ainum=this->ans->inum(); BX=this->block_size(); GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/(KTHREADS*JTHREADS)))); - - this->k_three_center.set_size(GX,BX); - this->k_three_center.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5, - &map, &elem2param, &_nelements, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, - &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, + this->k_3center_sel->set_size(GX,BX); + this->k_3center_sel->run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, + ¶m5, &map, &elem2param, &_nelements, + &this->nbor->dev_nbor, &this->ans->force, + &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &evatom); + Answer *end_ans; #ifdef THREE_CONCURRENT end_ans=this->ans2; @@ -274,23 +245,34 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) { #endif if (evatom!=0) { this->k_three_end_vatom.set_size(GX,BX); - this->k_three_end_vatom.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5, - &map, &elem2param, &_nelements, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_ilist, &this->dev_short_nbor, - &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); + this->k_three_end_vatom.run(&this->atom->x, ¶m1, ¶m2, ¶m3, + ¶m4, ¶m5, &map, &elem2param, &_nelements, + &this->nbor->dev_nbor, &this->nbor->three_ilist, + &end_ans->force, &end_ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom, + &this->_gpu_nbor); } else { - this->k_three_end.set_size(GX,BX); - this->k_three_end.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5, - &map, &elem2param, &_nelements, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_ilist, &this->dev_short_nbor, - &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); + this->k_3end_sel->set_size(GX,BX); + this->k_3end_sel->run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, + ¶m5, &map, &elem2param, &_nelements, + &this->nbor->dev_nbor, &this->nbor->three_ilist, + &end_ans->force, &end_ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom, + &this->_gpu_nbor); } + BX=this->block_pair(); + int GXT=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + // note that k_pair does not run with the short neighbor list + this->k_sel->set_size(GXT,BX); + this->k_sel->run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5, + &map, &elem2param, &_nelements, &this->nbor->dev_packed, + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &GX); + this->time_pair.stop(); + return GX; } template class Vashishta; diff --git a/lib/gpu/lal_vashishta.cu b/lib/gpu/lal_vashishta.cu index da15aaf09a..6c9ba14b4a 100644 --- a/lib/gpu/lal_vashishta.cu +++ b/lib/gpu/lal_vashishta.cu @@ -32,6 +32,14 @@ _texture( param4_tex,int4); _texture( param5_tex,int4); #endif +#if (__CUDACC_VER_MAJOR__ >= 11) +#define param1_tex param1 +#define param2_tex param2 +#define param3_tex param3 +#define param4_tex param4 +#define param5_tex param5 +#endif + #else #define pos_tex x_ #define param1_tex param1 @@ -41,92 +49,167 @@ _texture( param5_tex,int4); #define param5_tex param5 #endif + + #define THIRD (numtyp)0.66666666666666666667 //#define THREE_CONCURRENT -#if (ARCH < 300) +#if (SHUFFLE_AVAIL == 0) -#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, offset, \ - eflag, vflag, ans, engv) \ +#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, \ + offset, eflag, vflag, ans, engv, ev_stride) \ if (t_per_atom>1) { \ - __local acctyp red_acc[6][BLOCK_ELLIPSE]; \ - red_acc[0][tid]=f.x; \ - red_acc[1][tid]=f.y; \ - red_acc[2][tid]=f.z; \ - red_acc[3][tid]=energy; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<4; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ + simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ + if (EVFLAG && (vflag==2 || eflag==2)) { \ + if (eflag) { \ + simdsync(); \ + simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy); \ } \ - } \ - f.x=red_acc[0][tid]; \ - f.y=red_acc[1][tid]; \ - f.z=red_acc[2][tid]; \ - energy=red_acc[3][tid]; \ - if (vflag>0) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid]=virial[r]; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ - } \ + if (vflag) { \ + simdsync(); \ + simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \ } \ - for (int r=0; r<6; r++) \ - virial[r]=red_acc[r][tid]; \ } \ } \ - if (offset==0) { \ - int ei=ii; \ - if (eflag>0) { \ - engv[ei]+=energy*(acctyp)0.5; \ - ei+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - engv[ei]+=virial[i]*(acctyp)0.5; \ - ei+=inum; \ - } \ - } \ + if (offset==0 && ii1) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - f.x += shfl_xor(f.x, s, t_per_atom); \ - f.y += shfl_xor(f.y, s, t_per_atom); \ - f.z += shfl_xor(f.z, s, t_per_atom); \ - energy += shfl_xor(energy, s, t_per_atom); \ - } \ - if (vflag>0) { \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - for (int r=0; r<6; r++) \ - virial[r] += shfl_xor(virial[r], s, t_per_atom); \ - } \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (vflag==2 || eflag==2) { \ + if (eflag) \ + simd_reduce_add1(t_per_atom,energy); \ + if (vflag) \ + simd_reduce_arr(6, t_per_atom,virial); \ } \ } \ - if (offset==0) { \ - int ei=ii; \ - if (eflag>0) { \ - engv[ei]+=energy*(acctyp)0.5; \ - ei+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - engv[ei]+=virial[i]*(acctyp)0.5; \ + if (offset==0 && ii 1; active_subgs /= vwidth) { \ + if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \ + if (bnum < active_subgs) { \ + if (eflag) { \ + simd_reduce_add1(vwidth, energy); \ + if (voffset==0) red_acc[6][bnum] = energy; \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (voffset==0) \ + for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \ + } \ + } \ + \ + __syncthreads(); \ + if (tid < active_subgs) { \ + if (eflag) energy = red_acc[6][tid]; \ + if (vflag) \ + for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \ + } else { \ + if (eflag) energy = (acctyp)0; \ + if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \ + } \ + } \ + \ + if (bnum == 0) { \ + int ei=BLOCK_ID_X; \ + if (eflag) { \ + simd_reduce_add1(vwidth, energy); \ + if (tid==0) { \ + engv[ei]+=energy*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (tid==0) { \ + for (int r=0; r<6; r++) { \ + engv[ei]+=virial[r]*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + } \ + } \ + } else if (offset==0 && ii1) \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (offset==0 && ii0) + if (EVFLAG && eflag) energy += (param3_bigh*reta+vc2-vc3-param3_bigw*r6inv-r*param3_dvrc+param3_c0); - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -293,11 +381,10 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_, } } } // for nbor - - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii - + const int tid=THREAD_ID_X; + store_answers_p(f,energy,virial,ii,inum,tid,1,0,eflag,vflag,ans,engv, + ev_stride); } #define threebody(delr1x, delr1y, delr1z, eflag, energy) \ @@ -344,9 +431,9 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_, fky = delr2y*(frad2+csfac2)-delr1y*facang12; \ fkz = delr2z*(frad2+csfac2)-delr1z*facang12; \ \ - if (eflag>0) \ + if (EVFLAG && eflag) \ energy+=facrad; \ - if (vflag>0) { \ + if (EVFLAG && vflag) { \ virial[0] += delr1x*fjx + delr2x*fkx; \ virial[1] += delr1y*fjy + delr2y*fky; \ virial[2] += delr1z*fjz + delr2z*fkz; \ @@ -402,54 +489,45 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_, const __global int *restrict elem2param, const int nelements, const __global int * dev_nbor, - const __global int * dev_packed, - const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom, const int evatom) { - __local int tpa_sq, n_stride; - tpa_sq=fast_mul(t_per_atom,t_per_atom); + int n_stride; + const int tpa_sq=fast_mul(t_per_atom,t_per_atom); numtyp param_gamma_ij, param_r0sq_ij, param_r0_ij, param_gamma_ik, param_r0sq_ik, param_r0_ik; numtyp param_costheta_ijk, param_bigc_ijk, param_bigb_ijk, param_big2b_ijk; int tid, ii, offset; atom_info(tpa_sq,ii,tid,offset); - acctyp energy=(acctyp)0; + local_allocate_store_three(); + acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } __syncthreads(); if (ii { int init(const int ntypes, const int nlocal, const int nall, const int max_nbors, const double cell_size, const double gpu_split, FILE *screen, int* host_map, const int nelements, int*** host_elem2param, const int nparams, - const double* cutsq, const double* r0, + const double* cutsq, const double* r0, const double* gamma, const double* eta, const double* lam1inv, const double* lam4inv, const double* zizj, const double* mbigd, - const double* dvrc, const double* big6w, + const double* dvrc, const double* big6w, const double* heta, const double* bigh, const double* bigw, const double* c0, const double* costheta, const double* bigb, @@ -82,13 +82,12 @@ class Vashishta : public BaseThree { UCL_D_Vec elem2param; UCL_D_Vec map; int _nparams,_nelements; - numtyp _cutshortsq; UCL_Texture param1_tex, param2_tex, param3_tex, param4_tex, param5_tex; private: bool _allocated; - void loop(const bool _eflag, const bool _vflag, const int evatom); + int loop(const int eflag, const int vflag, const int evatom, bool &success); }; diff --git a/lib/gpu/lal_vashishta_ext.cpp b/lib/gpu/lal_vashishta_ext.cpp index 56dfd8a0ff..ecbdefed19 100644 --- a/lib/gpu/lal_vashishta_ext.cpp +++ b/lib/gpu/lal_vashishta_ext.cpp @@ -32,7 +32,7 @@ int vashishta_gpu_init(const int ntypes, const int inum, const int nall, const i const double* gamma, const double* eta, const double* lam1inv, const double* lam4inv, const double* zizj, const double* mbigd, - const double* dvrc, const double* big6w, + const double* dvrc, const double* big6w, const double* heta, const double* bigh, const double* bigw, const double* c0, const double* costheta, const double* bigb, @@ -63,10 +63,10 @@ int vashishta_gpu_init(const int ntypes, const int inum, const int nall, const i int init_ok=0; if (world_me==0) - init_ok=VashishtaMF.init(ntypes, inum, nall, 500, cell_size, gpu_split, screen, + init_ok=VashishtaMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen, host_map, nelements, host_elem2param, nparams, - cutsq, r0, gamma, eta, lam1inv, - lam4inv, zizj, mbigd, dvrc, big6w, heta, bigh, bigw, + cutsq, r0, gamma, eta, lam1inv, + lam4inv, zizj, mbigd, dvrc, big6w, heta, bigh, bigw, c0, costheta, bigb, big2b, bigc); VashishtaMF.device->world_barrier(); @@ -83,10 +83,10 @@ int vashishta_gpu_init(const int ntypes, const int inum, const int nall, const i fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=VashishtaMF.init(ntypes, inum, nall, 500, cell_size, gpu_split, screen, + init_ok=VashishtaMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen, host_map, nelements, host_elem2param, nparams, - cutsq, r0, gamma, eta, lam1inv, - lam4inv, zizj, mbigd, dvrc, big6w, heta, bigh, bigw, + cutsq, r0, gamma, eta, lam1inv, + lam4inv, zizj, mbigd, dvrc, big6w, heta, bigh, bigw, c0, costheta, bigb, big2b, bigc); VashishtaMF.device->gpu_barrier(); diff --git a/lib/gpu/lal_yukawa.cpp b/lib/gpu/lal_yukawa.cpp index 453139e537..707f60f071 100644 --- a/lib/gpu/lal_yukawa.cpp +++ b/lib/gpu/lal_yukawa.cpp @@ -109,20 +109,9 @@ double YukawaT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void YukawaT::loop(const bool _eflag, const bool _vflag) { +int YukawaT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -130,8 +119,8 @@ void YukawaT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &coeff, &_kappa, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &coeff, &_kappa, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, @@ -144,6 +133,7 @@ void YukawaT::loop(const bool _eflag, const bool _vflag) { &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class Yukawa; diff --git a/lib/gpu/lal_yukawa.cu b/lib/gpu/lal_yukawa.cu index 62bc013dc6..6ebd2dc06d 100644 --- a/lib/gpu/lal_yukawa.cu +++ b/lib/gpu/lal_yukawa.cu @@ -38,22 +38,25 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { numtyp e=coeff[mtype].x*screening*rinv; energy+=factor_lj*(e-coeff[mtype].y); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -104,9 +107,9 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_, @@ -124,25 +127,28 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_, __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + if (tid<4) sp_lj[tid]=sp_lj_in[tid]; if (tid0) { + if (EVFLAG && eflag) { numtyp e=coeff[mtype].x*screening*rinv; energy+=factor_lj*(e-coeff[mtype].y); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -193,8 +199,8 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } diff --git a/lib/gpu/lal_yukawa.h b/lib/gpu/lal_yukawa.h index 7d638d760e..51871a9728 100644 --- a/lib/gpu/lal_yukawa.h +++ b/lib/gpu/lal_yukawa.h @@ -72,7 +72,7 @@ class Yukawa : public BaseAtomic { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_yukawa_colloid.cpp b/lib/gpu/lal_yukawa_colloid.cpp index 46d4d64328..a447bb3889 100644 --- a/lib/gpu/lal_yukawa_colloid.cpp +++ b/lib/gpu/lal_yukawa_colloid.cpp @@ -133,10 +133,25 @@ double YukawaColloidT::host_memory_usage() const { template void YukawaColloidT::compute(const int f_ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, - int *numj, int **firstneigh, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *rad) { + int *numj, int **firstneigh, const bool eflag_in, + const bool vflag_in, const bool eatom, const bool vatom, + int &host_start, const double cpu_time, bool &success, + double *rad) { this->acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + this->set_kernel(eflag,vflag); // ------------------- Resize rad array -------------------------- @@ -177,8 +192,8 @@ void YukawaColloidT::compute(const int f_ago, const int inum_full, this->atom->add_x_data(host_x,host_type); this->add_rad_data(); - this->loop(eflag,vflag); - this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist); + const int red_blocks=this->loop(eflag,vflag); + this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks); this->device->add_ans_object(this->ans); this->hd_balancer.stop_timer(); } @@ -187,14 +202,28 @@ void YukawaColloidT::compute(const int f_ago, const int inum_full, // Reneighbor on GPU and then compute per-atom densities // --------------------------------------------------------------------------- template -int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, double *sublo, +int** YukawaColloidT::compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, + tagint **special, const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *rad) { this->acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + this->set_kernel(eflag,vflag); // ------------------- Resize rad array ---------------------------- @@ -240,8 +269,8 @@ int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall *ilist=this->nbor->host_ilist.begin(); *jnum=this->nbor->host_acc.begin(); - this->loop(eflag,vflag); - this->ans->copy_answers(eflag,vflag,eatom,vatom); + const int red_blocks=this->loop(eflag,vflag); + this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); this->device->add_ans_object(this->ans); this->hd_balancer.stop_timer(); @@ -252,20 +281,9 @@ int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall // Calculate per-atom energies and forces // --------------------------------------------------------------------------- template -void YukawaColloidT::loop(const bool _eflag, const bool _vflag) { +int YukawaColloidT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -273,8 +291,8 @@ void YukawaColloidT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &c_rad, &coeff, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &c_rad, &coeff, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa); @@ -286,6 +304,7 @@ void YukawaColloidT::loop(const bool _eflag, const bool _vflag) { &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa); } this->time_pair.stop(); + return GX; } template class YukawaColloid; diff --git a/lib/gpu/lal_yukawa_colloid.cu b/lib/gpu/lal_yukawa_colloid.cu index 30b458fec7..847ffa6d80 100644 --- a/lib/gpu/lal_yukawa_colloid.cu +++ b/lib/gpu/lal_yukawa_colloid.cu @@ -24,6 +24,10 @@ _texture_2d( pos_tex,int4); _texture( rad_tex,int2); #endif +#if (__CUDACC_VER_MAJOR__ >= 11) +#define rad_tex rad_ +#endif + #else #define pos_tex x_ #define rad_tex rad_ @@ -45,22 +49,25 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { numtyp e=coeff[mtype].x/kappa * screening; energy+=factor_lj*(e-coeff[mtype].y); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -113,9 +120,9 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_, @@ -134,25 +141,28 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_, __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; + int n_stride; + local_allocate_store_pair(); + if (tid<4) sp_lj[tid]=sp_lj_in[tid]; if (tid0) { + if (EVFLAG && eflag) { numtyp e=coeff[mtype].x/kappa * screening; energy+=factor_lj*(e-coeff[mtype].y); } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -205,8 +215,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } - diff --git a/lib/gpu/lal_yukawa_colloid.h b/lib/gpu/lal_yukawa_colloid.h index 607bc42321..a08248dd3a 100644 --- a/lib/gpu/lal_yukawa_colloid.h +++ b/lib/gpu/lal_yukawa_colloid.h @@ -114,7 +114,7 @@ class YukawaColloid : public BaseAtomic { private: bool _shared_view; bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_yukawa_colloid_ext.cpp b/lib/gpu/lal_yukawa_colloid_ext.cpp index 988d33bdd6..db86f91689 100644 --- a/lib/gpu/lal_yukawa_colloid_ext.cpp +++ b/lib/gpu/lal_yukawa_colloid_ext.cpp @@ -55,7 +55,7 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a, int init_ok=0; if (world_me==0) init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, - inum, nall, 300, maxspecial, cell_size, gpu_split, + inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, kappa); YKCOLLMF.device->world_barrier(); @@ -73,7 +73,7 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a, } if (gpu_rank==i && world_me!=0) init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, - inum, nall, 300, maxspecial, cell_size, gpu_split, + inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, kappa); YKCOLLMF.device->gpu_barrier(); diff --git a/lib/gpu/lal_yukawa_ext.cpp b/lib/gpu/lal_yukawa_ext.cpp index 995694bdfd..cf2bf89e3d 100644 --- a/lib/gpu/lal_yukawa_ext.cpp +++ b/lib/gpu/lal_yukawa_ext.cpp @@ -55,7 +55,7 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa, int init_ok=0; if (world_me==0) init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, - inum, nall, 300, maxspecial, cell_size, + inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); YKMF.device->world_barrier(); @@ -73,7 +73,7 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa, } if (gpu_rank==i && world_me!=0) init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, - inum, nall, 300, maxspecial, cell_size, + inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); YKMF.device->gpu_barrier(); diff --git a/lib/gpu/lal_zbl.cpp b/lib/gpu/lal_zbl.cpp index 2bf3369174..885f6f10bb 100644 --- a/lib/gpu/lal_zbl.cpp +++ b/lib/gpu/lal_zbl.cpp @@ -118,20 +118,9 @@ double ZBLT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -void ZBLT::loop(const bool _eflag, const bool _vflag) { +int ZBLT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -139,8 +128,8 @@ void ZBLT::loop(const bool _eflag, const bool _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &coeff3, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &coeff3, &_cut_globalsq, &_cut_innersq, &_cut_inner, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, @@ -154,6 +143,7 @@ void ZBLT::loop(const bool _eflag, const bool _vflag) { &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class ZBL; diff --git a/lib/gpu/lal_zbl.cu b/lib/gpu/lal_zbl.cu index 2539c0ddd7..09e1b4f6bb 100644 --- a/lib/gpu/lal_zbl.cu +++ b/lib/gpu/lal_zbl.cu @@ -95,17 +95,20 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_, int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - acctyp energy=(acctyp)0; + int n_stride; + local_allocate_store_pair(); + acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y, coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z); e += coeff3[mtype].z; @@ -151,7 +154,7 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_, } energy+=e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -162,9 +165,9 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_zbl_fast(const __global numtyp4 *restrict x_, @@ -186,25 +189,28 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_, __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + int n_stride; + local_allocate_store_pair(); + if (tid0) { + if (EVFLAG && eflag) { numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y, coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z); e += coeff3[mtype].z; @@ -251,7 +257,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_, } energy+=e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -262,8 +268,8 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } diff --git a/lib/gpu/lal_zbl.h b/lib/gpu/lal_zbl.h index e205d326c6..af4f1b2eac 100644 --- a/lib/gpu/lal_zbl.h +++ b/lib/gpu/lal_zbl.h @@ -76,7 +76,7 @@ class ZBL : public BaseAtomic { private: bool _allocated; - void loop(const bool _eflag, const bool _vflag); + int loop(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_zbl_ext.cpp b/lib/gpu/lal_zbl_ext.cpp index f15e814a50..ee7794af2d 100644 --- a/lib/gpu/lal_zbl_ext.cpp +++ b/lib/gpu/lal_zbl_ext.cpp @@ -58,7 +58,7 @@ int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1, init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4, host_sw5, host_d1a, host_d2a, host_d3a, host_d4a, host_zze, cut_globalsq, cut_innersq, cut_inner, - inum, nall, 300, maxspecial, cell_size, gpu_split, screen); + inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); ZBLMF.device->world_barrier(); if (message) @@ -77,7 +77,7 @@ int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1, init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4, host_sw5, host_d1a, host_d2a, host_d3a, host_d4a, host_zze, cut_globalsq, cut_innersq, cut_inner, - inum, nall, 300, maxspecial, cell_size, gpu_split, screen); + inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); ZBLMF.device->gpu_barrier(); if (message) diff --git a/python/lammps/core.py b/python/lammps/core.py index d1bc7bc138..e13bf9585b 100644 --- a/python/lammps/core.py +++ b/python/lammps/core.py @@ -286,15 +286,16 @@ class lammps(object): self.lib.lammps_fix_external_set_energy_global = [c_void_p, c_char_p, c_double] self.lib.lammps_fix_external_set_virial_global = [c_void_p, c_char_p, POINTER(c_double)] - # detect if Python is using version of mpi4py that can pass a communicator - + # detect if Python is using a version of mpi4py that can pass communicators + # only needed if LAMMPS has been compiled with MPI support. self.has_mpi4py = False - try: - from mpi4py import __version__ as mpi4py_version - # tested to work with mpi4py versions 2 and 3 - self.has_mpi4py = mpi4py_version.split('.')[0] in ['2','3'] - except: - pass + if self.has_mpi_support: + try: + from mpi4py import __version__ as mpi4py_version + # tested to work with mpi4py versions 2 and 3 + self.has_mpi4py = mpi4py_version.split('.')[0] in ['2','3'] + except: + pass # if no ptr provided, create an instance of LAMMPS # don't know how to pass an MPI communicator from PyPar @@ -307,23 +308,27 @@ class lammps(object): if not ptr: - # with mpi4py v2, can pass MPI communicator to LAMMPS + # with mpi4py v2+, we can pass MPI communicators to LAMMPS # need to adjust for type of MPI communicator object # allow for int (like MPICH) or void* (like OpenMPI) - if self.has_mpi4py and self.has_mpi_support: + if self.has_mpi_support and self.has_mpi4py: from mpi4py import MPI self.MPI = MPI if comm: - if not self.has_mpi4py: - raise Exception('Python mpi4py version is not 2 or 3') if not self.has_mpi_support: raise Exception('LAMMPS not compiled with real MPI library') + if not self.has_mpi4py: + raise Exception('Python mpi4py version is not 2 or 3') if self.MPI._sizeof(self.MPI.Comm) == sizeof(c_int): MPI_Comm = c_int else: MPI_Comm = c_void_p + # Detect whether LAMMPS and mpi4py definitely use different MPI libs + if sizeof(MPI_Comm) != self.lib.lammps_config_has_mpi_support(): + raise Exception('Inconsistent MPI library in LAMMPS and mpi4py') + narg = 0 cargs = None if cmdargs: @@ -1612,7 +1617,7 @@ class lammps(object): def get_neighlist(self, idx): """Returns an instance of :class:`NeighList` which wraps access to the neighbor list with the given index - See :py:meth:`lammps.numpy.get_neighlist() ` if you want to use + See :py:meth:`lammps.numpy.get_neighlist() ` if you want to use NumPy arrays instead of ``c_int`` pointers. :param idx: index of neighbor list diff --git a/python/lammps/pylammps.py b/python/lammps/pylammps.py index 47a2a5a6ab..4bba9f5e94 100644 --- a/python/lammps/pylammps.py +++ b/python/lammps/pylammps.py @@ -400,6 +400,7 @@ class PyLammps(object): self.lmp = lammps(name=name,cmdargs=cmdargs,ptr=None,comm=comm) print("LAMMPS output is captured by PyLammps wrapper") self._cmd_history = [] + self._enable_cmd_history = False self.runs = [] def __del__(self): @@ -434,6 +435,24 @@ class PyLammps(object): """ self.lmp.file(file) + @property + def enable_cmd_history(self): + """ + :getter: Return whether command history is saved + :setter: Set if command history should be saved + :type: bool + """ + return self._enable_cmd_history + + @enable_cmd_history.setter + def enable_cmd_history(self, value): + """ + :getter: Return whether command history is saved + :setter: Set if command history should be saved + :type: bool + """ + self._enable_cmd_history = (value == True) + def write_script(self, filepath): """ Write LAMMPS script file containing all commands executed up until now @@ -445,18 +464,28 @@ class PyLammps(object): for cmd in self._cmd_history: print(cmd, file=f) + def clear_cmd_history(self): + """ + Clear LAMMPS command history up to this point + """ + self._cmd_history = [] + def command(self, cmd): """ Execute LAMMPS command - All commands executed will be stored in a command history which can be - written to a file using :py:meth:`PyLammps.write_script()` + If :py:attr:`PyLammps.enable_cmd_history` is set to ``True``, commands executed + will be recorded. The entire command history can be written to a file using + :py:meth:`PyLammps.write_script()`. To clear the command history, use + :py:meth:`PyLammps.clear_cmd_history()`. :param cmd: command string that should be executed :type: cmd: string """ self.lmp.command(cmd) - self._cmd_history.append(cmd) + + if self.enable_cmd_history: + self._cmd_history.append(cmd) def run(self, *args, **kwargs): """ diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh index 1fefb01d42..49b7eeda57 100755 --- a/src/GPU/Install.sh +++ b/src/GPU/Install.sh @@ -30,6 +30,16 @@ action () { action fix_gpu.cpp action fix_gpu.h +action fix_nve_gpu.h +action fix_nve_gpu.cpp +action fix_nh_gpu.h +action fix_nh_gpu.cpp +action fix_nvt_gpu.h +action fix_nvt_gpu.cpp +action fix_npt_gpu.h +action fix_npt_gpu.cpp +action fix_nve_asphere_gpu.h fix_nve_asphere.h +action fix_nve_asphere_gpu.cpp fix_nve_asphere.cpp action gpu_extra.h action pair_beck_gpu.cpp action pair_beck_gpu.h @@ -83,6 +93,8 @@ action pair_lj96_cut_gpu.cpp action pair_lj96_cut_gpu.h action pair_lj_charmm_coul_long_gpu.cpp pair_lj_charmm_coul_long.cpp action pair_lj_charmm_coul_long_gpu.h pair_lj_charmm_coul_long.cpp +action pair_lj_charmm_coul_charmm_gpu.cpp pair_lj_charmm_coul_charmm.cpp +action pair_lj_charmm_coul_charmm_gpu.h pair_lj_charmm_coul_charmm.cpp action pair_lj_class2_coul_long_gpu.cpp pair_lj_class2_coul_long.cpp action pair_lj_class2_coul_long_gpu.h pair_lj_class2_coul_long.cpp action pair_lj_class2_gpu.cpp pair_lj_class2.cpp diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp index 8f88dfd61d..8297c338a5 100644 --- a/src/GPU/fix_gpu.cpp +++ b/src/GPU/fix_gpu.cpp @@ -15,6 +15,7 @@ #include #include "atom.h" +#include "comm.h" #include "force.h" #include "pair.h" #include "pair_hybrid.h" @@ -31,21 +32,28 @@ #include "citeme.h" #include "error.h" +#if (LAL_USE_OMP == 1) +#include +#endif using namespace LAMMPS_NS; using namespace FixConst; enum{GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH}; -extern int lmp_init_device(MPI_Comm world, MPI_Comm replica, - const int first_gpu, const int last_gpu, - const int gpu_mode, const double particle_split, - const int nthreads, const int t_per_atom, - const double cell_size, char *opencl_flags, +extern int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int ngpu, + const int first_gpu_id, const int gpu_mode, + const double particle_split, const int t_per_atom, + const double cell_size, char *opencl_args, + const int ocl_platform, char *device_type_flags, const int block_pair); extern void lmp_clear_device(); extern double lmp_gpu_forces(double **f, double **tor, double *eatom, - double **vatom, double *virial, double &ecoul); + double **vatom, double *virial, double &ecoul, + int &err_flag); +extern double lmp_gpu_update_bin_size(const double subx, const double suby, + const double subz, const int nlocal, + const double cut); static const char cite_gpu_package[] = "GPU package (short-range, long-range and three-body potentials):\n\n" @@ -105,22 +113,27 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : if (narg < 4) error->all(FLERR,"Illegal package gpu command"); + // If ngpu is 0, autoset ngpu to the number of devices per node matching + // best device int ngpu = atoi(arg[3]); - if (ngpu <= 0) error->all(FLERR,"Illegal package gpu command"); - int first_gpu = 0; - int last_gpu = ngpu-1; + if (ngpu < 0) error->all(FLERR,"Illegal package gpu command"); + + // Negative value indicate GPU package should find the best device ID + int first_gpu_id = -1; // options _gpu_mode = GPU_NEIGH; _particle_split = 1.0; - int nthreads = 1; + int nthreads = 0; int newtonflag = 0; int threads_per_atom = -1; double binsize = 0.0; - char *opencl_flags = nullptr; + char *opencl_args = nullptr; int block_pair = -1; int pair_only_flag = 0; + int ocl_platform = -1; + char *device_type_flags = nullptr; int iarg = 4; while (iarg < narg) { @@ -149,22 +162,25 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : error->all(FLERR,"Illegal package GPU command"); iarg += 2; } else if (strcmp(arg[iarg],"gpuID") == 0) { - if (iarg+3 > narg) error->all(FLERR,"Illegal package gpu command"); - first_gpu = utils::inumeric(FLERR,arg[iarg+1],false,lmp); - last_gpu = utils::inumeric(FLERR,arg[iarg+2],false,lmp); - iarg += 3; + if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command"); + first_gpu_id = utils::inumeric(FLERR,arg[iarg+1],false,lmp); + iarg += 2; } else if (strcmp(arg[iarg],"tpa") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command"); threads_per_atom = utils::inumeric(FLERR,arg[iarg+1],false,lmp); iarg += 2; - } else if (strcmp(arg[iarg],"nthreads") == 0) { + } else if (strcmp(arg[iarg],"omp") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command"); nthreads = utils::inumeric(FLERR,arg[iarg+1],false,lmp); - if (nthreads < 1) error->all(FLERR,"Illegal fix GPU command"); + if (nthreads < 0) error->all(FLERR,"Illegal fix GPU command"); iarg += 2; - } else if (strcmp(arg[iarg],"device") == 0) { + } else if (strcmp(arg[iarg],"platform") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command"); - opencl_flags = arg[iarg+1]; + ocl_platform = utils::inumeric(FLERR,arg[iarg+1],false,lmp); + iarg += 2; + } else if (strcmp(arg[iarg],"device_type") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command"); + device_type_flags = arg[iarg+1]; iarg += 2; } else if (strcmp(arg[iarg],"blocksize") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command"); @@ -176,12 +192,21 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : else if (strcmp(arg[iarg+1],"on") == 0) pair_only_flag = 1; else error->all(FLERR,"Illegal package gpu command"); iarg += 2; + } else if (strcmp(arg[iarg],"ocl_args") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command"); + opencl_args = arg[iarg+1]; + iarg += 2; } else error->all(FLERR,"Illegal package gpu command"); } - #ifndef _OPENMP + #if (LAL_USE_OMP == 0) if (nthreads > 1) error->all(FLERR,"No OpenMP support compiled in"); + #else + if (nthreads > 0) { + omp_set_num_threads(nthreads); + comm->nthreads = nthreads; + } #endif // set newton pair flag @@ -207,10 +232,11 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : // change binsize default (0.0) to -1.0 used by GPU lib if (binsize == 0.0) binsize = -1.0; - int gpu_flag = lmp_init_device(universe->uworld, world, first_gpu, last_gpu, - _gpu_mode, _particle_split, nthreads, - threads_per_atom, binsize, opencl_flags, - block_pair); + _binsize = binsize; + int gpu_flag = lmp_init_device(universe->uworld, world, ngpu, first_gpu_id, + _gpu_mode, _particle_split, threads_per_atom, + binsize, opencl_args, ocl_platform, + device_type_flags, block_pair); GPU_EXTRA::check_flag(gpu_flag,error,world); } @@ -296,9 +322,15 @@ void FixGPU::post_force(int /* vflag */) timer->stamp(); double lvirial[6]; for (int i = 0; i < 6; i++) lvirial[i] = 0.0; + int err_flag; double my_eng = lmp_gpu_forces(atom->f, atom->torque, force->pair->eatom, force->pair->vatom, lvirial, - force->pair->eng_coul); + force->pair->eng_coul, err_flag); + if (err_flag) { + if (err_flag==1) + error->one(FLERR, + "Too many neighbors on GPU. Use neigh_modify one to increase limit."); + } force->pair->eng_vdwl += my_eng; force->pair->virial[0] += lvirial[0]; @@ -335,3 +367,12 @@ double FixGPU::memory_usage() return bytes; } +double FixGPU::binsize(const double subx, const double suby, + const double subz, const int nlocal, + const double cut) { + if (_binsize > 0.0) return _binsize; + else if (_gpu_mode == GPU_FORCE || comm->cutghostuser) + return cut * 0.5; + else + return lmp_gpu_update_bin_size(subx, suby, subz, nlocal, cut); +} diff --git a/src/GPU/fix_gpu.h b/src/GPU/fix_gpu.h index ba0b4c83cb..29a0907915 100644 --- a/src/GPU/fix_gpu.h +++ b/src/GPU/fix_gpu.h @@ -37,10 +37,14 @@ class FixGPU : public Fix { void post_force_respa(int, int, int); double memory_usage(); + double binsize(const double subx, const double suby, + const double subz, const int nlocal, const double cut); + private: int _gpu_mode; int _nlevels_respa; double _particle_split; + double _binsize; }; } @@ -78,4 +82,11 @@ E: Cannot use neigh_modify exclude with GPU neighbor builds This is a current limitation of the GPU implementation in LAMMPS. +E: Too many neighbors on GPU. Use neigh_modify one to increase limit. + +The expected maximum number of neighbors is determined in the GPU package +automatically. This error means the actual number of neighbors is exceeding +the expected value. Use neigh_modify one command to increase GPU allocations +(e.g. doubling this value doubles the GPU allocation). + */ diff --git a/src/GPU/fix_nh_gpu.cpp b/src/GPU/fix_nh_gpu.cpp new file mode 100644 index 0000000000..8b57289a50 --- /dev/null +++ b/src/GPU/fix_nh_gpu.cpp @@ -0,0 +1,552 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://lammps.sandia.gov/, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include "fix_nh_gpu.h" + +#include "atom.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "memory.h" +#include "modify.h" +#include "neighbor.h" +#include "update.h" + +#include +#include + +using namespace LAMMPS_NS; +using namespace FixConst; + +#define TILTMAX 1.5 + +enum{NOBIAS,BIAS}; +enum{ISO,ANISO,TRICLINIC}; + +typedef struct { double x,y,z; } dbl3_t; + +/* ---------------------------------------------------------------------- + NVT,NPH,NPT integrators for improved Nose-Hoover equations of motion + ---------------------------------------------------------------------- */ + +FixNHGPU::FixNHGPU(LAMMPS *lmp, int narg, char **arg) : + FixNH(lmp, narg, arg) +{ + _dtfm = 0; + _nlocal3 = 0; + _nlocal_max = 0; +} + +/* ---------------------------------------------------------------------- */ + +FixNHGPU::~FixNHGPU() +{ +} + +/* ---------------------------------------------------------------------- */ + +void FixNHGPU::setup(int vflag) +{ + FixNH::setup(vflag); + if (strstr(update->integrate_style,"respa")) + _respa_on = 1; + else + _respa_on = 0; + reset_dt(); +} + +/* ---------------------------------------------------------------------- + change box size + remap all atoms or dilate group atoms depending on allremap flag + if rigid bodies exist, scale rigid body centers-of-mass +------------------------------------------------------------------------- */ + +void FixNHGPU::remap() +{ + if (_respa_on) { FixNH::remap(); return; } + + double oldlo,oldhi; + double expfac; + + dbl3_t * _noalias const x = (dbl3_t *) atom->x[0]; + int *mask = atom->mask; + int nlocal = atom->nlocal; + double *h = domain->h; + + // omega is not used, except for book-keeping + + for (int i = 0; i < 6; i++) omega[i] += dto*omega_dot[i]; + + // convert pertinent atoms and rigid bodies to lamda coords + const double hi0 = domain->h_inv[0]; + const double hi1 = domain->h_inv[1]; + const double hi2 = domain->h_inv[2]; + const double hi3 = domain->h_inv[3]; + const double hi4 = domain->h_inv[4]; + const double hi5 = domain->h_inv[5]; + const double b0 = domain->boxlo[0]; + const double b1 = domain->boxlo[1]; + const double b2 = domain->boxlo[2]; + + if (allremap) { + #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) + #pragma omp parallel for simd schedule(static) + #elif (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i = 0; i < nlocal; i++) { + const double d0 = x[i].x - b0; + const double d1 = x[i].y - b1; + const double d2 = x[i].z - b2; + x[i].x = hi0*d0 + hi5*d1 + hi4*d2; + x[i].y = hi1*d1 + hi3*d2; + x[i].z = hi2*d2; + } + } else { + #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) + #pragma omp parallel for simd schedule(static) + #elif (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i = 0; i < nlocal; i++) { + if (mask[i] & dilate_group_bit) { + const double d0 = x[i].x - b0; + const double d1 = x[i].y - b1; + const double d2 = x[i].z - b2; + x[i].x = hi0*d0 + hi5*d1 + hi4*d2; + x[i].y = hi1*d1 + hi3*d2; + x[i].z = hi2*d2; + } + } + } + + if (nrigid) + for (int i = 0; i < nrigid; i++) + modify->fix[rfix[i]]->deform(0); + + // reset global and local box to new size/shape + + // this operation corresponds to applying the + // translate and scale operations + // corresponding to the solution of the following ODE: + // + // h_dot = omega_dot * h + // + // where h_dot, omega_dot and h are all upper-triangular + // 3x3 tensors. In Voigt notation, the elements of the + // RHS product tensor are: + // h_dot = [0*0, 1*1, 2*2, 1*3+3*2, 0*4+5*3+4*2, 0*5+5*1] + // + // Ordering of operations preserves time symmetry. + + double dto2 = dto/2.0; + double dto4 = dto/4.0; + double dto8 = dto/8.0; + + // off-diagonal components, first half + + if (pstyle == TRICLINIC) { + + if (p_flag[4]) { + expfac = exp(dto8*omega_dot[0]); + h[4] *= expfac; + h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]); + h[4] *= expfac; + } + + if (p_flag[3]) { + expfac = exp(dto4*omega_dot[1]); + h[3] *= expfac; + h[3] += dto2*(omega_dot[3]*h[2]); + h[3] *= expfac; + } + + if (p_flag[5]) { + expfac = exp(dto4*omega_dot[0]); + h[5] *= expfac; + h[5] += dto2*(omega_dot[5]*h[1]); + h[5] *= expfac; + } + + if (p_flag[4]) { + expfac = exp(dto8*omega_dot[0]); + h[4] *= expfac; + h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]); + h[4] *= expfac; + } + } + + // scale diagonal components + // scale tilt factors with cell, if set + + if (p_flag[0]) { + oldlo = domain->boxlo[0]; + oldhi = domain->boxhi[0]; + expfac = exp(dto*omega_dot[0]); + domain->boxlo[0] = (oldlo-fixedpoint[0])*expfac + fixedpoint[0]; + domain->boxhi[0] = (oldhi-fixedpoint[0])*expfac + fixedpoint[0]; + } + + if (p_flag[1]) { + oldlo = domain->boxlo[1]; + oldhi = domain->boxhi[1]; + expfac = exp(dto*omega_dot[1]); + domain->boxlo[1] = (oldlo-fixedpoint[1])*expfac + fixedpoint[1]; + domain->boxhi[1] = (oldhi-fixedpoint[1])*expfac + fixedpoint[1]; + if (scalexy) h[5] *= expfac; + } + + if (p_flag[2]) { + oldlo = domain->boxlo[2]; + oldhi = domain->boxhi[2]; + expfac = exp(dto*omega_dot[2]); + domain->boxlo[2] = (oldlo-fixedpoint[2])*expfac + fixedpoint[2]; + domain->boxhi[2] = (oldhi-fixedpoint[2])*expfac + fixedpoint[2]; + if (scalexz) h[4] *= expfac; + if (scaleyz) h[3] *= expfac; + } + + // off-diagonal components, second half + + if (pstyle == TRICLINIC) { + + if (p_flag[4]) { + expfac = exp(dto8*omega_dot[0]); + h[4] *= expfac; + h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]); + h[4] *= expfac; + } + + if (p_flag[3]) { + expfac = exp(dto4*omega_dot[1]); + h[3] *= expfac; + h[3] += dto2*(omega_dot[3]*h[2]); + h[3] *= expfac; + } + + if (p_flag[5]) { + expfac = exp(dto4*omega_dot[0]); + h[5] *= expfac; + h[5] += dto2*(omega_dot[5]*h[1]); + h[5] *= expfac; + } + + if (p_flag[4]) { + expfac = exp(dto8*omega_dot[0]); + h[4] *= expfac; + h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]); + h[4] *= expfac; + } + + } + + domain->yz = h[3]; + domain->xz = h[4]; + domain->xy = h[5]; + + // tilt factor to cell length ratio can not exceed TILTMAX in one step + + if (domain->yz < -TILTMAX*domain->yprd || + domain->yz > TILTMAX*domain->yprd || + domain->xz < -TILTMAX*domain->xprd || + domain->xz > TILTMAX*domain->xprd || + domain->xy < -TILTMAX*domain->xprd || + domain->xy > TILTMAX*domain->xprd) + error->all(FLERR,"Fix npt/nph has tilted box too far in one step - " + "periodic cell is too far from equilibrium state"); + + domain->set_global_box(); + domain->set_local_box(); + + // convert pertinent atoms and rigid bodies back to box coords + const double h0 = domain->h[0]; + const double h1 = domain->h[1]; + const double h2 = domain->h[2]; + const double h3 = domain->h[3]; + const double h4 = domain->h[4]; + const double h5 = domain->h[5]; + const double nb0 = domain->boxlo[0]; + const double nb1 = domain->boxlo[1]; + const double nb2 = domain->boxlo[2]; + + if (allremap) { + #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) + #pragma omp parallel for simd schedule(static) + #elif (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i = 0; i < nlocal; i++) { + x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0; + x[i].y = h1*x[i].y + h3*x[i].z + nb1; + x[i].z = h2*x[i].z + nb2; + } + } else { + #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) + #pragma omp parallel for simd schedule(static) + #elif (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i = 0; i < nlocal; i++) { + if (mask[i] & dilate_group_bit) { + x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0; + x[i].y = h1*x[i].y + h3*x[i].z + nb1; + x[i].z = h2*x[i].z + nb2; + } + } + } + + if (nrigid) + for (int i = 0; i < nrigid; i++) + modify->fix[rfix[i]]->deform(1); +} + +/* ---------------------------------------------------------------------- + 2nd half of Verlet update +------------------------------------------------------------------------- */ + +void FixNHGPU::final_integrate() { + if (neighbor->ago == 0 && _respa_on == 0) reset_dt(); + FixNH::final_integrate(); +} + +/* ---------------------------------------------------------------------- */ + +void FixNHGPU::reset_dt() +{ + if (_respa_on) { FixNH::reset_dt(); return; } + dtv = update->dt; + dtf = 0.5 * update->dt * force->ftm2v; + dthalf = 0.5 * update->dt; + dt4 = 0.25 * update->dt; + dt8 = 0.125 * update->dt; + dto = dthalf; + + if (pstat_flag) + pdrag_factor = 1.0 - (update->dt * p_freq_max * drag / nc_pchain); + + if (tstat_flag) + tdrag_factor = 1.0 - (update->dt * t_freq * drag / nc_tchain); + + const int * const mask = atom->mask; + const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : + atom->nlocal; + + if (nlocal > _nlocal_max) { + if (_nlocal_max) memory->destroy(_dtfm); + _nlocal_max = static_cast(1.20 * nlocal); + memory->create(_dtfm, _nlocal_max * 3, "fix_nh_gpu:dtfm"); + } + + _nlocal3 = nlocal * 3; + + if (igroup == 0) { + if (atom->rmass) { + const double * const rmass = atom->rmass; + int n = 0; + for (int i = 0; i < nlocal; i++) { + const double dtfir = dtf / rmass[i]; + _dtfm[n++] = dtfir; + _dtfm[n++] = dtfir; + _dtfm[n++] = dtfir; + } + } else { + const double * const mass = atom->mass; + const int * const type = atom->type; + int n = 0; + for (int i = 0; i < nlocal; i++) { + const double dtfim = dtf / mass[type[i]]; + _dtfm[n++] = dtfim; + _dtfm[n++] = dtfim; + _dtfm[n++] = dtfim; + } + } + } else { + if (atom->rmass) { + const double * const rmass = atom->rmass; + int n = 0; + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) { + const double dtfir = dtf / rmass[i]; + _dtfm[n++] = dtfir; + _dtfm[n++] = dtfir; + _dtfm[n++] = dtfir; + } else { + _dtfm[n++] = 0.0; + _dtfm[n++] = 0.0; + _dtfm[n++] = 0.0; + } + } else { + const double * const mass = atom->mass; + const int * const type = atom->type; + int n = 0; + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) { + const double dtfim = dtf / mass[type[i]]; + _dtfm[n++] = dtfim; + _dtfm[n++] = dtfim; + _dtfm[n++] = dtfim; + } else { + _dtfm[n++] = 0.0; + _dtfm[n++] = 0.0; + _dtfm[n++] = 0.0; + } + } + } +} + +/* ---------------------------------------------------------------------- + perform half-step barostat scaling of velocities +-----------------------------------------------------------------------*/ + +void FixNHGPU::nh_v_press() +{ + if (pstyle == TRICLINIC || which == BIAS || _respa_on) { + FixNH::nh_v_press(); + return; + } + + dbl3_t * _noalias const v = (dbl3_t *)atom->v[0]; + int *mask = atom->mask; + int nlocal = atom->nlocal; + if (igroup == atom->firstgroup) nlocal = atom->nfirst; + + double f0 = exp(-dt4*(omega_dot[0]+mtk_term2)); + double f1 = exp(-dt4*(omega_dot[1]+mtk_term2)); + double f2 = exp(-dt4*(omega_dot[2]+mtk_term2)); + f0 *= f0; + f1 *= f1; + f2 *= f2; + + if (igroup == 0) { + #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) + #pragma omp parallel for simd schedule(static) + #elif (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i = 0; i < nlocal; i++) { + v[i].x *= f0; + v[i].y *= f1; + v[i].z *= f2; + } + } else { + #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) + #pragma omp parallel for simd schedule(static) + #elif (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i = 0; i < nlocal; i++) { + if (mask[i] & groupbit) { + v[i].x *= f0; + v[i].y *= f1; + v[i].z *= f2; + } + } + } +} + +/* ---------------------------------------------------------------------- + perform half-step update of velocities +-----------------------------------------------------------------------*/ + +void FixNHGPU::nve_v() +{ + if (_respa_on) { FixNH::nve_v(); return; } + + double * _noalias const v = atom->v[0]; + const double * _noalias const f = atom->f[0]; + #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) + #pragma omp parallel for simd schedule(static) + #elif (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i = 0; i < _nlocal3; i++) + v[i] += _dtfm[i] * f[i]; +} + +/* ---------------------------------------------------------------------- + perform full-step update of positions +-----------------------------------------------------------------------*/ + +void FixNHGPU::nve_x() +{ + if (_respa_on) { FixNH::nve_x(); return; } + + double * _noalias const x = atom->x[0]; + double * _noalias const v = atom->v[0]; + + // x update by full step only for atoms in group + + if (igroup == 0) { + #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) + #pragma omp parallel for simd schedule(static) + #elif (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i = 0; i < _nlocal3; i++) + x[i] += dtv * v[i]; + } else { + #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) + #pragma omp parallel for simd schedule(static) + #elif (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i = 0; i < _nlocal3; i++) { + if (_dtfm[i] != 0.0) + x[i] += dtv * v[i]; + } + } +} + +/* ---------------------------------------------------------------------- + perform half-step thermostat scaling of velocities +-----------------------------------------------------------------------*/ + +void FixNHGPU::nh_v_temp() +{ + if (which == BIAS || _respa_on) { + FixNH::nh_v_temp(); + return; + } + + double * _noalias const v = atom->v[0]; + + if (igroup == 0) { + #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) + #pragma omp parallel for simd schedule(static) + #elif (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i = 0; i < _nlocal3; i++) + v[i] *= factor_eta; + } else { + #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) + #pragma omp parallel for simd schedule(static) + #elif (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i = 0; i < _nlocal3; i++) { + if (_dtfm[i] != 0.0) + v[i] *= factor_eta; + } + } +} + +double FixNHGPU::memory_usage() +{ + return FixNH::memory_usage() + _nlocal_max * 3 * sizeof(double); +} diff --git a/src/GPU/fix_nh_gpu.h b/src/GPU/fix_nh_gpu.h new file mode 100644 index 0000000000..edd210e813 --- /dev/null +++ b/src/GPU/fix_nh_gpu.h @@ -0,0 +1,164 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifndef LMP_FIX_NH_GPU_H +#define LMP_FIX_NH_GPU_H + +#include "fix_nh.h" + +namespace LAMMPS_NS { + +class FixNHGPU : public FixNH { + public: + FixNHGPU(class LAMMPS *, int, char **); + virtual ~FixNHGPU(); + virtual void setup(int vflag); + void reset_dt(); + virtual void final_integrate(); + virtual double memory_usage(); + + protected: + double *_dtfm; + int _nlocal3, _nlocal_max, _respa_on; + + virtual void remap(); + virtual void nve_x(); + virtual void nve_v(); + virtual void nh_v_press(); + virtual void nh_v_temp(); +}; + +} + +#endif + +/* ERROR/WARNING messages: + +E: Illegal ... command + +Self-explanatory. Check the input script syntax and compare to the +documentation for the command. You can use -echo screen as a +command-line option when running LAMMPS to see the offending line. + +E: Target temperature for fix nvt/npt/nph cannot be 0.0 + +Self-explanatory. + +E: Invalid fix nvt/npt/nph command for a 2d simulation + +Cannot control z dimension in a 2d model. + +E: Fix nvt/npt/nph dilate group ID does not exist + +Self-explanatory. + +E: Invalid fix nvt/npt/nph command pressure settings + +If multiple dimensions are coupled, those dimensions must be +specified. + +E: Cannot use fix nvt/npt/nph on a non-periodic dimension + +When specifying a diagonal pressure component, the dimension must be +periodic. + +E: Cannot use fix nvt/npt/nph on a 2nd non-periodic dimension + +When specifying an off-diagonal pressure component, the 2nd of the two +dimensions must be periodic. E.g. if the xy component is specified, +then the y dimension must be periodic. + +E: Cannot use fix nvt/npt/nph with yz scaling when z is non-periodic dimension + +The 2nd dimension in the barostatted tilt factor must be periodic. + +E: Cannot use fix nvt/npt/nph with xz scaling when z is non-periodic dimension + +The 2nd dimension in the barostatted tilt factor must be periodic. + +E: Cannot use fix nvt/npt/nph with xy scaling when y is non-periodic dimension + +The 2nd dimension in the barostatted tilt factor must be periodic. + +E: Cannot use fix nvt/npt/nph with both yz dynamics and yz scaling + +Self-explanatory. + +E: Cannot use fix nvt/npt/nph with both xz dynamics and xz scaling + +Self-explanatory. + +E: Cannot use fix nvt/npt/nph with both xy dynamics and xy scaling + +Self-explanatory. + +E: Can not specify Pxy/Pxz/Pyz in fix nvt/npt/nph with non-triclinic box + +Only triclinic boxes can be used with off-diagonal pressure components. +See the region prism command for details. + +E: Invalid fix nvt/npt/nph pressure settings + +Settings for coupled dimensions must be the same. + +E: Fix nvt/npt/nph damping parameters must be > 0.0 + +Self-explanatory. + +E: Cannot use fix npt and fix deform on same component of stress tensor + +This would be changing the same box dimension twice. + +E: Temperature ID for fix nvt/npt does not exist + +Self-explanatory. + +E: Pressure ID for fix npt/nph does not exist + +Self-explanatory. + +E: Fix npt/nph has tilted box too far in one step - periodic cell is too far from equilibrium state + +Self-explanatory. The change in the box tilt is too extreme +on a short timescale. + +E: Could not find fix_modify temperature ID + +The compute ID for computing temperature does not exist. + +E: Fix_modify temperature ID does not compute temperature + +The compute ID assigned to the fix must compute temperature. + +W: Temperature for fix modify is not for group all + +The temperature compute is being used with a pressure calculation +which does operate on group all, so this may be inconsistent. + +E: Pressure ID for fix modify does not exist + +Self-explanatory. + +E: Could not find fix_modify pressure ID + +The compute ID for computing pressure does not exist. + +E: Fix_modify pressure ID does not compute pressure + +The compute ID assigned to the fix must compute pressure. + +*/ diff --git a/src/GPU/fix_npt_gpu.cpp b/src/GPU/fix_npt_gpu.cpp new file mode 100644 index 0000000000..2ba0be29e0 --- /dev/null +++ b/src/GPU/fix_npt_gpu.cpp @@ -0,0 +1,68 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://lammps.sandia.gov/, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include +#include "fix_npt_gpu.h" +#include "modify.h" +#include "error.h" + +using namespace LAMMPS_NS; +using namespace FixConst; + +/* ---------------------------------------------------------------------- */ + +FixNPTGPU::FixNPTGPU(LAMMPS *lmp, int narg, char **arg) : + FixNHGPU(lmp, narg, arg) +{ + if (!tstat_flag) + error->all(FLERR,"Temperature control must be used with fix npt/omp"); + if (!pstat_flag) + error->all(FLERR,"Pressure control must be used with fix npt/omp"); + + // create a new compute temp style + // id = fix-ID + temp + // compute group = all since pressure is always global (group all) + // and thus its KE/temperature contribution should use group all + + int n = strlen(id) + 6; + id_temp = new char[n]; + strcpy(id_temp,id); + strcat(id_temp,"_temp"); + + char **newarg = new char*[3]; + newarg[0] = id_temp; + newarg[1] = (char *) "all"; + newarg[2] = (char *) "temp"; + + modify->add_compute(3,newarg); + delete [] newarg; + tcomputeflag = 1; + + // create a new compute pressure style + // id = fix-ID + press, compute group = all + // pass id_temp as 4th arg to pressure constructor + + n = strlen(id) + 7; + id_press = new char[n]; + strcpy(id_press,id); + strcat(id_press,"_press"); + + newarg = new char*[4]; + newarg[0] = id_press; + newarg[1] = (char *) "all"; + newarg[2] = (char *) "pressure"; + newarg[3] = id_temp; + modify->add_compute(4,newarg); + delete [] newarg; + pcomputeflag = 1; +} diff --git a/src/GPU/fix_npt_gpu.h b/src/GPU/fix_npt_gpu.h new file mode 100644 index 0000000000..2684935fe5 --- /dev/null +++ b/src/GPU/fix_npt_gpu.h @@ -0,0 +1,52 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(npt/gpu,FixNPTGPU) + +#else + +#ifndef LMP_FIX_NPT_GPU_H +#define LMP_FIX_NPT_GPU_H + +#include "fix_nh_gpu.h" + +namespace LAMMPS_NS { + +class FixNPTGPU : public FixNHGPU { + public: + FixNPTGPU(class LAMMPS *, int, char **); + ~FixNPTGPU() {} +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: Temperature control must be used with fix npt + +Self-explanatory. + +E: Pressure control must be used with fix npt + +Self-explanatory. + +*/ diff --git a/src/GPU/fix_nve_asphere_gpu.cpp b/src/GPU/fix_nve_asphere_gpu.cpp new file mode 100644 index 0000000000..bf6cfda67d --- /dev/null +++ b/src/GPU/fix_nve_asphere_gpu.cpp @@ -0,0 +1,440 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://lammps.sandia.gov/, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include "fix_nve_asphere_gpu.h" + +#include "atom.h" +#include "atom_vec_ellipsoid.h" +#include "comm.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "memory.h" +#include "neighbor.h" +#include "update.h" +#include +#if (LAL_USE_OMP == 1) +#include +#endif + +using namespace LAMMPS_NS; +using namespace FixConst; + +#define INERTIA 0.2 // moment of inertia prefactor for ellipsoid + +#define ME_qnormalize(q) \ +{ \ + double norm = 1.0 / \ + sqrt(q##_w*q##_w + q##_i*q##_i + q##_j*q##_j + q##_k*q##_k); \ + q##_w *= norm; \ + q##_i *= norm; \ + q##_j *= norm; \ + q##_k *= norm; \ +} + +#define ME_mq_to_omega(m, quat, moments_0, moments_1, moments_2, w) \ +{ \ + double wbody_0, wbody_1, wbody_2; \ + double rot_0, rot_1, rot_2, rot_3, rot_4, rot_5, rot_6, rot_7, rot_8; \ + \ + double w2 = quat##_w * quat##_w; \ + double i2 = quat##_i * quat##_i; \ + double j2 = quat##_j * quat##_j; \ + double k2 = quat##_k * quat##_k; \ + double twoij = 2.0 * quat##_i * quat##_j; \ + double twoik = 2.0 * quat##_i * quat##_k; \ + double twojk = 2.0 * quat##_j * quat##_k; \ + double twoiw = 2.0 * quat##_i * quat##_w; \ + double twojw = 2.0 * quat##_j * quat##_w; \ + double twokw = 2.0 * quat##_k * quat##_w; \ + \ + rot##_0 = w2 + i2 - j2 - k2; \ + rot##_1 = twoij - twokw; \ + rot##_2 = twojw + twoik; \ + \ + rot##_3 = twoij + twokw; \ + rot##_4 = w2 - i2 + j2 - k2; \ + rot##_5 = twojk - twoiw; \ + \ + rot##_6 = twoik - twojw; \ + rot##_7 = twojk + twoiw; \ + rot##_8 = w2 - i2 - j2 + k2; \ + \ + wbody_0 = rot##_0*m##_0 + rot##_3*m##_1 + rot##_6*m##_2; \ + wbody_1 = rot##_1*m##_0 + rot##_4*m##_1 + rot##_7*m##_2; \ + wbody_2 = rot##_2*m##_0 + rot##_5*m##_1 + rot##_8*m##_2; \ + \ + wbody_0 *= moments_0; \ + wbody_1 *= moments_1; \ + wbody_2 *= moments_2; \ + \ + w##_0 = rot##_0*wbody_0 + rot##_1*wbody_1 + rot##_2*wbody_2; \ + w##_1 = rot##_3*wbody_0 + rot##_4*wbody_1 + rot##_5*wbody_2; \ + w##_2 = rot##_6*wbody_0 + rot##_7*wbody_1 + rot##_8*wbody_2; \ +} + +#define ME_omega_richardson(dtf,dtq,angmomin,quatin,torque,i0,i1,i2) \ +{ \ + angmomin[0] += dtf * torque[0]; \ + double angmom_0 = angmomin[0]; \ + angmomin[1] += dtf * torque[1]; \ + double angmom_1 = angmomin[1]; \ + angmomin[2] += dtf * torque[2]; \ + double angmom_2 = angmomin[2]; \ + \ + double quat_w = quatin[0]; \ + double quat_i = quatin[1]; \ + double quat_j = quatin[2]; \ + double quat_k = quatin[3]; \ + \ + double omega_0, omega_1, omega_2; \ + ME_mq_to_omega(angmom,quat,i0,i1,i2,omega); \ + \ + double wq_0, wq_1, wq_2, wq_3; \ + wq_0 = -omega_0*quat_i - omega_1*quat_j - omega_2*quat_k; \ + wq_1 = quat_w*omega_0 + omega_1*quat_k - omega_2*quat_j; \ + wq_2 = quat_w*omega_1 + omega_2*quat_i - omega_0*quat_k; \ + wq_3 = quat_w*omega_2 + omega_0*quat_j - omega_1*quat_i; \ + \ + double qfull_w, qfull_i, qfull_j, qfull_k; \ + qfull_w = quat_w + dtq * wq_0; \ + qfull_i = quat_i + dtq * wq_1; \ + qfull_j = quat_j + dtq * wq_2; \ + qfull_k = quat_k + dtq * wq_3; \ + ME_qnormalize(qfull); \ + \ + double qhalf_w, qhalf_i, qhalf_j, qhalf_k; \ + qhalf_w = quat_w + 0.5*dtq * wq_0; \ + qhalf_i = quat_i + 0.5*dtq * wq_1; \ + qhalf_j = quat_j + 0.5*dtq * wq_2; \ + qhalf_k = quat_k + 0.5*dtq * wq_3; \ + ME_qnormalize(qhalf); \ + \ + ME_mq_to_omega(angmom,qhalf,i0,i1,i2,omega); \ + wq_0 = -omega_0*qhalf_i - omega_1*qhalf_j - omega_2*qhalf_k; \ + wq_1 = qhalf_w*omega_0 + omega_1*qhalf_k - omega_2*qhalf_j; \ + wq_2 = qhalf_w*omega_1 + omega_2*qhalf_i - omega_0*qhalf_k; \ + wq_3 = qhalf_w*omega_2 + omega_0*qhalf_j - omega_1*qhalf_i; \ + \ + qhalf_w += 0.5*dtq * wq_0; \ + qhalf_i += 0.5*dtq * wq_1; \ + qhalf_j += 0.5*dtq * wq_2; \ + qhalf_k += 0.5*dtq * wq_3; \ + ME_qnormalize(qhalf); \ + \ + quat_w = 2.0*qhalf_w - qfull_w; \ + quat_i = 2.0*qhalf_i - qfull_i; \ + quat_j = 2.0*qhalf_j - qfull_j; \ + quat_k = 2.0*qhalf_k - qfull_k; \ + ME_qnormalize(quat); \ + \ + quatin[0] = quat_w; \ + quatin[1] = quat_i; \ + quatin[2] = quat_j; \ + quatin[3] = quat_k; \ +} + +/* ---------------------------------------------------------------------- */ + +FixNVEAsphereGPU::FixNVEAsphereGPU(LAMMPS *lmp, int narg, char **arg) : + FixNVE(lmp, narg, arg) +{ + _dtfm = 0; + _nlocal_max = 0; + _inertia0 = 0; + _inertia1 = 0; + _inertia2 = 0; +} + +/* ---------------------------------------------------------------------- */ + +void FixNVEAsphereGPU::init() +{ + avec = (AtomVecEllipsoid *) atom->style_match("ellipsoid"); + if (!avec) + error->all(FLERR,"Compute nve/asphere requires atom style ellipsoid"); + + // check that all particles are finite-size ellipsoids + // no point particles allowed, spherical is OK + + int *ellipsoid = atom->ellipsoid; + int *mask = atom->mask; + int nlocal = atom->nlocal; + + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) + if (ellipsoid[i] < 0) + error->one(FLERR,"Fix nve/asphere requires extended particles"); + + FixNVE::init(); +} + +/* ---------------------------------------------------------------------- */ + +void FixNVEAsphereGPU::setup(int vflag) +{ + FixNVE::setup(vflag); + reset_dt(); +} + +/* ---------------------------------------------------------------------- */ + +void FixNVEAsphereGPU::initial_integrate(int /*vflag*/) +{ + AtomVecEllipsoid::Bonus *bonus = avec->bonus; + int *ellipsoid = atom->ellipsoid; + double * _noalias const x = atom->x[0]; + double * _noalias const v = atom->v[0]; + const double * _noalias const f = atom->f[0]; + int *mask = atom->mask; + + double **angmom = atom->angmom; + double **torque = atom->torque; + int nlocal = atom->nlocal; + if (igroup == atom->firstgroup) nlocal = atom->nfirst; + + // set timestep here since dt may have changed or come via rRESPA + + dtq = 0.5 * dtv; + + #if (LAL_USE_OMP == 1) + #pragma omp parallel + #endif + { + #if (LAL_USE_OMP == 1) + const int nthreads = comm->nthreads; + const int tid = omp_get_thread_num(); + const int idelta = nlocal / nthreads + 1; + const int ifrom = tid * idelta; + const int ito = MIN(ifrom + idelta, nlocal); + const int ifrom3 = ifrom * 3; + const int ito3 = ito * 3; + #else + const int tid = 0; + const int ifrom = 0; + const int ifrom3 = 0; + const int ito = nlocal; + const int ito3 = nlocal * 3; + #endif + + #if (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i = ifrom3; i < ito3; i++) { + v[i] += _dtfm[i] * f[i]; + x[i] += dtv * v[i]; + } + + // update angular momentum by 1/2 step + if (igroup == 0) { + #if (LAL_USE_OMP_SIMD == 1) + // Workaround for compiler bug + #ifdef __INTEL_COMPILER + #pragma simd + #else + #pragma omp simd + #endif + #endif + for (int i = ifrom; i < ito; i++) { + double *quat = bonus[ellipsoid[i]].quat; + ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i], _inertia0[i], + _inertia1[i], _inertia2[i]); + } + } else { + #if (LAL_USE_OMP_SIMD == 1) + // Workaround for compiler bug + #ifdef __INTEL_COMPILER + #pragma simd + #else + #pragma omp simd + #endif + #endif + for (int i = ifrom; i < ito; i++) { + if (mask[i] & groupbit) { + double *quat = bonus[ellipsoid[i]].quat; + ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i], + _inertia0[i], _inertia1[i], _inertia2[i]); + } + } + } + } +} + +/* ---------------------------------------------------------------------- */ + +void FixNVEAsphereGPU::final_integrate() +{ + double * _noalias const v = atom->v[0]; + const double * _noalias const f = atom->f[0]; + double * _noalias const angmom = atom->angmom[0]; + const double * _noalias const torque = atom->torque[0]; + + const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : + atom->nlocal; + + if (neighbor->ago == 0) { + if (nlocal > _nlocal_max) { + if (_nlocal_max) { + memory->destroy(_dtfm); + memory->destroy(_inertia0); + memory->destroy(_inertia1); + memory->destroy(_inertia2); + } + _nlocal_max = static_cast(1.20 * nlocal); + memory->create(_dtfm, _nlocal_max * 3, "fix_nve_gpu:dtfm"); + memory->create(_inertia0, _nlocal_max * 3, "fix_nve_gpu:inertia0"); + memory->create(_inertia1, _nlocal_max * 3, "fix_nve_gpu:inertia1"); + memory->create(_inertia2, _nlocal_max * 3, "fix_nve_gpu:inertia2"); + } + } + + #if (LAL_USE_OMP == 1) + #pragma omp parallel + #endif + { + #if (LAL_USE_OMP == 1) + const int nthreads = comm->nthreads; + const int tid = omp_get_thread_num(); + const int idelta = nlocal / nthreads + 1; + const int ifrom = tid * idelta; + const int ito = MIN(ifrom + idelta, nlocal); + const int ifrom3 = ifrom * 3; + const int ito3 = ito * 3; + #else + const int tid = 0; + const int ifrom = 0; + const int ifrom3 = 0; + const int ito = nlocal; + const int ito3 = nlocal * 3; + #endif + + double dtfo; + if (neighbor->ago == 0) dtfo = reset_dt_omp(ifrom, ito, tid); + else dtfo = dtf; + + #if (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i = ifrom3; i < ito3; i++) { + v[i] += _dtfm[i] * f[i]; + angmom[i] += dtfo * torque[i]; + } + } +} + +void FixNVEAsphereGPU::reset_dt() { + const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : + atom->nlocal; + + if (nlocal > _nlocal_max) { + if (_nlocal_max) { + memory->destroy(_dtfm); + memory->destroy(_inertia0); + memory->destroy(_inertia1); + memory->destroy(_inertia2); + } + _nlocal_max = static_cast(1.20 * nlocal); + memory->create(_dtfm, _nlocal_max * 3, "fix_nve_gpu:dtfm"); + memory->create(_inertia0, _nlocal_max * 3, "fix_nve_gpu:inertia0"); + memory->create(_inertia1, _nlocal_max * 3, "fix_nve_gpu:inertia1"); + memory->create(_inertia2, _nlocal_max * 3, "fix_nve_gpu:inertia2"); + } + + #if (LAL_USE_OMP == 1) + #pragma omp parallel + #endif + { + #if (LAL_USE_OMP == 1) + const int nthreads = comm->nthreads; + const int tid = omp_get_thread_num(); + const int idelta = nlocal / nthreads + 1; + const int ifrom = tid * idelta; + const int ito = MIN(ifrom + idelta, nlocal); + #else + const int tid = 0; + const int ifrom = 0; + const int ito = nlocal; + #endif + reset_dt_omp(ifrom, ito, tid); + } +} + +double FixNVEAsphereGPU::reset_dt_omp(const int ifrom, const int ito, + const int tid) { + AtomVecEllipsoid::Bonus *bonus = avec->bonus; + int *ellipsoid = atom->ellipsoid; + const int * const mask = atom->mask; + + const double dtfo = 0.5 * update->dt * force->ftm2v; + if (tid == 0) { + dtv = update->dt; + dtf = dtfo; + } + + if (igroup == 0) { + const double * const rmass = atom->rmass; + int n = ifrom * 3; + for (int i = ifrom; i < ito; i++) { + const double dtfir = dtfo / rmass[i]; + _dtfm[n++] = dtfir; + _dtfm[n++] = dtfir; + _dtfm[n++] = dtfir; + double *shape = bonus[ellipsoid[i]].shape; + double idot = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]); + if (idot != 0.0) idot = 1.0 / idot; + _inertia0[i] = idot; + idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]); + if (idot != 0.0) idot = 1.0 / idot; + _inertia1[i] = idot; + idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]); + if (idot != 0.0) idot = 1.0 / idot; + _inertia2[i] = idot; + } + } else { + const double * const rmass = atom->rmass; + int n = ifrom * 3; + for (int i = ifrom; i < ito; i++) { + if (mask[i] & groupbit) { + const double dtfir = dtfo / rmass[i]; + _dtfm[n++] = dtfir; + _dtfm[n++] = dtfir; + _dtfm[n++] = dtfir; + double *shape = bonus[ellipsoid[i]].shape; + double idot = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]); + if (idot != 0.0) idot = 1.0 / idot; + _inertia0[i] = idot; + idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]); + if (idot != 0.0) idot = 1.0 / idot; + _inertia1[i] = idot; + idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]); + if (idot != 0.0) idot = 1.0 / idot; + _inertia2[i] = idot; + } else { + _dtfm[n++] = 0.0; + _dtfm[n++] = 0.0; + _dtfm[n++] = 0.0; + } + } + } + return dtfo; +} + +double FixNVEAsphereGPU::memory_usage() +{ + return FixNVE::memory_usage() + _nlocal_max * 12 * sizeof(double); +} + diff --git a/src/GPU/fix_nve_asphere_gpu.h b/src/GPU/fix_nve_asphere_gpu.h new file mode 100644 index 0000000000..3c67e0e024 --- /dev/null +++ b/src/GPU/fix_nve_asphere_gpu.h @@ -0,0 +1,63 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(nve/asphere/gpu,FixNVEAsphereGPU) + +#else + +#ifndef LMP_FIX_NVE_ASPHERE_GPU_H +#define LMP_FIX_NVE_ASPHERE_GPU_H + +#include "fix_nve.h" + +namespace LAMMPS_NS { + +class FixNVEAsphereGPU : public FixNVE { + public: + FixNVEAsphereGPU(class LAMMPS *, int, char **); + void init(); + void setup(int vflag); + void initial_integrate(int); + void final_integrate(); + void reset_dt(); + virtual double memory_usage(); + + private: + double reset_dt_omp(const int, const int, const int); + double *_dtfm, *_inertia0, *_inertia1, *_inertia2; + int _nlocal_max; + double dtq; + class AtomVecEllipsoid *avec; +}; + +} +#endif +#endif + +/* ERROR/WARNING messages: + +E: Compute nve/asphere requires atom style ellipsoid + +Self-explanatory. + +E: Fix nve/asphere requires extended particles + +This fix can only be used for particles with a shape setting. + +*/ diff --git a/src/GPU/fix_nve_gpu.cpp b/src/GPU/fix_nve_gpu.cpp new file mode 100644 index 0000000000..c3dd5b6ae2 --- /dev/null +++ b/src/GPU/fix_nve_gpu.cpp @@ -0,0 +1,291 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://lammps.sandia.gov/, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include "fix_nve_gpu.h" +#include +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "gpu_extra.h" +#include "memory.h" +#include "neighbor.h" +#include "update.h" +#if (LAL_USE_OMP == 1) +#include +#endif + +using namespace LAMMPS_NS; +using namespace FixConst; + +/* ---------------------------------------------------------------------- */ + +FixNVEGPU::FixNVEGPU(LAMMPS *lmp, int narg, char **arg) : + FixNVE(lmp, narg, arg) +{ + _dtfm = 0; + _nlocal_max = 0; +} + +/* ---------------------------------------------------------------------- */ + +FixNVEGPU::~FixNVEGPU() +{ + memory->destroy(_dtfm); +} + +/* ---------------------------------------------------------------------- */ + +void FixNVEGPU::setup(int vflag) +{ + FixNVE::setup(vflag); + if (strstr(update->integrate_style,"respa")) + _respa_on = 1; + else + _respa_on = 0; + if (atom->ntypes > 1) reset_dt(); +} + +/* ---------------------------------------------------------------------- + allow for both per-type and per-atom mass +------------------------------------------------------------------------- */ + +void FixNVEGPU::initial_integrate(int vflag) +{ + if (_respa_on) { FixNVE::initial_integrate(vflag); return; } + + // update v and x of atoms in group + + double * _noalias const x = atom->x[0]; + double * _noalias const v = atom->v[0]; + const double * _noalias const f = atom->f[0]; + const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : + atom->nlocal; + const int nlocal3 = nlocal * 3; + + #if (LAL_USE_OMP == 1) + #pragma omp parallel + #endif + { + #if (LAL_USE_OMP == 1) + const int nthreads = comm->nthreads; + const int idelta = nlocal3 / nthreads + 1; + const int ifrom3 = omp_get_thread_num() * idelta; + const int ito3 = MIN(ifrom3 + idelta, nlocal3); + #else + const int ifrom3 = 0; + const int ito3 = nlocal3; + #endif + if (igroup == 0 && atom->ntypes == 1 && !atom->rmass) { + const double dtfm = dtf / atom->mass[1]; + #if (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i = ifrom3; i < ito3; i++) { + v[i] += dtfm * f[i]; + x[i] += dtv * v[i]; + } + } else if (igroup == 0) { + #if (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i = ifrom3; i < ito3; i++) { + v[i] += _dtfm[i] * f[i]; + x[i] += dtv * v[i]; + } + } else { + #if (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i = ifrom3; i < ito3; i++) { + if (_dtfm[i] != 0.0) { + v[i] += _dtfm[i] * f[i]; + x[i] += dtv * v[i]; + } + } + } + } +} + +/* ---------------------------------------------------------------------- */ + +void FixNVEGPU::final_integrate() +{ + if (_respa_on) { FixNVE::final_integrate(); return; } + // update v of atoms in group + double * _noalias const v = atom->v[0]; + const double * _noalias const f = atom->f[0]; + const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : + atom->nlocal; + + if (neighbor->ago == 0) { + if (igroup != 0 || atom->ntypes != 1 || atom->rmass) { + if (nlocal > _nlocal_max) { + if (_nlocal_max) memory->destroy(_dtfm); + _nlocal_max = static_cast(1.20 * nlocal); + memory->create(_dtfm, _nlocal_max * 3, "fix_nve_gpu:dtfm"); + } + } + } + + #if (LAL_USE_OMP == 1) + #pragma omp parallel + #endif + { + #if (LAL_USE_OMP == 1) + const int nthreads = comm->nthreads; + const int tid = omp_get_thread_num(); + const int idelta = nlocal / nthreads + 1; + const int ifrom = tid * idelta; + const int ito = MIN(ifrom + idelta, nlocal); + const int ifrom3 = ifrom * 3; + const int ito3 = ito * 3; + #else + const int tid = 0; + const int ifrom = 0; + const int ifrom3 = 0; + const int ito = nlocal; + const int ito3 = nlocal * 3; + #endif + if (igroup == 0 && atom->ntypes == 1 && !atom->rmass) { + const double dtfm = dtf / atom->mass[1]; + #if (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i = ifrom3; i < ito3; i++) + v[i] += dtfm * f[i]; + } else if (igroup == 0) { + if (neighbor->ago == 0) reset_dt_omp(ifrom,ito,tid); + #if (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i = ifrom3; i < ito3; i++) + v[i] += _dtfm[i] * f[i]; + } else { + if (neighbor->ago == 0) reset_dt_omp(ifrom,ito,tid); + #if (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i = ifrom3; i < ito3; i++) + v[i] += _dtfm[i] * f[i]; + } + } +} + +void FixNVEGPU::reset_dt() { + if (_respa_on) { FixNVE::reset_dt(); return; } + if (igroup == 0 && atom->ntypes == 1 && !atom->rmass) { + dtv = update->dt; + dtf = 0.5 * update->dt * force->ftm2v; + } else { + const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : + atom->nlocal; + if (nlocal > _nlocal_max) { + if (_nlocal_max) memory->destroy(_dtfm); + _nlocal_max = static_cast(1.20 * nlocal); + memory->create(_dtfm, _nlocal_max * 3, "fix_nve_gpu:dtfm"); + } + + #if (LAL_USE_OMP == 1) + #pragma omp parallel + #endif + { + #if (LAL_USE_OMP == 1) + const int nthreads = comm->nthreads; + const int tid = omp_get_thread_num(); + const int idelta = nlocal / nthreads + 1; + const int ifrom = tid * idelta; + const int ito = MIN(ifrom + idelta, nlocal); + #else + const int tid = 0; + const int ifrom = 0; + const int ito = nlocal; + #endif + + reset_dt_omp(ifrom, ito, tid); + } + } +} + +void FixNVEGPU::reset_dt_omp(const int ifrom, const int ito, const int tid) { + const double dtfo = 0.5 * update->dt * force->ftm2v; + if (tid == 0) { + dtv = update->dt; + dtf = dtfo; + } + + const int * const mask = atom->mask; + if (igroup == 0) { + if (atom->rmass) { + const double * const rmass = atom->rmass; + int n = ifrom * 3; + for (int i = ifrom; i < ito; i++) { + const double dtfir = dtfo / rmass[i]; + _dtfm[n++] = dtfir; + _dtfm[n++] = dtfir; + _dtfm[n++] = dtfir; + } + } else { + const double * const mass = atom->mass; + const int * const type = atom->type; + int n = ifrom * 3; + for (int i = ifrom; i < ito; i++) { + const double dtfim = dtfo / mass[type[i]]; + _dtfm[n++] = dtfim; + _dtfm[n++] = dtfim; + _dtfm[n++] = dtfim; + } + } + } else { + if (atom->rmass) { + const double * const rmass = atom->rmass; + int n = ifrom * 3; + for (int i = ifrom; i < ito; i++) + if (mask[i] & groupbit) { + const double dtfir = dtfo / rmass[i]; + _dtfm[n++] = dtfir; + _dtfm[n++] = dtfir; + _dtfm[n++] = dtfir; + } else { + _dtfm[n++] = 0.0; + _dtfm[n++] = 0.0; + _dtfm[n++] = 0.0; + } + } else { + const double * const mass = atom->mass; + const int * const type = atom->type; + int n = ifrom * 3; + for (int i = ifrom; i < ito; i++) + if (mask[i] & groupbit) { + const double dtfim = dtfo / mass[type[i]]; + _dtfm[n++] = dtfim; + _dtfm[n++] = dtfim; + _dtfm[n++] = dtfim; + } else { + _dtfm[n++] = 0.0; + _dtfm[n++] = 0.0; + _dtfm[n++] = 0.0; + } + } + } +} + +double FixNVEGPU::memory_usage() +{ + const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : + atom->nlocal; + return FixNVE::memory_usage() + nlocal * 3 * sizeof(double); +} diff --git a/src/GPU/fix_nve_gpu.h b/src/GPU/fix_nve_gpu.h new file mode 100644 index 0000000000..1042d4eadd --- /dev/null +++ b/src/GPU/fix_nve_gpu.h @@ -0,0 +1,60 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(nve/gpu,FixNVEGPU) + +#else + +#ifndef LMP_FIX_NVE_GPU_H +#define LMP_FIX_NVE_GPU_H + +#include "fix_nve.h" + +namespace LAMMPS_NS { + +class FixNVEGPU : public FixNVE { + public: + FixNVEGPU(class LAMMPS *, int, char **); + virtual ~FixNVEGPU(); + virtual void setup(int); + virtual void initial_integrate(int); + virtual void final_integrate(); + virtual void reset_dt(); + virtual double memory_usage(); + + protected: + void reset_dt_omp(const int, const int, const int); + double *_dtfm; + int _nlocal_max, _respa_on; +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: Illegal ... command + +Self-explanatory. Check the input script syntax and compare to the +documentation for the command. You can use -echo screen as a +command-line option when running LAMMPS to see the offending line. + +*/ diff --git a/src/GPU/fix_nvt_gpu.cpp b/src/GPU/fix_nvt_gpu.cpp new file mode 100644 index 0000000000..7d7826b6bf --- /dev/null +++ b/src/GPU/fix_nvt_gpu.cpp @@ -0,0 +1,50 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://lammps.sandia.gov/, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include +#include "fix_nvt_gpu.h" +#include "group.h" +#include "modify.h" +#include "error.h" + +using namespace LAMMPS_NS; +using namespace FixConst; + +/* ---------------------------------------------------------------------- */ + +FixNVTGPU::FixNVTGPU(LAMMPS *lmp, int narg, char **arg) : + FixNHGPU(lmp, narg, arg) +{ + if (!tstat_flag) + error->all(FLERR,"Temperature control must be used with fix nvt"); + if (pstat_flag) + error->all(FLERR,"Pressure control can not be used with fix nvt"); + + // create a new compute temp style + // id = fix-ID + temp + + int n = strlen(id) + 6; + id_temp = new char[n]; + strcpy(id_temp,id); + strcat(id_temp,"_temp"); + + char **newarg = new char*[3]; + newarg[0] = id_temp; + newarg[1] = group->names[igroup]; + newarg[2] = (char *) "temp"; + + modify->add_compute(3,newarg); + delete [] newarg; + tcomputeflag = 1; +} + diff --git a/src/GPU/fix_nvt_gpu.h b/src/GPU/fix_nvt_gpu.h new file mode 100644 index 0000000000..7ccba97040 --- /dev/null +++ b/src/GPU/fix_nvt_gpu.h @@ -0,0 +1,52 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(nvt/gpu,FixNVTGPU) + +#else + +#ifndef LMP_FIX_NVT_GPU_H +#define LMP_FIX_NVT_GPU_H + +#include "fix_nh_gpu.h" + +namespace LAMMPS_NS { + +class FixNVTGPU : public FixNHGPU { + public: + FixNVTGPU(class LAMMPS *, int, char **); + ~FixNVTGPU() {} +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: Temperature control must be used with fix nvt + +Self-explanatory. + +E: Pressure control can not be used with fix nvt + +Self-explanatory. + +*/ diff --git a/src/GPU/gpu_extra.h b/src/GPU/gpu_extra.h index 115e1f0574..1a957c9aef 100644 --- a/src/GPU/gpu_extra.h +++ b/src/GPU/gpu_extra.h @@ -21,6 +21,29 @@ #include "modify.h" #include "error.h" +// ---------------------- OPENMP PREPROCESSOR STUFF ------------------ +#if defined(_OPENMP) + #if !defined(LAL_USE_OMP) + #define LAL_USE_OMP 1 + #endif + + #if !defined(LAL_USE_OMP_SIMD) + #if (_OPENMP >= 201307) + #define LAL_USE_OMP_SIMD 1 + #else + #define LAL_USE_OMP_SIMD 0 + #endif + #endif +#else + #if !defined(LAL_USE_OMP) + #define LAL_USE_OMP 0 + #endif + + #if !defined(LAL_USE_OMP_SIMD) + #define LAL_USE_OMP_SIMD 0 + #endif +#endif + namespace GPU_EXTRA { inline void check_flag(int error_flag, LAMMPS_NS::Error *error, @@ -61,6 +84,12 @@ namespace GPU_EXTRA { else if (all_success == -12) error->all(FLERR, "Invalid OpenCL platform ID."); + else if (all_success == -13) + error->all(FLERR, + "Invalid device configuration."); + else if (all_success == -15) + error->all(FLERR, + "P3M built for FP64 and GPU device is FP32 only."); else error->all(FLERR,"Unknown error in GPU library"); } @@ -127,12 +156,22 @@ greater than 4 for NVIDIA GPUs. E: Invalid custom OpenCL parameter string. There are not enough or too many parameters in the custom string for package -GPU. +GPU or the parameters do not meet required restrictions. E: Unknown error in GPU library Self-explanatory. +E: Invalid device configuration. + +The specified GPU or accelerator does not support the specified device +configuration. Check the output of ocl_get_devices or nvd_get_devices to +verify the correct device IDs for the GPU package. + +E: P3M built for FP64 and GPU device is FP32 only + +Either turn off GPU acceleration for PPPM or build LAMMPS with -DFFT_SINGLE + W: Increasing communication cutoff for GPU style The pair style has increased the communication cutoff to be consistent with diff --git a/src/GPU/pair_beck_gpu.cpp b/src/GPU/pair_beck_gpu.cpp index 38cc593076..ff9537a33e 100644 --- a/src/GPU/pair_beck_gpu.cpp +++ b/src/GPU/pair_beck_gpu.cpp @@ -48,9 +48,9 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **host_aa, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); void beck_gpu_clear(); -int ** beck_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, +int ** beck_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, @@ -160,9 +160,10 @@ void PairBeckGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = beck_gpu_init(atom->ntypes+1, cutsq, aa, alpha, beta, AA, BB, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_born_coul_long_cs_gpu.cpp b/src/GPU/pair_born_coul_long_cs_gpu.cpp index b65b662496..db0faab0ab 100644 --- a/src/GPU/pair_born_coul_long_cs_gpu.cpp +++ b/src/GPU/pair_born_coul_long_cs_gpu.cpp @@ -57,15 +57,15 @@ using namespace MathConst; // External functions from cuda library for atom decomposition int bornclcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, - double **host_born1, double **host_born2, - double **host_born3, double **host_a, - double **host_c, double **host_d, - double **sigma, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - int &gpu_mode, FILE *screen, double **host_cut_ljsq, - double host_cut_coulsq, double *host_special_coul, - const double qqrd2e, const double g_ewald); + double **host_born1, double **host_born2, + double **host_born3, double **host_a, + double **host_c, double **host_d, + double **sigma, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + int &gpu_mode, FILE *screen, double **host_cut_ljsq, + double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); void bornclcs_gpu_clear(); int** bornclcs_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, @@ -196,10 +196,11 @@ void PairBornCoulLongCSGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = bornclcs_gpu_init(atom->ntypes+1, cutsq, rhoinv, born1, born2, born3, a, c, d, sigma, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e, g_ewald); diff --git a/src/GPU/pair_born_coul_long_gpu.cpp b/src/GPU/pair_born_coul_long_gpu.cpp index 0a359f66cc..cad174c0de 100644 --- a/src/GPU/pair_born_coul_long_gpu.cpp +++ b/src/GPU/pair_born_coul_long_gpu.cpp @@ -195,10 +195,11 @@ void PairBornCoulLongGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = borncl_gpu_init(atom->ntypes+1, cutsq, rhoinv, born1, born2, born3, a, c, d, sigma, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e, g_ewald); diff --git a/src/GPU/pair_born_coul_wolf_cs_gpu.cpp b/src/GPU/pair_born_coul_wolf_cs_gpu.cpp index 7aba6e059b..5c8cac0ec2 100644 --- a/src/GPU/pair_born_coul_wolf_cs_gpu.cpp +++ b/src/GPU/pair_born_coul_wolf_cs_gpu.cpp @@ -45,24 +45,26 @@ using namespace MathConst; // External functions from cuda library for atom decomposition int borncwcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, - double **host_born1, double **host_born2, - double **host_born3, double **host_a, double **host_c, - double **host_d, double **sigma, double **offset, - double *special_lj, const int inum, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, - double **host_cut_ljsq, double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double alf, const double e_shift, const double f_shift); + double **host_born1, double **host_born2, + double **host_born3, double **host_a, double **host_c, + double **host_d, double **sigma, double **offset, + double *special_lj, const int inum, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double alf, const double e_shift, + const double f_shift); void borncwcs_gpu_clear(); -int ** borncwcs_gpu_compute_n(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, - double *prd); +int ** borncwcs_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + double *host_q, double *boxlo, double *prd); void borncwcs_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, @@ -179,10 +181,11 @@ void PairBornCoulWolfCSGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = borncwcs_gpu_init(atom->ntypes+1, cutsq, rhoinv, born1, born2, born3, a, c, d, sigma, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e, alf, e_shift, f_shift); diff --git a/src/GPU/pair_born_coul_wolf_gpu.cpp b/src/GPU/pair_born_coul_wolf_gpu.cpp index ee6fcf3cea..73e58b0a1f 100644 --- a/src/GPU/pair_born_coul_wolf_gpu.cpp +++ b/src/GPU/pair_born_coul_wolf_gpu.cpp @@ -51,13 +51,15 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, double *host_special_coul, const double qqrd2e, - const double alf, const double e_shift, const double f_shift); + const double alf, const double e_shift, + const double f_shift); void borncw_gpu_clear(); int ** borncw_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, + const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd); @@ -177,10 +179,11 @@ void PairBornCoulWolfGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = borncw_gpu_init(atom->ntypes+1, cutsq, rhoinv, born1, born2, born3, a, c, d, sigma, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e, alf, e_shift, f_shift); diff --git a/src/GPU/pair_born_gpu.cpp b/src/GPU/pair_born_gpu.cpp index 84ed4cfc04..770dad8346 100644 --- a/src/GPU/pair_born_gpu.cpp +++ b/src/GPU/pair_born_gpu.cpp @@ -48,13 +48,13 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); void born_gpu_reinit(const int ntypes, double **host_rhoinv, - double **host_born1, double **host_born2, double **host_born3, - double **host_a, double **host_c, double **host_d, - double **offset); + double **host_born1, double **host_born2, + double **host_born3, double **host_a, double **host_c, + double **host_d, double **offset); void born_gpu_clear(); -int ** born_gpu_compute_n(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, +int ** born_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -163,10 +163,11 @@ void PairBornGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = born_gpu_init(atom->ntypes+1, cutsq, rhoinv, born1, born2, born3, a, c, d, sigma, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_buck_coul_cut_gpu.cpp b/src/GPU/pair_buck_coul_cut_gpu.cpp index 036bc0d7a8..2c9e71bc83 100644 --- a/src/GPU/pair_buck_coul_cut_gpu.cpp +++ b/src/GPU/pair_buck_coul_cut_gpu.cpp @@ -167,9 +167,10 @@ void PairBuckCoulCutGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = buckc_gpu_init(atom->ntypes+1, cutsq, rhoinv, buck1, buck2, a, c, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_buck_coul_long_gpu.cpp b/src/GPU/pair_buck_coul_long_gpu.cpp index 3916e5634e..3d48862c6a 100644 --- a/src/GPU/pair_buck_coul_long_gpu.cpp +++ b/src/GPU/pair_buck_coul_long_gpu.cpp @@ -191,9 +191,10 @@ void PairBuckCoulLongGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = buckcl_gpu_init(atom->ntypes+1, cutsq, rhoinv, buck1, buck2, a, c, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e, g_ewald); diff --git a/src/GPU/pair_buck_gpu.cpp b/src/GPU/pair_buck_gpu.cpp index 54c579bf72..d17f9d2072 100644 --- a/src/GPU/pair_buck_gpu.cpp +++ b/src/GPU/pair_buck_gpu.cpp @@ -47,8 +47,8 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); void buck_gpu_reinit(const int ntypes, double **cutsq, double **host_rhoinv, - double **host_buck1, double **host_buck2, - double **host_a, double **host_c, double **offset); + double **host_buck1, double **host_buck2, + double **host_a, double **host_c, double **offset); void buck_gpu_clear(); int ** buck_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, @@ -161,9 +161,10 @@ void PairBuckGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = buck_gpu_init(atom->ntypes+1, cutsq, rhoinv, buck1, buck2, a, c, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_colloid_gpu.cpp b/src/GPU/pair_colloid_gpu.cpp index 2e35486993..8b7870575a 100644 --- a/src/GPU/pair_colloid_gpu.cpp +++ b/src/GPU/pair_colloid_gpu.cpp @@ -44,18 +44,18 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, double **host_a12, double **host_a1, double **host_a2, double **host_d1, - double **host_d2, double **host_sigma3, double **host_sigma6, - int **host_form, const int nlocal, + double **host_d2, double **host_sigma3, + double **host_sigma6, int **host_form, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); void colloid_gpu_clear(); -int ** colloid_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, - const double cpu_time, bool &success); +int ** colloid_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, + const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, + int **jnum, const double cpu_time, bool &success); void colloid_gpu_compute(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, @@ -171,10 +171,11 @@ void PairColloidGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = colloid_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, force->special_lj, a12, a1, a2, d1, d2, sigma3, sigma6, _form, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); memory->destroy(_form); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_coul_cut_gpu.cpp b/src/GPU/pair_coul_cut_gpu.cpp index 1e45aebf7b..9098f86737 100644 --- a/src/GPU/pair_coul_cut_gpu.cpp +++ b/src/GPU/pair_coul_cut_gpu.cpp @@ -47,21 +47,21 @@ int coul_gpu_init(const int ntypes, double **host_scale, double **cutsq, const double qqrd2e); void coul_gpu_reinit(const int ntypes, double **host_scale); void coul_gpu_clear(); -int ** coul_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, - double *prd); +int ** coul_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd); void coul_gpu_compute(const int ago, const int inum, const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q, const int nlocal, - double *boxlo, double *prd); + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, const double cpu_time, + bool &success, double *host_q, const int nlocal, + double *boxlo, double *prd); double coul_gpu_bytes(); /* ---------------------------------------------------------------------- */ @@ -166,9 +166,10 @@ void PairCoulCutGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = coul_gpu_init(atom->ntypes+1, scale, cutsq, force->special_coul, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, force->qqrd2e); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_coul_debye_gpu.cpp b/src/GPU/pair_coul_debye_gpu.cpp index f23b5acde3..1db2995810 100644 --- a/src/GPU/pair_coul_debye_gpu.cpp +++ b/src/GPU/pair_coul_debye_gpu.cpp @@ -48,20 +48,20 @@ int cdebye_gpu_init(const int ntypes, double **host_scale, double **cutsq, void cdebye_gpu_reinit(const int ntypes, double **host_scale); void cdebye_gpu_clear(); int ** cdebye_gpu_compute_n(const int ago, const int inum, const int nall, - double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, - double *prd); + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, + const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, + int **jnum, const double cpu_time, bool &success, + double *host_q, double *boxlo, double *prd); void cdebye_gpu_compute(const int ago, const int inum, const int nall, - double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q, const int nlocal, - double *boxlo, double *prd); + double **host_x, int *host_type, int *ilist, + int *numj, int **firstneigh, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, + int &host_start, const double cpu_time, bool &success, + double *host_q, const int nlocal, double *boxlo, + double *prd); double cdebye_gpu_bytes(); /* ---------------------------------------------------------------------- */ @@ -167,9 +167,10 @@ void PairCoulDebyeGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = cdebye_gpu_init(atom->ntypes+1, scale, cutsq, force->special_coul, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, force->qqrd2e, kappa); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_coul_dsf_gpu.cpp b/src/GPU/pair_coul_dsf_gpu.cpp index 0bcffb5d2c..830ad057e6 100644 --- a/src/GPU/pair_coul_dsf_gpu.cpp +++ b/src/GPU/pair_coul_dsf_gpu.cpp @@ -57,9 +57,9 @@ int cdsf_gpu_init(const int ntypes, const int nlocal, const int nall, const double e_shift, const double f_shift, const double alpha); void cdsf_gpu_clear(); -int ** cdsf_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, +int ** cdsf_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -184,8 +184,9 @@ void PairCoulDSFGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = cdsf_gpu_init(atom->ntypes+1, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, cut_coulsq, force->special_coul, force->qqrd2e, e_shift, f_shift, alpha); diff --git a/src/GPU/pair_coul_long_cs_gpu.cpp b/src/GPU/pair_coul_long_cs_gpu.cpp index ef404d7a13..5b1fcd9c8f 100644 --- a/src/GPU/pair_coul_long_cs_gpu.cpp +++ b/src/GPU/pair_coul_long_cs_gpu.cpp @@ -54,27 +54,27 @@ using namespace LAMMPS_NS; // External functions from cuda library for atom decomposition -int clcs_gpu_init(const int ntypes, double **scale, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, int &gpu_mode, - FILE *screen, double host_cut_coulsq, double *host_special_coul, - const double qqrd2e, const double g_ewald); +int clcs_gpu_init(const int ntypes, double **scale, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); void clcs_gpu_reinit(const int ntypes, double **scale); void clcs_gpu_clear(); int ** clcs_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, double *host_q, - double *boxlo, double *prd); + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double *host_q, + double *boxlo, double *prd); void clcs_gpu_compute(const int ago, const int inum, const int nall, - double **host_x, int *host_type, int *ilist, int *numj, - int **firstneigh, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q, - const int nlocal, double *boxlo, double *prd); + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd); double clcs_gpu_bytes(); /* ---------------------------------------------------------------------- */ @@ -186,8 +186,9 @@ void PairCoulLongCSGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = clcs_gpu_init(atom->ntypes+1, scale, - atom->nlocal, atom->nlocal+atom->nghost, 300, + atom->nlocal, atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, cut_coulsq, force->special_coul, force->qqrd2e, g_ewald); diff --git a/src/GPU/pair_coul_long_gpu.cpp b/src/GPU/pair_coul_long_gpu.cpp index 1118a012d0..af6a66fa34 100644 --- a/src/GPU/pair_coul_long_gpu.cpp +++ b/src/GPU/pair_coul_long_gpu.cpp @@ -181,8 +181,9 @@ void PairCoulLongGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = cl_gpu_init(atom->ntypes+1, scale, - atom->nlocal, atom->nlocal+atom->nghost, 300, + atom->nlocal, atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, cut_coulsq, force->special_coul, force->qqrd2e, g_ewald); diff --git a/src/GPU/pair_dpd_gpu.cpp b/src/GPU/pair_dpd_gpu.cpp index 59c0fa031f..d77d83e953 100644 --- a/src/GPU/pair_dpd_gpu.cpp +++ b/src/GPU/pair_dpd_gpu.cpp @@ -52,8 +52,8 @@ int ** dpd_gpu_compute_n(const int ago, const int inum_full, const int nall, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, bool &success, - double **host_v, const double dtinvsqrt, + int **ilist, int **jnum, const double cpu_time, + bool &success, double **host_v, const double dtinvsqrt, const int seed, const int timestep, double *boxlo, double *prd); void dpd_gpu_compute(const int ago, const int inum_full, const int nall, @@ -308,9 +308,10 @@ void PairDPDGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = dpd_gpu_init(atom->ntypes+1, cutsq, a0, gamma, sigma, cut, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_dpd_tstat_gpu.cpp b/src/GPU/pair_dpd_tstat_gpu.cpp index 8bf98cc8ed..a5ae3e3001 100644 --- a/src/GPU/pair_dpd_tstat_gpu.cpp +++ b/src/GPU/pair_dpd_tstat_gpu.cpp @@ -47,12 +47,13 @@ int dpd_tstat_gpu_init(const int ntypes, double **cutsq, double **host_a0, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); void dpd_tstat_gpu_clear(); -int ** dpd_tstat_gpu_compute_n(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, bool &success, +int ** dpd_tstat_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double **host_v, const double dtinvsqrt, const int seed, const int timestep, double *boxlo, double *prd); @@ -64,8 +65,9 @@ void dpd_tstat_gpu_compute(const int ago, const int inum_full, const int nall, double **host_v, const double dtinvsqrt, const int seed, const int timestep, const int nlocal, double *boxlo, double *prd); -void dpd_tstat_gpu_update_coeff(int ntypes, double **host_a0, double **host_gamma, - double **host_sigma, double **host_cut); +void dpd_tstat_gpu_update_coeff(int ntypes, double **host_a0, + double **host_gamma, double **host_sigma, + double **host_cut); double dpd_tstat_gpu_bytes(); #define EPSILON 1.0e-10 @@ -325,10 +327,11 @@ void PairDPDTstatGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = dpd_tstat_gpu_init(atom->ntypes+1, cutsq, a0, gamma, sigma, - cut, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen); + cut, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, mnf, maxspecial, + cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); if (gpu_mode == GPU_FORCE) { diff --git a/src/GPU/pair_eam_alloy_gpu.cpp b/src/GPU/pair_eam_alloy_gpu.cpp index c1370af307..4678a6f669 100644 --- a/src/GPU/pair_eam_alloy_gpu.cpp +++ b/src/GPU/pair_eam_alloy_gpu.cpp @@ -39,21 +39,22 @@ using namespace LAMMPS_NS; // External functions from cuda library for atom decomposition int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq, - int **host_type2rhor, int **host_type2z2r, - int *host_type2frho, double ***host_rhor_spline, - double ***host_z2r_spline, double ***host_frho_spline, - double rdr, double rdrho, double rhomax, - int nrhor, int nrho, int nz2r, int nfrho, int nr, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, int &gpu_mode, - FILE *screen, int &fp_size); + int **host_type2rhor, int **host_type2z2r, + int *host_type2frho, double ***host_rhor_spline, + double ***host_z2r_spline, double ***host_frho_spline, + double rdr, double rdrho, double rhomax, + int nrhor, int nrho, int nz2r, int nfrho, int nr, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + int &gpu_mode, FILE *screen, int &fp_size); void eam_alloy_gpu_clear(); -int** eam_alloy_gpu_compute_n(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, tagint **special, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, int **ilist, - int **jnum, const double cpu_time, bool &success, +int** eam_alloy_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, int &inum, void **fp_ptr); void eam_alloy_gpu_compute(const int ago, const int inum_full, const int nlocal, const int nall,double **host_x, int *host_type, @@ -183,10 +184,11 @@ void PairEAMAlloyGPU::init_style() if (atom->molecular) maxspecial=atom->maxspecial; int fp_size; + int mnf = 5e-2 * neighbor->oneatom; int success = eam_alloy_gpu_init(atom->ntypes+1, cutforcesq, type2rhor, type2z2r, type2frho, rhor_spline, z2r_spline, frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr, - atom->nlocal, atom->nlocal+atom->nghost, 300, + atom->nlocal, atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, fp_size); GPU_EXTRA::check_flag(success,error,world); @@ -195,7 +197,6 @@ void PairEAMAlloyGPU::init_style() neighbor->requests[irequest]->half = 0; neighbor->requests[irequest]->full = 1; } - if (fp_size == sizeof(double)) fp_single = false; else diff --git a/src/GPU/pair_eam_fs_gpu.cpp b/src/GPU/pair_eam_fs_gpu.cpp index ce3ea8bb0b..390bb93987 100644 --- a/src/GPU/pair_eam_fs_gpu.cpp +++ b/src/GPU/pair_eam_fs_gpu.cpp @@ -50,19 +50,19 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq, void eam_fs_gpu_clear(); int** eam_fs_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, tagint **special, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, int **ilist, - int **jnum, const double cpu_time, bool &success, - int &inum, void **fp_ptr); + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, int &inum, void **fp_ptr); void eam_fs_gpu_compute(const int ago, const int inum_full, const int nlocal, - const int nall,double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, void **fp_ptr); + const int nall,double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, void **fp_ptr); void eam_fs_gpu_compute_force(int *ilist, const bool eflag, const bool vflag, - const bool eatom, const bool vatom); + const bool eatom, const bool vatom); double eam_fs_gpu_bytes(); /* ---------------------------------------------------------------------- */ @@ -183,10 +183,11 @@ void PairEAMFSGPU::init_style() if (atom->molecular) maxspecial=atom->maxspecial; int fp_size; + int mnf = 5e-2 * neighbor->oneatom; int success = eam_fs_gpu_init(atom->ntypes+1, cutforcesq, type2rhor, type2z2r, type2frho, rhor_spline, z2r_spline, frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr, - atom->nlocal, atom->nlocal+atom->nghost, 300, + atom->nlocal, atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, fp_size); GPU_EXTRA::check_flag(success,error,world); @@ -195,7 +196,6 @@ void PairEAMFSGPU::init_style() neighbor->requests[irequest]->half = 0; neighbor->requests[irequest]->full = 1; } - if (fp_size == sizeof(double)) fp_single = false; else diff --git a/src/GPU/pair_eam_gpu.cpp b/src/GPU/pair_eam_gpu.cpp index abd721a327..e458ea2020 100644 --- a/src/GPU/pair_eam_gpu.cpp +++ b/src/GPU/pair_eam_gpu.cpp @@ -50,11 +50,11 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq, void eam_gpu_clear(); int** eam_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, tagint **special, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, int **ilist, - int **jnum, const double cpu_time, bool &success, - int &inum, void **fp_ptr); + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, int &inum, void **fp_ptr); void eam_gpu_compute(const int ago, const int inum_full, const int nlocal, const int nall,double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, @@ -185,10 +185,11 @@ void PairEAMGPU::init_style() if (atom->molecular) maxspecial=atom->maxspecial; int fp_size; + int mnf = 5e-2 * neighbor->oneatom; int success = eam_gpu_init(atom->ntypes+1, cutforcesq, type2rhor, type2z2r, type2frho, rhor_spline, z2r_spline, frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr, - atom->nlocal, atom->nlocal+atom->nghost, 300, + atom->nlocal, atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, fp_size); GPU_EXTRA::check_flag(success,error,world); @@ -197,7 +198,6 @@ void PairEAMGPU::init_style() neighbor->requests[irequest]->half = 0; neighbor->requests[irequest]->full = 1; } - if (fp_size == sizeof(double)) fp_single = false; else diff --git a/src/GPU/pair_gauss_gpu.cpp b/src/GPU/pair_gauss_gpu.cpp index 89b79f11f2..fe9dd9ba96 100644 --- a/src/GPU/pair_gauss_gpu.cpp +++ b/src/GPU/pair_gauss_gpu.cpp @@ -41,15 +41,16 @@ using namespace LAMMPS_NS; // External functions from cuda library for atom decomposition int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a, - double **b, double **offset, double *special_lj, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen); + double **b, double **offset, double *special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + int &gpu_mode, FILE *screen); void gauss_gpu_reinit(const int ntypes, double **cutsq, double **host_a, - double **b, double **offset); + double **b, double **offset); void gauss_gpu_clear(); -int ** gauss_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, +int ** gauss_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, @@ -158,9 +159,10 @@ void PairGaussGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = gauss_gpu_init(atom->ntypes+1, cutsq, a, b, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_gayberne_gpu.cpp b/src/GPU/pair_gayberne_gpu.cpp index 19a4c77032..81966824ba 100644 --- a/src/GPU/pair_gayberne_gpu.cpp +++ b/src/GPU/pair_gayberne_gpu.cpp @@ -49,12 +49,12 @@ int gb_gpu_init(const int ntypes, const double gamma, const double upsilon, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen); + const double cell_size, int &gpu_mode, FILE *screen); void gb_gpu_clear(); int ** gb_gpu_compute_n(const int ago, const int inum, const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, tagint **special, - const bool eflag, const bool vflag, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double **host_quat); @@ -207,10 +207,11 @@ void PairGayBerneGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = gb_gpu_init(atom->ntypes+1, gamma, upsilon, mu, shape2, well, cutsq, sigma, epsilon, lshape, form, lj1, lj2, lj3, lj4, offset, force->special_lj, - atom->nlocal, atom->nlocal+atom->nghost, 300, + atom->nlocal, atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_lj96_cut_gpu.cpp b/src/GPU/pair_lj96_cut_gpu.cpp index e15a78fb91..84d1a1a10d 100644 --- a/src/GPU/pair_lj96_cut_gpu.cpp +++ b/src/GPU/pair_lj96_cut_gpu.cpp @@ -160,9 +160,10 @@ void PairLJ96CutGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = lj96_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp b/src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp new file mode 100644 index 0000000000..4f8679a8a8 --- /dev/null +++ b/src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp @@ -0,0 +1,309 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://lammps.sandia.gov/, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Mike Brown (SNL) +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_lj_charmm_coul_charmm_gpu.h" +#include "atom.h" +#include "atom_vec.h" +#include "comm.h" +#include "force.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "memory.h" +#include "error.h" +#include "neigh_request.h" +#include "universe.h" +#include "domain.h" +#include "gpu_extra.h" + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int crm_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double *special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double cut_lj_innersq, const double cut_coul_innersq, + const double denom_lj, const double denom_coul, + double **epsilon, double **sigma, + const bool mix_arithmetic); +void crm_gpu_clear(); +int ** crm_gpu_compute_n(const int ago, const int inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, const bool eflag, + const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, + int **jnum, const double cpu_time, bool &success, + double *host_q, double *boxlo, double *prd); +void crm_gpu_compute(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd); +double crm_gpu_bytes(); + +/* ---------------------------------------------------------------------- */ + +PairLJCharmmCoulCharmmGPU::PairLJCharmmCoulCharmmGPU(LAMMPS *lmp) : + PairLJCharmmCoulCharmm(lmp), gpu_mode(GPU_FORCE) +{ + reinitflag = 0; + cpu_time = 0.0; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairLJCharmmCoulCharmmGPU::~PairLJCharmmCoulCharmmGPU() +{ + crm_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCharmmCoulCharmmGPU::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + else evflag = vflag_fdotr = 0; + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + if (gpu_mode != GPU_FORCE) { + inum = atom->nlocal; + firstneigh = crm_gpu_compute_n(neighbor->ago, inum, nall, atom->x, + atom->type, domain->sublo, domain->subhi, + atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, + domain->prd); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + crm_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success, atom->q, + atom->nlocal, domain->boxlo, domain->prd); + } + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); + + if (host_startq_flag) + error->all(FLERR, + "Pair style lj/charmm/coul/long/gpu requires atom attribute q"); + if (force->newton_pair) + error->all(FLERR, + "Cannot use newton pair with lj/charmm/coul/long/gpu pair style"); + + // Repeat cutsq calculation because done after call to init_style + + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) + cut = init_one(i,j); + } + } + + cut_lj_innersq = cut_lj_inner * cut_lj_inner; + cut_coul_innersq = cut_coul_inner * cut_coul_inner; + cut_ljsq = cut_lj * cut_lj; + cut_coulsq = cut_coul * cut_coul; + cut_bothsq = MAX(cut_ljsq,cut_coulsq); + + denom_lj = (cut_ljsq-cut_lj_innersq) * (cut_ljsq-cut_lj_innersq) * + (cut_ljsq-cut_lj_innersq); + denom_lj = 1.0 / denom_lj; + + denom_coul = (cut_coulsq-cut_coul_innersq) * (cut_coulsq-cut_coul_innersq) * + (cut_coulsq-cut_coul_innersq); + denom_coul = 1.0 / denom_coul; + + double cell_size = sqrt(cut_bothsq) + neighbor->skin; + + int maxspecial=0; + if (atom->molecular) + maxspecial=atom->maxspecial; + + bool arithmetic = true; + for (int i = 1; i < atom->ntypes + 1; i++) + for (int j = i + 1; j < atom->ntypes + 1; j++) { + if (epsilon[i][j] != sqrt(epsilon[i][i] * epsilon[j][j])) + arithmetic = false; + if (sigma[i][j] != 0.5 * (sigma[i][i] + sigma[j][j])) + arithmetic = false; + } + + int mnf = 5e-2 * neighbor->oneatom; + int success = crm_gpu_init(atom->ntypes+1, cut_bothsq, lj1, lj2, lj3, lj4, + force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, mnf, maxspecial, + cell_size, gpu_mode, screen, cut_ljsq, + cut_coulsq, force->special_coul, force->qqrd2e, + cut_lj_innersq,cut_coul_innersq,denom_lj, + denom_coul,epsilon,sigma,arithmetic); + GPU_EXTRA::check_flag(success,error,world); + + if (gpu_mode == GPU_FORCE) { + int irequest = neighbor->request(this,instance_me); + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->full = 1; + } +} + +/* ---------------------------------------------------------------------- */ + +double PairLJCharmmCoulCharmmGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + crm_gpu_bytes(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCharmmCoulCharmmGPU::cpu_compute(int start, int inum, int eflag, + int vflag, int *ilist, + int *numneigh, int **firstneigh) +{ + int i,j,ii,jj,jnum,itype,jtype; + double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair; + double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; + double philj,switch1,switch2; + int *jlist; + + evdwl = ecoul = 0.0; + + double **x = atom->x; + double **f = atom->f; + double *q = atom->q; + int *type = atom->type; + double *special_coul = force->special_coul; + double *special_lj = force->special_lj; + double qqrd2e = force->qqrd2e; + + // loop over neighbors of my atoms + + for (ii = start; ii < inum; ii++) { + i = ilist[ii]; + qtmp = q[i]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + factor_lj = special_lj[sbmask(j)]; + factor_coul = special_coul[sbmask(j)]; + j &= NEIGHMASK; + + delx = xtmp - x[j][0]; + dely = ytmp - x[j][1]; + delz = ztmp - x[j][2]; + rsq = delx*delx + dely*dely + delz*delz; + + if (rsq < cut_bothsq) { + r2inv = 1.0/rsq; + + if (rsq < cut_coulsq) { + forcecoul = qqrd2e * qtmp*q[j]*sqrt(r2inv); + if (rsq > cut_coul_innersq) { + switch1 = (cut_coulsq-rsq) * (cut_coulsq-rsq) * + (cut_coulsq + 2.0*rsq - 3.0*cut_coul_innersq) * denom_coul; + forcecoul *= switch1; + } + } else forcecoul = 0.0; + + if (rsq < cut_ljsq) { + r6inv = r2inv*r2inv*r2inv; + jtype = type[j]; + forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]); + if (rsq > cut_lj_innersq) { + switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) * + (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) * denom_lj; + switch2 = 12.0*rsq * (cut_ljsq-rsq) * + (rsq-cut_lj_innersq) * denom_lj; + philj = r6inv * (lj3[itype][jtype]*r6inv - lj4[itype][jtype]); + forcelj = forcelj*switch1 + philj*switch2; + } + } else forcelj = 0.0; + + fpair = (factor_coul*forcecoul + factor_lj*forcelj) * r2inv; + + f[i][0] += delx*fpair; + f[i][1] += dely*fpair; + f[i][2] += delz*fpair; + + if (eflag) { + if (rsq < cut_coulsq) { + ecoul = qqrd2e * qtmp*q[j]*sqrt(r2inv); + if (rsq > cut_coul_innersq) { + switch1 = (cut_coulsq-rsq) * (cut_coulsq-rsq) * + (cut_coulsq + 2.0*rsq - 3.0*cut_coul_innersq) * + denom_coul; + ecoul *= switch1; + } + ecoul *= factor_coul; + } else ecoul = 0.0; + + if (rsq < cut_ljsq) { + evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]); + if (rsq > cut_lj_innersq) { + switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) * + (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) * denom_lj; + evdwl *= switch1; + } + evdwl *= factor_lj; + } else evdwl = 0.0; + } + + if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz); + } + } + } +} diff --git a/src/GPU/pair_lj_charmm_coul_charmm_gpu.h b/src/GPU/pair_lj_charmm_coul_charmm_gpu.h new file mode 100644 index 0000000000..d80730ca5c --- /dev/null +++ b/src/GPU/pair_lj_charmm_coul_charmm_gpu.h @@ -0,0 +1,62 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/charmm/coul/charmm/gpu,PairLJCharmmCoulCharmmGPU) + +#else + +#ifndef LMP_PAIR_LJ_CHARMM_COUL_CHARMM_GPU_H +#define LMP_PAIR_LJ_CHARMM_COUL_CHARMM_GPU_H + +#include "pair_lj_charmm_coul_charmm.h" + +namespace LAMMPS_NS { + +class PairLJCharmmCoulCharmmGPU : public PairLJCharmmCoulCharmm { + public: + PairLJCharmmCoulCharmmGPU(LAMMPS *lmp); + ~PairLJCharmmCoulCharmmGPU(); + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int); + void init_style(); + double memory_usage(); + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + private: + int gpu_mode; + double cpu_time; +}; + +} +#endif +#endif + +/* ERROR/WARNING messages: + +E: Insufficient memory on accelerator + +There is insufficient memory on one of the devices specified for the gpu +package + +E: Pair style lj/charmm/coul/long/gpu requires atom attribute q + +The atom style defined does not have this attribute. + +E: Cannot use newton pair with lj/charmm/coul/long/gpu pair style + +Self-explanatory. + +*/ diff --git a/src/GPU/pair_lj_charmm_coul_long_gpu.cpp b/src/GPU/pair_lj_charmm_coul_long_gpu.cpp index b89e4d4574..9753404d5e 100644 --- a/src/GPU/pair_lj_charmm_coul_long_gpu.cpp +++ b/src/GPU/pair_lj_charmm_coul_long_gpu.cpp @@ -203,9 +203,10 @@ void PairLJCharmmCoulLongGPU::init_style() arithmetic = false; } + int mnf = 5e-2 * neighbor->oneatom; int success = crml_gpu_init(atom->ntypes+1, cut_bothsq, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e, g_ewald, cut_lj_innersq,denom_lj,epsilon,sigma, diff --git a/src/GPU/pair_lj_class2_coul_long_gpu.cpp b/src/GPU/pair_lj_class2_coul_long_gpu.cpp index 50183196f8..3fc6195fa8 100644 --- a/src/GPU/pair_lj_class2_coul_long_gpu.cpp +++ b/src/GPU/pair_lj_class2_coul_long_gpu.cpp @@ -188,9 +188,10 @@ void PairLJClass2CoulLongGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = c2cl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e, g_ewald); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_lj_class2_gpu.cpp b/src/GPU/pair_lj_class2_gpu.cpp index 55fdc2d43d..cf8158ce5f 100644 --- a/src/GPU/pair_lj_class2_gpu.cpp +++ b/src/GPU/pair_lj_class2_gpu.cpp @@ -157,9 +157,10 @@ void PairLJClass2GPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = lj96_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_lj_cubic_gpu.cpp b/src/GPU/pair_lj_cubic_gpu.cpp index 35062a5d71..a0dd9498c6 100644 --- a/src/GPU/pair_lj_cubic_gpu.cpp +++ b/src/GPU/pair_lj_cubic_gpu.cpp @@ -52,18 +52,18 @@ int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq, const double cell_size, int &gpu_mode, FILE *screen); void ljcb_gpu_clear(); -int ** ljcb_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, - const double cpu_time, bool &success); +int ** ljcb_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success); void ljcb_gpu_compute(const int ago, const int inum, const int nall, - double **host_x, int *host_type, int *ilist, int *numj, - int **firstneigh, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success); + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success); double ljcb_gpu_bytes(); /* ---------------------------------------------------------------------- */ @@ -165,10 +165,11 @@ void PairLJCubicGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = ljcb_gpu_init(atom->ntypes+1, cutsq, cut_inner_sq, cut_inner, sigma, epsilon, lj1, lj2, lj3, lj4, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_lj_cut_coul_cut_gpu.cpp b/src/GPU/pair_lj_cut_coul_cut_gpu.cpp index e4823a3ea4..7932a352b3 100644 --- a/src/GPU/pair_lj_cut_coul_cut_gpu.cpp +++ b/src/GPU/pair_lj_cut_coul_cut_gpu.cpp @@ -48,16 +48,16 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, const double qqrd2e); void ljc_gpu_clear(); -int ** ljc_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, +int ** ljc_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd); void ljc_gpu_compute(const int ago, const int inum, - const int nall, double **host_x, int *host_type, + const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, @@ -168,9 +168,10 @@ void PairLJCutCoulCutGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = ljc_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_lj_cut_coul_debye_gpu.cpp b/src/GPU/pair_lj_cut_coul_debye_gpu.cpp index 1f7ae9af01..eb8e2c9c7f 100644 --- a/src/GPU/pair_lj_cut_coul_debye_gpu.cpp +++ b/src/GPU/pair_lj_cut_coul_debye_gpu.cpp @@ -41,17 +41,17 @@ using namespace LAMMPS_NS; // External functions from cuda library for atom decomposition int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, - double **host_cut_ljsq, double **host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double kappa); + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double kappa); void ljcd_gpu_clear(); int ** ljcd_gpu_compute_n(const int ago, const int inum, const int nall, - double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -170,9 +170,10 @@ void PairLJCutCoulDebyeGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = ljcd_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e, kappa); diff --git a/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp b/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp index 6c25412ae8..e071245a56 100644 --- a/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp +++ b/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp @@ -59,9 +59,9 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1, const double e_shift, const double f_shift, const double alpha); void ljd_gpu_clear(); -int ** ljd_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, +int ** ljd_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -185,9 +185,10 @@ void PairLJCutCoulDSFGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = ljd_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e, e_shift, f_shift, alpha); diff --git a/src/GPU/pair_lj_cut_coul_long_gpu.cpp b/src/GPU/pair_lj_cut_coul_long_gpu.cpp index 50776de795..cff48afd1e 100644 --- a/src/GPU/pair_lj_cut_coul_long_gpu.cpp +++ b/src/GPU/pair_lj_cut_coul_long_gpu.cpp @@ -58,8 +58,8 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double *host_special_coul, const double qqrd2e, const double g_ewald); void ljcl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double **host_lj_cutsq); + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double **host_lj_cutsq); void ljcl_gpu_clear(); int ** ljcl_gpu_compute_n(const int ago, const int inum, const int nall, double **host_x, int *host_type, @@ -193,9 +193,10 @@ void PairLJCutCoulLongGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = ljcl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e, g_ewald); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_lj_cut_coul_msm_gpu.cpp b/src/GPU/pair_lj_cut_coul_msm_gpu.cpp index 33ba418533..d686ea4d88 100644 --- a/src/GPU/pair_lj_cut_coul_msm_gpu.cpp +++ b/src/GPU/pair_lj_cut_coul_msm_gpu.cpp @@ -48,15 +48,17 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, - double *host_special_coul, const int order, const double qqrd2e); + double *host_special_coul, const int order, + const double qqrd2e); void ljcm_gpu_clear(); -int ** ljcm_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, +int ** ljcm_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, double *prd); + bool &success, double *host_q, double *boxlo, + double *prd); void ljcm_gpu_compute(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, @@ -177,12 +179,13 @@ void PairLJCutCoulMSMGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = ljcm_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, force->kspace->get_gcons(), force->kspace->get_dgcons(), offset, force->special_lj, atom->nlocal, atom->nlocal+atom->nghost, - 300, maxspecial, cell_size, gpu_mode, screen, + mnf, maxspecial, cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, force->special_coul, force->kspace->order, force->qqrd2e); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp b/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp index ae93cd9010..16eef6e8e8 100644 --- a/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp +++ b/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp @@ -173,9 +173,10 @@ void PairLJCutDipoleCutGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = dpl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_lj_cut_dipole_long_gpu.cpp b/src/GPU/pair_lj_cut_dipole_long_gpu.cpp index 8e7d5baddc..b7c29cedb8 100644 --- a/src/GPU/pair_lj_cut_dipole_long_gpu.cpp +++ b/src/GPU/pair_lj_cut_dipole_long_gpu.cpp @@ -52,29 +52,30 @@ using namespace MathConst; // External functions from cuda library for atom decomposition int dplj_gpu_init(const int ntypes, double **cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, - double **host_cut_ljsq, const double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, const double g_ewald); + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, const double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald); void dplj_gpu_clear(); int ** dplj_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, - double *host_q, double **host_mu, - double *boxlo, double *prd); + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + double *host_q, double **host_mu, + double *boxlo, double *prd); void dplj_gpu_compute(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q, double **host_mu, - const int nlocal, double *boxlo, double *prd); + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, const double cpu_time, + bool &success, double *host_q, double **host_mu, + const int nlocal, double *boxlo, double *prd); double dplj_gpu_bytes(); /* ---------------------------------------------------------------------- */ @@ -196,9 +197,10 @@ void PairLJCutDipoleLongGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = dplj_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e, g_ewald); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_lj_cut_gpu.cpp b/src/GPU/pair_lj_cut_gpu.cpp index 2b2773b920..edd2a7feb0 100644 --- a/src/GPU/pair_lj_cut_gpu.cpp +++ b/src/GPU/pair_lj_cut_gpu.cpp @@ -47,13 +47,13 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, const double cell_size, int &gpu_mode, FILE *screen); void ljl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset); + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset); void ljl_gpu_clear(); -int ** ljl_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, +int ** ljl_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, @@ -164,9 +164,10 @@ void PairLJCutGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = ljl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp b/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp index 3e852513b2..9584c6f68a 100644 --- a/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp +++ b/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp @@ -229,10 +229,11 @@ void PairLJCutTIP4PLongGPU::init_style() error->warning(FLERR,"Increasing communication cutoff for TIP4P GPU style"); } + int mnf = 5e-2 * neighbor->oneatom; int success = ljtip4p_long_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, typeH, typeO, alpha, qdist, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, cut_coulsqplus, force->special_coul, force->qqrd2e, diff --git a/src/GPU/pair_lj_expand_coul_long_gpu.cpp b/src/GPU/pair_lj_expand_coul_long_gpu.cpp index 533f9d9070..da0c720c74 100644 --- a/src/GPU/pair_lj_expand_coul_long_gpu.cpp +++ b/src/GPU/pair_lj_expand_coul_long_gpu.cpp @@ -50,31 +50,31 @@ using namespace LAMMPS_NS; // External functions from cuda library for atom decomposition int ljecl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double **shift, double *special_lj, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, - double **host_cut_ljsq, double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald); + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double **shift, double *special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + int &gpu_mode, FILE *screen, double **host_cut_ljsq, + double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); int ljecl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double **shift, double **host_lj_cutsq); + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double **shift, double **host_lj_cutsq); void ljecl_gpu_clear(); int ** ljecl_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, double *host_q, - double *boxlo, double *prd); + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double *host_q, + double *boxlo, double *prd); void ljecl_gpu_compute(const int ago, const int inum, const int nall, - double **host_x, int *host_type, int *ilist, int *numj, - int **firstneigh, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q, - const int nlocal, double *boxlo, double *prd); + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd); double ljecl_gpu_bytes(); /* ---------------------------------------------------------------------- */ @@ -193,9 +193,10 @@ void PairLJExpandCoulLongGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = ljecl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, shift, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e, g_ewald); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_lj_expand_gpu.cpp b/src/GPU/pair_lj_expand_gpu.cpp index d3745dce56..0e86e41255 100644 --- a/src/GPU/pair_lj_expand_gpu.cpp +++ b/src/GPU/pair_lj_expand_gpu.cpp @@ -47,8 +47,8 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); void lje_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double **shift); + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double **shift); void lje_gpu_clear(); int ** lje_gpu_compute_n(const int ago, const int inum, const int nall, double **host_x, int *host_type, double *sublo, @@ -161,9 +161,10 @@ void PairLJExpandGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = lje_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, shift, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_lj_gromacs_gpu.cpp b/src/GPU/pair_lj_gromacs_gpu.cpp index 1bffbcd0b9..a605ebd6c4 100644 --- a/src/GPU/pair_lj_gromacs_gpu.cpp +++ b/src/GPU/pair_lj_gromacs_gpu.cpp @@ -43,16 +43,17 @@ using namespace LAMMPS_NS; int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, - double *special_lj, const int inum, - const int nall, const int max_nbors, const int maxspecial, + double *special_lj, const int inum, const int nall, + const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, - double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, - double **host_ljsw4, double **host_ljsw5, - double **cut_inner, double **cut_innersq); + double **host_ljsw1, double **host_ljsw2, + double **host_ljsw3, double **host_ljsw4, + double **host_ljsw5, double **cut_inner, + double **cut_innersq); void ljgrm_gpu_clear(); -int ** ljgrm_gpu_compute_n(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, +int ** ljgrm_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -164,9 +165,10 @@ void PairLJGromacsGPU::init_style() if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = ljgrm_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, ljsw1, ljsw2, ljsw3, ljsw4, ljsw5, cut_inner, cut_inner_sq); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_lj_sdk_coul_long_gpu.cpp b/src/GPU/pair_lj_sdk_coul_long_gpu.cpp index a3ba87c82e..df2310e904 100644 --- a/src/GPU/pair_lj_sdk_coul_long_gpu.cpp +++ b/src/GPU/pair_lj_sdk_coul_long_gpu.cpp @@ -197,9 +197,10 @@ void PairLJSDKCoulLongGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = sdkl_gpu_init(atom->ntypes+1, cutsq, lj_type, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e, g_ewald); diff --git a/src/GPU/pair_lj_sdk_gpu.cpp b/src/GPU/pair_lj_sdk_gpu.cpp index baf341c25a..5a1960e4c8 100644 --- a/src/GPU/pair_lj_sdk_gpu.cpp +++ b/src/GPU/pair_lj_sdk_gpu.cpp @@ -166,9 +166,10 @@ void PairLJSDKGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = sdk_gpu_init(atom->ntypes+1,cutsq,lj_type,lj1,lj2,lj3,lj4, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp b/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp index 6f0ebc58b7..470c2f049e 100644 --- a/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp +++ b/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp @@ -48,21 +48,21 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, const double qqrd2e); void dplsf_gpu_clear(); -int ** dplsf_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, +int ** dplsf_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double **host_mu, double *boxlo, double *prd); -void dplsf_gpu_compute(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q, double **host_mu, const int nlocal, - double *boxlo, double *prd); +void dplsf_gpu_compute(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + double **host_mu, const int nlocal, double *boxlo, + double *prd); double dplsf_gpu_bytes(); /* ---------------------------------------------------------------------- */ @@ -172,9 +172,10 @@ void PairLJSFDipoleSFGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = dplsf_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_mie_cut_gpu.cpp b/src/GPU/pair_mie_cut_gpu.cpp index e9e6eedde8..05e92909da 100644 --- a/src/GPU/pair_mie_cut_gpu.cpp +++ b/src/GPU/pair_mie_cut_gpu.cpp @@ -47,9 +47,9 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); void mie_gpu_clear(); -int ** mie_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, +int ** mie_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, @@ -161,9 +161,10 @@ void PairMIECutGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = mie_gpu_init(atom->ntypes+1, cutsq, mie1, mie2, mie3, mie4, gamA, gamR, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_morse_gpu.cpp b/src/GPU/pair_morse_gpu.cpp index 75ca5627ba..d929c76930 100644 --- a/src/GPU/pair_morse_gpu.cpp +++ b/src/GPU/pair_morse_gpu.cpp @@ -46,9 +46,9 @@ int mor_gpu_init(const int ntypes, double **cutsq, double **host_morse1, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); void mor_gpu_clear(); -int ** mor_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, +int ** mor_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, @@ -157,9 +157,10 @@ void PairMorseGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = mor_gpu_init(atom->ntypes+1, cutsq, morse1, r0, alpha, d0, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_resquared_gpu.cpp b/src/GPU/pair_resquared_gpu.cpp index b6c212da6f..c816ad9166 100644 --- a/src/GPU/pair_resquared_gpu.cpp +++ b/src/GPU/pair_resquared_gpu.cpp @@ -44,16 +44,16 @@ using namespace LAMMPS_NS; int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq, double **sigma, double **epsilon, - int **form, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen); + int **form, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, double **offset, + double *special_lj, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen); void re_gpu_clear(); int ** re_gpu_compute_n(const int ago, const int inum, const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, tagint **special, - const bool eflag, const bool vflag, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double **host_quat); @@ -205,10 +205,11 @@ void PairRESquaredGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = re_gpu_init(atom->ntypes+1, shape1, well, cutsq, sigma, epsilon, form, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_soft_gpu.cpp b/src/GPU/pair_soft_gpu.cpp index c9eb55157a..5a3ad0c577 100644 --- a/src/GPU/pair_soft_gpu.cpp +++ b/src/GPU/pair_soft_gpu.cpp @@ -48,13 +48,13 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **prefactor, void soft_gpu_reinit(const int ntypes, double **cutsq, double **host_prefactor, double **host_cut); void soft_gpu_clear(); -int ** soft_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, - const double cpu_time, bool &success); +int ** soft_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success); void soft_gpu_compute(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, @@ -162,9 +162,10 @@ void PairSoftGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = soft_gpu_init(atom->ntypes+1, cutsq, prefactor, cut, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_sw_gpu.cpp b/src/GPU/pair_sw_gpu.cpp index 3d851121e0..7bfbe2810f 100644 --- a/src/GPU/pair_sw_gpu.cpp +++ b/src/GPU/pair_sw_gpu.cpp @@ -38,31 +38,27 @@ using namespace LAMMPS_NS; // External functions from cuda library for atom decomposition -int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors, - const double cell_size, int &gpu_mode, FILE *screen, - int* host_map, const int nelements, int*** host_elem2param, const int nparams, - const double* sw_epsilon, const double* sw_sigma, - const double* sw_lambda, const double* sw_gamma, - const double* sw_costheta, const double* sw_biga, - const double* sw_bigb, const double* sw_powerp, - const double* sw_powerq, const double* sw_cut, - const double* sw_cutsq); +int sw_gpu_init(const int ntypes, const int inum, const int nall, + const int max_nbors, const double cell_size, int &gpu_mode, + FILE *screen, double **ncutsq, double **ncut, double **sigma, + double **powerp, double **powerq, double **sigma_gamma, + double **c1, double **c2, double **c3,double **c4, + double **c5, double **c6, double ***lambda_epsilon, + double ***costheta, const int *map, int ***e2param); void sw_gpu_clear(); -int ** sw_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, +int ** sw_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success); -void sw_gpu_compute(const int ago, const int nloc, const int nall, const int ln, - double **host_x, int *host_type, int *ilist, int *numj, - int **firstneigh, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success); +void sw_gpu_compute(const int ago, const int nloc, const int nall, + const int ln, double **host_x, int *host_type, int *ilist, + int *numj, int **firstneigh, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, + int &host_start, const double cpu_time, bool &success); double sw_gpu_bytes(); -extern double lmp_gpu_forces(double **f, double **tor, double *eatom, - double **vatom, double *virial, double &ecoul); #define MAXLINE 1024 #define DELTA 4 @@ -159,55 +155,84 @@ void PairSWGPU::init_style() if (force->newton_pair != 0) error->all(FLERR,"Pair style sw/gpu requires newton pair off"); - double *epsilon, *sigma, *lambda, *gamma; - double *biga, *bigb, *powerp, *powerq; - double *_cut, *_cutsq, *costheta; - epsilon = sigma = lambda = gamma = nullptr; - biga = bigb = powerp = powerq = nullptr; - _cut = _cutsq = costheta = nullptr; + double **c1, **c2, **c3, **c4, **c5, **c6; + double **ncutsq, **ncut, **sigma, **powerp, **powerq, **sigma_gamma; + double ***lambda_epsilon, ***costheta; + c1 = c2 = c3 = c4 = c5 = c6 = nullptr; + ncutsq = ncut = sigma = powerp = powerq = sigma_gamma = nullptr; + lambda_epsilon = costheta = nullptr; - memory->create(epsilon,nparams,"pair:epsilon"); - memory->create(sigma,nparams,"pair:sigma"); - memory->create(lambda,nparams,"pair:lambda"); - memory->create(gamma,nparams,"pair:gamma"); - memory->create(biga,nparams,"pair:biga"); - memory->create(bigb,nparams,"pair:bigb"); - memory->create(powerp,nparams,"pair:powerp"); - memory->create(powerq,nparams,"pair:powerq"); - memory->create(_cut,nparams,"pair:_cut"); - memory->create(_cutsq,nparams,"pair:_cutsq"); - memory->create(costheta,nparams,"pair:costheta"); + const int tp1 = atom->ntypes + 1; - for (int i = 0; i < nparams; i++) { - epsilon[i] = params[i].epsilon; - sigma[i] = params[i].sigma; - lambda[i] = params[i].lambda; - gamma[i] = params[i].gamma; - biga[i] = params[i].biga; - bigb[i] = params[i].bigb; - powerp[i] = params[i].powerp; - powerq[i] = params[i].powerq; - _cut[i] = params[i].cut; - _cutsq[i] = params[i].cutsq; - costheta[i] = params[i].costheta; + memory->create(ncutsq, tp1, tp1, "pair:ncutsq"); + memory->create(ncut, tp1, tp1, "pair:ncut"); + memory->create(sigma, tp1, tp1, "pair:sigma"); + memory->create(powerp, tp1, tp1, "pair:powerp"); + memory->create(powerq, tp1, tp1, "pair:powerq"); + memory->create(sigma_gamma, tp1, tp1, "pair:sigma_gamma"); + memory->create(c1, tp1, tp1, "pair:c1"); + memory->create(c2, tp1, tp1, "pair:c2"); + memory->create(c3, tp1, tp1, "pair:c3"); + memory->create(c4, tp1, tp1, "pair:c4"); + memory->create(c5, tp1, tp1, "pair:c5"); + memory->create(c6, tp1, tp1, "pair:c6"); + memory->create(lambda_epsilon, tp1, tp1, tp1, "pair:lambda_epsilon"); + memory->create(costheta, tp1, tp1, tp1, "pair:costheta"); + + for (int ii = 1; ii < tp1; ii++) { + int i = map[ii]; + for (int jj = 1; jj < tp1; jj++) { + int j = map[jj]; + if (i < 0 || j < 0) + continue; + else { + int ijparam = elem2param[i][j][j]; + ncutsq[ii][jj] = params[ijparam].cutsq; + ncut[ii][jj] = params[ijparam].cut; + sigma[ii][jj] = params[ijparam].sigma; + powerp[ii][jj] = params[ijparam].powerp; + powerq[ii][jj] = params[ijparam].powerq; + sigma_gamma[ii][jj] = params[ijparam].sigma_gamma; + c1[ii][jj] = params[ijparam].c1; + c2[ii][jj] = params[ijparam].c2; + c3[ii][jj] = params[ijparam].c3; + c4[ii][jj] = params[ijparam].c4; + c5[ii][jj] = params[ijparam].c5; + c6[ii][jj] = params[ijparam].c6; + } + + for (int kk = 1; kk < tp1; kk++) { + int k = map[kk]; + if (k < 0) + continue; + else { + int ijkparam = elem2param[i][j][k]; + costheta[ii][jj][kk] = params[ijkparam].costheta; + lambda_epsilon[ii][jj][kk] = params[ijkparam].lambda_epsilon; + } + } + } } - int success = sw_gpu_init(atom->ntypes+1, atom->nlocal, atom->nlocal+atom->nghost, 300, - cell_size, gpu_mode, screen, map, nelements, - elem2param, nparams, epsilon, - sigma, lambda, gamma, costheta, biga, bigb, - powerp, powerq, _cut, _cutsq); + int mnf = 5e-2 * neighbor->oneatom; + int success = sw_gpu_init(tp1, atom->nlocal, atom->nlocal+atom->nghost, mnf, + cell_size, gpu_mode, screen, ncutsq, ncut, sigma, + powerp, powerq, sigma_gamma, c1, c2, c3, c4, c5, + c6, lambda_epsilon, costheta, map, elem2param); - memory->destroy(epsilon); + memory->destroy(ncutsq); + memory->destroy(ncut); memory->destroy(sigma); - memory->destroy(lambda); - memory->destroy(gamma); - memory->destroy(biga); - memory->destroy(bigb); memory->destroy(powerp); memory->destroy(powerq); - memory->destroy(_cut); - memory->destroy(_cutsq); + memory->destroy(sigma_gamma); + memory->destroy(c1); + memory->destroy(c2); + memory->destroy(c3); + memory->destroy(c4); + memory->destroy(c5); + memory->destroy(c6); + memory->destroy(lambda_epsilon); memory->destroy(costheta); GPU_EXTRA::check_flag(success,error,world); @@ -218,7 +243,6 @@ void PairSWGPU::init_style() neighbor->requests[irequest]->full = 1; neighbor->requests[irequest]->ghost = 1; } - if (comm->cutghostuser < (2.0*cutmax + neighbor->skin)) { comm->cutghostuser=2.0*cutmax + neighbor->skin; if (comm->me == 0) diff --git a/src/GPU/pair_table_gpu.cpp b/src/GPU/pair_table_gpu.cpp index e3cb740e0e..05b76d9adb 100644 --- a/src/GPU/pair_table_gpu.cpp +++ b/src/GPU/pair_table_gpu.cpp @@ -231,9 +231,10 @@ void PairTableGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = table_gpu_init(atom->ntypes+1, cutsq, table_coeffs, table_data, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, tabstyle, ntables, tablength); GPU_EXTRA::check_flag(success,error,world); @@ -243,7 +244,6 @@ void PairTableGPU::init_style() neighbor->requests[irequest]->half = 0; neighbor->requests[irequest]->full = 1; } - memory->destroy(table_coeffs); memory->destroy(table_data); } diff --git a/src/GPU/pair_tersoff_gpu.cpp b/src/GPU/pair_tersoff_gpu.cpp index 8758150956..e675ba6903 100644 --- a/src/GPU/pair_tersoff_gpu.cpp +++ b/src/GPU/pair_tersoff_gpu.cpp @@ -66,8 +66,6 @@ void tersoff_gpu_compute(const int ago, const int nlocal, const int nall, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success); double tersoff_gpu_bytes(); -extern double lmp_gpu_forces(double **f, double **tor, double *eatom, - double **vatom, double *virial, double &ecoul); #define MAXLINE 1024 #define DELTA 4 @@ -216,8 +214,9 @@ void PairTersoffGPU::init_style() _cutsq[i] = params[i].cutsq; } + int mnf = 5e-2 * neighbor->oneatom; int success = tersoff_gpu_init(atom->ntypes+1, atom->nlocal, - atom->nlocal+atom->nghost, 300, + atom->nlocal+atom->nghost, mnf, cell_size, gpu_mode, screen, map, nelements, elem2param, nparams, lam1, lam2, lam3, powermint, biga, bigb, bigr, bigd, @@ -252,7 +251,6 @@ void PairTersoffGPU::init_style() neighbor->requests[irequest]->full = 1; neighbor->requests[irequest]->ghost = 1; } - if (comm->cutghostuser < (2.0*cutmax + neighbor->skin)) { comm->cutghostuser = 2.0*cutmax + neighbor->skin; if (comm->me == 0) diff --git a/src/GPU/pair_tersoff_mod_gpu.cpp b/src/GPU/pair_tersoff_mod_gpu.cpp index 71734c1c09..98a7248c1f 100644 --- a/src/GPU/pair_tersoff_mod_gpu.cpp +++ b/src/GPU/pair_tersoff_mod_gpu.cpp @@ -43,9 +43,10 @@ int tersoff_mod_gpu_init(const int ntypes, const int inum, const int nall, int* host_map, const int nelements, int*** host_elem2param, const int nparams, const double* ts_lam1, const double* ts_lam2, const double* ts_lam3, const double* ts_powermint, const double* ts_biga, const double* ts_bigb, - const double* ts_bigr, const double* ts_bigd, const double* ts_c1, const double* ts_c2, - const double* ts_c3, const double* ts_c4, const double* ts_c5, const double* ts_h, - const double* ts_beta, const double* ts_powern, const double* ts_powern_del, + const double* ts_bigr, const double* ts_bigd, const double* ts_c1, + const double* ts_c2, const double* ts_c3, const double* ts_c4, + const double* ts_c5, const double* ts_h, const double* ts_beta, + const double* ts_powern, const double* ts_powern_del, const double* ts_ca1, const double* ts_cutsq); void tersoff_mod_gpu_clear(); int ** tersoff_mod_gpu_compute_n(const int ago, const int inum_full, @@ -61,8 +62,6 @@ void tersoff_mod_gpu_compute(const int ago, const int nlocal, const int nall, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success); double tersoff_mod_gpu_bytes(); -extern double lmp_gpu_forces(double **f, double **tor, double *eatom, - double **vatom, double *virial, double &ecoul); /* ---------------------------------------------------------------------- */ @@ -208,8 +207,9 @@ void PairTersoffMODGPU::init_style() _cutsq[i] = params[i].cutsq; } + int mnf = 5e-2 * neighbor->oneatom; int success = tersoff_mod_gpu_init(atom->ntypes+1, atom->nlocal, - atom->nlocal+atom->nghost, 300, + atom->nlocal+atom->nghost, mnf, cell_size, gpu_mode, screen, map, nelements, elem2param, nparams, lam1, lam2, lam3, powermint, biga, bigb, bigr, bigd, @@ -244,7 +244,6 @@ void PairTersoffMODGPU::init_style() neighbor->requests[irequest]->full = 1; neighbor->requests[irequest]->ghost = 1; } - if (comm->cutghostuser < (2.0*cutmax + neighbor->skin)) { comm->cutghostuser = 2.0*cutmax + neighbor->skin; if (comm->me == 0) diff --git a/src/GPU/pair_tersoff_zbl_gpu.cpp b/src/GPU/pair_tersoff_zbl_gpu.cpp index e662159fa8..e17b48fec5 100644 --- a/src/GPU/pair_tersoff_zbl_gpu.cpp +++ b/src/GPU/pair_tersoff_zbl_gpu.cpp @@ -69,8 +69,6 @@ void tersoff_zbl_gpu_compute(const int ago, const int nlocal, const int nall, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success); double tersoff_zbl_gpu_bytes(); -extern double lmp_gpu_forces(double **f, double **tor, double *eatom, - double **vatom, double *virial, double &ecoul); /* ---------------------------------------------------------------------- */ @@ -225,8 +223,9 @@ void PairTersoffZBLGPU::init_style() _cutsq[i] = params[i].cutsq; } + int mnf = 5e-2 * neighbor->oneatom; int success = tersoff_zbl_gpu_init(atom->ntypes+1, atom->nlocal, - atom->nlocal+atom->nghost, 300, + atom->nlocal+atom->nghost, mnf, cell_size, gpu_mode, screen, map, nelements, elem2param, nparams, lam1, lam2, lam3, powermint, biga, bigb, bigr, bigd, @@ -266,7 +265,6 @@ void PairTersoffZBLGPU::init_style() neighbor->requests[irequest]->full = 1; neighbor->requests[irequest]->ghost = 1; } - if (comm->cutghostuser < (2.0*cutmax + neighbor->skin)) { comm->cutghostuser = 2.0*cutmax + neighbor->skin; if (comm->me == 0) diff --git a/src/GPU/pair_ufm_gpu.cpp b/src/GPU/pair_ufm_gpu.cpp index 87354acda9..f950bf11c3 100644 --- a/src/GPU/pair_ufm_gpu.cpp +++ b/src/GPU/pair_ufm_gpu.cpp @@ -43,28 +43,27 @@ using namespace LAMMPS_NS; // External functions from cuda library for atom decomposition int ufml_gpu_init(const int ntypes, double **cutsq, double **host_uf1, - double **host_uf2, double **host_uf3, - double **offset, double *special_lj, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen); + double **host_uf2, double **host_uf3, + double **offset, double *special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen); int ufml_gpu_reinit(const int ntypes, double **cutsq, double **host_uf1, - double **host_uf2, double **host_uf3, - double **offset); + double **host_uf2, double **host_uf3, double **offset); void ufml_gpu_clear(); -int ** ufml_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, - const double cpu_time, bool &success); +int ** ufml_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success); void ufml_gpu_compute(const int ago, const int inum, const int nall, - double **host_x, int *host_type, int *ilist, int *numj, - int **firstneigh, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success); + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success); double ufml_gpu_bytes(); /* ---------------------------------------------------------------------- */ @@ -166,9 +165,10 @@ void PairUFMGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = ufml_gpu_init(atom->ntypes+1, cutsq, uf1, uf2, uf3, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_vashishta_gpu.cpp b/src/GPU/pair_vashishta_gpu.cpp index df17b2091a..c5dd722974 100644 --- a/src/GPU/pair_vashishta_gpu.cpp +++ b/src/GPU/pair_vashishta_gpu.cpp @@ -38,34 +38,34 @@ using namespace LAMMPS_NS; // External functions from cuda library for atom decomposition -int vashishta_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors, - const double cell_size, int &gpu_mode, FILE *screen, - int* host_map, const int nelements, int*** host_elem2param, const int nparams, - const double* cutsq, const double* r0, - const double* gamma, const double* eta, - const double* lam1inv, const double* lam4inv, - const double* zizj, const double* mbigd, - const double* dvrc, const double* big6w, - const double* heta, const double* bigh, - const double* bigw, const double* c0, - const double* costheta, const double* bigb, - const double* big2b, const double* bigc); +int vashishta_gpu_init(const int ntypes, const int inum, const int nall, + const int max_nbors, const double cell_size, + int &gpu_mode, FILE *screen, int* host_map, + const int nelements, int*** host_elem2param, + const int nparams, const double* cutsq, const double* r0, + const double* gamma, const double* eta, + const double* lam1inv, const double* lam4inv, + const double* zizj, const double* mbigd, + const double* dvrc, const double* big6w, + const double* heta, const double* bigh, + const double* bigw, const double* c0, + const double* costheta, const double* bigb, + const double* big2b, const double* bigc); void vashishta_gpu_clear(); -int ** vashishta_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, +int ** vashishta_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success); -void vashishta_gpu_compute(const int ago, const int nloc, const int nall, const int ln, - double **host_x, int *host_type, int *ilist, int *numj, - int **firstneigh, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success); +void vashishta_gpu_compute(const int ago, const int nloc, const int nall, + const int ln, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success); double vashishta_gpu_bytes(); -extern double lmp_gpu_forces(double **f, double **tor, double *eatom, - double **vatom, double *virial, double &ecoul); /* ---------------------------------------------------------------------- */ @@ -214,7 +214,8 @@ void PairVashishtaGPU::init_style() big2b[i] = params[i].big2b; bigc[i] = params[i].bigc; } - int success = vashishta_gpu_init(atom->ntypes+1, atom->nlocal, atom->nlocal+atom->nghost, 500, + int mnf = 5e-2 * neighbor->oneatom; + int success = vashishta_gpu_init(atom->ntypes+1, atom->nlocal, atom->nlocal+atom->nghost, mnf, cell_size, gpu_mode, screen, map, nelements, elem2param, nparams, cutsq, r0, gamma, eta, lam1inv, lam4inv, zizj, mbigd, dvrc, big6w, heta, bigh, bigw, @@ -246,7 +247,6 @@ void PairVashishtaGPU::init_style() neighbor->requests[irequest]->full = 1; neighbor->requests[irequest]->ghost = 1; } - if (comm->cutghostuser < (2.0*cutmax + neighbor->skin)) { comm->cutghostuser=2.0*cutmax + neighbor->skin; if (comm->me == 0) diff --git a/src/GPU/pair_yukawa_colloid_gpu.cpp b/src/GPU/pair_yukawa_colloid_gpu.cpp index 8da3b48dd5..9322f95f44 100644 --- a/src/GPU/pair_yukawa_colloid_gpu.cpp +++ b/src/GPU/pair_yukawa_colloid_gpu.cpp @@ -41,24 +41,27 @@ using namespace LAMMPS_NS; // External functions from cuda library for atom decomposition int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a, - double **host_offset, double *special_lj, const int inum, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, - const double kappa); + double **host_offset, double *special_lj, const int inum, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + int &gpu_mode, FILE *screen, const double kappa); void ykcolloid_gpu_clear(); int ** ykcolloid_gpu_compute_n(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_rad); + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + double *host_rad); void ykcolloid_gpu_compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_rad); + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, + double *host_rad); double ykcolloid_gpu_bytes(); /* ---------------------------------------------------------------------- */ @@ -167,9 +170,10 @@ void PairYukawaColloidGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = ykcolloid_gpu_init(atom->ntypes+1, cutsq, a, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, kappa); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_yukawa_gpu.cpp b/src/GPU/pair_yukawa_gpu.cpp index 8c133b068e..81304159a0 100644 --- a/src/GPU/pair_yukawa_gpu.cpp +++ b/src/GPU/pair_yukawa_gpu.cpp @@ -49,10 +49,10 @@ void yukawa_gpu_clear(); int ** yukawa_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success); + tagint **special, const bool eflag, + const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, + int **jnum, const double cpu_time, bool &success); void yukawa_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, @@ -159,9 +159,10 @@ void PairYukawaGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = yukawa_gpu_init(atom->ntypes+1, cutsq, kappa, a, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); diff --git a/src/GPU/pair_zbl_gpu.cpp b/src/GPU/pair_zbl_gpu.cpp index eda0c26614..93e0588285 100644 --- a/src/GPU/pair_zbl_gpu.cpp +++ b/src/GPU/pair_zbl_gpu.cpp @@ -50,9 +50,9 @@ int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); void zbl_gpu_clear(); -int ** zbl_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, +int ** zbl_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, @@ -165,11 +165,12 @@ void PairZBLGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = zbl_gpu_init(atom->ntypes+1, cutsq, sw1, sw2, sw3, sw4, sw5, d1a, d2a, d3a, d4a, zze, cut_globalsq, cut_innersq, cut_inner, atom->nlocal, atom->nlocal+atom->nghost, - 300, maxspecial, cell_size, gpu_mode, screen); + mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); if (gpu_mode == GPU_FORCE) { diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp index cc7ef8841e..61d0144b73 100644 --- a/src/GPU/pppm_gpu.cpp +++ b/src/GPU/pppm_gpu.cpp @@ -80,9 +80,9 @@ FFT_SCALAR* PPPM_GPU_API(init)(const int nlocal, const int nall, FILE *screen, const bool respa, int &success); void PPPM_GPU_API(clear)(const double poisson_time); int PPPM_GPU_API(spread)(const int ago, const int nlocal, const int nall, - double **host_x, int *host_type, bool &success, - double *host_q, double *boxlo, const double delxinv, - const double delyinv, const double delzinv); + double **host_x, int *host_type, bool &success, + double *host_q, double *boxlo, const double delxinv, + const double delyinv, const double delzinv); void PPPM_GPU_API(interp)(const FFT_SCALAR qqrd2e_scale); double PPPM_GPU_API(bytes)(); void PPPM_GPU_API(forces)(double **f); @@ -208,9 +208,9 @@ void PPPMGPU::compute(int eflag, int vflag) if (triclinic == 0) { bool success = true; int flag=PPPM_GPU_API(spread)(nago, atom->nlocal, atom->nlocal + - atom->nghost, atom->x, atom->type, success, - atom->q, domain->boxlo, delxinv, delyinv, - delzinv); + atom->nghost, atom->x, atom->type, success, + atom->q, domain->boxlo, delxinv, delyinv, + delzinv); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); if (flag != 0) @@ -402,7 +402,7 @@ void PPPMGPU::poisson_ik() work1[n++] = ZEROF; } - fft1->compute(work1,work1,1); + fft1->compute(work1,work1,FFT3d::FORWARD); // if requested, compute energy and virial contribution @@ -441,7 +441,7 @@ void PPPMGPU::poisson_ik() if (evflag_atom) poisson_peratom(); - // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k) + // compute gradients of V(r) in each of 3 dims by transformimg ik*V(k) // FFT leaves data in 3d brick decomposition // copy it into inner portion of vdx,vdy,vdz arrays @@ -451,12 +451,12 @@ void PPPMGPU::poisson_ik() for (k = nzlo_fft; k <= nzhi_fft; k++) for (j = nylo_fft; j <= nyhi_fft; j++) for (i = nxlo_fft; i <= nxhi_fft; i++) { - work2[n] = fkx[i]*work1[n+1]; - work2[n+1] = -fkx[i]*work1[n]; + work2[n] = -fkx[i]*work1[n+1]; + work2[n+1] = fkx[i]*work1[n]; n += 2; } - fft2->compute(work2,work2,-1); + fft2->compute(work2,work2,FFT3d::BACKWARD); n = 0; int x_hi = nxhi_in * 4 + 3; @@ -473,12 +473,12 @@ void PPPMGPU::poisson_ik() for (k = nzlo_fft; k <= nzhi_fft; k++) for (j = nylo_fft; j <= nyhi_fft; j++) for (i = nxlo_fft; i <= nxhi_fft; i++) { - work2[n] = fky[j]*work1[n+1]; - work2[n+1] = -fky[j]*work1[n]; + work2[n] = -fky[j]*work1[n+1]; + work2[n+1] = fky[j]*work1[n]; n += 2; } - fft2->compute(work2,work2,-1); + fft2->compute(work2,work2,FFT3d::BACKWARD); n = 0; for (k = nzlo_in; k <= nzhi_in; k++) @@ -494,12 +494,12 @@ void PPPMGPU::poisson_ik() for (k = nzlo_fft; k <= nzhi_fft; k++) for (j = nylo_fft; j <= nyhi_fft; j++) for (i = nxlo_fft; i <= nxhi_fft; i++) { - work2[n] = fkz[k]*work1[n+1]; - work2[n+1] = -fkz[k]*work1[n]; + work2[n] = -fkz[k]*work1[n+1]; + work2[n+1] = fkz[k]*work1[n]; n += 2; } - fft2->compute(work2,work2,-1); + fft2->compute(work2,work2,FFT3d::BACKWARD); n = 0; for (k = nzlo_in; k <= nzhi_in; k++) diff --git a/src/MAKE/OPTIONS/Makefile.g++_openmpi b/src/MAKE/OPTIONS/Makefile.g++_openmpi index 548994f832..75c12f9b38 100644 --- a/src/MAKE/OPTIONS/Makefile.g++_openmpi +++ b/src/MAKE/OPTIONS/Makefile.g++_openmpi @@ -7,12 +7,12 @@ SHELL = /bin/sh # specify flags and libraries needed for your compiler export OMPI_CXX = g++ -CC = mpicxx +CC = mpicxx -std=c++11 CCFLAGS = -g -O3 SHFLAGS = -fPIC DEPFLAGS = -M -LINK = mpicxx +LINK = mpicxx -std=c++11 LINKFLAGS = -g -O LIB = SIZE = size diff --git a/src/MAKE/OPTIONS/Makefile.g++_serial b/src/MAKE/OPTIONS/Makefile.g++_serial index 65de6a2c2c..4f6f0afe22 100644 --- a/src/MAKE/OPTIONS/Makefile.g++_serial +++ b/src/MAKE/OPTIONS/Makefile.g++_serial @@ -6,12 +6,12 @@ SHELL = /bin/sh # compiler/linker settings # specify flags and libraries needed for your compiler -CC = g++ +CC = g++ -std=c++11 CCFLAGS = -g -O3 SHFLAGS = -fPIC DEPFLAGS = -M -LINK = g++ +LINK = g++ -std=c++11 LINKFLAGS = -g -O LIB = SIZE = size diff --git a/src/MAKE/OPTIONS/Makefile.oneapi b/src/MAKE/OPTIONS/Makefile.oneapi new file mode 100644 index 0000000000..2524773a76 --- /dev/null +++ b/src/MAKE/OPTIONS/Makefile.oneapi @@ -0,0 +1,122 @@ +# oneapi = For Intel oneAPI builds with GPU package + +SHELL = /bin/sh + +# --------------------------------------------------------------------- +# compiler/linker settings +# specify flags and libraries needed for your compiler + +CC = mpiicpc -std=c++11 +OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits +CCFLAGS = -qopenmp -qopenmp-simd -qno-offload -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) \ + -I$(MKLROOT)/include +SHFLAGS = -fPIC +DEPFLAGS = -M + +LINK = mpiicpc -std=c++11 +LINKFLAGS = -qopenmp -qopenmp-simd $(OPTFLAGS) -L$(MKLROOT)/lib/intel64/ +LIB = -ltbbmalloc -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core +SIZE = size + +ARCHIVE = ar +ARFLAGS = -rc +SHLIBFLAGS = -shared + +# --------------------------------------------------------------------- +# LAMMPS-specific settings, all OPTIONAL +# specify settings for LAMMPS features you will use +# if you change any -D setting, do full re-compile after "make clean" + +# LAMMPS ifdef settings +# see possible settings in Section 3.5 of the manual + +LMP_INC = -DLAMMPS_GZIP + +# MPI library +# see discussion in Section 3.4 of the manual +# MPI wrapper compiler/linker can provide this info +# can point to dummy MPI library in src/STUBS as in Makefile.serial +# use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts +# INC = path for mpi.h, MPI compiler settings +# PATH = path for MPI library +# LIB = name of MPI library + +MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 +MPI_PATH = +MPI_LIB = + +# FFT library +# see discussion in Section 3.5.2 of manual +# can be left blank to use provided KISS FFT library +# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings +# PATH = path for FFT library +# LIB = name of FFT library + +FFT_INC = -DFFT_MKL -DFFT_SINGLE +FFT_PATH = +FFT_LIB = + +# JPEG and/or PNG library +# see discussion in Section 3.5.4 of manual +# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC +# INC = path(s) for jpeglib.h and/or png.h +# PATH = path(s) for JPEG library and/or PNG library +# LIB = name(s) of JPEG library and/or PNG library + +JPG_INC = +JPG_PATH = +JPG_LIB = + +# --------------------------------------------------------------------- +# build rules and dependencies +# do not edit this section + +include Makefile.package.settings +include Makefile.package + +EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC) +EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH) +EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB) +EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS) +EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS) + +# Path to src files + +vpath %.cpp .. +vpath %.h .. + +# Link target + +$(EXE): main.o $(LMPLIB) $(EXTRA_LINK_DEPENDS) + $(LINK) $(LINKFLAGS) main.o $(EXTRA_PATH) $(LMPLINK) $(EXTRA_LIB) $(LIB) -o $@ + $(SIZE) $@ + +# Library targets + +$(ARLIB): $(OBJ) $(EXTRA_LINK_DEPENDS) + @rm -f ../$(ARLIB) + $(ARCHIVE) $(ARFLAGS) ../$(ARLIB) $(OBJ) + @rm -f $(ARLIB) + @ln -s ../$(ARLIB) $(ARLIB) + +$(SHLIB): $(OBJ) $(EXTRA_LINK_DEPENDS) + $(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o ../$(SHLIB) \ + $(OBJ) $(EXTRA_LIB) $(LIB) + @rm -f $(SHLIB) + @ln -s ../$(SHLIB) $(SHLIB) + +# Compilation rules + +%.o:%.cpp + $(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $< + +# Individual dependencies + +depend : fastdep.exe $(SRC) + @./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1 + +fastdep.exe: ../DEPEND/fastdep.c + cc -O -o $@ $< + +sinclude .depend diff --git a/src/STUBS/Makefile b/src/STUBS/Makefile index 3c3c3b46d9..c9b6fdb65a 100644 --- a/src/STUBS/Makefile +++ b/src/STUBS/Makefile @@ -11,13 +11,13 @@ SHELL = /bin/sh # Files -SRC = mpi.c +SRC = mpi.cpp INC = mpi.h # Definitions EXE = libmpi_stubs.a -OBJ = $(SRC:.c=.o) +OBJ = $(SRC:.cpp=.o) # System-specific settings @@ -36,7 +36,7 @@ clean: # Compilation rules -.c.o: +.cpp.o: $(CC) $(CCFLAGS) -c $< # Individual dependencies diff --git a/src/STUBS/Makefile.mingw32-cross b/src/STUBS/Makefile.mingw32-cross index 4144954ec7..2934bbd468 100644 --- a/src/STUBS/Makefile.mingw32-cross +++ b/src/STUBS/Makefile.mingw32-cross @@ -5,17 +5,17 @@ SHELL = /bin/sh # Files -SRC = mpi.c +SRC = mpi.cpp INC = mpi.h # Definitions EXE = libmpi_mingw32.a -OBJ = $(SRC:%.c=%_mingw32.o) +OBJ = $(SRC:%.cpp=%_mingw32.o) # System-specific settings -CC = i686-w64-mingw32-gcc +CC = i686-w64-mingw32-g++ CCFLAGS = -O2 -Wall -march=i686 -mtune=generic -mfpmath=387 -mpc64 -I. ARCHIVE = i686-w64-mingw32-ar ARCHFLAG = rs diff --git a/src/STUBS/Makefile.mingw64-cross b/src/STUBS/Makefile.mingw64-cross index 70b971f262..e62d5dcbe1 100644 --- a/src/STUBS/Makefile.mingw64-cross +++ b/src/STUBS/Makefile.mingw64-cross @@ -5,17 +5,17 @@ SHELL = /bin/sh # Files -SRC = mpi.c +SRC = mpi.cpp INC = mpi.h # Definitions EXE = libmpi_mingw64.a -OBJ = $(SRC:%.c=%_mingw64.o) +OBJ = $(SRC:%.cpp=%_mingw64.o) # System-specific settings -CC = x86_64-w64-mingw32-gcc +CC = x86_64-w64-mingw32-g++ CCFLAGS = -O2 -Wall -march=core2 -mtune=core2 -msse2 -mpc64 -I. ARCHIVE = x86_64-w64-mingw32-ar ARCHFLAG = rs diff --git a/src/STUBS/mpi.c b/src/STUBS/mpi.cpp similarity index 100% rename from src/STUBS/mpi.c rename to src/STUBS/mpi.cpp diff --git a/src/STUBS/mpi.h b/src/STUBS/mpi.h index 063dc542be..28e897960d 100644 --- a/src/STUBS/mpi.h +++ b/src/STUBS/mpi.h @@ -16,12 +16,17 @@ #include -/* use C bindings for MPI interface */ +/* We compile STUBS with C++ so the symbols embedded + * the serial shared library will not collide with any + * corresponding symbols from a real MPI library (which + * uses C bindings). As a consequence the header *must* + * enforce compiling with C++ only. */ -#ifdef __cplusplus -extern "C" { +#ifndef __cplusplus +#error "MPI STUBS must be compiled with a C++ compiler" #endif + /* Dummy defs for MPI stubs */ #define MPI_COMM_WORLD 0 @@ -176,8 +181,4 @@ int MPI_Alltoallv(void *sendbuf, int *sendcounts, int *sdispls, MPI_Datatype recvtype, MPI_Comm comm); /* ---------------------------------------------------------------------- */ -#ifdef __cplusplus -} -#endif - #endif diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp index 31bd63160f..6c7e108ca6 100644 --- a/src/USER-INTEL/fix_intel.cpp +++ b/src/USER-INTEL/fix_intel.cpp @@ -318,8 +318,7 @@ void FixIntel::init() _zero_master = 0; if (_pair_hybrid_flag && _hybrid_nonpair) - if (_pair_hybrid_flag > 1 || force->newton_pair == 0) - _pair_hybrid_zero = 1; + _pair_hybrid_zero = 1; _hybrid_nonpair = 0; _pair_intel_count = 0; diff --git a/src/USER-REACTION/fix_bond_react.cpp b/src/USER-REACTION/fix_bond_react.cpp index 3098a1bd67..93c9fe525b 100644 --- a/src/USER-REACTION/fix_bond_react.cpp +++ b/src/USER-REACTION/fix_bond_react.cpp @@ -537,7 +537,6 @@ FixBondReact::FixBondReact(LAMMPS *lmp, int narg, char **arg) : nmax = 0; partner = finalpartner = nullptr; distsq = nullptr; - probability = nullptr; maxattempt = 0; attempt = nullptr; nattempt = nullptr; @@ -585,7 +584,6 @@ FixBondReact::~FixBondReact() memory->destroy(finalpartner); memory->destroy(nattempt); memory->destroy(distsq); - memory->destroy(probability); memory->destroy(attempt); memory->destroy(edge); memory->destroy(equivalences); @@ -870,6 +868,9 @@ void FixBondReact::post_integrate() ghostly_rxn_count[i] = 0; nlocalskips[i] = 0; nghostlyskips[i] = 0; + // update reaction probability + if (var_flag[PROB][i]) + fraction[i] = input->variable->compute_equal(var_id[PROB][i]); } if (nevery_check) { @@ -890,16 +891,14 @@ void FixBondReact::post_integrate() memory->destroy(finalpartner); memory->destroy(distsq); memory->destroy(nattempt); - memory->destroy(probability); nmax = atom->nmax; memory->create(partner,nmax,"bond/react:partner"); memory->create(finalpartner,nmax,"bond/react:finalpartner"); memory->create(distsq,nmax,2,"bond/react:distsq"); memory->create(nattempt,nreacts,"bond/react:nattempt"); - memory->create(probability,nmax,"bond/react:probability"); } - // reset create counts + // reset 'attempt' counts for (int i = 0; i < nreacts; i++) { nattempt[i] = 0; } @@ -962,25 +961,14 @@ void FixBondReact::post_integrate() comm->reverse_comm_fix(this); } - // update reaction probability - if (var_flag[PROB][rxnID]) - fraction[rxnID] = input->variable->compute_equal(var_id[PROB][rxnID]); - // each atom now knows its winning partner - // for prob check, generate random value for each atom with a bond partner - // forward comm of partner and random value, so ghosts have it - - if (fraction[rxnID] < 1.0) { - for (int i = 0; i < nlocal; i++) - if (partner[i]) probability[i] = random[rxnID]->uniform(); - } + // forward comm of partner, so ghosts have it commflag = 2; comm->forward_comm_fix(this,2); // consider for reaction: // only if both atoms list each other as winning bond partner - // and probability constraint is satisfied // if other atom is owned by another proc, it should do same thing int temp_nattempt = 0; @@ -994,16 +982,6 @@ void FixBondReact::post_integrate() continue; } - // apply probability constraint using RN for atom with smallest ID - - if (fraction[rxnID] < 1.0) { - if (tag[i] < tag[j]) { - if (probability[i] >= fraction[rxnID]) continue; - } else { - if (probability[j] >= fraction[rxnID]) continue; - } - } - // store final bond partners and count the rxn possibility once finalpartner[i] = tag[j]; @@ -1031,23 +1009,28 @@ void FixBondReact::post_integrate() if (finalpartner[i] == 0) continue; j = atom->map(finalpartner[i]); - // if (j < 0 || tag[i] < tag[j]) { - if (tag[i] < tag[j]) { //atom->map(std::min(tag[i],tag[j])) <= nlocal && - if (nattempt[rxnID] == maxattempt) { + if (tag[i] < tag[j]) { + if (nattempt[rxnID] > maxattempt-2) { maxattempt += DELTA; - // third column of 'attempt': bond/react integer ID + // third dim of 'attempt': bond/react integer ID memory->grow(attempt,maxattempt,2,nreacts,"bond/react:attempt"); } // to ensure types remain in same order - // unnecessary now taken from reaction map file if (iatomtype[rxnID] == type[i]) { attempt[nattempt[rxnID]][0][rxnID] = tag[i]; attempt[nattempt[rxnID]][1][rxnID] = finalpartner[i]; + nattempt[rxnID]++; + // add another attempt if initiator atoms are same type + if (iatomtype[rxnID] == jatomtype[rxnID]) { + attempt[nattempt[rxnID]][0][rxnID] = finalpartner[i]; + attempt[nattempt[rxnID]][1][rxnID] = tag[i]; + nattempt[rxnID]++; + } } else { attempt[nattempt[rxnID]][0][rxnID] = finalpartner[i]; attempt[nattempt[rxnID]][1][rxnID] = tag[i]; + nattempt[rxnID]++; } - nattempt[rxnID]++; } } } @@ -1340,10 +1323,14 @@ void FixBondReact::superimpose_algorithm() (nxspecial[local_atom1][0] == 0 || xspecial[local_atom1][0] == atom->tag[local_atom2]) && check_constraints()) { - status = ACCEPT; - glove_ghostcheck(); - } else - status = REJECT; + if (fraction[rxnID] < 1.0 && + random[rxnID]->uniform() >= fraction[rxnID]) { + status = REJECT; + } else { + status = ACCEPT; + glove_ghostcheck(); + } + } else status = REJECT; } avail_guesses = 0; @@ -1380,9 +1367,12 @@ void FixBondReact::superimpose_algorithm() } } - if (status == ACCEPT && check_constraints()) { // reaction site found successfully! - glove_ghostcheck(); - } + // reaction site found successfully! + if (status == ACCEPT) + if (fraction[rxnID] < 1.0 && + random[rxnID]->uniform() >= fraction[rxnID]) status = REJECT; + else glove_ghostcheck(); + hang_catch++; // let's go ahead and catch the simplest of hangs //if (hang_catch > onemol->natoms*4) @@ -1622,8 +1612,8 @@ void FixBondReact::check_a_neighbor() glove_counter++; if (glove_counter == onemol->natoms) { - status = ACCEPT; - ring_check(); + if (ring_check() && check_constraints()) status = ACCEPT; + else status = GUESSFAIL; return; } // status should still == PROCEED @@ -1674,8 +1664,8 @@ void FixBondReact::check_a_neighbor() glove_counter++; if (glove_counter == onemol->natoms) { - status = ACCEPT; - ring_check(); + if (ring_check() && check_constraints()) status = ACCEPT; + else status = GUESSFAIL; return; // will never complete here when there are edge atoms // ...actually that could be wrong if people get creative...shouldn't affect anything @@ -1786,8 +1776,8 @@ void FixBondReact::inner_crosscheck_loop() } glove_counter++; if (glove_counter == onemol->natoms) { - status = ACCEPT; - ring_check(); + if (ring_check() && check_constraints()) status = ACCEPT; + else status = GUESSFAIL; return; } status = CONTINUE; @@ -1798,21 +1788,17 @@ void FixBondReact::inner_crosscheck_loop() Necessary for certain ringed structures ------------------------------------------------------------------------- */ -void FixBondReact::ring_check() +int FixBondReact::ring_check() { // ring_check can be made more efficient by re-introducing 'frozen' atoms // 'frozen' atoms have been assigned and also are no longer pioneers // double check the number of neighbors match for all non-edge atoms // otherwise, atoms at 'end' of symmetric ring can behave like edge atoms - for (int i = 0; i < onemol->natoms; i++) { - if (edge[i][rxnID] == 0) { - if (onemol_nxspecial[i][0] != nxspecial[atom->map(glove[i][1])][0]) { - status = GUESSFAIL; - return; - } - } - } + for (int i = 0; i < onemol->natoms; i++) + if (edge[i][rxnID] == 0 && + onemol_nxspecial[i][0] != nxspecial[atom->map(glove[i][1])][0]) + return 0; for (int i = 0; i < onemol->natoms; i++) { for (int j = 0; j < onemol_nxspecial[i][0]; j++) { @@ -1824,12 +1810,10 @@ void FixBondReact::ring_check() break; } } - if (ring_fail == 1) { - status = GUESSFAIL; - return; - } + if (ring_fail == 1) return 0; } } + return 1; } /* ---------------------------------------------------------------------- @@ -2705,7 +2689,7 @@ update molecule IDs, charges, types, special lists and all topology void FixBondReact::update_everything() { - int nlocal; // must be defined after create_atoms + int nlocal = atom->nlocal; // must be redefined after create atoms int *type = atom->type; int **nspecial = atom->nspecial; tagint **special = atom->special; @@ -2717,6 +2701,9 @@ void FixBondReact::update_everything() // used when deleting atoms int ndel,ndelone; int *mark; + int nmark = nlocal; + memory->create(mark,nmark,"bond/react:mark"); + for (int i = 0; i < nmark; i++) mark[i] = 0; tagint *tag = atom->tag; AtomVec *avec = atom->avec; @@ -2778,8 +2765,11 @@ void FixBondReact::update_everything() // mark to-delete atoms nlocal = atom->nlocal; - mark = new int[nlocal]; - for (int i = 0; i < nlocal; i++) mark[i] = 0; + if (nlocal > nmark) { + memory->grow(mark,nlocal,"bond/react:mark"); + for (int i = nmark; i < nlocal; i++) mark[i] = 0; + nmark = nlocal; + } for (int i = 0; i < update_num_mega; i++) { rxnID = update_mega_glove[0][i]; onemol = atom->molecules[unreacted_mol[rxnID]]; @@ -3228,7 +3218,7 @@ void FixBondReact::update_everything() } } } - delete [] mark; + memory->destroy(mark); MPI_Allreduce(&ndelone,&ndel,1,MPI_INT,MPI_SUM,world); @@ -3941,20 +3931,10 @@ int FixBondReact::pack_forward_comm(int n, int *list, double *buf, m = 0; - if (commflag == 1) { - for (i = 0; i < n; i++) { - j = list[i]; - printf("hello you shouldn't be here\n"); - //buf[m++] = ubuf(bondcount[j]).d; - } - return m; - } - if (commflag == 2) { for (i = 0; i < n; i++) { j = list[i]; buf[m++] = ubuf(partner[j]).d; - buf[m++] = probability[j]; } return m; } @@ -3980,15 +3960,9 @@ void FixBondReact::unpack_forward_comm(int n, int first, double *buf) m = 0; last = first + n; - if (commflag == 1) { + if (commflag == 2) { for (i = first; i < last; i++) - printf("hello you shouldn't be here\n"); - // bondcount[i] = (int) ubuf(buf[m++]).i; - } else if (commflag == 2) { - for (i = first; i < last; i++) { partner[i] = (tagint) ubuf(buf[m++]).i; - probability[i] = buf[m++]; - } } else { m = 0; last = first + n; @@ -4029,20 +4003,18 @@ void FixBondReact::unpack_reverse_comm(int n, int *list, double *buf) m = 0; - if (commflag != 1) { - for (i = 0; i < n; i++) { - j = list[i]; - if (closeneigh[rxnID] != 0) { - if (buf[m+1] < distsq[j][1]) { - partner[j] = (tagint) ubuf(buf[m++]).i; - distsq[j][1] = buf[m++]; - } else m += 2; - } else { - if (buf[m+1] > distsq[j][0]) { - partner[j] = (tagint) ubuf(buf[m++]).i; - distsq[j][0] = buf[m++]; - } else m += 2; - } + for (i = 0; i < n; i++) { + j = list[i]; + if (closeneigh[rxnID] != 0) { + if (buf[m+1] < distsq[j][1]) { + partner[j] = (tagint) ubuf(buf[m++]).i; + distsq[j][1] = buf[m++]; + } else m += 2; + } else { + if (buf[m+1] > distsq[j][0]) { + partner[j] = (tagint) ubuf(buf[m++]).i; + distsq[j][0] = buf[m++]; + } else m += 2; } } } diff --git a/src/USER-REACTION/fix_bond_react.h b/src/USER-REACTION/fix_bond_react.h index 87a5945d45..67788df217 100644 --- a/src/USER-REACTION/fix_bond_react.h +++ b/src/USER-REACTION/fix_bond_react.h @@ -86,7 +86,7 @@ class FixBondReact : public Fix { int nmax; // max num local atoms int max_natoms; // max natoms in a molecule template tagint *partner,*finalpartner; - double **distsq,*probability; + double **distsq; int *nattempt; int maxattempt; int allnattempt; @@ -171,7 +171,7 @@ class FixBondReact : public Fix { void check_a_neighbor(); void crosscheck_the_neighbor(); void inner_crosscheck_loop(); - void ring_check(); + int ring_check(); int check_constraints(); void get_IDcoords(int, int, double *); double get_temperature(tagint **, int, int); diff --git a/src/atom.cpp b/src/atom.cpp index 3308d07267..fe260309e2 100644 --- a/src/atom.cpp +++ b/src/atom.cpp @@ -40,6 +40,10 @@ #include "neigh_request.h" #endif +#ifdef LMP_GPU +#include "fix_gpu.h" +#endif + using namespace LAMMPS_NS; using namespace MathConst; @@ -1748,7 +1752,7 @@ void Atom::set_mass(const char *file, int line, int /*narg*/, char **arg) if (lo < 1 || hi > ntypes) error->all(file,line,"Invalid type for mass set"); for (int itype = lo; itype <= hi; itype++) { - mass[itype] = atof(arg[1]); + mass[itype] = utils::numeric(FLERR,arg[1],false,lmp); mass_setflag[itype] = 1; if (mass[itype] <= 0.0) error->all(file,line,"Invalid mass value"); @@ -2149,7 +2153,7 @@ void Atom::setup_sort_bins() bininvy = nbiny / (bboxhi[1]-bboxlo[1]); bininvz = nbinz / (bboxhi[2]-bboxlo[2]); - #ifdef LMP_USER_INTEL +#ifdef LMP_USER_INTEL int intel_neigh = 0; if (neighbor->nrequest) { if (neighbor->requests[0]->intel) intel_neigh = 1; @@ -2194,7 +2198,36 @@ void Atom::setup_sort_bins() bboxhi[1] = bboxlo[1] + static_cast(nbiny) / bininvy; bboxhi[2] = bboxlo[2] + static_cast(nbinz) / bininvz; } - #endif +#endif + +#ifdef LMP_GPU + if (userbinsize == 0.0) { + int ifix = modify->find_fix("package_gpu"); + if (ifix >= 0) { + const double subx = domain->subhi[0] - domain->sublo[0]; + const double suby = domain->subhi[1] - domain->sublo[1]; + const double subz = domain->subhi[2] - domain->sublo[2]; + + FixGPU *fix = static_cast(modify->fix[ifix]); + binsize = fix->binsize(subx, suby, subz, atom->nlocal, + neighbor->cutneighmax); + bininv = 1.0 / binsize; + + nbinx = static_cast (ceil(subx * bininv)); + nbiny = static_cast (ceil(suby * bininv)); + nbinz = static_cast (ceil(subz * bininv)); + if (domain->dimension == 2) nbinz = 1; + + if (nbinx == 0) nbinx = 1; + if (nbiny == 0) nbiny = 1; + if (nbinz == 0) nbinz = 1; + + bininvx = bininv; + bininvy = bininv; + bininvz = bininv; + } + } +#endif if (1.0*nbinx*nbiny*nbinz > INT_MAX) error->one(FLERR,"Too many atom sorting bins"); diff --git a/src/citeme.cpp b/src/citeme.cpp index fdd1ee867d..41ac87f5bb 100644 --- a/src/citeme.cpp +++ b/src/citeme.cpp @@ -118,7 +118,7 @@ void CiteMe::flush() if (!citefile.empty()) logbuffer += fmt::format(cite_file,"file",citefile); if (screen_flag == VERBOSE) - scrbuffer += fmt::format(cite_file,"screen","output"); + logbuffer += fmt::format(cite_file,"screen","output"); logbuffer += cite_separator; if (logfile) fputs(logbuffer.c_str(),logfile); logbuffer.clear(); diff --git a/src/compute_reduce.cpp b/src/compute_reduce.cpp index 82d3dff458..bc9aeefe7b 100644 --- a/src/compute_reduce.cpp +++ b/src/compute_reduce.cpp @@ -148,8 +148,8 @@ ComputeReduce::ComputeReduce(LAMMPS *lmp, int narg, char **arg) : if (iarg+3 > narg) error->all(FLERR,"Illegal compute reduce command"); if (mode != MINN && mode != MAXX) error->all(FLERR,"Compute reduce replace requires min or max mode"); - int col1 = atoi(arg[iarg+1]) - 1; - int col2 = atoi(arg[iarg+2]) - 1; + int col1 = utils::inumeric(FLERR,arg[iarg+1],false,lmp) - 1; + int col2 = utils::inumeric(FLERR,arg[iarg+2],false,lmp) - 1; if (col1 < 0 || col1 >= nvalues || col2 < 0 || col2 >= nvalues) error->all(FLERR,"Illegal compute reduce command"); if (col1 == col2) error->all(FLERR,"Illegal compute reduce command"); diff --git a/src/dump_cfg.cpp b/src/dump_cfg.cpp index ed8df72096..b4e6af90cf 100644 --- a/src/dump_cfg.cpp +++ b/src/dump_cfg.cpp @@ -75,7 +75,8 @@ DumpCFG::DumpCFG(LAMMPS *lmp, int narg, char **arg) : if (argi.get_dim() == 1) { std::string newarg(std::to_string(earg[iarg][0])); - newarg += '_' + argi.get_name() + '_' + std::to_string(argi.get_index1()); + newarg += std::string("_") + argi.get_name(); + newarg += std::string("_") + std::to_string(argi.get_index1()); auxname[i] = new char[newarg.size()+1]; strcpy(auxname[i],newarg.c_str()); } else { diff --git a/src/fix_addforce.cpp b/src/fix_addforce.cpp index a06544e268..07031a40a4 100644 --- a/src/fix_addforce.cpp +++ b/src/fix_addforce.cpp @@ -83,7 +83,7 @@ FixAddForce::FixAddForce(LAMMPS *lmp, int narg, char **arg) : while (iarg < narg) { if (strcmp(arg[iarg],"every") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal fix addforce command"); - nevery = atoi(arg[iarg+1]); + nevery = utils::inumeric(FLERR,arg[iarg+1],false,lmp); if (nevery <= 0) error->all(FLERR,"Illegal fix addforce command"); iarg += 2; } else if (strcmp(arg[iarg],"region") == 0) { diff --git a/src/fix_property_atom.cpp b/src/fix_property_atom.cpp index c1c52a3f8c..f18888bbfc 100644 --- a/src/fix_property_atom.cpp +++ b/src/fix_property_atom.cpp @@ -254,13 +254,19 @@ void FixPropertyAtom::read_data_section(char *keyword, int n, char *buf, if ((m = atom->map(itag)) >= 0) { for (j = 0; j < nvalue; j++) { - if (style[j] == MOLECULE) atom->molecule[m] = ATOTAGINT(values[j+1]); - else if (style[j] == CHARGE) atom->q[m] = atof(values[j+1]); - else if (style[j] == RMASS) atom->rmass[m] = atof(values[j+1]); - else if (style[j] == INTEGER) - atom->ivector[index[j]][m] = atoi(values[j+1]); - else if (style[j] == DOUBLE) - atom->dvector[index[j]][m] = atof(values[j+1]); + if (style[j] == MOLECULE) { + atom->molecule[m] = utils::tnumeric(FLERR,values[j+1],false,lmp); + } else if (style[j] == CHARGE) { + atom->q[m] = utils::numeric(FLERR,values[j+1],false,lmp); + } else if (style[j] == RMASS) { + atom->rmass[m] = utils::numeric(FLERR,values[j+1],false,lmp); + } else if (style[j] == INTEGER) { + atom->ivector[index[j]][m] = utils::inumeric(FLERR,values[j+1], + false,lmp); + } else if (style[j] == DOUBLE) { + atom->dvector[index[j]][m] = utils::numeric(FLERR,values[j+1], + true,lmp); + } } } diff --git a/src/image.cpp b/src/image.cpp index 4b181ee8b0..0acef0bceb 100644 --- a/src/image.cpp +++ b/src/image.cpp @@ -113,6 +113,11 @@ Image::Image(LAMMPS *lmp, int nmap_caller) : Pointers(lmp) backLightColor[2] = 0.9; random = nullptr; + + // MPI_Gatherv vectors + + recvcounts = nullptr; + displs = nullptr; } /* ---------------------------------------------------------------------- */ @@ -134,6 +139,9 @@ Image::~Image() memory->destroy(rgbcopy); if (random) delete random; + + memory->destroy(recvcounts); + memory->destroy(displs); } /* ---------------------------------------------------------------------- @@ -334,16 +342,37 @@ void Image::merge() // extra SSAO enhancement // bcast full image to all procs // each works on subset of pixels - // gather result back to proc 0 + // MPI_Gather() result back to proc 0 + // use Gatherv() if subset of pixels is not the same size on every proc if (ssao) { MPI_Bcast(imageBuffer,npixels*3,MPI_BYTE,0,world); MPI_Bcast(surfaceBuffer,npixels*2,MPI_DOUBLE,0,world); MPI_Bcast(depthBuffer,npixels,MPI_DOUBLE,0,world); compute_SSAO(); - int pixelPart = height/nprocs * width*3; - MPI_Gather(imageBuffer+me*pixelPart,pixelPart,MPI_BYTE, - rgbcopy,pixelPart,MPI_BYTE,0,world); + + int pixelstart = 3 * static_cast (1.0*me/nprocs * npixels); + int pixelstop = 3 * static_cast (1.0*(me+1)/nprocs * npixels); + int mypixels = pixelstop - pixelstart; + + if (npixels % nprocs == 0) { + MPI_Gather(imageBuffer+pixelstart,mypixels,MPI_BYTE, + rgbcopy,mypixels,MPI_BYTE,0,world); + + } else { + if (recvcounts == nullptr) { + memory->create(recvcounts,nprocs,"image:recvcounts"); + memory->create(displs,nprocs,"image:displs"); + MPI_Allgather(&mypixels,1,MPI_INT,recvcounts,1,MPI_INT,world); + displs[0] = 0; + for (int i = 1; i < nprocs; i++) + displs[i] = displs[i-1] + recvcounts[i-1]; + } + + MPI_Gatherv(imageBuffer+pixelstart,mypixels,MPI_BYTE, + rgbcopy,recvcounts,displs,MPI_BYTE,0,world); + } + writeBuffer = rgbcopy; } else { writeBuffer = imageBuffer; @@ -880,110 +909,117 @@ void Image::compute_SSAO() -tanPerPixel / zoom; int pixelRadius = (int) trunc (SSAORadius / pixelWidth + 0.5); - int x,y,s; - int hPart = height / nprocs; - int index = me * hPart * width; - for (y = me * hPart; y < (me + 1) * hPart; y ++) { - for (x = 0; x < width; x ++, index ++) { - double cdepth = depthBuffer[index]; - if (cdepth < 0) { continue; } + // each proc is assigned a subset of contiguous pixels from the full image + // pixels are contiguous in x (columns within a row), then by row + // index = pixels from 0 to npixel-1 + // x = column # from 0 to width-1 + // y = row # from 0 to height-1 - double sx = surfaceBuffer[index * 2 + 0]; - double sy = surfaceBuffer[index * 2 + 1]; - double sin_t = -sqrt(sx*sx + sy*sy); + int pixelstart = static_cast (1.0*me/nprocs * npixels); + int pixelstop = static_cast (1.0*(me+1)/nprocs * npixels); - double mytheta = random->uniform() * SSAOJitter; - double ao = 0.0; + for (int index = pixelstart; index < pixelstop; index++) { + int x = index % width; + int y = index / width; - for (s = 0; s < SSAOSamples; s ++) { - double hx = cos(mytheta); - double hy = sin(mytheta); - mytheta += delTheta; + double cdepth = depthBuffer[index]; + if (cdepth < 0) { continue; } - // multiply by z cross surface tangent - // so that dot (aka cos) works here + double sx = surfaceBuffer[index * 2 + 0]; + double sy = surfaceBuffer[index * 2 + 1]; + double sin_t = -sqrt(sx*sx + sy*sy); - double scaled_sin_t = sin_t * (hx*sy + hy*sx); + double mytheta = random->uniform() * SSAOJitter; + double ao = 0.0; - // Bresenham's line algorithm to march over depthBuffer + for (int s = 0; s < SSAOSamples; s ++) { + double hx = cos(mytheta); + double hy = sin(mytheta); + mytheta += delTheta; - int dx = static_cast (hx * pixelRadius); - int dy = static_cast (hy * pixelRadius); - int ex = x + dx; - if (ex < 0) { ex = 0; } if (ex >= width) { ex = width - 1; } - int ey = y + dy; - if (ey < 0) { ey = 0; } if (ey >= height) { ey = height - 1; } - double delta; - int small, large; - double lenIncr; - if (fabs(hx) > fabs(hy)) { - small = (hx > 0) ? 1 : -1; - large = (hy > 0) ? width : -width; - delta = fabs(hy / hx); - } else { - small = (hy > 0) ? width : -width; - large = (hx > 0) ? 1 : -1; - delta = fabs(hx / hy); + // multiply by z cross surface tangent + // so that dot (aka cos) works here + + double scaled_sin_t = sin_t * (hx*sy + hy*sx); + + // Bresenham's line algorithm to march over depthBuffer + + int dx = static_cast (hx * pixelRadius); + int dy = static_cast (hy * pixelRadius); + int ex = x + dx; + if (ex < 0) { ex = 0; } if (ex >= width) { ex = width - 1; } + int ey = y + dy; + if (ey < 0) { ey = 0; } if (ey >= height) { ey = height - 1; } + double delta; + int small, large; + double lenIncr; + if (fabs(hx) > fabs(hy)) { + small = (hx > 0) ? 1 : -1; + large = (hy > 0) ? width : -width; + delta = fabs(hy / hx); + } else { + small = (hy > 0) ? width : -width; + large = (hx > 0) ? 1 : -1; + delta = fabs(hx / hy); + } + lenIncr = sqrt (1 + delta * delta) * pixelWidth; + + // initialize with one step + // because the center point doesn't need testing + + int end = ex + ey * width; + int ind = index + small; + double len = lenIncr; + double err = delta; + if (err >= 1.0) { + ind += large; + err -= 1.0; + } + + double minPeak = -1; + double peakLen = 0.0; + int stepsTaken = 1; + while ((small > 0 && ind <= end) || (small < 0 && ind >= end)) { + if (ind < 0 || ind >= (width*height)) { + break; } - lenIncr = sqrt (1 + delta * delta) * pixelWidth; - // initialize with one step - // because the center point doesn't need testing + // cdepth - depthBuffer B/C we want it in the negative z direction - int end = ex + ey * width; - int ind = index + small; - double len = lenIncr; - double err = delta; + if (minPeak < 0 || (depthBuffer[ind] >= 0 && + depthBuffer[ind] < minPeak)) { + minPeak = depthBuffer[ind]; + peakLen = len; + } + ind += small; + len += lenIncr; + err += delta; if (err >= 1.0) { ind += large; err -= 1.0; } - - double minPeak = -1; - double peakLen = 0.0; - int stepsTaken = 1; - while ((small > 0 && ind <= end) || (small < 0 && ind >= end)) { - if (ind < 0 || ind >= (width*height)) { - break; - } - - // cdepth - depthBuffer B/C we want it in the negative z direction - - if (minPeak < 0 || (depthBuffer[ind] >= 0 && - depthBuffer[ind] < minPeak)) { - minPeak = depthBuffer[ind]; - peakLen = len; - } - ind += small; - len += lenIncr; - err += delta; - if (err >= 1.0) { - ind += large; - err -= 1.0; - } - stepsTaken ++; - } - - if (peakLen > 0) { - double h = atan ((cdepth - minPeak) / peakLen); - ao += saturate(sin (h) - scaled_sin_t); - } else { - ao += saturate(-scaled_sin_t); - } + stepsTaken ++; } - ao /= (double)SSAOSamples; - double c[3]; - c[0] = (double) (*(unsigned char *) &imageBuffer[index * 3 + 0]); - c[1] = (double) (*(unsigned char *) &imageBuffer[index * 3 + 1]); - c[2] = (double) (*(unsigned char *) &imageBuffer[index * 3 + 2]); - c[0] *= (1.0 - ao); - c[1] *= (1.0 - ao); - c[2] *= (1.0 - ao); - imageBuffer[index * 3 + 0] = (int) c[0]; - imageBuffer[index * 3 + 1] = (int) c[1]; - imageBuffer[index * 3 + 2] = (int) c[2]; + if (peakLen > 0) { + double h = atan ((cdepth - minPeak) / peakLen); + ao += saturate(sin (h) - scaled_sin_t); + } else { + ao += saturate(-scaled_sin_t); + } } + ao /= (double)SSAOSamples; + + double c[3]; + c[0] = (double) (*(unsigned char *) &imageBuffer[index * 3 + 0]); + c[1] = (double) (*(unsigned char *) &imageBuffer[index * 3 + 1]); + c[2] = (double) (*(unsigned char *) &imageBuffer[index * 3 + 2]); + c[0] *= (1.0 - ao); + c[1] *= (1.0 - ao); + c[2] *= (1.0 - ao); + imageBuffer[index * 3 + 0] = (int) c[0]; + imageBuffer[index * 3 + 1] = (int) c[1]; + imageBuffer[index * 3 + 2] = (int) c[2]; } } diff --git a/src/image.h b/src/image.h index 7df81425d9..1de455d4bd 100644 --- a/src/image.h +++ b/src/image.h @@ -73,6 +73,10 @@ class Image : protected Pointers { double *depthcopy,*surfacecopy; unsigned char *imageBuffer,*rgbcopy,*writeBuffer; + // MPI_Gatherv + + int *recvcounts,*displs; + // constant view params double FOV; diff --git a/src/info.cpp b/src/info.cpp index bf6f14a48a..f1dc96645b 100644 --- a/src/info.cpp +++ b/src/info.cpp @@ -1449,8 +1449,13 @@ void Info::get_memory_info(double *meminfo) meminfo[2] = (double)pmc.PeakWorkingSetSize/1048576.0; #else #if defined(__linux__) +#if defined(__GLIBC__) && __GLIBC_PREREQ(2, 33) + struct mallinfo2 mi; + mi = mallinfo2(); +#else struct mallinfo mi; mi = mallinfo(); +#endif meminfo[1] = (double)mi.uordblks/1048576.0+(double)mi.hblkhd/1048576.0; #endif struct rusage ru; diff --git a/src/kspace.cpp b/src/kspace.cpp index 5556a5e8d0..f44cc42aaf 100644 --- a/src/kspace.cpp +++ b/src/kspace.cpp @@ -564,9 +564,9 @@ void KSpace::modify_params(int narg, char **arg) iarg += 2; } else if (strcmp(arg[iarg],"kmax/ewald") == 0) { if (iarg+4 > narg) error->all(FLERR,"Illegal kspace_modify command"); - kx_ewald = atoi(arg[iarg+1]); - ky_ewald = atoi(arg[iarg+2]); - kz_ewald = atoi(arg[iarg+3]); + kx_ewald = utils::inumeric(FLERR,arg[iarg+1],false,lmp); + ky_ewald = utils::inumeric(FLERR,arg[iarg+2],false,lmp); + kz_ewald = utils::inumeric(FLERR,arg[iarg+3],false,lmp); if (kx_ewald < 0 || ky_ewald < 0 || kz_ewald < 0) error->all(FLERR,"Bad kspace_modify kmax/ewald parameter"); if (kx_ewald > 0 && ky_ewald > 0 && kz_ewald > 0) @@ -583,15 +583,15 @@ void KSpace::modify_params(int narg, char **arg) iarg += 2; } else if (strcmp(arg[iarg],"force/disp/real") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command"); - accuracy_real_6 = atof(arg[iarg+1]); + accuracy_real_6 = utils::numeric(FLERR,arg[iarg+1],false,lmp); iarg += 2; } else if (strcmp(arg[iarg],"force/disp/kspace") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command"); - accuracy_kspace_6 = atof(arg[iarg+1]); + accuracy_kspace_6 = utils::numeric(FLERR,arg[iarg+1],false,lmp); iarg += 2; } else if (strcmp(arg[iarg],"eigtol") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command"); - splittol = atof(arg[iarg+1]); + splittol = utils::numeric(FLERR,arg[iarg+1],false,lmp); if (splittol >= 1.0) error->all(FLERR,"Kspace_modify eigtol must be smaller than one"); iarg += 2; diff --git a/src/lammps.cpp b/src/lammps.cpp index 6734fbd209..277ec4414f 100644 --- a/src/lammps.cpp +++ b/src/lammps.cpp @@ -842,12 +842,12 @@ void LAMMPS::post_create() if (strcmp(suffix,"omp") == 0 && !modify->check_package("OMP")) error->all(FLERR,"Using suffix omp without USER-OMP package installed"); - if (strcmp(suffix,"gpu") == 0) input->one("package gpu 1"); + if (strcmp(suffix,"gpu") == 0) input->one("package gpu 0"); if (strcmp(suffix,"intel") == 0) input->one("package intel 1"); if (strcmp(suffix,"omp") == 0) input->one("package omp 0"); if (suffix2) { - if (strcmp(suffix2,"gpu") == 0) input->one("package gpu 1"); + if (strcmp(suffix2,"gpu") == 0) input->one("package gpu 0"); if (strcmp(suffix2,"intel") == 0) input->one("package intel 1"); if (strcmp(suffix2,"omp") == 0) input->one("package omp 0"); } diff --git a/src/library.cpp b/src/library.cpp index 71bf205d90..2a7bbf07b3 100644 --- a/src/library.cpp +++ b/src/library.cpp @@ -4128,16 +4128,18 @@ void lammps_get_os_info(char *buffer, int buf_size) /* ---------------------------------------------------------------------- */ /** This function is used to query whether LAMMPS was compiled with - * a real MPI library or in serial. + * a real MPI library or in serial. For the real MPI library it + * reports the size of the MPI communicator in bytes (4 or 8), + * which allows to check for compatibility with a hosting code. * - * \return 0 when compiled with MPI STUBS, otherwise 1 */ + * \return 0 when compiled with MPI STUBS, otherwise the MPI_Comm size in bytes */ int lammps_config_has_mpi_support() { #ifdef MPI_STUBS return 0; #else - return 1; + return sizeof(MPI_Comm); #endif } diff --git a/src/reset_atom_ids.h b/src/reset_atom_ids.h index 7c5c53e2ba..02a7f77e8d 100644 --- a/src/reset_atom_ids.h +++ b/src/reset_atom_ids.h @@ -37,7 +37,7 @@ class ResetIDs : protected Pointers { int ilocal; }; - #if defined(LMP_QSORT) +#if defined(LMP_QSORT) // static variable across all ResetID objects, for qsort callback static AtomRvous *sortrvous; #endif diff --git a/unittest/c-library/test_library_config.cpp b/unittest/c-library/test_library_config.cpp index f196f800da..e5eb044d31 100644 --- a/unittest/c-library/test_library_config.cpp +++ b/unittest/c-library/test_library_config.cpp @@ -74,7 +74,7 @@ TEST(LAMMPSConfig, package_name) EXPECT_EQ(lammps_config_package_name(numpkgs + 10, buf, 128), 0); EXPECT_THAT(buf, StrEq("")); } else { - EXPECT_EQ(lammps_config_package_name(0, buf, 128), 1); + EXPECT_EQ(lammps_config_package_name(0, buf, 128), 0); EXPECT_THAT(buf, StrEq("")); } }; @@ -200,7 +200,10 @@ TEST(LAMMPSConfig, exceptions) TEST(LAMMPSConfig, mpi_support) { - EXPECT_EQ(lammps_config_has_mpi_support(), LAMMPS_HAS_MPI); + if (LAMMPS_HAS_MPI) + EXPECT_GT(lammps_config_has_mpi_support(), 0); + else + EXPECT_EQ(lammps_config_has_mpi_support(), 0); }; TEST(LAMMPSConfig, png_support) diff --git a/unittest/python/python-open.py b/unittest/python/python-open.py index 67500ea6fa..5140ce9185 100644 --- a/unittest/python/python-open.py +++ b/unittest/python/python-open.py @@ -37,7 +37,7 @@ class PythonOpen(unittest.TestCase): lmp=lammps(name=self.machine) self.assertIsNot(lmp.lmp,None) self.assertEqual(lmp.opened,1) - self.assertEqual(has_mpi4py,lmp.has_mpi4py) + self.assertEqual(has_mpi and has_mpi4py,lmp.has_mpi4py) self.assertEqual(has_mpi,lmp.has_mpi_support) lmp.close() self.assertIsNone(lmp.lmp,None)