diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index bc33da60de..2cc11d4ecb 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -37,6 +37,10 @@ enable_language(CXX) ##################################################################### include(CheckCCompilerFlag) +if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -restrict") +endif() + ######################################################################## # User input options # ######################################################################## @@ -76,7 +80,7 @@ add_definitions(-DLAMMPS_MEMALIGN=${LAMMPS_MEMALIGN}) option(LAMMPS_EXCEPTIONS "enable the use of C++ exceptions for error messages (useful for library interface)" OFF) if(LAMMPS_EXCEPTIONS) add_definitions(-DLAMMPS_EXCEPTIONS) - set(LAMMPS_API_DEFINES "${LAMMPS_API_DEFINES -DLAMMPS_EXCEPTIONS") + set(LAMMPS_API_DEFINES "${LAMMPS_API_DEFINES} -DLAMMPS_EXCEPTIONS") endif() set(LAMMPS_MACHINE "" CACHE STRING "Suffix to append to lmp binary and liblammps (WON'T enable any features automatically") @@ -665,7 +669,9 @@ include_directories(${LAMMPS_STYLE_HEADERS_DIR}) ############################################ add_library(lammps ${LIB_SOURCES}) target_link_libraries(lammps ${LAMMPS_LINK_LIBS}) -add_dependencies(lammps ${LAMMPS_DEPS}) +if(LAMMPS_DEPS) + add_dependencies(lammps ${LAMMPS_DEPS}) +endif() set_target_properties(lammps PROPERTIES OUTPUT_NAME lammps${LAMMPS_MACHINE}) if(BUILD_SHARED_LIBS) set_target_properties(lammps PROPERTIES SOVERSION ${SOVERSION}) diff --git a/doc/src/Section_packages.txt b/doc/src/Section_packages.txt index d9a9fb4163..e08784bf6c 100644 --- a/doc/src/Section_packages.txt +++ b/doc/src/Section_packages.txt @@ -705,7 +705,7 @@ dynamics can be run with LAMMPS using density-functional tight-binding quantum forces calculated by LATTE. More information on LATTE can be found at this web site: -"https://github.com/lanl/LATTE"_#latte_home. A brief technical +"https://github.com/lanl/LATTE"_latte_home. A brief technical description is given with the "fix latte"_fix_latte.html command. :link(latte_home,https://github.com/lanl/LATTE) @@ -728,6 +728,7 @@ make lib-latte args="-b" # download and build in lib/latte/LATTE- make lib-latte args="-p $HOME/latte" # use existing LATTE installation in $HOME/latte make lib-latte args="-b -m gfortran" # download and build in lib/latte and # copy Makefile.lammps.gfortran to Makefile.lammps +:pre Note that 3 symbolic (soft) links, "includelink" and "liblink" and "filelink", are created in lib/latte to point into the LATTE home dir. diff --git a/doc/src/fix_latte.txt b/doc/src/fix_latte.txt index f78e13b866..4edd610546 100644 --- a/doc/src/fix_latte.txt +++ b/doc/src/fix_latte.txt @@ -66,7 +66,7 @@ reference charge of overlapping atom-centered densities and bond integrals are parameterized using a Slater-Koster tight-binding approach. This procedure, which usually is referred to as the DFTB method has been described in detail by ("Elstner"_#Elstner) and -("Finnis"_#Finnis) and coworkers. +("Finnis"_#Finnis2) and coworkers. The work of the LATTE developers follows that of Elstner closely with respect to the physical model. However, the development of LATTE is @@ -173,7 +173,7 @@ M. Haugk, T. Frauenheim, S. Suhai, and G. Seifert, Phys. Rev. B, 58, M. Haugk, T. Frauenheim, S. Suhai, and G. Seifert, Phys. Rev. B, 58, 7260 (1998). -:link(Finnis) +:link(Finnis2) [(Finnis)] M. W. Finnis, A. T. Paxton, M. Methfessel, and M. van Schilfgarde, Phys. Rev. Lett., 81, 5149 (1998). @@ -197,11 +197,11 @@ J. Sci. Comput. 36 (2), 147-170, (2014). [(Niklasson2014)] A. M. N. Niklasson and M. Cawkwell, J. Chem. Phys., 141, 164123, (2014). -:link(Niklasson2014) +:link(Niklasson2017) [(Niklasson2017)] A. M. N. Niklasson, J. Chem. Phys., 147, 054103 (2017). -:link(Niklasson2012) -[(Niklasson2017)] A. M. N. Niklasson, M. J. Cawkwell, Phys. Rev. B, 86 +:link(Cawkwell2012) +[(Cawkwell2012)] A. M. N. Niklasson, M. J. Cawkwell, Phys. Rev. B, 86 (17), 174308 (2012). :link(Negre2016) diff --git a/doc/src/fix_neb.txt b/doc/src/fix_neb.txt index 52d8a7df84..73b3e31266 100644 --- a/doc/src/fix_neb.txt +++ b/doc/src/fix_neb.txt @@ -93,7 +93,7 @@ intermediate replica with the previous and the next image: Fnudge_parallel = {Kspring} * (|Ri+1 - Ri| - |Ri - Ri-1|) :pre -Note that in this case the specified {Kspring) is in force/distance +Note that in this case the specified {Kspring} is in force/distance units. With a value of {ideal}, the spring force is computed as suggested in @@ -105,7 +105,7 @@ where RD is the "reaction coordinate" see "neb"_neb.html section, and RDideal is the ideal RD for which all the images are equally spaced. I.e. RDideal = (I-1)*meanDist when the climbing replica is off, where I is the replica number). The meanDist is the average distance -between replicas. Note that in this case the specified {Kspring) is +between replicas. Note that in this case the specified {Kspring} is in force units. Note that the {ideal} form of nudging can often be more effective at diff --git a/doc/src/fixes.txt b/doc/src/fixes.txt index 7000a66c51..e363273a75 100644 --- a/doc/src/fixes.txt +++ b/doc/src/fixes.txt @@ -59,6 +59,7 @@ Fixes :h1 fix_langevin fix_langevin_drude fix_langevin_eff + fix_latte fix_lb_fluid fix_lb_momentum fix_lb_pc diff --git a/doc/src/lammps.book b/doc/src/lammps.book index 86dfe78af3..b74ec49aed 100644 --- a/doc/src/lammps.book +++ b/doc/src/lammps.book @@ -187,6 +187,7 @@ fix_ipi.html fix_langevin.html fix_langevin_drude.html fix_langevin_eff.html +fix_latte.html fix_lb_fluid.html fix_lb_momentum.html fix_lb_pc.html diff --git a/doc/src/package.txt b/doc/src/package.txt index 58f6a5e34d..5c698934e8 100644 --- a/doc/src/package.txt +++ b/doc/src/package.txt @@ -62,7 +62,7 @@ args = arguments specific to the style :l {no_affinity} values = none {kokkos} args = keyword value ... zero or more keyword/value pairs may be appended - keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} + keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse} {neigh} value = {full} or {half} full = full neighbor list half = half neighbor list built in thread-safe manner @@ -75,9 +75,10 @@ args = arguments specific to the style :l {binsize} value = size size = bin size for neighbor list construction (distance units) {comm} value = {no} or {host} or {device} - use value for both comm/exchange and comm/forward + use value for comm/exchange and comm/forward and comm/reverse {comm/exchange} value = {no} or {host} or {device} {comm/forward} value = {no} or {host} or {device} + {comm/reverse} value = {no} or {host} or {device} no = perform communication pack/unpack in non-KOKKOS mode host = perform pack/unpack on host (e.g. with OpenMP threading) device = perform pack/unpack on device (e.g. on GPU) @@ -429,17 +430,18 @@ Coulombic solver"_kspace_style.html because the GPU is faster at performing pairwise interactions, then this rule of thumb may give too large a binsize. -The {comm} and {comm/exchange} and {comm/forward} keywords determine +The {comm} and {comm/exchange} and {comm/forward} and {comm/reverse} keywords determine whether the host or device performs the packing and unpacking of data when communicating per-atom data between processors. "Exchange" communication happens only on timesteps that neighbor lists are rebuilt. The data is only for atoms that migrate to new processors. -"Forward" communication happens every timestep. The data is for atom +"Forward" communication happens every timestep. "Reverse" communication +happens every timestep if the {newton} option is on. The data is for atom coordinates and any other atom properties that needs to be updated for ghost atoms owned by each processor. The {comm} keyword is simply a short-cut to set the same value -for both the {comm/exchange} and {comm/forward} keywords. +for both the {comm/exchange} and {comm/forward} and {comm/reverse} keywords. The value options for all 3 keywords are {no} or {host} or {device}. A value of {no} means to use the standard non-KOKKOS method of diff --git a/doc/src/pair_eam.txt b/doc/src/pair_eam.txt index a0026432ec..03e77f53ab 100644 --- a/doc/src/pair_eam.txt +++ b/doc/src/pair_eam.txt @@ -294,7 +294,7 @@ distribution have a ".cdeam" suffix. Style {eam/fs} computes pairwise interactions for metals and metal alloys using a generalized form of EAM potentials due to Finnis and -Sinclair "(Finnis)"_#Finnis. The total energy Ei of an atom I is +Sinclair "(Finnis)"_#Finnis1. The total energy Ei of an atom I is given by :c,image(Eqs/pair_eam_fs.jpg) @@ -442,7 +442,7 @@ of Physics: Condensed Matter, 16, S2629 (2004). [(Daw)] Daw, Baskes, Phys Rev Lett, 50, 1285 (1983). Daw, Baskes, Phys Rev B, 29, 6443 (1984). -:link(Finnis) +:link(Finnis1) [(Finnis)] Finnis, Sinclair, Philosophical Magazine A, 50, 45 (1984). :link(Stukowski) diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index 43d3f17d63..d414056187 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,5 +1,24 @@ # Change Log +## [2.04.04](https://github.com/kokkos/kokkos/tree/2.04.04) (2017-09-11) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.00...2.04.04) + +**Implemented enhancements:** + +- OpenMP partition: set number of threads on nested level [\#1082](https://github.com/kokkos/kokkos/issues/1082) +- Add StaticCrsGraph row\(\) method [\#1071](https://github.com/kokkos/kokkos/issues/1071) +- Enhance Kokkos complex operator overloading [\#1052](https://github.com/kokkos/kokkos/issues/1052) +- Tell Trilinos packages about host+device lambda [\#1019](https://github.com/kokkos/kokkos/issues/1019) +- Function markup for defaulted class members [\#952](https://github.com/kokkos/kokkos/issues/952) +- Add deterministic random number generator [\#857](https://github.com/kokkos/kokkos/issues/857) + +**Fixed bugs:** + +- Fix reduction\_identity\::max for floating point numbers [\#1048](https://github.com/kokkos/kokkos/issues/1048) +- Fix MD iteration policy ignores lower bound on GPUs [\#1041](https://github.com/kokkos/kokkos/issues/1041) +- (Experimental) HBWSpace Linking issues in KokkosKernels [\#1094](https://github.com/kokkos/kokkos/issues/1094) +- (Experimental) ROCm: algorithms/unit\_tests test\_sort failing with segfault [\#1070](https://github.com/kokkos/kokkos/issues/1070) + ## [2.04.00](https://github.com/kokkos/kokkos/tree/2.04.00) (2017-08-16) [Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.13...2.04.00) diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index b8236e8fd1..4641232a1f 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -443,7 +443,7 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib - KOKKOS_LIBS += -lmemkind + KOKKOS_LIBS += -lmemkind -lnuma tmp := $(shell echo "\#define KOKKOS_HAVE_HBWSPACE 1" >> KokkosCore_config.tmp ) endif @@ -614,9 +614,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1) ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) else - # Assume that this is a really a GNU compiler or it could be XL on P8. - KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8 - KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8 + ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) + KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8 + KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8 + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + + else + # Assume that this is a really a GNU compiler on P8. + KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8 + KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8 + endif + endif endif endif @@ -626,9 +635,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1) ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) else - # Assume that this is a really a GNU compiler or it could be XL on P9. - KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9 - KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9 + ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) + KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9 + KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9 + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + + else + # Assume that this is a really a GNU compiler on P9 + KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9 + KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9 + endif + endif endif endif diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp index 9082e47052..3db9a145d7 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp @@ -1265,6 +1265,243 @@ void Random_XorShift1024_Pool::free_state(const Random_XorShift102 } +#endif + +#if defined(KOKKOS_ENABLE_ROCM) + + template<> + class Random_XorShift1024 { + private: + int p_; + const int state_idx_; + uint64_t* state_; + const int stride_; + friend class Random_XorShift1024_Pool; + public: + + typedef Kokkos::Experimental::ROCm device_type; + typedef Random_XorShift1024_Pool pool_type; + + enum {MAX_URAND = 0xffffffffU}; + enum {MAX_URAND64 = 0xffffffffffffffffULL-1}; + enum {MAX_RAND = static_cast(0xffffffffU/2)}; + enum {MAX_RAND64 = static_cast(0xffffffffffffffffULL/2-1)}; + + KOKKOS_INLINE_FUNCTION + Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0): + p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){ + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand() { + uint64_t state_0 = state_[ p_ * stride_ ]; + uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ]; + state_1 ^= state_1 << 31; + state_1 ^= state_1 >> 11; + state_0 ^= state_0 >> 30; + uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL; + tmp = tmp>>16; + return static_cast(tmp&MAX_URAND); + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64() { + uint64_t state_0 = state_[ p_ * stride_ ]; + uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ]; + state_1 ^= state_1 << 31; + state_1 ^= state_1 >> 11; + state_0 ^= state_0 >> 30; + return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& range) { + const uint32_t max_val = (MAX_URAND/range)*range; + uint32_t tmp = urand(); + while(tmp>=max_val) + urand(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& start, const uint32_t& end ) { + return urand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& range) { + const uint64_t max_val = (MAX_URAND64/range)*range; + uint64_t tmp = urand64(); + while(tmp>=max_val) + urand64(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& start, const uint64_t& end ) { + return urand64(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + int rand() { + return static_cast(urand()/2); + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& range) { + const int max_val = (MAX_RAND/range)*range; + int tmp = rand(); + while(tmp>=max_val) + rand(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& start, const int& end ) { + return rand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64() { + return static_cast(urand64()/2); + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& range) { + const int64_t max_val = (MAX_RAND64/range)*range; + int64_t tmp = rand64(); + while(tmp>=max_val) + rand64(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& start, const int64_t& end ) { + return rand64(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + float frand() { + return 1.0f * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& range) { + return range * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& start, const float& end ) { + return frand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + double drand() { + return 1.0 * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& range) { + return range * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& start, const double& end ) { + return frand(end-start)+start; + } + + //Marsaglia polar method for drawing a standard normal distributed random number + KOKKOS_INLINE_FUNCTION + double normal() { + double S = 2.0; + double U; + while(S>=1.0) { + U = 2.0*drand() - 1.0; + const double V = 2.0*drand() - 1.0; + S = U*U+V*V; + } + return U*std::sqrt(-2.0*log(S)/S); + } + + KOKKOS_INLINE_FUNCTION + double normal(const double& mean, const double& std_dev=1.0) { + return mean + normal()*std_dev; + } + }; + +template<> +inline +Random_XorShift64_Pool::Random_XorShift64_Pool(uint64_t seed) { + num_states_ = 0; + init(seed,4*32768); +} + +template<> +KOKKOS_INLINE_FUNCTION +Random_XorShift64 Random_XorShift64_Pool::get_state() const { +#ifdef __HCC_ACCELERATOR__ + const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z; + int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) * + blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_; + while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) { + i+=blockDim_x*blockDim_y*blockDim_z; + if(i>=num_states_) {i = i_offset;} + } + + return Random_XorShift64(state_(i),i); +#else + return Random_XorShift64(state_(0),0); +#endif +} + +template<> +KOKKOS_INLINE_FUNCTION +void Random_XorShift64_Pool::free_state(const Random_XorShift64 &state) const { +#ifdef __HCC_ACCELERATOR__ + state_(state.state_idx_) = state.state_; + locks_(state.state_idx_) = 0; + return; +#endif +} + + +template<> +inline +Random_XorShift1024_Pool::Random_XorShift1024_Pool(uint64_t seed) { + num_states_ = 0; + init(seed,4*32768); +} + +template<> +KOKKOS_INLINE_FUNCTION +Random_XorShift1024 Random_XorShift1024_Pool::get_state() const { +#ifdef __HCC_ACCELERATOR__ + const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z; + int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) * + blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_; + while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) { + i+=blockDim_x*blockDim_y*blockDim_z; + if(i>=num_states_) {i = i_offset;} + } + + return Random_XorShift1024(state_, p_(i), i); +#else + return Random_XorShift1024(state_, p_(0), 0); +#endif +} + +template<> +KOKKOS_INLINE_FUNCTION +void Random_XorShift1024_Pool::free_state(const Random_XorShift1024 &state) const { +#ifdef __HCC_ACCELERATOR__ + for(int i=0; i<16; i++) + state_(state.state_idx_,i) = state.state_[i]; + locks_(state.state_idx_) = 0; + return; +#endif +} + + #endif diff --git a/lib/kokkos/algorithms/unit_tests/Makefile b/lib/kokkos/algorithms/unit_tests/Makefile index b74192ef18..a5a10c82ee 100644 --- a/lib/kokkos/algorithms/unit_tests/Makefile +++ b/lib/kokkos/algorithms/unit_tests/Makefile @@ -30,6 +30,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) TEST_TARGETS += test-cuda endif +ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1) + OBJ_ROCM = TestROCm.o UnitTestMain.o gtest-all.o + TARGETS += KokkosAlgorithms_UnitTest_ROCm + TEST_TARGETS += test-rocm +endif + ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o TARGETS += KokkosAlgorithms_UnitTest_Threads @@ -51,6 +57,9 @@ endif KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS) $(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Cuda +KokkosAlgorithms_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_ROCm + KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS) $(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Threads @@ -63,6 +72,9 @@ KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS) test-cuda: KokkosAlgorithms_UnitTest_Cuda ./KokkosAlgorithms_UnitTest_Cuda +test-rocm: KokkosAlgorithms_UnitTest_ROCm + ./KokkosAlgorithms_UnitTest_ROCm + test-threads: KokkosAlgorithms_UnitTest_Threads ./KokkosAlgorithms_UnitTest_Threads diff --git a/lib/kokkos/algorithms/unit_tests/TestROCm.cpp b/lib/kokkos/algorithms/unit_tests/TestROCm.cpp new file mode 100644 index 0000000000..720b377ed2 --- /dev/null +++ b/lib/kokkos/algorithms/unit_tests/TestROCm.cpp @@ -0,0 +1,112 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#ifdef KOKKOS_ENABLE_ROCM + +#include +#include +#include + +#include + +#include + +#include +#include + +namespace Test { + +class rocm : public ::testing::Test { +protected: + static void SetUpTestCase() + { + std::cout << std::setprecision(5) << std::scientific; + Kokkos::HostSpace::execution_space::initialize(); + Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) ); + } + static void TearDownTestCase() + { + Kokkos::Experimental::ROCm::finalize(); + Kokkos::HostSpace::execution_space::finalize(); + } +}; + +void rocm_test_random_xorshift64( int num_draws ) +{ + Impl::test_random >(num_draws); +} + +void rocm_test_random_xorshift1024( int num_draws ) +{ + Impl::test_random >(num_draws); +} + + +#define ROCM_RANDOM_XORSHIFT64( num_draws ) \ + TEST_F( rocm, Random_XorShift64 ) { \ + rocm_test_random_xorshift64(num_draws); \ + } + +#define ROCM_RANDOM_XORSHIFT1024( num_draws ) \ + TEST_F( rocm, Random_XorShift1024 ) { \ + rocm_test_random_xorshift1024(num_draws); \ + } + +#define ROCM_SORT_UNSIGNED( size ) \ + TEST_F( rocm, SortUnsigned ) { \ + Impl::test_sort< Kokkos::Experimental::ROCm, unsigned >(size); \ + } + +ROCM_RANDOM_XORSHIFT64( 132141141 ) +ROCM_RANDOM_XORSHIFT1024( 52428813 ) +ROCM_SORT_UNSIGNED(171) + +#undef ROCM_RANDOM_XORSHIFT64 +#undef ROCM_RANDOM_XORSHIFT1024 +#undef ROCM_SORT_UNSIGNED +} +#else +void KOKKOS_ALGORITHMS_UNITTESTS_TESTROCM_PREVENT_LINK_ERROR() {} +#endif /* #ifdef KOKKOS_ENABLE_ROCM */ + diff --git a/lib/kokkos/bin/hpcbind b/lib/kokkos/bin/hpcbind index ca34648780..b88b334f8b 100755 --- a/lib/kokkos/bin/hpcbind +++ b/lib/kokkos/bin/hpcbind @@ -27,7 +27,7 @@ fi HPCBIND_HWLOC_PARENT_CPUSET="" if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then MY_PID="$BASHPID" - HPCBIND_HWLOC_PARENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2) + HPCBIND_HWLOC_PARENT_CPUSET="$(hwloc-ps -a --cpuset | grep ${MY_PID} | cut -f 2)" fi ################################################################################ @@ -58,23 +58,34 @@ declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0)) ################################################################################ HPCBIND_QUEUE_NAME="" declare -i HPCBIND_QUEUE_INDEX=0 -declare -i HPCBIND_QUEUE_GPU_MAPPING=0 +declare -i HPCBIND_QUEUE_MAPPING=0 -if [[ ! -z "${SLURM_LOCAL_ID}" ]]; then - HPCBIND_QUEUE_GPU_MAPPING=1 - HPCBIND_QUEUE_NAME="sbatch" +if [[ ! -z "${PMI_RANK}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="mpich" + HPCBIND_QUEUE_INDEX=${PMI_RANK} +elif [[ ! -z "${OMPI_COMM_WORLD_RANK}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="openmpi" + HPCBIND_QUEUE_INDEX=${OMPI_COMM_WORLD_RANK} +elif [[ ! -z "${MV2_COMM_WORLD_RANK}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="mvapich2" + HPCBIND_QUEUE_INDEX=${MV2_COMM_WORLD_RANK} +elif [[ ! -z "${SLURM_LOCAL_ID}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="slurm" HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID} elif [[ ! -z "${LBS_JOBINDEX}" ]]; then - HPCBIND_QUEUE_GPU_MAPPING=1 + HPCBIND_QUEUE_MAPPING=1 HPCBIND_QUEUE_NAME="bsub" HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX} elif [[ ! -z "${ALPS_APP_PE}" ]]; then - HPCBIND_QUEUE_GPU_MAPPING=1 + HPCBIND_QUEUE_MAPPING=1 HPCBIND_QUEUE_NAME="aprun" HPCBIND_QUEUE_INDEX=${ALPS_APP_PE} fi - ################################################################################ # Show help ################################################################################ @@ -91,13 +102,14 @@ function show_help { echo " --proc-bind= Set the initial process mask for the script" echo " LOC can be any valid location argument for" echo " hwloc-calc Default: all" + echo " --whole-system ${cmd} will ignore the its parent process binding" echo " --distribute=N Distribute the current cpuset into N partitions" echo " --distribute-partition=I" echo " Use the i'th partition (zero based)" echo " --visible-gpus= Comma separated list of gpu ids" echo " Default: CUDA_VISIBLE_DEVICES or all gpus in" echo " sequential order" - echo " --gpu-ignore-queue Ignore queue job id when choosing visible GPU" + echo " --ignore-queue Ignore queue job id when choosing visible GPU and partition" echo " --no-gpu-mapping Do not set CUDA_VISIBLE_DEVICES" echo " --openmp=M.m Set env variables for the given OpenMP version" echo " Default: 4.0" @@ -110,22 +122,30 @@ function show_help { echo " --force-openmp-proc-bind=" echo " Override logic for selecting OMP_PROC_BIND" echo " --no-openmp-nested Set OMP_NESTED to false" - echo " --show-bindings Show the bindings" - echo " --lstopo Show bindings in lstopo without executing a command" - echo " -v|--verbose Show options and relevant environment variables" + echo " --output-prefix=

Save the output to files of the form" + echo " P-N.log, P-N.out and P-N.err where P is the prefix" + echo " and N is the queue index or mpi rank (no spaces)" + echo " --output-mode= How console output should be handled." + echo " Options are all, rank0, and none. Default: rank0" + echo " --lstopo Show bindings in lstopo" + echo " -v|--verbose Print bindings and relevant environment variables" echo " -h|--help Show this message" echo "" echo "Sample Usage:" echo " Split the current process cpuset into 4 and use the 3rd partition" echo " ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..." - echo " Bing the process to all even cores" + echo " Launch 16 jobs over 4 nodes with 4 jobs per node using only the even pus" + echo " and save the output to rank specific files" + echo " mpiexec -N 16 -npernode 4 ${cmd} --whole-system --proc-bind=pu:even \\" + echo " --distribute=4 -v --output-prefix=output -- command ..." + echo " Bind the process to all even cores" echo " ${cmd} --proc-bind=core:even -v -- command ..." - echo " Bind to the first 64 cores and split the current process cpuset into 4" - echo " ${cmd} --proc-bind=core:0-63 --distribute=4 --distribute-partition=0 -- command ..." - echo " skip GPU 0 when mapping visible devices" + echo " Bind the the even cores of socket 0 and the odd cores of socket 1" + echo " ${cmd} --proc-bind='socket:0.core:even socket:1.core:odd' -v -- command ..." + echo " Skip GPU 0 when mapping visible devices" echo " ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..." echo " Display the current bindings" - echo " ${cmd} --proc-bind=numa:0 --show-bindings -- command" + echo " ${cmd} --proc-bind=numa:0 -- command" echo " Display the current bindings using lstopo" echo " ${cmd} --proc-bind=numa:0.core:odd --lstopo" echo "" @@ -144,7 +164,7 @@ fi declare -a UNKNOWN_ARGS=() declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC} declare -i HPCBIND_DISTRIBUTE=1 -declare -i HPCBIND_PARTITION=0 +declare -i HPCBIND_PARTITION=-1 HPCBIND_PROC_BIND="all" HPCBIND_OPENMP_VERSION=4.0 declare -i HPCBIND_OPENMP_PERCENT=100 @@ -155,11 +175,15 @@ HPCBIND_OPENMP_FORCE_PROC_BIND="" HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true} declare -i HPCBIND_VERBOSE=0 -declare -i HPCBIND_SHOW_BINDINGS=0 declare -i HPCBIND_LSTOPO=0 -for i in $@; do - case $i in +HPCBIND_OUTPUT_PREFIX="" +HPCBIND_OUTPUT_MODE="rank0" + +declare -i HPCBIND_HAS_COMMAND=0 + +for i in "$@"; do + case "$i" in # number of partitions to create --no-hwloc-bind) HPCBIND_ENABLE_HWLOC_BIND=0 @@ -169,6 +193,10 @@ for i in $@; do HPCBIND_PROC_BIND="${i#*=}" shift ;; + --whole-system) + HPCBIND_HWLOC_PARENT_CPUSET="" + shift + ;; --distribute=*) HPCBIND_DISTRIBUTE="${i#*=}" shift @@ -182,8 +210,8 @@ for i in $@; do HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ') shift ;; - --gpu-ignore-queue) - HPCBIND_QUEUE_GPU_MAPPING=0 + --ignore-queue) + HPCBIND_QUEUE_MAPPING=0 shift ;; --no-gpu-mapping) @@ -218,14 +246,18 @@ for i in $@; do HPCBIND_OPENMP_NESTED="false" shift ;; - --show-bindings) - HPCBIND_VERBOSE=1 - HPCBIND_SHOW_BINDINGS=1 + --output-prefix=*) + HPCBIND_OUTPUT_PREFIX="${i#*=}" + shift + ;; + --output-mode=*) + HPCBIND_OUTPUT_MODE="${i#*=}" + #convert to lower case + HPCBIND_OUTPUT_MODE="${HPCBIND_OUTPUT_MODE,,}" shift ;; --lstopo) HPCBIND_VERBOSE=1 - HPCBIND_SHOW_BINDINGS=0 HPCBIND_LSTOPO=1 shift ;; @@ -239,6 +271,7 @@ for i in $@; do ;; # ignore remaining arguments --) + HPCBIND_HAS_COMMAND=1 shift break ;; @@ -250,16 +283,41 @@ for i in $@; do esac done +################################################################################ +# Check output mode +################################################################################ +declare -i HPCBIND_TEE=0 + +if [[ "${HPCBIND_OUTPUT_MODE}" == "none" ]]; then + HPCBIND_TEE=0 +elif [[ "${HPCBIND_OUTPUT_MODE}" == "all" ]]; then + HPCBIND_TEE=1 +elif [[ ${HPCBIND_QUEUE_INDEX} -eq 0 ]]; then + #default to rank0 printing to screen + HPCBIND_TEE=1 +fi + + +if [[ "${HPCBIND_OUTPUT_PREFIX}" == "" ]]; then + HPCBIND_LOG=/dev/null + HPCBIND_ERR=/dev/null + HPCBIND_OUT=/dev/null +else + HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.hpc.log" + HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.err" + HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.out" + > ${HPCBIND_LOG} +fi + ################################################################################ # Check unknown arguments ################################################################################ if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then - echo "Uknown options: ${UNKNOWN_ARGS[*]}" + echo "HPCBIND Uknown options: ${UNKNOWN_ARGS[*]}" > >(tee -a ${HPCBIND_LOG}) exit 1 fi - ################################################################################ # Check that visible gpus are valid ################################################################################ @@ -268,22 +326,19 @@ if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} || ${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then - echo "Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]}, setting to 0" + echo "HPCBIND Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]} (setting to 0)" > >(tee -a ${HPCBIND_LOG}) HPCBIND_VISIBLE_GPUS[$i]=0; fi done NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]} fi - ################################################################################ # Check OpenMP percent ################################################################################ if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then - echo "OpenMP percent < 1, setting to 1" HPCBIND_OPENMP_PERCENT=1 elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then - echo "OpenMP percent > 100, setting to 100" HPCBIND_OPENMP_PERCENT=100 fi @@ -291,15 +346,21 @@ fi # Check distribute ################################################################################ if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then - echo "Invalid input for distribute, changing distribute to 1" HPCBIND_DISTRIBUTE=1 fi -if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then - echo "Invalid input for distribute-partition, changing to 0" +################################################################################ +#choose the correct partition +################################################################################ +if [[ ${HPCBIND_PARTITION} -lt 0 && ${HPCBIND_QUEUE_MAPPING} -eq 1 ]]; then + HPCBIND_PARTITION=${HPCBIND_QUEUE_INDEX} +elif [[ ${HPCBIND_PARTITION} -lt 0 ]]; then HPCBIND_PARTITION=0 fi +if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then + HPCBIND_PARTITION=$((HPCBIND_PARTITION % HPCBIND_DISTRIBUTE)) +fi ################################################################################ # Find cpuset and num threads @@ -309,13 +370,17 @@ declare -i HPCBIND_NUM_PUS=0 if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then - BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND}) + BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND[*]}) else - BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND}) + BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND[*]}) fi - CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE})) - HPCBIND_HWLOC_CPUSET=${CPUSETS[${HPCBIND_PARTITION}]} + if [[ ${HPCBIND_DISTRIBUTE} -gt 1 ]]; then + CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE})) + HPCBIND_HWLOC_CPUSET="${CPUSETS[${HPCBIND_PARTITION}]}" + else + HPCBIND_HWLOC_CPUSET="${BINDING}" + fi HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l) else HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor) @@ -373,13 +438,13 @@ export OMP_NESTED=${HPCBIND_OPENMP_NESTED} ################################################################################ if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then - if [[ ${HPCBIND_QUEUE_GPU_MAPPING} -eq 0 ]]; then + if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS)) - export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]} + export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}" else declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION)) declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS)) - export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]} + export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}" fi fi @@ -389,22 +454,22 @@ fi export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC} export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA} export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS} -export HPCBIND_HWLOC_CPUSET=${HPCBIND_HWLOC_CPUSET} +export HPCBIND_HWLOC_CPUSET="${HPCBIND_HWLOC_CPUSET}" export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE} export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION} if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then export HPCBIND_HWLOC_PARENT_CPUSET="all" else - export HPCBIND_HWLOC_PARENT_CPUSET=${HPCBIND_HWLOC_PARENT_CPUSET} + export HPCBIND_HWLOC_PARENT_CPUSET="${HPCBIND_HWLOC_PARENT_CPUSET}" fi -export HPCBIND_HWLOC_PROC_BIND=${HPCBIND_PROC_BIND} +export HPCBIND_HWLOC_PROC_BIND="${HPCBIND_PROC_BIND}" export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING} export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',') -export HPCBIND_OPENMP_VERSION=${HPCBIND_OPENMP_VERSION} +export HPCBIND_OPENMP_VERSION="${HPCBIND_OPENMP_VERSION}" if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX} - export HPCBIND_QUEUE_NAME=${HPCBIND_QUEUE_NAME} - export HPCBIND_QUEUE_GPU_MAPPING=${HPCBIND_QUEUE_GPU_MAPPING} + export HPCBIND_QUEUE_NAME="${HPCBIND_QUEUE_NAME}" + export HPCBIND_QUEUE_MAPPING=${HPCBIND_QUEUE_MAPPING} fi @@ -412,43 +477,63 @@ fi # Print verbose ################################################################################ -if [[ ${HPCBIND_VERBOSE} -eq 1 ]]; then - MY_ENV=$(env | sort) - echo "[HPCBIND]" - echo "${MY_ENV}" | grep -E "^HPCBIND_" - echo "[CUDA]" - echo "${MY_ENV}" | grep -E "^CUDA_" - echo "[OPENMP]" - echo "${MY_ENV}" | grep -E "^OMP_" -fi +TMP_ENV=$(env | sort) +if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then + echo "[HOST]" >> ${HPCBIND_LOG} + hostname -s >> ${HPCBIND_LOG} + echo "[HPCBIND]" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^HPCBIND_" >> ${HPCBIND_LOG} + echo "[CUDA]" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG} + echo "[OPENMP]" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^OMP_" >> ${HPCBIND_LOG} -if [[ ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then - echo "[BINDINGS]" - hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu -elif [[ ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then - echo "Unable to show bindings, hwloc not available." + if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then + echo "[BINDINGS]" >> ${HPCBIND_LOG} + hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu >> ${HPCBIND_LOG} + else + echo "Unable to show bindings, hwloc not available." >> ${HPCBIND_LOG} + fi +else + echo "[HOST]" > >(tee -a ${HPCBIND_LOG}) + hostname -s > >(tee -a ${HPCBIND_LOG}) + echo "[HPCBIND]" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^HPCBIND_" > >(tee -a ${HPCBIND_LOG}) + echo "[CUDA]" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG}) + echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^OMP_" > >(tee -a ${HPCBIND_LOG}) + + if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then + echo "[BINDINGS]" > >(tee -a ${HPCBIND_LOG}) + hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu > >(tee -a ${HPCBIND_LOG}) + else + echo "Unable to show bindings, hwloc not available." > >(tee -a ${HPCBIND_LOG}) + fi fi ################################################################################ # Run command ################################################################################ -if [[ ${HPCBIND_LSTOPO} -eq 0 ]]; then - if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then - hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- $@ - else - eval $@ - fi -else - if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then - if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then - echo "[BINDINGS]" - hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu - hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- lstopo --pid 0 +# must be the last executed command so that the return value is correct +if [[ ${HPCBIND_LSTOPO} -eq 1 && ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then + hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- lstopo --pid 0 +elif [[ ${HPCBIND_HAS_COMMAND} -eq 1 ]]; then + # clear output files + > ${HPCBIND_ERR} + > ${HPCBIND_OUT} + if [[ ${HPCBIND_TEE} -eq 0 ]]; then + if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then + hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR} else - hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} + eval $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR} fi else - echo "Unable to show bindings, hwloc not available." + if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then + hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2) + else + eval $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2) + fi fi fi diff --git a/lib/kokkos/bin/kokkos-bind b/lib/kokkos/bin/kokkos-bind deleted file mode 100755 index b6fe07a1bd..0000000000 --- a/lib/kokkos/bin/kokkos-bind +++ /dev/null @@ -1,221 +0,0 @@ -#!/usr/bin/env bash - -# check if hwloc commands exist -declare -i HAS_HWLOC=0 -type hwloc-bind >/dev/null 2>&1 -HAS_HWLOC="${HAS_HWLOC} + $?" - -type hwloc-distrib >/dev/null 2>&1 -HAS_HWLOC="${HAS_HWLOC} + $?" - -type hwloc-ls >/dev/null 2>&1 -HAS_HWLOC="${HAS_HWLOC} + $?" - -type hwloc-calc >/dev/null 2>&1 -HAS_HWLOC="${HAS_HWLOC} + $?" - -type hwloc-ps >/dev/null 2>&1 -HAS_HWLOC="${HAS_HWLOC} + $?" - - -#parse args -declare -a UNKNOWN_ARGS=() -declare -i DISTRIBUTE=1 -declare -i INDEX=0 -PROC_BIND="all" -CURRENT_CPUSET="" -OPENMP_VERSION=4.0 -OPENMP_PROC_BIND=True -OPENMP_NESTED=True -VERBOSE=False - -#get the current process cpuset -if [[ ${HAS_HWLOC} -eq 0 ]]; then - MY_PID="$BASHPID" - CURRENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2) - echo "$CURRENT_CPUSET" -fi - -function show_help { - local cmd=$(basename "$0") - echo "Usage: ${cmd} -- command ..." - echo " Uses hwloc to divide the node into the given number of groups," - echo " set the appropriate OMP_NUM_THREADS and execute the command on the" - echo " selected group." - echo "" - echo " NOTE: This command assumes it has exclusive use of the node" - echo "" - echo "Options:" - echo " --proc-bind= Set the initial process mask for the script. " - echo " LOC can be any valid location argumnet for" - echo " hwloc-calc. Defaults to the entire machine" - echo " --distribute=N Distribute the current proc-bind into N groups" - echo " --index=I Use the i'th group (zero based)" - echo " --openmp=M.m Set env variables for the given OpenMP version" - echo " (default 4.0)" - echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES" - echo " --no-openmp-nested Set OMP_NESTED to false" - echo " -v|--verbose" - echo " -h|--help" - echo "" - echo "Sample Usage:" - echo " ${cmd} --distribute=4 --index=2 -v -- command ..." - echo "" -} - -if [[ "$#" -eq 0 ]]; then - show_help - exit 0 -fi - - -for i in $@; do - case $i in - # number of partitions to create - --proc-bind=*) - PROC_BIND="${i#*=}" - shift - ;; - --distribute=*) - DISTRIBUTE="${i#*=}" - shift - ;; - # which group to use - --index=*) - INDEX="${i#*=}" - shift - ;; - --openmp=*) - OPENMP_VERSION="${i#*=}" - shift - ;; - --no-openmp-proc-bind) - OPENMP_PROC_BIND=False - shift - ;; - --no-openmp-nested) - OPENMP_NESTED=False - shift - ;; - -v|--verbose) - VERBOSE=True - shift - ;; - -h|--help) - show_help - exit 0 - ;; - # ignore remaining arguments - --) - shift - break - ;; - # unknown option - *) - UNKNOWN_ARGS+=("$i") - shift - ;; - esac -done - -if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then - echo "Uknown options: ${UNKNOWN_ARGS[*]}" - exit 1 -fi - -if [[ ${DISTRIBUTE} -le 0 ]]; then - echo "Invalid input for distribute, changing distribute to 1" - DISTRIBUTE=1 -fi - -if [[ ${INDEX} -ge ${DISTRIBUTE} ]]; then - echo "Invalid input for index, changing index to 0" - INDEX=0 -fi - -if [[ ${HAS_HWLOC} -ne 0 ]]; then - echo "hwloc not found, no process binding will occur" - DISTRIBUTE=1 - INDEX=0 -fi - -if [[ ${HAS_HWLOC} -eq 0 ]]; then - - if [[ "${CURRENT_CPUSET}" == "" ]]; then - BINDING=$(hwloc-calc ${PROC_BIND}) - else - BINDING=$(hwloc-calc --restrict ${CURRENT_CPUSET} ${PROC_BIND}) - fi - - CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${DISTRIBUTE})) - CPUSET=${CPUSETS[${INDEX}]} - NUM_THREADS=$(hwloc-ls --restrict ${CPUSET} --only pu | wc -l) - - if [[ "${VERBOSE}" == "True" ]]; then - echo "hwloc: true" - echo " proc_bind: ${PROC_BIND}" - echo " distribute: ${DISTRIBUTE}" - echo " index: ${INDEX}" - echo " parent_cpuset: ${CURRENT_CPUSET}" - echo " cpuset: ${CPUSET}" - echo "omp_num_threads: ${NUM_THREADS}" - echo "omp_proc_bind: ${OPENMP_PROC_BIND}" - echo "omp_nested: ${OPENMP_NESTED}" - echo "OpenMP: ${OPENMP_VERSION}" - fi - - # set OMP env - if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then - if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then - export OMP_PLACES="threads" - export OMP_PROC_BIND="spread" - else - export OMP_PROC_BIND="true" - unset OMP_PLACES - fi - else - unset OMP_PLACES - unset OMP_PROC_BIND - fi - if [[ "${OPENMP_NESTED}" == "True" ]]; then - export OMP_NESTED="true" - else - export OMP_NESTED="false" - fi - export OMP_NUM_THREADS="${NUM_THREADS}" - - hwloc-bind ${CPUSET} -- $@ -else - NUM_THREADS=$(cat /proc/cpuinfo | grep -c processor) - - if [[ "${VERBOSE}" == "True" ]]; then - echo "hwloc: false" - echo "omp_num_threads: ${NUM_THREADS}" - echo "omp_proc_bind: ${OPENMP_PROC_BIND}" - echo "omp_nested: ${OPENMP_NESTED}" - echo "OpenMP: ${OPENMP_VERSION}" - fi - - # set OMP env - if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then - if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then - export OMP_PLACES="threads" - export OMP_PROC_BIND="spread" - else - export OMP_PROC_BIND="true" - unset OMP_PLACES - fi - else - unset OMP_PLACES - unset OMP_PROC_BIND - fi - if [[ "${OPENMP_NESTED}" == "True" ]]; then - export OMP_NESTED="true" - else - export OMP_NESTED="false" - fi - export OMP_NUM_THREADS="${NUM_THREADS}" - - eval $@ -fi - diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper index 09fa5d500a..76e33f3c66 100755 --- a/lib/kokkos/bin/nvcc_wrapper +++ b/lib/kokkos/bin/nvcc_wrapper @@ -78,6 +78,9 @@ temp_dir=${TMPDIR:-/tmp} # Check if we have an optimization argument already optimization_applied=0 +# Check if we have -std=c++X or --std=c++X already +stdcxx_applied=0 + #echo "Arguments: $# $@" while [ $# -gt 0 ] @@ -130,10 +133,16 @@ do cuda_args="$cuda_args $1 $2" shift ;; - #Handle c++11 setting - --std=c++11|-std=c++11) - shared_args="$shared_args $1" + #Handle c++11 + --std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++1z|-std=c++1z) + if [ $stdcxx_applied -eq 1 ]; then + echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-std=c++1* or --std=c++1*), only the first is used because nvcc can only accept a single std setting" + else + shared_args="$shared_args $1" + stdcxx_applied=1 + fi ;; + #strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98 -std=c++98|--std=c++98) ;; diff --git a/lib/kokkos/config/master_history.txt b/lib/kokkos/config/master_history.txt index 96b05c02e1..6f9ca897d9 100644 --- a/lib/kokkos/config/master_history.txt +++ b/lib/kokkos/config/master_history.txt @@ -9,3 +9,4 @@ tag: 2.03.00 date: 04:25:2017 master: 120d9ce7 develop: 015ba641 tag: 2.03.05 date: 05:27:2017 master: 36b92f43 develop: 79073186 tag: 2.03.13 date: 07:27:2017 master: da314444 develop: 29ccb58a tag: 2.04.00 date: 08:16:2017 master: 54eb75c0 develop: 32fb8ee1 +tag: 2.04.04 date: 09:11:2017 master: 2b7e9c20 develop: 51e7b25a diff --git a/lib/kokkos/config/trilinos-integration/checkin-test b/lib/kokkos/config/trilinos-integration/checkin-test index 92a1b1c068..ffb565fcbb 100644 --- a/lib/kokkos/config/trilinos-integration/checkin-test +++ b/lib/kokkos/config/trilinos-integration/checkin-test @@ -1,4 +1,4 @@ module purge -module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.59.0/base +module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.63.0/base sems-yaml_cpp sems-superlu #Run Trilinos CheckinTest diff --git a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp index 0408472c68..996b6b5610 100644 --- a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp +++ b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp @@ -125,6 +125,123 @@ namespace Impl { }; } +/// \class GraphRowViewConst +/// \brief View of a row of a sparse graph. +/// \tparam GraphType Sparse graph type, such as (but not limited to) StaticCrsGraph. +/// +/// This class provides a generic view of a row of a sparse graph. +/// We intended this class to view a row of a StaticCrsGraph, but +/// GraphType need not necessarily be CrsMatrix. +/// +/// The row view is suited for computational kernels like sparse +/// matrix-vector multiply, as well as for modifying entries in the +/// sparse matrix. The view is always const as it does not allow graph modification. +/// +/// Here is an example loop over the entries in the row: +/// \code +/// typedef typename GraphRowViewConst::ordinal_type ordinal_type; +/// +/// GraphRowView G_i = ...; +/// const ordinal_type numEntries = G_i.length; +/// for (ordinal_type k = 0; k < numEntries; ++k) { +/// ordinal_type j = G_i.colidx (k); +/// // ... do something with A_ij and j ... +/// } +/// \endcode +/// +/// GraphType must provide the \c data_type +/// typedefs. In addition, it must make sense to use GraphRowViewConst to +/// view a row of GraphType. In particular, column +/// indices of a row must be accessible using the entries +/// resp. colidx arrays given to the constructor of this +/// class, with a constant stride between successive entries. +/// The stride is one for the compressed sparse row storage format (as +/// is used by CrsMatrix), but may be greater than one for other +/// sparse matrix storage formats (e.g., ELLPACK or jagged diagonal). +template +struct GraphRowViewConst { + //! The type of the column indices in the row. + typedef const typename GraphType::data_type ordinal_type; + +private: + //! Array of (local) column indices in the row. + ordinal_type* colidx_; + /// \brief Stride between successive entries in the row. + /// + /// For compressed sparse row (CSR) storage, this is always one. + /// This might be greater than one for storage formats like ELLPACK + /// or Jagged Diagonal. Nevertheless, the stride can never be + /// greater than the number of rows or columns in the matrix. Thus, + /// \c ordinal_type is the correct type. + const ordinal_type stride_; + +public: + /// \brief Constructor + /// + /// \param values [in] Array of the row's values. + /// \param colidx [in] Array of the row's column indices. + /// \param stride [in] (Constant) stride between matrix entries in + /// each of the above arrays. + /// \param count [in] Number of entries in the row. + KOKKOS_INLINE_FUNCTION + GraphRowViewConst ( ordinal_type* const colidx_in, + const ordinal_type& stride, + const ordinal_type& count) : + colidx_ (colidx_in), stride_ (stride), length (count) + {} + + /// \brief Constructor with offset into \c colidx array + /// + /// \param colidx [in] Array of the row's column indices. + /// \param stride [in] (Constant) stride between matrix entries in + /// each of the above arrays. + /// \param count [in] Number of entries in the row. + /// \param idx [in] Start offset into \c colidx array + /// + /// \tparam OffsetType The type of \c idx (see above). Must be a + /// built-in integer type. This may differ from ordinal_type. + /// For example, the matrix may have dimensions that fit in int, + /// but a number of entries that does not fit in int. + template + KOKKOS_INLINE_FUNCTION + GraphRowViewConst ( const typename GraphType::entries_type& colidx_in, + const ordinal_type& stride, + const ordinal_type& count, + const OffsetType& idx, + const typename std::enable_if::value, int>::type& = 0) : + colidx_ (&colidx_in(idx)), stride_ (stride), length (count) + {} + + /// \brief Number of entries in the row. + /// + /// This is a public const field rather than a public const method, + /// in order to avoid possible overhead of a method call if the + /// compiler is unable to inline that method call. + /// + /// We assume that rows contain no duplicate entries (i.e., entries + /// with the same column index). Thus, a row may have up to + /// A.numCols() entries. This means that the correct type of + /// 'length' is ordinal_type. + const ordinal_type length; + + /// \brief (Const) reference to the column index of entry i in this + /// row of the sparse matrix. + /// + /// "Entry i" is not necessarily the entry with column index i, nor + /// does i necessarily correspond to the (local) row index. + KOKKOS_INLINE_FUNCTION + ordinal_type& colidx (const ordinal_type& i) const { + return colidx_[i*stride_]; + } + + /// \brief An alias for colidx + KOKKOS_INLINE_FUNCTION + ordinal_type& operator()(const ordinal_type& i) const { + return colidx(i); + } +}; + + /// \class StaticCrsGraph /// \brief Compressed row storage array. /// @@ -218,6 +335,38 @@ public: static_cast (0); } + /// \brief Return a const view of row i of the graph. + /// + /// If row i does not belong to the graph, return an empty view. + /// + /// The returned object \c view implements the following interface: + ///

    + ///
  • \c view.length is the number of entries in the row
  • + ///
  • \c view.colidx(k) returns a const reference to the + /// column index of the k-th entry in the row
  • + ///
+ /// k is not a column index; it just counts from 0 to + /// view.length - 1. + /// + /// Users should not rely on the return type of this method. They + /// should instead assign to 'auto'. That allows compile-time + /// polymorphism for different kinds of sparse matrix formats (e.g., + /// ELLPACK or Jagged Diagonal) that we may wish to support in the + /// future. + KOKKOS_INLINE_FUNCTION + GraphRowViewConst rowConst (const data_type i) const { + const size_type start = row_map(i); + // count is guaranteed to fit in ordinal_type, as long as no row + // has duplicate entries. + const data_type count = static_cast (row_map(i+1) - start); + + if (count == 0) { + return GraphRowViewConst (NULL, 1, 0); + } else { + return GraphRowViewConst (entries, 1, count, start); + } + } + /** \brief Create a row partitioning into a given number of blocks * balancing non-zeros + a fixed cost per row. */ diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp index 46321378d9..c184c14d07 100644 --- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp +++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp @@ -91,11 +91,11 @@ struct DeviceIterateTile<2,RP,Functor,void > // LL if (RP::inner_direction == RP::Left) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1); } @@ -106,11 +106,11 @@ struct DeviceIterateTile<2,RP,Functor,void > // LR else { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { m_func(offset_0 , offset_1); } @@ -143,11 +143,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag> if (RP::inner_direction == RP::Left) { // Loop over size maxnumblocks until full range covered for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { m_func(Tag(), offset_0 , offset_1); } @@ -157,11 +157,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag> } else { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { m_func(Tag(), offset_0 , offset_1); } @@ -196,15 +196,15 @@ struct DeviceIterateTile<3,RP,Functor,void > // LL if (RP::inner_direction == RP::Left) { for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1 , offset_2); } @@ -217,15 +217,15 @@ struct DeviceIterateTile<3,RP,Functor,void > // LR else { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) { m_func(offset_0 , offset_1 , offset_2); } @@ -259,15 +259,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag> { if (RP::inner_direction == RP::Left) { for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { m_func(Tag(), offset_0 , offset_1 , offset_2); } @@ -279,15 +279,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag> } else { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) { m_func(Tag(), offset_0 , offset_1 , offset_2); } @@ -340,19 +340,19 @@ struct DeviceIterateTile<4,RP,Functor,void > const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { - const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z; + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) { for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3); } @@ -378,19 +378,19 @@ struct DeviceIterateTile<4,RP,Functor,void > const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) { for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { - const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z; + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3); } @@ -442,19 +442,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag> const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { - const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z; + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) { for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3); } @@ -479,19 +479,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag> const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) { for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { - const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z; + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3); } @@ -558,23 +558,23 @@ struct DeviceIterateTile<5,RP,Functor,void > const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2]; for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { - const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z; + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4); } @@ -613,23 +613,23 @@ struct DeviceIterateTile<5,RP,Functor,void > const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { - const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z; + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4); } @@ -695,23 +695,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag> const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2]; for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { - const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z; + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4); } @@ -750,23 +750,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag> const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { - const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z; + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4); } @@ -845,27 +845,27 @@ struct DeviceIterateTile<6,RP,Functor,void > const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4]; for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { - const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { - const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5); } @@ -917,27 +917,27 @@ struct DeviceIterateTile<6,RP,Functor,void > const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { - const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { - const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5); } @@ -1016,27 +1016,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag> const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4]; for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { - const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { - const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5); } @@ -1088,27 +1088,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag> const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { - const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { - const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5); } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp index cae8ecd489..079d9f0889 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp @@ -164,7 +164,7 @@ static void cuda_parallel_launch_constant_memory() template< class DriverType, unsigned int maxTperB, unsigned int minBperSM > __global__ -__launch_bounds__(maxTperB, minBperSM) +//__launch_bounds__(maxTperB, minBperSM) static void cuda_parallel_launch_constant_memory() { const DriverType & driver = @@ -182,7 +182,7 @@ static void cuda_parallel_launch_local_memory( const DriverType driver ) template< class DriverType, unsigned int maxTperB, unsigned int minBperSM > __global__ -__launch_bounds__(maxTperB, minBperSM) +//__launch_bounds__(maxTperB, minBperSM) static void cuda_parallel_launch_local_memory( const DriverType driver ) { driver(); diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp index 26b47a8b74..f8355f0d06 100644 --- a/lib/kokkos/core/src/Kokkos_Complex.hpp +++ b/lib/kokkos/core/src/Kokkos_Complex.hpp @@ -242,45 +242,89 @@ public: re_ = v; } + template KOKKOS_INLINE_FUNCTION - complex& operator += (const complex& src) { + complex& + operator += (const complex& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ += src.re_; im_ += src.im_; return *this; } + template KOKKOS_INLINE_FUNCTION - void operator += (const volatile complex& src) volatile { + void + operator += (const volatile complex& src) volatile { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ += src.re_; im_ += src.im_; } KOKKOS_INLINE_FUNCTION - complex& operator += (const RealType& src) { + complex& + operator += (const std::complex& src) { + re_ += src.real(); + im_ += src.imag(); + return *this; + } + + template + KOKKOS_INLINE_FUNCTION + complex& + operator += (const InputRealType& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ += src; return *this; } + template KOKKOS_INLINE_FUNCTION - void operator += (const volatile RealType& src) volatile { + void + operator += (const volatile InputRealType& src) volatile { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ += src; } - + + template KOKKOS_INLINE_FUNCTION - complex& operator -= (const complex& src) { + complex& + operator -= (const complex& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ -= src.re_; im_ -= src.im_; return *this; } KOKKOS_INLINE_FUNCTION - complex& operator -= (const RealType& src) { + complex& + operator -= (const std::complex& src) { + re_ -= src.real(); + im_ -= src.imag(); + return *this; + } + + template + KOKKOS_INLINE_FUNCTION + complex& + operator -= (const InputRealType& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ -= src; return *this; } + template KOKKOS_INLINE_FUNCTION - complex& operator *= (const complex& src) { + complex& + operator *= (const complex& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); const RealType realPart = re_ * src.re_ - im_ * src.im_; const RealType imagPart = re_ * src.im_ + im_ * src.re_; re_ = realPart; @@ -288,8 +332,12 @@ public: return *this; } + template KOKKOS_INLINE_FUNCTION - void operator *= (const volatile complex& src) volatile { + void + operator *= (const volatile complex& src) volatile { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); const RealType realPart = re_ * src.re_ - im_ * src.im_; const RealType imagPart = re_ * src.im_ + im_ * src.re_; re_ = realPart; @@ -297,20 +345,70 @@ public: } KOKKOS_INLINE_FUNCTION - complex& operator *= (const RealType& src) { + complex& + operator *= (const std::complex& src) { + const RealType realPart = re_ * src.real() - im_ * src.imag(); + const RealType imagPart = re_ * src.imag() + im_ * src.real(); + re_ = realPart; + im_ = imagPart; + return *this; + } + + template + KOKKOS_INLINE_FUNCTION + complex& + operator *= (const InputRealType& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ *= src; im_ *= src; return *this; } + template KOKKOS_INLINE_FUNCTION - void operator *= (const volatile RealType& src) volatile { + void + operator *= (const volatile InputRealType& src) volatile { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ *= src; im_ *= src; } + template KOKKOS_INLINE_FUNCTION - complex& operator /= (const complex& y) { + complex& + operator /= (const complex& y) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + + // Scale (by the "1-norm" of y) to avoid unwarranted overflow. + // If the real part is +/-Inf and the imaginary part is -/+Inf, + // this won't change the result. + const RealType s = std::fabs (y.real ()) + std::fabs (y.imag ()); + + // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0. + // In that case, the relation x/y == (x/s) / (y/s) doesn't hold, + // because y/s is NaN. + if (s == 0.0) { + this->re_ /= s; + this->im_ /= s; + } + else { + const complex x_scaled (this->re_ / s, this->im_ / s); + const complex y_conj_scaled (y.re_ / s, -(y.im_) / s); + const RealType y_scaled_abs = y_conj_scaled.re_ * y_conj_scaled.re_ + + y_conj_scaled.im_ * y_conj_scaled.im_; // abs(y) == abs(conj(y)) + *this = x_scaled * y_conj_scaled; + *this /= y_scaled_abs; + } + return *this; + } + + KOKKOS_INLINE_FUNCTION + complex& + operator /= (const std::complex& y) { + // Scale (by the "1-norm" of y) to avoid unwarranted overflow. // If the real part is +/-Inf and the imaginary part is -/+Inf, // this won't change the result. @@ -334,57 +432,95 @@ public: return *this; } + + template KOKKOS_INLINE_FUNCTION - complex& operator /= (const RealType& src) { + complex& + operator /= (const InputRealType& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + re_ /= src; im_ /= src; return *this; } + template KOKKOS_INLINE_FUNCTION - bool operator == (const complex& src) { - return (re_ == src.re_) && (im_ == src.im_); + bool + operator == (const complex& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + + return (re_ == static_cast(src.re_)) && (im_ == static_cast(src.im_)); } KOKKOS_INLINE_FUNCTION - bool operator == (const RealType src) { - return (re_ == src) && (im_ == RealType(0)); + bool + operator == (const std::complex& src) { + return (re_ == src.real()) && (im_ == src.imag()); + } + + template + KOKKOS_INLINE_FUNCTION + bool + operator == (const InputRealType src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + + return (re_ == static_cast(src)) && (im_ == RealType(0)); + } + + template + KOKKOS_INLINE_FUNCTION + bool + operator != (const complex& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + + return (re_ != static_cast(src.re_)) || (im_ != static_cast(src.im_)); } KOKKOS_INLINE_FUNCTION - bool operator != (const complex& src) { - return (re_ != src.re_) || (im_ != src.im_); + bool + operator != (const std::complex& src) { + return (re_ != src.real()) || (im_ != src.imag()); } + template KOKKOS_INLINE_FUNCTION - bool operator != (const RealType src) { - return (re_ != src) || (im_ != RealType(0)); - } + bool + operator != (const InputRealType src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + return (re_ != static_cast(src)) || (im_ != RealType(0)); + } + }; //! Binary + operator for complex complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator + (const complex& x, const complex& y) { - return complex (x.real () + y.real (), x.imag () + y.imag ()); +complex::type> +operator + (const complex& x, const complex& y) { + return complex::type > (x.real () + y.real (), x.imag () + y.imag ()); } //! Binary + operator for complex scalar. -template +template KOKKOS_INLINE_FUNCTION -complex -operator + (const complex& x, const RealType& y) { - return complex (x.real () + y , x.imag ()); +complex::type> +operator + (const complex& x, const RealType2& y) { + return complex::type> (x.real () + y , x.imag ()); } //! Binary + operator for scalar complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator + (const RealType& x, const complex& y) { - return complex (x + y.real (), y.imag ()); +complex::type> +operator + (const RealType1& x, const complex& y) { + return complex::type> (x + y.real (), y.imag ()); } //! Unary + operator for complex. @@ -396,27 +532,27 @@ operator + (const complex& x) { } //! Binary - operator for complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator - (const complex& x, const complex& y) { - return complex (x.real () - y.real (), x.imag () - y.imag ()); +complex::type> +operator - (const complex& x, const complex& y) { + return complex::type> (x.real () - y.real (), x.imag () - y.imag ()); } //! Binary - operator for complex scalar. -template +template KOKKOS_INLINE_FUNCTION -complex -operator - (const complex& x, const RealType& y) { - return complex (x.real () - y , x.imag ()); +complex::type> +operator - (const complex& x, const RealType2& y) { + return complex::type> (x.real () - y , x.imag ()); } //! Binary - operator for scalar complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator - (const RealType& x, const complex& y) { - return complex (x - y.real (), - y.imag ()); +complex::type> +operator - (const RealType1& x, const complex& y) { + return complex::type> (x - y.real (), - y.imag ()); } //! Unary - operator for complex. @@ -428,12 +564,12 @@ operator - (const complex& x) { } //! Binary * operator for complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator * (const complex& x, const complex& y) { - return complex (x.real () * y.real () - x.imag () * y.imag (), - x.real () * y.imag () + x.imag () * y.real ()); +complex::type> +operator * (const complex& x, const complex& y) { + return complex::type> (x.real () * y.real () - x.imag () * y.imag (), + x.real () * y.imag () + x.imag () * y.real ()); } /// \brief Binary * operator for std::complex and complex. @@ -446,33 +582,34 @@ operator * (const complex& x, const complex& y) { /// This function cannot be called in a CUDA device function, because /// std::complex's methods and nonmember functions are not marked as /// CUDA device functions. -template -complex -operator * (const std::complex& x, const complex& y) { - return complex (x.real () * y.real () - x.imag () * y.imag (), - x.real () * y.imag () + x.imag () * y.real ()); +template +inline +complex::type> +operator * (const std::complex& x, const complex& y) { + return complex::type> (x.real () * y.real () - x.imag () * y.imag (), + x.real () * y.imag () + x.imag () * y.real ()); } /// \brief Binary * operator for RealType times complex. /// /// This function exists because the compiler doesn't know that /// RealType and complex commute with respect to operator*. -template +template KOKKOS_INLINE_FUNCTION -complex -operator * (const RealType& x, const complex& y) { - return complex (x * y.real (), x * y.imag ()); +complex::type> +operator * (const RealType1& x, const complex& y) { + return complex::type> (x * y.real (), x * y.imag ()); } /// \brief Binary * operator for RealType times complex. /// /// This function exists because the compiler doesn't know that /// RealType and complex commute with respect to operator*. -template +template KOKKOS_INLINE_FUNCTION -complex -operator * (const complex& y, const RealType& x) { - return complex (x * y.real (), x * y.imag ()); +complex::type> +operator * (const complex& y, const RealType2& x) { + return complex::type> (x * y.real (), x * y.imag ()); } //! Imaginary part of a complex number. @@ -539,33 +676,34 @@ complex pow (const complex& x) { //! Binary operator / for complex and real numbers template KOKKOS_INLINE_FUNCTION -complex +complex::type> operator / (const complex& x, const RealType2& y) { - return complex (real (x) / y, imag (x) / y); + return complex::type> (real (x) / y, imag (x) / y); } //! Binary operator / for complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator / (const complex& x, const complex& y) { +complex::type> +operator / (const complex& x, const complex& y) { // Scale (by the "1-norm" of y) to avoid unwarranted overflow. // If the real part is +/-Inf and the imaginary part is -/+Inf, // this won't change the result. - const RealType s = std::fabs (real (y)) + std::fabs (imag (y)); + typedef typename std::common_type::type common_real_type; + const common_real_type s = std::fabs (real (y)) + std::fabs (imag (y)); // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0. // In that case, the relation x/y == (x/s) / (y/s) doesn't hold, // because y/s is NaN. if (s == 0.0) { - return complex (real (x) / s, imag (x) / s); + return complex (real (x) / s, imag (x) / s); } else { - const complex x_scaled (real (x) / s, imag (x) / s); - const complex y_conj_scaled (real (y) / s, -imag (y) / s); - const RealType y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) + + const complex x_scaled (real (x) / s, imag (x) / s); + const complex y_conj_scaled (real (y) / s, -imag (y) / s); + const RealType1 y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) + imag (y_conj_scaled) * imag (y_conj_scaled); // abs(y) == abs(conj(y)) - complex result = x_scaled * y_conj_scaled; + complex result = x_scaled * y_conj_scaled; result /= y_scaled_abs; return result; } @@ -574,16 +712,19 @@ operator / (const complex& x, const complex& y) { //! Binary operator / for complex and real numbers template KOKKOS_INLINE_FUNCTION -complex +complex::type> operator / (const RealType1& x, const complex& y) { - return complex (x)/y; + return complex::type> (x)/y; } //! Equality operator for two complex numbers. -template +template KOKKOS_INLINE_FUNCTION -bool operator == (const complex& x, const complex& y) { - return real (x) == real (y) && imag (x) == imag (y); +bool +operator == (const complex& x, const complex& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(real (x)) == static_cast(real (y)) && + static_cast(imag (x)) == static_cast(imag (y)) ); } /// \brief Equality operator for std::complex and Kokkos::complex. @@ -592,50 +733,68 @@ bool operator == (const complex& x, const complex& y) { /// Otherwise, CUDA builds will give compiler warnings ("warning: /// calling a constexpr __host__ function("real") from a __host__ /// __device__ function("operator==") is not allowed"). -template -bool operator == (const std::complex& x, const complex& y) { - return std::real (x) == real (y) && std::imag (x) == imag (y); +template +inline +bool +operator == (const std::complex& x, const complex& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(std::real (x)) == static_cast(real (y)) && + static_cast(std::imag (x)) == static_cast(imag (y)) ); } - + //! Equality operator for complex and real number. template KOKKOS_INLINE_FUNCTION -bool operator == (const complex& x, const RealType2& y) { - return real (x) == y && imag (x) == static_cast (0.0); +bool +operator == (const complex& x, const RealType2& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(real (x)) == static_cast(y) && + static_cast(imag (x)) == static_cast(0.0) ); } //! Equality operator for real and complex number. -template +template KOKKOS_INLINE_FUNCTION -bool operator == (const RealType& x, const complex& y) { +bool +operator == (const RealType1& x, const complex& y) { return y == x; } //! Inequality operator for two complex numbers. -template +template KOKKOS_INLINE_FUNCTION -bool operator != (const complex& x, const complex& y) { - return real (x) != real (y) || imag (x) != imag (y); +bool +operator != (const complex& x, const complex& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(real (x)) != static_cast(real (y)) || + static_cast(imag (x)) != static_cast(imag (y)) ); } //! Inequality operator for std::complex and Kokkos::complex. -template -KOKKOS_INLINE_FUNCTION -bool operator != (const std::complex& x, const complex& y) { - return std::real (x) != real (y) || std::imag (x) != imag (y); +template +inline +bool +operator != (const std::complex& x, const complex& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(std::real (x)) != static_cast(real (y)) || + static_cast(std::imag (x)) != static_cast(imag (y)) ); } //! Inequality operator for complex and real number. template KOKKOS_INLINE_FUNCTION -bool operator != (const complex& x, const RealType2& y) { - return real (x) != y || imag (x) != static_cast (0.0); +bool +operator != (const complex& x, const RealType2& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(real (x)) != static_cast(y) || + static_cast(imag (x)) != static_cast(0.0) ); } //! Inequality operator for real and complex number. -template +template KOKKOS_INLINE_FUNCTION -bool operator != (const RealType& x, const complex& y) { +bool +operator != (const RealType1& x, const complex& y) { return y != x; } diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp index f089c16ad2..b9c131cd7a 100644 --- a/lib/kokkos/core/src/Kokkos_Crs.hpp +++ b/lib/kokkos/core/src/Kokkos_Crs.hpp @@ -353,7 +353,14 @@ struct CountAndFill { struct Fill {}; KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const { auto j = m_crs.row_map(i); - data_type* fill = &(m_crs.entries(j)); + /* we don't want to access entries(entries.size()), even if its just to get its + address and never use it. + this can happen when row (i) is empty and all rows after it are also empty. + we could compare to row_map(i + 1), but that is a read from global memory, + whereas dimension_0() should be part of the View in registers (or constant memory) */ + data_type* fill = + (j == static_cast(m_crs.entries.dimension_0())) ? + nullptr : (&(m_crs.entries(j))); m_functor(i, fill); } using self_type = CountAndFill; diff --git a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp index 9c9af0dd8b..b811751a2c 100644 --- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp @@ -147,12 +147,11 @@ public: , const size_t arg_alloc_size ) const; /**\brief Return Name of the MemorySpace */ - static constexpr const char* name(); + static constexpr const char* name() { return "HBW"; } private: AllocationMechanism m_alloc_mech; - static constexpr const char* m_name = "HBW"; friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >; }; diff --git a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp index 339571941d..a825fd54d3 100644 --- a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp +++ b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp @@ -192,7 +192,7 @@ template<> struct reduction_identity { KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum() {return static_cast(0.0f);} KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() {return static_cast(1.0f);} - KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() {return FLT_MIN;} + KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() {return -FLT_MAX;} KOKKOS_FORCEINLINE_FUNCTION constexpr static float min() {return FLT_MAX;} }; @@ -200,7 +200,7 @@ template<> struct reduction_identity { KOKKOS_FORCEINLINE_FUNCTION constexpr static double sum() {return static_cast(0.0);} KOKKOS_FORCEINLINE_FUNCTION constexpr static double prod() {return static_cast(1.0);} - KOKKOS_FORCEINLINE_FUNCTION constexpr static double max() {return DBL_MIN;} + KOKKOS_FORCEINLINE_FUNCTION constexpr static double max() {return -DBL_MAX;} KOKKOS_FORCEINLINE_FUNCTION constexpr static double min() {return DBL_MAX;} }; @@ -208,7 +208,7 @@ template<> struct reduction_identity { KOKKOS_FORCEINLINE_FUNCTION constexpr static long double sum() {return static_cast(0.0);} KOKKOS_FORCEINLINE_FUNCTION constexpr static long double prod() {return static_cast(1.0);} - KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max() {return LDBL_MIN;} + KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max() {return -LDBL_MAX;} KOKKOS_FORCEINLINE_FUNCTION constexpr static long double min() {return LDBL_MAX;} }; diff --git a/lib/kokkos/core/src/Kokkos_ROCm.hpp b/lib/kokkos/core/src/Kokkos_ROCm.hpp index b13b0b01de..0118d4667e 100644 --- a/lib/kokkos/core/src/Kokkos_ROCm.hpp +++ b/lib/kokkos/core/src/Kokkos_ROCm.hpp @@ -211,6 +211,24 @@ struct VerifyExecutionCanAccessMemorySpace } // namespace Kokkos + +#define threadIdx_x (hc_get_workitem_id(0)) +#define threadIdx_y (hc_get_workitem_id(1)) +#define threadIdx_z (hc_get_workitem_id(2)) + +#define blockIdx_x (hc_get_group_id(0)) +#define blockIdx_y (hc_get_group_id(1)) +#define blockIdx_z (hc_get_group_id(2)) + +#define blockDim_x (hc_get_group_size(0)) +#define blockDim_y (hc_get_group_size(1)) +#define blockDim_z (hc_get_group_size(2)) + +#define gridDim_x (hc_get_num_groups(0)) +#define gridDim_y (hc_get_num_groups(1)) +#define gridDim_z (hc_get_num_groups(2)) + + #include #include diff --git a/lib/kokkos/core/src/Makefile b/lib/kokkos/core/src/Makefile index 8fb13b8954..a917cf1656 100644 --- a/lib/kokkos/core/src/Makefile +++ b/lib/kokkos/core/src/Makefile @@ -88,6 +88,7 @@ build-makefile-kokkos: echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos echo "" >> Makefile.kokkos echo "#Variables used in application Makefiles" >> Makefile.kokkos + echo "KOKKOS_OS = $(KOKKOS_OS)" >> Makefile.kokkos echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos echo "KOKKOS_CPPFLAGS = $(KOKKOS_CPPFLAGS)" >> Makefile.kokkos diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp index 37d2ac8318..de84f6e59f 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp @@ -211,6 +211,7 @@ void OpenMP::partition_master( F const& f , thread_local_bytes ); + omp_set_num_threads(partition_size); f( omp_get_thread_num(), omp_get_num_threads() ); Impl::t_openmp_instance->~Exec(); diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp index 0b7a1e2583..f2674e5929 100644 --- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp +++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp @@ -113,7 +113,6 @@ void reduce_enqueue( if (output_length < 1) return; - assert(output_result != nullptr); const auto td = get_tile_desc(szElements,output_length,team_size,vector_size, shared_size); // allocate host and device memory for the results from each team @@ -176,14 +175,17 @@ void reduce_enqueue( } }); - ValueInit::init(ReducerConditional::select(f, reducer), output_result); + if (output_result != nullptr) + ValueInit::init(ReducerConditional::select(f, reducer), output_result); fut.wait(); copy(result,result_cpu.data()); - for(std::size_t i=0;i result(td.num_tiles); hc::array scratch(len); - tile_for(td, [&,len,td](hc::tiled_index<1> t_idx, tile_buffer buffer) [[hc]] + tile_for(td, [&,f,len,td](hc::tiled_index<1> t_idx, tile_buffer buffer) [[hc]] { const auto local = t_idx.local[0]; const auto global = t_idx.global[0]; @@ -135,7 +135,7 @@ void scan_enqueue( ValueJoin::join(f, &result_cpu[i], &result_cpu[i-1]); copy(result_cpu.data(),result); - hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,len,td](hc::tiled_index<1> t_idx) [[hc]] + hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,f,len,td](hc::tiled_index<1> t_idx) [[hc]] { // const auto local = t_idx.local[0]; const auto global = t_idx.global[0]; diff --git a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp index 3d3029535e..c5e73c8b26 100644 --- a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp @@ -68,6 +68,8 @@ int bit_first_zero( unsigned i ) noexcept return full != i ? _bit_scan_forward( ~i ) : -1 ; #elif defined( KOKKOS_COMPILER_IBM ) return full != i ? __cnttz4( ~i ) : -1 ; +#elif defined( KOKKOS_COMPILER_CRAYC ) + return full != i ? _popcnt( i ^ (i+1) ) - 1 : -1 ; #elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ ) return full != i ? __builtin_ffs( ~i ) - 1 : -1 ; #else @@ -90,17 +92,16 @@ int bit_scan_forward( unsigned i ) return _bit_scan_forward(i); #elif defined( KOKKOS_COMPILER_IBM ) return __cnttz4(i); +#elif defined( KOKKOS_COMPILER_CRAYC ) + return i ? _popcnt(~i & (i-1)) : -1; #elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ ) return __builtin_ffs(i) - 1; #else - unsigned t = 1u; - int r = 0; - while ( i && ( i & t == 0 ) ) - { - t = t << 1; - ++r; + int offset = -1; + if ( i ) { + for ( offset = 0 ; (i & ( 1 << offset ) ) == 0 ; ++offset ); } - return r; + return offset; #endif } @@ -116,17 +117,16 @@ int bit_scan_reverse( unsigned i ) return _bit_scan_reverse(i); #elif defined( KOKKOS_COMPILER_IBM ) return shift - __cntlz4(i); +#elif defined( KOKKOS_COMPILER_CRAYC ) + return i ? shift - _leadz32(i) : 0 ; #elif defined( __GNUC__ ) || defined( __GNUG__ ) return shift - __builtin_clz(i); #else - unsigned t = 1u << shift; - int r = 0; - while ( i && ( i & t == 0 ) ) - { - t = t >> 1; - ++r; + int offset = 0; + if ( i ) { + for ( offset = shift ; (i & ( 1 << offset ) ) == 0 ; --offset ); } - return r; + return offset; #endif } @@ -142,6 +142,8 @@ int bit_count( unsigned i ) return _popcnt32(i); #elif defined( KOKKOS_COMPILER_IBM ) return __popcnt4(i); +#elif defined( KOKKOS_COMPILER_CRAYC ) + return _popcnt(i); #elif defined( __GNUC__ ) || defined( __GNUG__ ) return __builtin_popcount(i); #else diff --git a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp index e11f8b6d34..cd0553218d 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp @@ -166,10 +166,6 @@ void HBWSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_s } } -constexpr const char* HBWSpace::name() { - return m_name; -} - } // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/core/unit_test/TestComplex.hpp b/lib/kokkos/core/unit_test/TestComplex.hpp index ce5537fed3..c7f681699e 100644 --- a/lib/kokkos/core/unit_test/TestComplex.hpp +++ b/lib/kokkos/core/unit_test/TestComplex.hpp @@ -114,7 +114,7 @@ struct TestComplexBasicMath { typename Kokkos::View*,ExecSpace>::HostMirror h_results; void testit () { - d_results = Kokkos::View*,ExecSpace>("TestComplexBasicMath",20); + d_results = Kokkos::View*,ExecSpace>("TestComplexBasicMath",24); h_results = Kokkos::create_mirror_view(d_results); Kokkos::parallel_for(Kokkos::RangePolicy(0,1), *this); @@ -125,6 +125,7 @@ struct TestComplexBasicMath { std::complex b(3.25,5.75); std::complex d(1.0,2.0); double c = 9.3; + int e = 2; std::complex r; r = a+b; ASSERT_FLOAT_EQ(h_results(0).real(), r.real()); ASSERT_FLOAT_EQ(h_results(0).imag(), r.imag()); @@ -147,6 +148,12 @@ struct TestComplexBasicMath { r = c-a; ASSERT_FLOAT_EQ(h_results(17).real(), r.real()); ASSERT_FLOAT_EQ(h_results(17).imag(), r.imag()); r = c*a; ASSERT_FLOAT_EQ(h_results(18).real(), r.real()); ASSERT_FLOAT_EQ(h_results(18).imag(), r.imag()); r = c/a; ASSERT_FLOAT_EQ(h_results(19).real(), r.real()); ASSERT_FLOAT_EQ(h_results(19).imag(), r.imag()); + + r = a; + /* r = a+e; */ ASSERT_FLOAT_EQ(h_results(20).real(), r.real()+e); ASSERT_FLOAT_EQ(h_results(20).imag(), r.imag()); + /* r = a-e; */ ASSERT_FLOAT_EQ(h_results(21).real(), r.real()-e); ASSERT_FLOAT_EQ(h_results(21).imag(), r.imag()); + /* r = a*e; */ ASSERT_FLOAT_EQ(h_results(22).real(), r.real()*e); ASSERT_FLOAT_EQ(h_results(22).imag(), r.imag()*e); + /* r = a/e; */ ASSERT_FLOAT_EQ(h_results(23).real(), r.real()/2); ASSERT_FLOAT_EQ(h_results(23).imag(), r.imag()/e); } KOKKOS_INLINE_FUNCTION @@ -190,6 +197,12 @@ struct TestComplexBasicMath { d_results(17) = c-a; d_results(18) = c*a; d_results(19) = c/a; + + int e = 2; + d_results(20) = a+e; + d_results(21) = a-e; + d_results(22) = a*e; + d_results(23) = a/e; } }; diff --git a/lib/kokkos/core/unit_test/TestMDRange.hpp b/lib/kokkos/core/unit_test/TestMDRange.hpp index f579ddf02c..fbc3a65c2f 100644 --- a/lib/kokkos/core/unit_test/TestMDRange.hpp +++ b/lib/kokkos/core/unit_test/TestMDRange.hpp @@ -286,7 +286,9 @@ struct TestMDRange_2D { // Test with reducers - scalar { typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType > range_type; - range_type range( {{ 0, 0 }}, {{ N0, N1 }}, {{ 3, 3 }} ); + int s0 = 1; + int s1 = 1; + range_type range( {{ s0, s1 }}, {{ N0, N1 }}, {{ 3, 3 }} ); TestMDRange_2D functor( N0, N1 ); @@ -297,7 +299,7 @@ struct TestMDRange_2D { parallel_reduce( range, functor, reducer_scalar ); - ASSERT_EQ( sum, 2 * N0 * N1 ); + ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) ); } // Test with reducers - scalar view { @@ -445,7 +447,9 @@ struct TestMDRange_2D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } ); + const int s0 = 1; + const int s1 = 1; + range_type range( point_type{ { s0, s1 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } ); TestMDRange_2D functor( N0, N1 ); parallel_for( range, functor ); @@ -454,8 +458,8 @@ struct TestMDRange_2D { Kokkos::deep_copy( h_view, functor.input_view ); int counter = 0; - for ( int i = 0; i < N0; ++i ) - for ( int j = 0; j < N1; ++j ) + for ( int i = s0; i < N0; ++i ) + for ( int j = s1; j < N1; ++j ) { if ( h_view( i, j ) != 3 ) { ++counter; @@ -463,7 +467,7 @@ struct TestMDRange_2D { } if ( counter != 0 ) { - printf( "Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter ); + printf( "Offset Start + Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter ); } ASSERT_EQ( counter, 0 ); @@ -699,6 +703,7 @@ struct TestMDRange_2D { ASSERT_EQ( counter, 0 ); } + } // end test_for2 }; // MDRange_2D @@ -749,7 +754,10 @@ struct TestMDRange_3D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); TestMDRange_3D functor( N0, N1, N2 ); @@ -757,7 +765,7 @@ struct TestMDRange_3D { double sum = 0.0; parallel_reduce( range, functor, sum ); - ASSERT_EQ( sum, 2 * N0 * N1 * N2 ); + ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) ); } // Test with reducers - scalar @@ -952,7 +960,10 @@ struct TestMDRange_3D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); TestMDRange_3D functor( N0, N1, N2 ); parallel_for( range, functor ); @@ -961,9 +972,9 @@ struct TestMDRange_3D { Kokkos::deep_copy( h_view, functor.input_view ); int counter = 0; - for ( int i = 0; i < N0; ++i ) - for ( int j = 0; j < N1; ++j ) - for ( int k = 0; k < N2; ++k ) + for ( int i = s0; i < N0; ++i ) + for ( int j = s1; j < N1; ++j ) + for ( int k = s2; k < N2; ++k ) { if ( h_view( i, j, k ) != 3 ) { ++counter; @@ -971,7 +982,7 @@ struct TestMDRange_3D { } if ( counter != 0 ) { - printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter ); + printf( "Offset Start + Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter ); } ASSERT_EQ( counter, 0 ); @@ -1207,7 +1218,11 @@ struct TestMDRange_4D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } ); TestMDRange_4D functor( N0, N1, N2, N3 ); @@ -1215,7 +1230,7 @@ struct TestMDRange_4D { double sum = 0.0; parallel_reduce( range, functor, sum ); - ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 ); + ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) ); } // Test with reducers - scalar @@ -1415,7 +1430,11 @@ struct TestMDRange_4D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } ); TestMDRange_4D functor( N0, N1, N2, N3 ); parallel_for( range, functor ); @@ -1424,10 +1443,10 @@ struct TestMDRange_4D { Kokkos::deep_copy( h_view, functor.input_view ); int counter = 0; - for ( int i = 0; i < N0; ++i ) - for ( int j = 0; j < N1; ++j ) - for ( int k = 0; k < N2; ++k ) - for ( int l = 0; l < N3; ++l ) + for ( int i = s0; i < N0; ++i ) + for ( int j = s1; j < N1; ++j ) + for ( int k = s2; k < N2; ++k ) + for ( int l = s3; l < N3; ++l ) { if ( h_view( i, j, k, l ) != 3 ) { ++counter; @@ -1435,7 +1454,7 @@ struct TestMDRange_4D { } if ( counter != 0 ) { - printf("Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter); + printf("Offset Start + Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter); } ASSERT_EQ( counter, 0 ); @@ -1682,7 +1701,12 @@ struct TestMDRange_5D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + int s4 = 1; + range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } ); TestMDRange_5D functor( N0, N1, N2, N3, N4 ); @@ -1690,7 +1714,7 @@ struct TestMDRange_5D { double sum = 0.0; parallel_reduce( range, functor, sum ); - ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 ); + ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) ); } // Test with reducers - scalar @@ -1810,7 +1834,12 @@ struct TestMDRange_5D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + int s4 = 1; + range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } ); TestMDRange_5D functor( N0, N1, N2, N3, N4 ); parallel_for( range, functor ); @@ -1819,11 +1848,11 @@ struct TestMDRange_5D { Kokkos::deep_copy( h_view, functor.input_view ); int counter = 0; - for ( int i = 0; i < N0; ++i ) - for ( int j = 0; j < N1; ++j ) - for ( int k = 0; k < N2; ++k ) - for ( int l = 0; l < N3; ++l ) - for ( int m = 0; m < N4; ++m ) + for ( int i = s0; i < N0; ++i ) + for ( int j = s1; j < N1; ++j ) + for ( int k = s2; k < N2; ++k ) + for ( int l = s3; l < N3; ++l ) + for ( int m = s4; m < N4; ++m ) { if ( h_view( i, j, k, l, m ) != 3 ) { ++counter; @@ -1831,7 +1860,7 @@ struct TestMDRange_5D { } if ( counter != 0 ) { - printf( "Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter ); + printf( "Offset Start + Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter ); } ASSERT_EQ( counter, 0 ); @@ -2084,7 +2113,13 @@ struct TestMDRange_6D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + int s4 = 1; + int s5 = 1; + range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } ); TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); @@ -2092,7 +2127,7 @@ struct TestMDRange_6D { double sum = 0.0; parallel_reduce( range, functor, sum ); - ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 ); + ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) * (N5 - s5) ); } // Test with reducers - scalar @@ -2214,7 +2249,13 @@ struct TestMDRange_6D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + int s4 = 1; + int s5 = 1; + range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); parallel_for( range, functor ); @@ -2223,12 +2264,12 @@ struct TestMDRange_6D { Kokkos::deep_copy( h_view, functor.input_view ); int counter = 0; - for ( int i = 0; i < N0; ++i ) - for ( int j = 0; j < N1; ++j ) - for ( int k = 0; k < N2; ++k ) - for ( int l = 0; l < N3; ++l ) - for ( int m = 0; m < N4; ++m ) - for ( int n = 0; n < N5; ++n ) + for ( int i = s0; i < N0; ++i ) + for ( int j = s1; j < N1; ++j ) + for ( int k = s2; k < N2; ++k ) + for ( int l = s3; l < N3; ++l ) + for ( int m = s4; m < N4; ++m ) + for ( int n = s5; n < N5; ++n ) { if ( h_view( i, j, k, l, m, n ) != 3 ) { ++counter; @@ -2236,7 +2277,7 @@ struct TestMDRange_6D { } if ( counter != 0 ) { - printf( "Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter ); + printf( "Offset Start + Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter ); } ASSERT_EQ( counter, 0 ); diff --git a/lib/latte/Install.py b/lib/latte/Install.py index b3e771e4cc..37cb5d6b17 100644 --- a/lib/latte/Install.py +++ b/lib/latte/Install.py @@ -159,13 +159,13 @@ if buildflag or pathflag: os.remove("includelink") if os.path.isfile("liblink") or os.path.islink("liblink"): os.remove("liblink") - if os.path.isfile("filelink") or os.path.islink("filelink"): - os.remove("filelink") + if os.path.isfile("filelink.o") or os.path.islink("filelink.o"): + os.remove("filelink.o") cmd = 'ln -s "%s/src" includelink' % lattedir subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) cmd = 'ln -s "%s" liblink' % lattedir subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) - cmd = 'ln -s "%s/src/latte_c_bind.o" filelink' % lattedir + cmd = 'ln -s "%s/src/latte_c_bind.o" filelink.o' % lattedir subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) # copy Makefile.lammps.suffix to Makefile.lammps diff --git a/lib/latte/Makefile.lammps.gfortran b/lib/latte/Makefile.lammps.gfortran index 921721552b..6aa7782f8a 100644 --- a/lib/latte/Makefile.lammps.gfortran +++ b/lib/latte/Makefile.lammps.gfortran @@ -3,5 +3,5 @@ # GNU Fortran settings latte_SYSINC = -latte_SYSLIB = ../../lib/latte/filelink -llatte -lgfortran -llapack -lblas +latte_SYSLIB = ../../lib/latte/filelink.o -llatte -lgfortran -llapack -lblas latte_SYSPATH = -fopenmp diff --git a/lib/latte/Makefile.lammps.ifort b/lib/latte/Makefile.lammps.ifort index 23d2b32fcc..0491bdd8a5 100644 --- a/lib/latte/Makefile.lammps.ifort +++ b/lib/latte/Makefile.lammps.ifort @@ -3,7 +3,7 @@ # Intel ifort settings latte_SYSINC = -latte_SYSLIB = ../../lib/latte/filelink \ +latte_SYSLIB = ../../lib/latte/filelink.o \ -llatte -lifcore -lsvml -lompstub -limf -lmkl_intel_lp64 \ -lmkl_intel_thread -lmkl_core -lmkl_intel_thread -lpthread \ -openmp -O0 diff --git a/src/.gitignore b/src/.gitignore index 1571065b72..13518abbe8 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -405,6 +405,8 @@ /fix_lambdah_calc.h /fix_langevin_eff.cpp /fix_langevin_eff.h +/fix_latte.cpp +/fix_latte.h /fix_lb_fluid.cpp /fix_lb_fluid.h /fix_lb_momentum.cpp diff --git a/src/Depend.sh b/src/Depend.sh index 9463607960..e1c812ebc2 100644 --- a/src/Depend.sh +++ b/src/Depend.sh @@ -119,6 +119,10 @@ if (test $1 = "USER-DPD") then depend KOKKOS fi +if (test $1 = "USER-DRUDE") then + depend USER-OMP +fi + if (test $1 = "USER-FEP") then depend USER-OMP fi diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.cpp b/src/KOKKOS/atom_vec_atomic_kokkos.cpp index b63dc5fb8c..6c610c8c11 100644 --- a/src/KOKKOS/atom_vec_atomic_kokkos.cpp +++ b/src/KOKKOS/atom_vec_atomic_kokkos.cpp @@ -136,450 +136,6 @@ void AtomVecAtomicKokkos::copy(int i, int j, int delflag) /* ---------------------------------------------------------------------- */ -template -struct AtomVecAtomicKokkos_PackComm { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array_randomread _x; - typename ArrayTypes::t_xfloat_2d_um _buf; - typename ArrayTypes::t_int_2d_const _list; - const int _iswap; - X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; - X_FLOAT _pbc[6]; - - AtomVecAtomicKokkos_PackComm( - const typename DAT::tdual_x_array &x, - const typename DAT::tdual_xfloat_2d &buf, - const typename DAT::tdual_int_2d &list, - const int & iswap, - const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, - const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): - _x(x.view()),_list(list.view()),_iswap(iswap), - _xprd(xprd),_yprd(yprd),_zprd(zprd), - _xy(xy),_xz(xz),_yz(yz) { - const size_t maxsend = (buf.view().dimension_0()*buf.view().dimension_1())/3; - const size_t elements = 3; - buffer_view(_buf,buf,maxsend,elements); - _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; - _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; - }; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - const int j = _list(_iswap,i); - if (PBC_FLAG == 0) { - _buf(i,0) = _x(j,0); - _buf(i,1) = _x(j,1); - _buf(i,2) = _x(j,2); - } else { - if (TRICLINIC == 0) { - _buf(i,0) = _x(j,0) + _pbc[0]*_xprd; - _buf(i,1) = _x(j,1) + _pbc[1]*_yprd; - _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; - } else { - _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; - _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; - _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; - } - } - } -}; - -/* ---------------------------------------------------------------------- */ - -int AtomVecAtomicKokkos::pack_comm_kokkos(const int &n, - const DAT::tdual_int_2d &list, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, - const int* const pbc) -{ - // Check whether to always run forward communication on the host - // Choose correct forward PackComm kernel - - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - - return n*size_forward; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecAtomicKokkos_PackCommSelf { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array_randomread _x; - typename ArrayTypes::t_x_array _xw; - int _nfirst; - typename ArrayTypes::t_int_2d_const _list; - const int _iswap; - X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; - X_FLOAT _pbc[6]; - - AtomVecAtomicKokkos_PackCommSelf( - const typename DAT::tdual_x_array &x, - const int &nfirst, - const typename DAT::tdual_int_2d &list, - const int & iswap, - const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, - const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): - _x(x.view()),_xw(x.view()),_nfirst(nfirst),_list(list.view()),_iswap(iswap), - _xprd(xprd),_yprd(yprd),_zprd(zprd), - _xy(xy),_xz(xz),_yz(yz) { - _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; - _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; - }; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - const int j = _list(_iswap,i); - if (PBC_FLAG == 0) { - _xw(i+_nfirst,0) = _x(j,0); - _xw(i+_nfirst,1) = _x(j,1); - _xw(i+_nfirst,2) = _x(j,2); - } else { - if (TRICLINIC == 0) { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } else { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } - } - - } -}; - -/* ---------------------------------------------------------------------- */ - -int AtomVecAtomicKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap, - const int nfirst, const int &pbc_flag, const int* const pbc) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - return n*3; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecAtomicKokkos_UnpackComm { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array _x; - typename ArrayTypes::t_xfloat_2d_const _buf; - int _first; - - AtomVecAtomicKokkos_UnpackComm( - const typename DAT::tdual_x_array &x, - const typename DAT::tdual_xfloat_2d &buf, - const int& first):_x(x.view()),_buf(buf.view()), - _first(first) {}; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - _x(i+_first,0) = _buf(i,0); - _x(i+_first,1) = _buf(i,1); - _x(i+_first,2) = _buf(i,2); - } -}; - -/* ---------------------------------------------------------------------- */ - -void AtomVecAtomicKokkos::unpack_comm_kokkos(const int &n, const int &first, - const DAT::tdual_xfloat_2d &buf ) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - struct AtomVecAtomicKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - struct AtomVecAtomicKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecAtomicKokkos::pack_comm(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecAtomicKokkos::pack_comm_vel(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz,dvx,dvy,dvz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - if (!deform_vremap) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4]; - dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3]; - dvz = pbc[2]*h_rate[2]; - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - if (mask[i] & deform_groupbit) { - buf[m++] = h_v(j,0) + dvx; - buf[m++] = h_v(j,1) + dvy; - buf[m++] = h_v(j,2) + dvz; - } else { - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecAtomicKokkos::unpack_comm(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecAtomicKokkos::unpack_comm_vel(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - h_v(i,0) = buf[m++]; - h_v(i,1) = buf[m++]; - h_v(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecAtomicKokkos::pack_reverse(int n, int first, double *buf) -{ - if(n > 0) - sync(Host,F_MASK); - - int m = 0; - const int last = first + n; - for (int i = first; i < last; i++) { - buf[m++] = h_f(i,0); - buf[m++] = h_f(i,1); - buf[m++] = h_f(i,2); - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecAtomicKokkos::unpack_reverse(int n, int *list, double *buf) -{ - if(n > 0) { - sync(Host,F_MASK); - modified(Host,F_MASK); - } - - int m = 0; - for (int i = 0; i < n; i++) { - const int j = list[i]; - h_f(j,0) += buf[m++]; - h_f(j,1) += buf[m++]; - h_f(j,2) += buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - template struct AtomVecAtomicKokkos_PackBorder { typedef DeviceType device_type; diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.h b/src/KOKKOS/atom_vec_atomic_kokkos.h index 5e9a72c2e3..e4d2654e2c 100644 --- a/src/KOKKOS/atom_vec_atomic_kokkos.h +++ b/src/KOKKOS/atom_vec_atomic_kokkos.h @@ -33,12 +33,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos { virtual ~AtomVecAtomicKokkos() {} void grow(int); void copy(int, int, int); - int pack_comm(int, int *, double *, int, int *); - int pack_comm_vel(int, int *, double *, int, int *); - void unpack_comm(int, int, double *); - void unpack_comm_vel(int, int, double *); - int pack_reverse(int, int, double *); - void unpack_reverse(int, int *, double *); int pack_border(int, int *, double *, int, int *); int pack_border_vel(int, int *, double *, int, int *); void unpack_border(int, int, double *); @@ -55,15 +49,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos { bigint memory_usage(); void grow_reset(); - int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, const int pbc[]); - void unpack_comm_kokkos(const int &n, const int &nfirst, - const DAT::tdual_xfloat_2d &buf); - int pack_comm_self(const int &n, const DAT::tdual_int_2d &list, - const int & iswap, const int nfirst, - const int &pbc_flag, const int pbc[]); int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space); @@ -99,9 +84,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos { DAT::t_x_array d_x; DAT::t_v_array d_v; DAT::t_f_array d_f; - HAT::t_x_array h_x; - HAT::t_v_array h_v; - HAT::t_f_array h_f; DAT::tdual_int_1d k_count; }; diff --git a/src/KOKKOS/atom_vec_bond_kokkos.cpp b/src/KOKKOS/atom_vec_bond_kokkos.cpp index e0f29a27bb..076144420c 100644 --- a/src/KOKKOS/atom_vec_bond_kokkos.cpp +++ b/src/KOKKOS/atom_vec_bond_kokkos.cpp @@ -178,448 +178,6 @@ void AtomVecBondKokkos::copy(int i, int j, int delflag) /* ---------------------------------------------------------------------- */ -template -struct AtomVecBondKokkos_PackComm { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array_randomread _x; - typename ArrayTypes::t_xfloat_2d_um _buf; - typename ArrayTypes::t_int_2d_const _list; - const int _iswap; - X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; - X_FLOAT _pbc[6]; - - AtomVecBondKokkos_PackComm( - const typename DAT::tdual_x_array &x, - const typename DAT::tdual_xfloat_2d &buf, - const typename DAT::tdual_int_2d &list, - const int & iswap, - const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, - const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): - _x(x.view()),_list(list.view()),_iswap(iswap), - _xprd(xprd),_yprd(yprd),_zprd(zprd), - _xy(xy),_xz(xz),_yz(yz) { - const size_t maxsend = (buf.view().dimension_0()*buf.view().dimension_1())/3; - const size_t elements = 3; - buffer_view(_buf,buf,maxsend,elements); - _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; - _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; - }; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - const int j = _list(_iswap,i); - if (PBC_FLAG == 0) { - _buf(i,0) = _x(j,0); - _buf(i,1) = _x(j,1); - _buf(i,2) = _x(j,2); - } else { - if (TRICLINIC == 0) { - _buf(i,0) = _x(j,0) + _pbc[0]*_xprd; - _buf(i,1) = _x(j,1) + _pbc[1]*_yprd; - _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; - } else { - _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; - _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; - _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; - } - } - } -}; - -/* ---------------------------------------------------------------------- */ - -int AtomVecBondKokkos::pack_comm_kokkos(const int &n, - const DAT::tdual_int_2d &list, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, - const int* const pbc) -{ - // Check whether to always run forward communication on the host - // Choose correct forward PackComm kernel - - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - - return n*size_forward; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecBondKokkos_PackCommSelf { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array_randomread _x; - typename ArrayTypes::t_x_array _xw; - int _nfirst; - typename ArrayTypes::t_int_2d_const _list; - const int _iswap; - X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; - X_FLOAT _pbc[6]; - - AtomVecBondKokkos_PackCommSelf( - const typename DAT::tdual_x_array &x, - const int &nfirst, - const typename DAT::tdual_int_2d &list, - const int & iswap, - const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, - const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): - _x(x.view()),_xw(x.view()),_nfirst(nfirst),_list(list.view()),_iswap(iswap), - _xprd(xprd),_yprd(yprd),_zprd(zprd), - _xy(xy),_xz(xz),_yz(yz) { - _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; - _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; - }; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - const int j = _list(_iswap,i); - if (PBC_FLAG == 0) { - _xw(i+_nfirst,0) = _x(j,0); - _xw(i+_nfirst,1) = _x(j,1); - _xw(i+_nfirst,2) = _x(j,2); - } else { - if (TRICLINIC == 0) { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } else { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } - } - - } -}; - -/* ---------------------------------------------------------------------- */ - -int AtomVecBondKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap, - const int nfirst, const int &pbc_flag, const int* const pbc) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - return n*3; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecBondKokkos_UnpackComm { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array _x; - typename ArrayTypes::t_xfloat_2d_const _buf; - int _first; - - AtomVecBondKokkos_UnpackComm( - const typename DAT::tdual_x_array &x, - const typename DAT::tdual_xfloat_2d &buf, - const int& first):_x(x.view()),_buf(buf.view()), - _first(first) {}; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - _x(i+_first,0) = _buf(i,0); - _x(i+_first,1) = _buf(i,1); - _x(i+_first,2) = _buf(i,2); - } -}; - -/* ---------------------------------------------------------------------- */ - -void AtomVecBondKokkos::unpack_comm_kokkos(const int &n, const int &first, - const DAT::tdual_xfloat_2d &buf ) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - struct AtomVecBondKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - struct AtomVecBondKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecBondKokkos::pack_comm(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecBondKokkos::pack_comm_vel(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz,dvx,dvy,dvz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - if (!deform_vremap) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4]; - dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3]; - dvz = pbc[2]*h_rate[2]; - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - if (mask[i] & deform_groupbit) { - buf[m++] = h_v(j,0) + dvx; - buf[m++] = h_v(j,1) + dvy; - buf[m++] = h_v(j,2) + dvz; - } else { - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecBondKokkos::unpack_comm(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecBondKokkos::unpack_comm_vel(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - h_v(i,0) = buf[m++]; - h_v(i,1) = buf[m++]; - h_v(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecBondKokkos::pack_reverse(int n, int first, double *buf) -{ - if(n > 0) - sync(Host,F_MASK); - - int m = 0; - const int last = first + n; - for (int i = first; i < last; i++) { - buf[m++] = h_f(i,0); - buf[m++] = h_f(i,1); - buf[m++] = h_f(i,2); - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecBondKokkos::unpack_reverse(int n, int *list, double *buf) -{ - if(n > 0) - modified(Host,F_MASK); - - int m = 0; - for (int i = 0; i < n; i++) { - const int j = list[i]; - h_f(j,0) += buf[m++]; - h_f(j,1) += buf[m++]; - h_f(j,2) += buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - template struct AtomVecBondKokkos_PackBorder { typedef DeviceType device_type; diff --git a/src/KOKKOS/atom_vec_bond_kokkos.h b/src/KOKKOS/atom_vec_bond_kokkos.h index 3dcc99fa78..7ec15450ef 100644 --- a/src/KOKKOS/atom_vec_bond_kokkos.h +++ b/src/KOKKOS/atom_vec_bond_kokkos.h @@ -32,12 +32,6 @@ class AtomVecBondKokkos : public AtomVecKokkos { virtual ~AtomVecBondKokkos() {} void grow(int); void copy(int, int, int); - int pack_comm(int, int *, double *, int, int *); - int pack_comm_vel(int, int *, double *, int, int *); - void unpack_comm(int, int, double *); - void unpack_comm_vel(int, int, double *); - int pack_reverse(int, int, double *); - void unpack_reverse(int, int *, double *); int pack_border(int, int *, double *, int, int *); int pack_border_vel(int, int *, double *, int, int *); int pack_border_hybrid(int, int *, double *); @@ -59,15 +53,6 @@ class AtomVecBondKokkos : public AtomVecKokkos { bigint memory_usage(); void grow_reset(); - int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, const int pbc[]); - void unpack_comm_kokkos(const int &n, const int &nfirst, - const DAT::tdual_xfloat_2d &buf); - int pack_comm_self(const int &n, const DAT::tdual_int_2d &list, - const int & iswap, const int nfirst, - const int &pbc_flag, const int pbc[]); int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space); @@ -112,9 +97,6 @@ class AtomVecBondKokkos : public AtomVecKokkos { DAT::t_x_array d_x; DAT::t_v_array d_v; DAT::t_f_array d_f; - HAT::t_x_array h_x; - HAT::t_v_array h_v; - HAT::t_f_array h_f; DAT::t_tagint_1d d_molecule; DAT::t_int_2d d_nspecial; diff --git a/src/KOKKOS/atom_vec_charge_kokkos.cpp b/src/KOKKOS/atom_vec_charge_kokkos.cpp index 89f7e91c2b..7b8b74b405 100644 --- a/src/KOKKOS/atom_vec_charge_kokkos.cpp +++ b/src/KOKKOS/atom_vec_charge_kokkos.cpp @@ -199,397 +199,6 @@ struct AtomVecChargeKokkos_PackComm { /* ---------------------------------------------------------------------- */ -int AtomVecChargeKokkos::pack_comm_kokkos(const int &n, - const DAT::tdual_int_2d &list, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, - const int* const pbc) -{ - // Check whether to always run forward communication on the host - // Choose correct forward PackComm kernel - - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - - return n*size_forward; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecChargeKokkos_PackCommSelf { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array_randomread _x; - typename ArrayTypes::t_x_array _xw; - int _nfirst; - typename ArrayTypes::t_int_2d_const _list; - const int _iswap; - X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; - X_FLOAT _pbc[6]; - - AtomVecChargeKokkos_PackCommSelf( - const typename DAT::tdual_x_array &x, - const int &nfirst, - const typename DAT::tdual_int_2d &list, - const int & iswap, - const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, - const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): - _x(x.view()),_xw(x.view()),_nfirst(nfirst),_list(list.view()),_iswap(iswap), - _xprd(xprd),_yprd(yprd),_zprd(zprd), - _xy(xy),_xz(xz),_yz(yz) { - _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; - _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; - }; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - const int j = _list(_iswap,i); - if (PBC_FLAG == 0) { - _xw(i+_nfirst,0) = _x(j,0); - _xw(i+_nfirst,1) = _x(j,1); - _xw(i+_nfirst,2) = _x(j,2); - } else { - if (TRICLINIC == 0) { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } else { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } - } - - } -}; - -/* ---------------------------------------------------------------------- */ - -int AtomVecChargeKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap, - const int nfirst, const int &pbc_flag, const int* const pbc) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - return n*3; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecChargeKokkos_UnpackComm { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array _x; - typename ArrayTypes::t_xfloat_2d_const _buf; - int _first; - - AtomVecChargeKokkos_UnpackComm( - const typename DAT::tdual_x_array &x, - const typename DAT::tdual_xfloat_2d &buf, - const int& first):_x(x.view()),_buf(buf.view()), - _first(first) {}; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - _x(i+_first,0) = _buf(i,0); - _x(i+_first,1) = _buf(i,1); - _x(i+_first,2) = _buf(i,2); - } -}; - -/* ---------------------------------------------------------------------- */ - -void AtomVecChargeKokkos::unpack_comm_kokkos(const int &n, const int &first, - const DAT::tdual_xfloat_2d &buf ) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - struct AtomVecChargeKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - struct AtomVecChargeKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecChargeKokkos::pack_comm(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecChargeKokkos::pack_comm_vel(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz,dvx,dvy,dvz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - if (!deform_vremap) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4]; - dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3]; - dvz = pbc[2]*h_rate[2]; - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - if (mask[i] & deform_groupbit) { - buf[m++] = h_v(j,0) + dvx; - buf[m++] = h_v(j,1) + dvy; - buf[m++] = h_v(j,2) + dvz; - } else { - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecChargeKokkos::unpack_comm(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecChargeKokkos::unpack_comm_vel(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - h_v(i,0) = buf[m++]; - h_v(i,1) = buf[m++]; - h_v(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecChargeKokkos::pack_reverse(int n, int first, double *buf) -{ - if(n > 0) - sync(Host,F_MASK); - - int m = 0; - const int last = first + n; - for (int i = first; i < last; i++) { - buf[m++] = h_f(i,0); - buf[m++] = h_f(i,1); - buf[m++] = h_f(i,2); - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecChargeKokkos::unpack_reverse(int n, int *list, double *buf) -{ - if(n > 0) - modified(Host,F_MASK); - - int m = 0; - for (int i = 0; i < n; i++) { - const int j = list[i]; - h_f(j,0) += buf[m++]; - h_f(j,1) += buf[m++]; - h_f(j,2) += buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - template struct AtomVecChargeKokkos_PackBorder { typedef DeviceType device_type; diff --git a/src/KOKKOS/atom_vec_charge_kokkos.h b/src/KOKKOS/atom_vec_charge_kokkos.h index f9b385e7ed..e9ff70bbe1 100644 --- a/src/KOKKOS/atom_vec_charge_kokkos.h +++ b/src/KOKKOS/atom_vec_charge_kokkos.h @@ -33,12 +33,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos { virtual ~AtomVecChargeKokkos() {} void grow(int); void copy(int, int, int); - int pack_comm(int, int *, double *, int, int *); - int pack_comm_vel(int, int *, double *, int, int *); - void unpack_comm(int, int, double *); - void unpack_comm_vel(int, int, double *); - int pack_reverse(int, int, double *); - void unpack_reverse(int, int *, double *); int pack_border(int, int *, double *, int, int *); int pack_border_vel(int, int *, double *, int, int *); int pack_border_hybrid(int, int *, double *); @@ -60,15 +54,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos { bigint memory_usage(); void grow_reset(); - int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, const int pbc[]); - void unpack_comm_kokkos(const int &n, const int &nfirst, - const DAT::tdual_xfloat_2d &buf); - int pack_comm_self(const int &n, const DAT::tdual_int_2d &list, - const int & iswap, const int nfirst, - const int &pbc_flag, const int pbc[]); int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space); @@ -108,9 +93,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos { DAT::t_x_array d_x; DAT::t_v_array d_v; DAT::t_f_array d_f; - HAT::t_x_array h_x; - HAT::t_v_array h_v; - HAT::t_f_array h_f; DAT::t_float_1d d_q; diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.h b/src/KOKKOS/atom_vec_dpd_kokkos.h index 372404cc7d..cec1b82357 100644 --- a/src/KOKKOS/atom_vec_dpd_kokkos.h +++ b/src/KOKKOS/atom_vec_dpd_kokkos.h @@ -111,9 +111,6 @@ class AtomVecDPDKokkos : public AtomVecKokkos { DAT::t_x_array d_x; DAT::t_v_array d_v; DAT::t_f_array d_f; - HAT::t_x_array h_x; - HAT::t_v_array h_v; - HAT::t_f_array h_f; DAT::tdual_int_1d k_count; }; diff --git a/src/KOKKOS/atom_vec_full_kokkos.cpp b/src/KOKKOS/atom_vec_full_kokkos.cpp index fd7eaf7c81..8e9abe4067 100644 --- a/src/KOKKOS/atom_vec_full_kokkos.cpp +++ b/src/KOKKOS/atom_vec_full_kokkos.cpp @@ -307,452 +307,6 @@ void AtomVecFullKokkos::copy(int i, int j, int delflag) /* ---------------------------------------------------------------------- */ -template -struct AtomVecFullKokkos_PackComm { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array_randomread _x; - typename ArrayTypes::t_xfloat_2d_um _buf; - typename ArrayTypes::t_int_2d_const _list; - const int _iswap; - X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; - X_FLOAT _pbc[6]; - - AtomVecFullKokkos_PackComm( - const typename DAT::tdual_x_array &x, - const typename DAT::tdual_xfloat_2d &buf, - const typename DAT::tdual_int_2d &list, - const int & iswap, - const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, - const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): - _x(x.view()),_list(list.view()),_iswap(iswap), - _xprd(xprd),_yprd(yprd),_zprd(zprd), - _xy(xy),_xz(xz),_yz(yz) { - const size_t maxsend = (buf.view().dimension_0() - *buf.view().dimension_1())/3; - const size_t elements = 3; - buffer_view(_buf,buf,maxsend,elements); - _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; - _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; - }; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - const int j = _list(_iswap,i); - if (PBC_FLAG == 0) { - _buf(i,0) = _x(j,0); - _buf(i,1) = _x(j,1); - _buf(i,2) = _x(j,2); - } else { - if (TRICLINIC == 0) { - _buf(i,0) = _x(j,0) + _pbc[0]*_xprd; - _buf(i,1) = _x(j,1) + _pbc[1]*_yprd; - _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; - } else { - _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; - _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; - _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; - } - } - } -}; - -/* ---------------------------------------------------------------------- */ - -int AtomVecFullKokkos::pack_comm_kokkos(const int &n, - const DAT::tdual_int_2d &list, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, - const int* const pbc) -{ - // Check whether to always run forward communication on the host - // Choose correct forward PackComm kernel - - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - - return n*size_forward; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecFullKokkos_PackCommSelf { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array_randomread _x; - typename ArrayTypes::t_x_array _xw; - int _nfirst; - typename ArrayTypes::t_int_2d_const _list; - const int _iswap; - X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; - X_FLOAT _pbc[6]; - - AtomVecFullKokkos_PackCommSelf( - const typename DAT::tdual_x_array &x, - const int &nfirst, - const typename DAT::tdual_int_2d &list, - const int & iswap, - const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, - const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): - _x(x.view()),_xw(x.view()),_nfirst(nfirst), - _list(list.view()),_iswap(iswap), - _xprd(xprd),_yprd(yprd),_zprd(zprd), - _xy(xy),_xz(xz),_yz(yz) { - _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; - _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; - }; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - const int j = _list(_iswap,i); - if (PBC_FLAG == 0) { - _xw(i+_nfirst,0) = _x(j,0); - _xw(i+_nfirst,1) = _x(j,1); - _xw(i+_nfirst,2) = _x(j,2); - } else { - if (TRICLINIC == 0) { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } else { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } - } - - } -}; - -/* ---------------------------------------------------------------------- */ - -int AtomVecFullKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, - const int & iswap, - const int nfirst, const int &pbc_flag, - const int* const pbc) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - return n*3; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecFullKokkos_UnpackComm { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array _x; - typename ArrayTypes::t_xfloat_2d_const _buf; - int _first; - - AtomVecFullKokkos_UnpackComm( - const typename DAT::tdual_x_array &x, - const typename DAT::tdual_xfloat_2d &buf, - const int& first):_x(x.view()),_buf(buf.view()), - _first(first) {}; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - _x(i+_first,0) = _buf(i,0); - _x(i+_first,1) = _buf(i,1); - _x(i+_first,2) = _buf(i,2); - } -}; - -/* ---------------------------------------------------------------------- */ - -void AtomVecFullKokkos::unpack_comm_kokkos(const int &n, const int &first, - const DAT::tdual_xfloat_2d &buf ) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - struct AtomVecFullKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - struct AtomVecFullKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecFullKokkos::pack_comm(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecFullKokkos::pack_comm_vel(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz,dvx,dvy,dvz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - if (!deform_vremap) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4]; - dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3]; - dvz = pbc[2]*h_rate[2]; - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - if (mask[i] & deform_groupbit) { - buf[m++] = h_v(j,0) + dvx; - buf[m++] = h_v(j,1) + dvy; - buf[m++] = h_v(j,2) + dvz; - } else { - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecFullKokkos::unpack_comm(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecFullKokkos::unpack_comm_vel(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - h_v(i,0) = buf[m++]; - h_v(i,1) = buf[m++]; - h_v(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecFullKokkos::pack_reverse(int n, int first, double *buf) -{ - if(n > 0) - sync(Host,F_MASK); - - int m = 0; - const int last = first + n; - for (int i = first; i < last; i++) { - buf[m++] = h_f(i,0); - buf[m++] = h_f(i,1); - buf[m++] = h_f(i,2); - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecFullKokkos::unpack_reverse(int n, int *list, double *buf) -{ - if(n > 0) - modified(Host,F_MASK); - - int m = 0; - for (int i = 0; i < n; i++) { - const int j = list[i]; - h_f(j,0) += buf[m++]; - h_f(j,1) += buf[m++]; - h_f(j,2) += buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - template struct AtomVecFullKokkos_PackBorder { typedef DeviceType device_type; diff --git a/src/KOKKOS/atom_vec_full_kokkos.h b/src/KOKKOS/atom_vec_full_kokkos.h index 760df087e1..33760a8b5f 100644 --- a/src/KOKKOS/atom_vec_full_kokkos.h +++ b/src/KOKKOS/atom_vec_full_kokkos.h @@ -32,12 +32,6 @@ class AtomVecFullKokkos : public AtomVecKokkos { virtual ~AtomVecFullKokkos() {} void grow(int); void copy(int, int, int); - int pack_comm(int, int *, double *, int, int *); - int pack_comm_vel(int, int *, double *, int, int *); - void unpack_comm(int, int, double *); - void unpack_comm_vel(int, int, double *); - int pack_reverse(int, int, double *); - void unpack_reverse(int, int *, double *); int pack_border(int, int *, double *, int, int *); int pack_border_vel(int, int *, double *, int, int *); int pack_border_hybrid(int, int *, double *); @@ -59,15 +53,6 @@ class AtomVecFullKokkos : public AtomVecKokkos { bigint memory_usage(); void grow_reset(); - int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, const int pbc[]); - void unpack_comm_kokkos(const int &n, const int &nfirst, - const DAT::tdual_xfloat_2d &buf); - int pack_comm_self(const int &n, const DAT::tdual_int_2d &list, - const int & iswap, const int nfirst, - const int &pbc_flag, const int pbc[]); int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space); @@ -125,9 +110,6 @@ class AtomVecFullKokkos : public AtomVecKokkos { DAT::t_x_array d_x; DAT::t_v_array d_v; DAT::t_f_array d_f; - HAT::t_x_array h_x; - HAT::t_v_array h_v; - HAT::t_f_array h_f; DAT::t_float_1d d_q; HAT::t_float_1d h_q; diff --git a/src/KOKKOS/atom_vec_kokkos.cpp b/src/KOKKOS/atom_vec_kokkos.cpp index 5542991395..03fb2a4ead 100644 --- a/src/KOKKOS/atom_vec_kokkos.cpp +++ b/src/KOKKOS/atom_vec_kokkos.cpp @@ -12,6 +12,10 @@ ------------------------------------------------------------------------- */ #include "atom_vec_kokkos.h" +#include "atom_kokkos.h" +#include "comm_kokkos.h" +#include "domain.h" +#include "atom_masks.h" using namespace LAMMPS_NS; @@ -24,3 +28,585 @@ AtomVecKokkos::AtomVecKokkos(LAMMPS *lmp) : AtomVec(lmp) buffer_size = 0; } +/* ---------------------------------------------------------------------- */ + +template +struct AtomVecKokkos_PackComm { + typedef DeviceType device_type; + + typename ArrayTypes::t_x_array_randomread _x; + typename ArrayTypes::t_xfloat_2d_um _buf; + typename ArrayTypes::t_int_2d_const _list; + const int _iswap; + X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; + X_FLOAT _pbc[6]; + + AtomVecKokkos_PackComm( + const typename DAT::tdual_x_array &x, + const typename DAT::tdual_xfloat_2d &buf, + const typename DAT::tdual_int_2d &list, + const int & iswap, + const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, + const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): + _x(x.view()),_list(list.view()),_iswap(iswap), + _xprd(xprd),_yprd(yprd),_zprd(zprd), + _xy(xy),_xz(xz),_yz(yz) { + const size_t maxsend = (buf.view().dimension_0()*buf.view().dimension_1())/3; + const size_t elements = 3; + buffer_view(_buf,buf,maxsend,elements); + _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; + _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; + }; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + const int j = _list(_iswap,i); + if (PBC_FLAG == 0) { + _buf(i,0) = _x(j,0); + _buf(i,1) = _x(j,1); + _buf(i,2) = _x(j,2); + } else { + if (TRICLINIC == 0) { + _buf(i,0) = _x(j,0) + _pbc[0]*_xprd; + _buf(i,1) = _x(j,1) + _pbc[1]*_yprd; + _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; + } else { + _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; + _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; + _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; + } + } + } +}; + +/* ---------------------------------------------------------------------- */ + +int AtomVecKokkos::pack_comm_kokkos(const int &n, + const DAT::tdual_int_2d &list, + const int & iswap, + const DAT::tdual_xfloat_2d &buf, + const int &pbc_flag, + const int* const pbc) +{ + // Check whether to always run forward communication on the host + // Choose correct forward PackComm kernel + + if(commKK->forward_comm_on_host) { + sync(Host,X_MASK); + if(pbc_flag) { + if(domain->triclinic) { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } else { + if(domain->triclinic) { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } + } else { + sync(Device,X_MASK); + if(pbc_flag) { + if(domain->triclinic) { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } else { + if(domain->triclinic) { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } + } + + return n*size_forward; +} + +/* ---------------------------------------------------------------------- */ + +template +struct AtomVecKokkos_PackCommSelf { + typedef DeviceType device_type; + + typename ArrayTypes::t_x_array_randomread _x; + typename ArrayTypes::t_x_array _xw; + int _nfirst; + typename ArrayTypes::t_int_2d_const _list; + const int _iswap; + X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; + X_FLOAT _pbc[6]; + + AtomVecKokkos_PackCommSelf( + const typename DAT::tdual_x_array &x, + const int &nfirst, + const typename DAT::tdual_int_2d &list, + const int & iswap, + const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, + const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): + _x(x.view()),_xw(x.view()),_nfirst(nfirst),_list(list.view()),_iswap(iswap), + _xprd(xprd),_yprd(yprd),_zprd(zprd), + _xy(xy),_xz(xz),_yz(yz) { + _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; + _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; + }; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + const int j = _list(_iswap,i); + if (PBC_FLAG == 0) { + _xw(i+_nfirst,0) = _x(j,0); + _xw(i+_nfirst,1) = _x(j,1); + _xw(i+_nfirst,2) = _x(j,2); + } else { + if (TRICLINIC == 0) { + _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd; + _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd; + _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; + } else { + _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; + _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; + _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; + } + } + + } +}; + +/* ---------------------------------------------------------------------- */ + +int AtomVecKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap, + const int nfirst, const int &pbc_flag, const int* const pbc) { + if(commKK->forward_comm_on_host) { + sync(Host,X_MASK); + modified(Host,X_MASK); + if(pbc_flag) { + if(domain->triclinic) { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } else { + if(domain->triclinic) { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } + } else { + sync(Device,X_MASK); + modified(Device,X_MASK); + if(pbc_flag) { + if(domain->triclinic) { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } else { + if(domain->triclinic) { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } + } + return n*3; +} + +/* ---------------------------------------------------------------------- */ + +template +struct AtomVecKokkos_UnpackComm { + typedef DeviceType device_type; + + typename ArrayTypes::t_x_array _x; + typename ArrayTypes::t_xfloat_2d_const _buf; + int _first; + + AtomVecKokkos_UnpackComm( + const typename DAT::tdual_x_array &x, + const typename DAT::tdual_xfloat_2d &buf, + const int& first):_x(x.view()),_buf(buf.view()), + _first(first) {}; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + _x(i+_first,0) = _buf(i,0); + _x(i+_first,1) = _buf(i,1); + _x(i+_first,2) = _buf(i,2); + } +}; + +/* ---------------------------------------------------------------------- */ + +void AtomVecKokkos::unpack_comm_kokkos(const int &n, const int &first, + const DAT::tdual_xfloat_2d &buf ) { + if(commKK->forward_comm_on_host) { + sync(Host,X_MASK); + modified(Host,X_MASK); + struct AtomVecKokkos_UnpackComm f(atomKK->k_x,buf,first); + Kokkos::parallel_for(n,f); + } else { + sync(Device,X_MASK); + modified(Device,X_MASK); + struct AtomVecKokkos_UnpackComm f(atomKK->k_x,buf,first); + Kokkos::parallel_for(n,f); + } +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecKokkos::pack_comm(int n, int *list, double *buf, + int pbc_flag, int *pbc) +{ + int i,j,m; + double dx,dy,dz; + + m = 0; + if (pbc_flag == 0) { + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0); + buf[m++] = h_x(j,1); + buf[m++] = h_x(j,2); + } + } else { + if (domain->triclinic == 0) { + dx = pbc[0]*domain->xprd; + dy = pbc[1]*domain->yprd; + dz = pbc[2]*domain->zprd; + } else { + dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; + dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; + dz = pbc[2]*domain->zprd; + } + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0) + dx; + buf[m++] = h_x(j,1) + dy; + buf[m++] = h_x(j,2) + dz; + } + } + return m; +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecKokkos::pack_comm_vel(int n, int *list, double *buf, + int pbc_flag, int *pbc) +{ + int i,j,m; + double dx,dy,dz,dvx,dvy,dvz; + + m = 0; + if (pbc_flag == 0) { + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0); + buf[m++] = h_x(j,1); + buf[m++] = h_x(j,2); + buf[m++] = h_v(j,0); + buf[m++] = h_v(j,1); + buf[m++] = h_v(j,2); + } + } else { + if (domain->triclinic == 0) { + dx = pbc[0]*domain->xprd; + dy = pbc[1]*domain->yprd; + dz = pbc[2]*domain->zprd; + } else { + dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; + dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; + dz = pbc[2]*domain->zprd; + } + if (!deform_vremap) { + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0) + dx; + buf[m++] = h_x(j,1) + dy; + buf[m++] = h_x(j,2) + dz; + buf[m++] = h_v(j,0); + buf[m++] = h_v(j,1); + buf[m++] = h_v(j,2); + } + } else { + dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4]; + dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3]; + dvz = pbc[2]*h_rate[2]; + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0) + dx; + buf[m++] = h_x(j,1) + dy; + buf[m++] = h_x(j,2) + dz; + if (atom->mask[i] & deform_groupbit) { + buf[m++] = h_v(j,0) + dvx; + buf[m++] = h_v(j,1) + dvy; + buf[m++] = h_v(j,2) + dvz; + } else { + buf[m++] = h_v(j,0); + buf[m++] = h_v(j,1); + buf[m++] = h_v(j,2); + } + } + } + } + return m; +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecKokkos::unpack_comm(int n, int first, double *buf) +{ + int i,m,last; + + m = 0; + last = first + n; + for (i = first; i < last; i++) { + h_x(i,0) = buf[m++]; + h_x(i,1) = buf[m++]; + h_x(i,2) = buf[m++]; + } +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecKokkos::unpack_comm_vel(int n, int first, double *buf) +{ + int i,m,last; + + m = 0; + last = first + n; + for (i = first; i < last; i++) { + h_x(i,0) = buf[m++]; + h_x(i,1) = buf[m++]; + h_x(i,2) = buf[m++]; + h_v(i,0) = buf[m++]; + h_v(i,1) = buf[m++]; + h_v(i,2) = buf[m++]; + } +} + +/* ---------------------------------------------------------------------- */ + +template +struct AtomVecKokkos_PackReverse { + typedef DeviceType device_type; + + typename ArrayTypes::t_f_array_randomread _f; + typename ArrayTypes::t_ffloat_2d _buf; + int _first; + + AtomVecKokkos_PackReverse( + const typename DAT::tdual_f_array &f, + const typename DAT::tdual_ffloat_2d &buf, + const int& first):_f(f.view()),_buf(buf.view()), + _first(first) {}; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + _buf(i,0) = _f(i+_first,0); + _buf(i,1) = _f(i+_first,1); + _buf(i,2) = _f(i+_first,2); + } +}; + +/* ---------------------------------------------------------------------- */ + +int AtomVecKokkos::pack_reverse_kokkos(const int &n, const int &first, + const DAT::tdual_ffloat_2d &buf ) { + if(commKK->reverse_comm_on_host) { + sync(Host,F_MASK); + struct AtomVecKokkos_PackReverse f(atomKK->k_f,buf,first); + Kokkos::parallel_for(n,f); + } else { + sync(Device,F_MASK); + struct AtomVecKokkos_PackReverse f(atomKK->k_f,buf,first); + Kokkos::parallel_for(n,f); + } + + return n*size_reverse; +} + +/* ---------------------------------------------------------------------- */ + +template +struct AtomVecKokkos_UnPackReverseSelf { + typedef DeviceType device_type; + + typename ArrayTypes::t_f_array_randomread _f; + typename ArrayTypes::t_f_array _fw; + int _nfirst; + typename ArrayTypes::t_int_2d_const _list; + const int _iswap; + + AtomVecKokkos_UnPackReverseSelf( + const typename DAT::tdual_f_array &f, + const int &nfirst, + const typename DAT::tdual_int_2d &list, + const int & iswap): + _f(f.view()),_fw(f.view()),_nfirst(nfirst),_list(list.view()),_iswap(iswap) { + }; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + const int j = _list(_iswap,i); + _fw(j,0) += _f(i+_nfirst,0); + _fw(j,1) += _f(i+_nfirst,1); + _fw(j,2) += _f(i+_nfirst,2); + } +}; + +/* ---------------------------------------------------------------------- */ + +int AtomVecKokkos::unpack_reverse_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap, + const int nfirst) { + if(commKK->reverse_comm_on_host) { + sync(Host,F_MASK); + struct AtomVecKokkos_UnPackReverseSelf f(atomKK->k_f,nfirst,list,iswap); + Kokkos::parallel_for(n,f); + modified(Host,F_MASK); + } else { + sync(Device,F_MASK); + struct AtomVecKokkos_UnPackReverseSelf f(atomKK->k_f,nfirst,list,iswap); + Kokkos::parallel_for(n,f); + modified(Device,F_MASK); + } + return n*3; +} + +/* ---------------------------------------------------------------------- */ + +template +struct AtomVecKokkos_UnPackReverse { + typedef DeviceType device_type; + + typename ArrayTypes::t_f_array _f; + typename ArrayTypes::t_ffloat_2d_const _buf; + typename ArrayTypes::t_int_2d_const _list; + const int _iswap; + + AtomVecKokkos_UnPackReverse( + const typename DAT::tdual_f_array &f, + const typename DAT::tdual_ffloat_2d &buf, + const typename DAT::tdual_int_2d &list, + const int & iswap): + _f(f.view()),_list(list.view()),_iswap(iswap) { + const size_t maxsend = (buf.view().dimension_0()*buf.view().dimension_1())/3; + const size_t elements = 3; + buffer_view(_buf,buf,maxsend,elements); + }; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + const int j = _list(_iswap,i); + _f(j,0) += _buf(i,0); + _f(j,1) += _buf(i,1); + _f(j,2) += _buf(i,2); + } +}; + +/* ---------------------------------------------------------------------- */ + +void AtomVecKokkos::unpack_reverse_kokkos(const int &n, + const DAT::tdual_int_2d &list, + const int & iswap, + const DAT::tdual_ffloat_2d &buf) +{ + // Check whether to always run reverse communication on the host + // Choose correct reverse UnPackReverse kernel + + if(commKK->reverse_comm_on_host) { + struct AtomVecKokkos_UnPackReverse f(atomKK->k_f,buf,list,iswap); + Kokkos::parallel_for(n,f); + modified(Host,F_MASK); + } else { + struct AtomVecKokkos_UnPackReverse f(atomKK->k_f,buf,list,iswap); + Kokkos::parallel_for(n,f); + modified(Device,F_MASK); + } +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecKokkos::pack_reverse(int n, int first, double *buf) +{ + if(n > 0) + sync(Host,F_MASK); + + int m = 0; + const int last = first + n; + for (int i = first; i < last; i++) { + buf[m++] = h_f(i,0); + buf[m++] = h_f(i,1); + buf[m++] = h_f(i,2); + } + + return m; +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecKokkos::unpack_reverse(int n, int *list, double *buf) +{ + int m = 0; + for (int i = 0; i < n; i++) { + const int j = list[i]; + h_f(j,0) += buf[m++]; + h_f(j,1) += buf[m++]; + h_f(j,2) += buf[m++]; + } + + if(n > 0) + modified(Host,F_MASK); +} diff --git a/src/KOKKOS/atom_vec_kokkos.h b/src/KOKKOS/atom_vec_kokkos.h index 7f593f235f..20a07ec443 100644 --- a/src/KOKKOS/atom_vec_kokkos.h +++ b/src/KOKKOS/atom_vec_kokkos.h @@ -35,29 +35,48 @@ class AtomVecKokkos : public AtomVec { public: AtomVecKokkos(class LAMMPS *); virtual ~AtomVecKokkos() {} + virtual int pack_comm(int, int *, double *, int, int *); + virtual int pack_comm_vel(int, int *, double *, int, int *); + virtual void unpack_comm(int, int, double *); + virtual void unpack_comm_vel(int, int, double *); + virtual int pack_reverse(int, int, double *); + virtual void unpack_reverse(int, int *, double *); virtual void sync(ExecutionSpace space, unsigned int mask) = 0; virtual void modified(ExecutionSpace space, unsigned int mask) = 0; - virtual void sync_overlapping_device(ExecutionSpace space, unsigned int mask) {}; + virtual void sync_overlapping_device(ExecutionSpace space, unsigned int mask) = 0; virtual int pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap, const int nfirst, - const int &pbc_flag, const int pbc[]) = 0; - //{return 0;} + const int &pbc_flag, const int pbc[]); + virtual int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &list, const int & iswap, const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, const int pbc[]) = 0; - //{return 0;} + const int &pbc_flag, const int pbc[]); + virtual void unpack_comm_kokkos(const int &n, const int &nfirst, - const DAT::tdual_xfloat_2d &buf) = 0; + const DAT::tdual_xfloat_2d &buf); + + virtual int + unpack_reverse_self(const int &n, const DAT::tdual_int_2d &list, + const int & iswap, const int nfirst); + + virtual int + pack_reverse_kokkos(const int &n, const int &nfirst, + const DAT::tdual_ffloat_2d &buf); + + virtual void + unpack_reverse_kokkos(const int &n, const DAT::tdual_int_2d &list, + const int & iswap, const DAT::tdual_ffloat_2d &buf); + virtual int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space) = 0; - //{return 0;}; + virtual void unpack_border_kokkos(const int &n, const int &nfirst, const DAT::tdual_xfloat_2d &buf, @@ -68,15 +87,19 @@ class AtomVecKokkos : public AtomVec { DAT::tdual_int_1d k_sendlist, DAT::tdual_int_1d k_copylist, ExecutionSpace space, int dim, X_FLOAT lo, X_FLOAT hi) = 0; - //{return 0;}; + virtual int unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv, int nlocal, int dim, X_FLOAT lo, X_FLOAT hi, ExecutionSpace space) = 0; - //{return 0;}; + protected: + HAT::t_x_array h_x; + HAT::t_v_array h_v; + HAT::t_f_array h_f; + class CommKokkos *commKK; size_t buffer_size; void* buffer; diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index f5ed0f525f..5534341342 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -46,7 +46,8 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp) if (sendlist) for (int i = 0; i < maxswap; i++) memory->destroy(sendlist[i]); memory->sfree(sendlist); sendlist = NULL; - k_sendlist = ArrayTypes::tdual_int_2d(); + k_sendlist = DAT::tdual_int_2d(); + k_total_send = DAT::tdual_int_scalar("comm::k_total_send"); // error check for disallow of OpenMP threads? @@ -57,12 +58,12 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp) memory->destroy(buf_recv); buf_recv = NULL; - k_exchange_sendlist = ArrayTypes:: + k_exchange_sendlist = DAT:: tdual_int_1d("comm:k_exchange_sendlist",100); - k_exchange_copylist = ArrayTypes:: + k_exchange_copylist = DAT:: tdual_int_1d("comm:k_exchange_copylist",100); - k_count = ArrayTypes::tdual_int_1d("comm:k_count",1); - k_sendflag = ArrayTypes::tdual_int_1d("comm:k_sendflag",100); + k_count = DAT::tdual_int_scalar("comm:k_count"); + k_sendflag = DAT::tdual_int_1d("comm:k_sendflag",100); memory->destroy(maxsendlist); maxsendlist = NULL; @@ -102,8 +103,10 @@ void CommKokkos::init() atomKK = (AtomKokkos *) atom; exchange_comm_classic = lmp->kokkos->exchange_comm_classic; forward_comm_classic = lmp->kokkos->forward_comm_classic; + reverse_comm_classic = lmp->kokkos->reverse_comm_classic; exchange_comm_on_host = lmp->kokkos->exchange_comm_on_host; forward_comm_on_host = lmp->kokkos->forward_comm_on_host; + reverse_comm_on_host = lmp->kokkos->reverse_comm_on_host; CommBrick::init(); @@ -132,8 +135,11 @@ void CommKokkos::init() if (force->newton == 0) check_reverse = 0; if (force->pair) check_reverse += force->pair->comm_reverse_off; - if(check_reverse || check_forward) + if (ghost_velocity) forward_comm_classic = true; + + if (!comm_f_only) // not all Kokkos atom_vec styles have reverse pack/unpack routines yet + reverse_comm_classic = true; } /* ---------------------------------------------------------------------- @@ -173,7 +179,6 @@ void CommKokkos::forward_comm_device(int dummy) int n; MPI_Request request; AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec; - double **x = atom->x; double *buf; // exchange data with another proc @@ -181,32 +186,29 @@ void CommKokkos::forward_comm_device(int dummy) // if comm_x_only set, exchange or copy directly to x, don't unpack k_sendlist.sync(); + atomKK->sync(ExecutionSpaceFromDevice::space,X_MASK); for (int iswap = 0; iswap < nswap; iswap++) { - if (sendproc[iswap] != me) { if (comm_x_only) { - atomKK->sync(ExecutionSpaceFromDevice::space,X_MASK); - if (size_forward_recv[iswap]) buf = x[firstrecv[iswap]]; - else buf = NULL; - if (size_forward_recv[iswap]) { buf = atomKK->k_x.view().ptr_on_device() + firstrecv[iswap]*atomKK->k_x.view().dimension_1(); MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE, - recvproc[iswap],0,world,&request); + recvproc[iswap],0,world,&request); } n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist, iswap,k_buf_send,pbc_flag[iswap],pbc[iswap]); - if (n) { MPI_Send(k_buf_send.view().ptr_on_device(), n,MPI_DOUBLE,sendproc[iswap],0,world); } - if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); - atomKK->modified(ExecutionSpaceFromDevice:: - space,X_MASK); + if (size_forward_recv[iswap]) { + MPI_Wait(&request,MPI_STATUS_IGNORE); + atomKK->modified(ExecutionSpaceFromDevice:: + space,X_MASK); + } } else if (ghost_velocity) { error->all(FLERR,"Ghost velocity forward comm not yet " "implemented with Kokkos"); @@ -248,21 +250,93 @@ void CommKokkos::forward_comm_device(int dummy) } } } + +/* ---------------------------------------------------------------------- + reverse communication of forces on atoms every timestep + other per-atom attributes may also be sent via pack/unpack routines +------------------------------------------------------------------------- */ + void CommKokkos::reverse_comm() { + if (!reverse_comm_classic) { + if (reverse_comm_on_host) reverse_comm_device(); + else reverse_comm_device(); + return; + } + k_sendlist.sync(); + if (comm_f_only) atomKK->sync(Host,F_MASK); else atomKK->sync(Host,ALL_MASK); + CommBrick::reverse_comm(); + if (comm_f_only) atomKK->modified(Host,F_MASK); else atomKK->modified(Host,ALL_MASK); - atomKK->sync(Device,ALL_MASK); + + //atomKK->sync(Device,ALL_MASK); // is this needed? } +template +void CommKokkos::reverse_comm_device() +{ + int n; + MPI_Request request; + AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec; + double *buf; + + // exchange data with another proc + // if other proc is self, just copy + // if comm_f_only set, exchange or copy directly from f, don't pack + + k_sendlist.sync(); + atomKK->sync(ExecutionSpaceFromDevice::space,F_MASK); + + for (int iswap = nswap-1; iswap >= 0; iswap--) { + if (sendproc[iswap] != me) { + if (comm_f_only) { + if (size_reverse_recv[iswap]) + MPI_Irecv(k_buf_recv.view().ptr_on_device(),size_reverse_recv[iswap],MPI_DOUBLE, + sendproc[iswap],0,world,&request); + if (size_reverse_send[iswap]) { + buf = atomKK->k_f.view().ptr_on_device() + + firstrecv[iswap]*atomKK->k_f.view().dimension_1(); + + MPI_Send(buf,size_reverse_send[iswap],MPI_DOUBLE, + recvproc[iswap],0,world); + } + if (size_reverse_recv[iswap]) { + MPI_Wait(&request,MPI_STATUS_IGNORE); + atomKK->modified(ExecutionSpaceFromDevice:: + space,F_MASK); + } + } else { + if (size_reverse_recv[iswap]) + MPI_Irecv(k_buf_recv.view().ptr_on_device(), + size_reverse_recv[iswap],MPI_DOUBLE, + sendproc[iswap],0,world,&request); + n = avec->pack_reverse_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send); + if (n) + MPI_Send(k_buf_send.view().ptr_on_device(),n, + MPI_DOUBLE,recvproc[iswap],0,world); + if (size_reverse_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); + } + avec->unpack_reverse_kokkos(sendnum[iswap],k_sendlist,iswap, + k_buf_recv); + } else { + if (sendnum[iswap]) + n = avec->unpack_reverse_self(sendnum[iswap],k_sendlist,iswap, + firstrecv[iswap]); + } + } +} + +/* ---------------------------------------------------------------------- */ + void CommKokkos::forward_comm_fix(Fix *fix, int size) { k_sendlist.sync(); @@ -408,7 +482,7 @@ struct BuildExchangeListFunctor { typename AT::t_x_array _x; int _nlocal,_dim; - typename AT::t_int_1d _nsend; + typename AT::t_int_scalar _nsend; typename AT::t_int_1d _sendlist; typename AT::t_int_1d _sendflag; @@ -416,7 +490,7 @@ struct BuildExchangeListFunctor { BuildExchangeListFunctor( const typename AT::tdual_x_array x, const typename AT::tdual_int_1d sendlist, - typename AT::tdual_int_1d nsend, + typename AT::tdual_int_scalar nsend, typename AT::tdual_int_1d sendflag,int nlocal, int dim, X_FLOAT lo, X_FLOAT hi): _x(x.template view()), @@ -430,7 +504,7 @@ struct BuildExchangeListFunctor { KOKKOS_INLINE_FUNCTION void operator() (int i) const { if (_x(i,_dim) < _lo || _x(i,_dim) >= _hi) { - const int mysend=Kokkos::atomic_fetch_add(&_nsend(0),1); + const int mysend=Kokkos::atomic_fetch_add(&_nsend(),1); if(mysend<_sendlist.dimension_0()) { _sendlist(mysend) = i; _sendflag(i) = 1; @@ -489,9 +563,9 @@ void CommKokkos::exchange_device() if (true) { if (k_sendflag.h_view.dimension_0()(); - k_count.h_view(0) = k_exchange_sendlist.h_view.dimension_0(); - while (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) { - k_count.h_view(0) = 0; + k_count.h_view() = k_exchange_sendlist.h_view.dimension_0(); + while (k_count.h_view()>=k_exchange_sendlist.h_view.dimension_0()) { + k_count.h_view() = 0; k_count.modify(); k_count.sync(); @@ -504,10 +578,10 @@ void CommKokkos::exchange_device() k_count.modify(); k_count.sync(); - if (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) { - k_exchange_sendlist.resize(k_count.h_view(0)*1.1); - k_exchange_copylist.resize(k_count.h_view(0)*1.1); - k_count.h_view(0)=k_exchange_sendlist.h_view.dimension_0(); + if (k_count.h_view()>=k_exchange_sendlist.h_view.dimension_0()) { + k_exchange_sendlist.resize(k_count.h_view()*1.1); + k_exchange_copylist.resize(k_count.h_view()*1.1); + k_count.h_view()=k_exchange_sendlist.h_view.dimension_0(); } } k_exchange_copylist.sync(); @@ -515,22 +589,22 @@ void CommKokkos::exchange_device() k_sendflag.sync(); int sendpos = nlocal-1; - nlocal -= k_count.h_view(0); - for(int i = 0; i < k_count.h_view(0); i++) { + nlocal -= k_count.h_view(); + for(int i = 0; i < k_count.h_view(); i++) { if (k_exchange_sendlist.h_view(i)(); k_exchange_copylist.sync(); - nsend = k_count.h_view(0); + nsend = k_count.h_view(); if (nsend > maxsend) grow_send_kokkos(nsend,1); nsend = - avec->pack_exchange_kokkos(k_count.h_view(0),k_buf_send, + avec->pack_exchange_kokkos(k_count.h_view(),k_buf_send, k_exchange_sendlist,k_exchange_copylist, ExecutionSpaceFromDevice:: space,dim,lo,hi); @@ -640,9 +714,7 @@ void CommKokkos::borders() } atomKK->sync(Host,ALL_MASK); - atomKK->modified(Host,ALL_MASK); k_sendlist.sync(); - k_sendlist.modify(); CommBrick::borders(); k_sendlist.modify(); atomKK->modified(Host,ALL_MASK); @@ -659,11 +731,11 @@ struct BuildBorderListFunctor { int iswap,maxsendlist; int nfirst,nlast,dim; typename AT::t_int_2d sendlist; - typename AT::t_int_1d nsend; + typename AT::t_int_scalar nsend; BuildBorderListFunctor(typename AT::tdual_x_array _x, typename AT::tdual_int_2d _sendlist, - typename AT::tdual_int_1d _nsend,int _nfirst, + typename AT::tdual_int_scalar _nsend,int _nfirst, int _nlast, int _dim, X_FLOAT _lo, X_FLOAT _hi, int _iswap, int _maxsendlist): @@ -684,7 +756,7 @@ struct BuildBorderListFunctor { for (int i=teamstart + dev.team_rank(); i= lo && x(i,dim) <= hi) mysend++; } - const int my_store_pos = dev.team_scan(mysend,&nsend(0)); + const int my_store_pos = dev.team_scan(mysend,&nsend()); if (my_store_pos+mysend < maxsendlist) { mysend = my_store_pos; @@ -713,7 +785,7 @@ void CommKokkos::borders_device() { AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec; ExecutionSpace exec_space = ExecutionSpaceFromDevice::space; - k_sendlist.modify(); + k_sendlist.sync(); atomKK->sync(exec_space,ALL_MASK); // do swaps over all 3 dimensions @@ -763,37 +835,38 @@ void CommKokkos::borders_device() { if (sendflag) { if (!bordergroup || ineed >= 2) { if (style == SINGLE) { - typename ArrayTypes::tdual_int_1d total_send("TS",1); - total_send.h_view(0) = 0; - if(exec_space == Device) { - total_send.template modify(); - total_send.template sync(); - } + k_total_send.h_view() = 0; + k_total_send.template modify(); + k_total_send.template sync(); BuildBorderListFunctor f(atomKK->k_x,k_sendlist, - total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]); + k_total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]); Kokkos::TeamPolicy config((nlast-nfirst+127)/128,128); Kokkos::parallel_for(config,f); - total_send.template modify(); - total_send.template sync(); + k_total_send.template modify(); + k_total_send.template sync(); + + k_sendlist.modify(); + + if(k_total_send.h_view() >= maxsendlist[iswap]) { + grow_list(iswap,k_total_send.h_view()); + + k_total_send.h_view() = 0; + k_total_send.template modify(); + k_total_send.template sync(); - if(total_send.h_view(0) >= maxsendlist[iswap]) { - grow_list(iswap,total_send.h_view(0)); - k_sendlist.modify(); - total_send.h_view(0) = 0; - if(exec_space == Device) { - total_send.template modify(); - total_send.template sync(); - } BuildBorderListFunctor f(atomKK->k_x,k_sendlist, - total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]); + k_total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]); Kokkos::TeamPolicy config((nlast-nfirst+127)/128,128); Kokkos::parallel_for(config,f); - total_send.template modify(); - total_send.template sync(); + + k_total_send.template modify(); + k_total_send.template sync(); + + k_sendlist.modify(); } - nsend = total_send.h_view(0); + nsend = k_total_send.h_view(); } else { error->all(FLERR,"Required border comm not yet " "implemented with Kokkos"); @@ -916,10 +989,11 @@ void CommKokkos::borders_device() { // reset global->local map - if (exec_space == Host) k_sendlist.sync(); atomKK->modified(exec_space,ALL_MASK); - atomKK->sync(Host,TAG_MASK); - if (map_style) atom->map_set(); + if (map_style) { + atomKK->sync(Host,TAG_MASK); + atom->map_set(); + } } /* ---------------------------------------------------------------------- realloc the size of the send buffer as needed with BUFFACTOR and bufextra @@ -961,7 +1035,7 @@ void CommKokkos::grow_send_kokkos(int n, int flag, ExecutionSpace space) buf_send = k_buf_send.view().ptr_on_device(); } else { - k_buf_send = ArrayTypes:: + k_buf_send = DAT:: tdual_xfloat_2d("comm:k_buf_send",maxsend_border,atom->avec->size_border); buf_send = k_buf_send.view().ptr_on_device(); } @@ -975,7 +1049,7 @@ void CommKokkos::grow_recv_kokkos(int n, ExecutionSpace space) { maxrecv = static_cast (BUFFACTOR * n); int maxrecv_border = (maxrecv+BUFEXTRA+5)/atom->avec->size_border + 2; - k_buf_recv = ArrayTypes:: + k_buf_recv = DAT:: tdual_xfloat_2d("comm:k_buf_recv",maxrecv_border,atom->avec->size_border); buf_recv = k_buf_recv.view().ptr_on_device(); } @@ -988,6 +1062,11 @@ void CommKokkos::grow_list(int iswap, int n) { int size = static_cast (BUFFACTOR * n); + if (exchange_comm_classic) { // force realloc on Host + k_sendlist.sync(); + k_sendlist.modify(); + } + memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist"); for(int i=0;i(); + k_sendlist.modify(); + } + memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist"); memory->grow(maxsendlist,n,"comm:maxsendlist"); diff --git a/src/KOKKOS/comm_kokkos.h b/src/KOKKOS/comm_kokkos.h index a8ae973124..f137655cb8 100644 --- a/src/KOKKOS/comm_kokkos.h +++ b/src/KOKKOS/comm_kokkos.h @@ -25,15 +25,17 @@ class CommKokkos : public CommBrick { bool exchange_comm_classic; bool forward_comm_classic; + bool reverse_comm_classic; bool exchange_comm_on_host; bool forward_comm_on_host; + bool reverse_comm_on_host; CommKokkos(class LAMMPS *); ~CommKokkos(); void init(); void forward_comm(int dummy = 0); // forward comm of atom coords - void reverse_comm(); // reverse comm of atom coords + void reverse_comm(); // reverse comm of atom coords void exchange(); // move atoms to new procs void borders(); // setup list of atoms to comm @@ -47,15 +49,17 @@ class CommKokkos : public CommBrick { void reverse_comm_dump(class Dump *); // reverse comm from a Dump template void forward_comm_device(int dummy); + template void reverse_comm_device(); template void forward_comm_pair_device(Pair *pair); template void exchange_device(); template void borders_device(); protected: DAT::tdual_int_2d k_sendlist; + DAT::tdual_int_scalar k_total_send; DAT::tdual_xfloat_2d k_buf_send,k_buf_recv; DAT::tdual_int_1d k_exchange_sendlist,k_exchange_copylist,k_sendflag; - DAT::tdual_int_1d k_count; + DAT::tdual_int_scalar k_count; //double *buf_send; // send buffer for all comm //double *buf_recv; // recv buffer for all comm diff --git a/src/KOKKOS/fix_qeq_reax_kokkos.cpp b/src/KOKKOS/fix_qeq_reax_kokkos.cpp index e54b53ae89..5d2f6a0438 100644 --- a/src/KOKKOS/fix_qeq_reax_kokkos.cpp +++ b/src/KOKKOS/fix_qeq_reax_kokkos.cpp @@ -63,6 +63,7 @@ FixQEqReaxKokkos(LAMMPS *lmp, int narg, char **arg) : nmax = nmax = m_cap = 0; allocated_flag = 0; + nprev = 4; } /* ---------------------------------------------------------------------- */ @@ -158,15 +159,15 @@ void FixQEqReaxKokkos::init_hist() { int i,j; - k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",atom->nmax,5); + k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",atom->nmax,nprev); d_s_hist = k_s_hist.template view(); h_s_hist = k_s_hist.h_view; - k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",atom->nmax,5); + k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",atom->nmax,nprev); d_t_hist = k_t_hist.template view(); h_t_hist = k_t_hist.h_view; for( i = 0; i < atom->nmax; i++ ) - for( j = 0; j < 5; j++ ) + for( j = 0; j < nprev; j++ ) k_s_hist.h_view(i,j) = k_t_hist.h_view(i,j) = 0.0; k_s_hist.template modify(); @@ -334,11 +335,11 @@ void FixQEqReaxKokkos::allocate_array() d_d = k_d.template view(); h_d = k_d.h_view; - k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",nmax,5); + k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",nmax,nprev); d_s_hist = k_s_hist.template view(); h_s_hist = k_s_hist.h_view; - k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",nmax,5); + k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",nmax,nprev); d_t_hist = k_t_hist.template view(); h_t_hist = k_t_hist.h_view; } @@ -368,7 +369,7 @@ void FixQEqReaxKokkos::zero_item(int ii) const d_o[i] = 0.0; d_r[i] = 0.0; d_d[i] = 0.0; - //for( int j = 0; j < 5; j++ ) + //for( int j = 0; j < nprev; j++ ) //d_s_hist(i,j) = d_t_hist(i,j) = 0.0; } @@ -1087,7 +1088,7 @@ void FixQEqReaxKokkos::calculate_q_item(int ii) const if (mask[i] & groupbit) { q(i) = d_s[i] - delta * d_t[i]; - for (int k = 4; k > 0; --k) { + for (int k = nprev-1; k > 0; --k) { d_s_hist(i,k) = d_s_hist(i,k-1); d_t_hist(i,k) = d_t_hist(i,k-1); } @@ -1173,7 +1174,7 @@ double FixQEqReaxKokkos::memory_usage() { double bytes; - bytes = atom->nmax*5*2 * sizeof(F_FLOAT); // s_hist & t_hist + bytes = atom->nmax*nprev*2 * sizeof(F_FLOAT); // s_hist & t_hist bytes += atom->nmax*8 * sizeof(F_FLOAT); // storage bytes += n_cap*2 * sizeof(int); // matrix... bytes += m_cap * sizeof(int); diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp index 072a802b54..2b02624dce 100644 --- a/src/KOKKOS/kokkos.cpp +++ b/src/KOKKOS/kokkos.cpp @@ -123,8 +123,10 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) neighflag_qeq_set = 0; exchange_comm_classic = 0; forward_comm_classic = 0; + reverse_comm_classic = 0; exchange_comm_on_host = 0; forward_comm_on_host = 0; + reverse_comm_on_host = 0; #ifdef KILL_KOKKOS_ON_SIGSEGV signal(SIGSEGV, my_signal_handler); @@ -158,8 +160,8 @@ void KokkosLMP::accelerator(int narg, char **arg) neighflag_qeq_set = 0; int newtonflag = 0; double binsize = 0.0; - exchange_comm_classic = forward_comm_classic = 0; - exchange_comm_on_host = forward_comm_on_host = 0; + exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0; + exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; int iarg = 0; while (iarg < narg) { @@ -200,13 +202,13 @@ void KokkosLMP::accelerator(int narg, char **arg) } else if (strcmp(arg[iarg],"comm") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); if (strcmp(arg[iarg+1],"no") == 0) { - exchange_comm_classic = forward_comm_classic = 1; + exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1; } else if (strcmp(arg[iarg+1],"host") == 0) { - exchange_comm_classic = forward_comm_classic = 0; - exchange_comm_on_host = forward_comm_on_host = 1; + exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0; + exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 1; } else if (strcmp(arg[iarg+1],"device") == 0) { - exchange_comm_classic = forward_comm_classic = 0; - exchange_comm_on_host = forward_comm_on_host = 0; + exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0; + exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; } else error->all(FLERR,"Illegal package kokkos command"); iarg += 2; } else if (strcmp(arg[iarg],"comm/exchange") == 0) { @@ -231,6 +233,17 @@ void KokkosLMP::accelerator(int narg, char **arg) forward_comm_on_host = 0; } else error->all(FLERR,"Illegal package kokkos command"); iarg += 2; + } else if (strcmp(arg[iarg],"comm/reverse") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); + if (strcmp(arg[iarg+1],"no") == 0) reverse_comm_classic = 1; + else if (strcmp(arg[iarg+1],"host") == 0) { + reverse_comm_classic = 0; + reverse_comm_on_host = 1; + } else if (strcmp(arg[iarg+1],"device") == 0) { + reverse_comm_classic = 0; + reverse_comm_on_host = 0; + } else error->all(FLERR,"Illegal package kokkos command"); + iarg += 2; } else error->all(FLERR,"Illegal package kokkos command"); } diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h index 8e28b38cbf..7b7848f1f0 100644 --- a/src/KOKKOS/kokkos.h +++ b/src/KOKKOS/kokkos.h @@ -27,8 +27,10 @@ class KokkosLMP : protected Pointers { int neighflag_qeq_set; int exchange_comm_classic; int forward_comm_classic; + int reverse_comm_classic; int exchange_comm_on_host; int forward_comm_on_host; + int reverse_comm_on_host; int num_threads,ngpu; int numa; int auto_sync; diff --git a/src/KOKKOS/nbin_kokkos.cpp b/src/KOKKOS/nbin_kokkos.cpp index c7e815928a..95ea105ad9 100644 --- a/src/KOKKOS/nbin_kokkos.cpp +++ b/src/KOKKOS/nbin_kokkos.cpp @@ -75,6 +75,10 @@ void NBinKokkos::bin_atoms_setup(int nall) k_bincount = DAT::tdual_int_1d("Neighbor::d_bincount",mbins); bincount = k_bincount.view(); } + if (nall > k_atom2bin.d_view.dimension_0()) { + k_atom2bin = DAT::tdual_int_1d("Neighbor::d_atom2bin",nall); + atom2bin = k_atom2bin.view(); + } } /* ---------------------------------------------------------------------- @@ -86,6 +90,10 @@ void NBinKokkos::bin_atoms() { last_bin = update->ntimestep; + k_bins.template sync(); + k_bincount.template sync(); + k_atom2bin.template sync(); + h_resize() = 1; while(h_resize() > 0) { @@ -115,6 +123,10 @@ void NBinKokkos::bin_atoms() c_bins = bins; } } + + k_bins.template modify(); + k_bincount.template modify(); + k_atom2bin.template modify(); } /* ---------------------------------------------------------------------- */ @@ -125,6 +137,7 @@ void NBinKokkos::binatomsItem(const int &i) const { const int ibin = coord2bin(x(i, 0), x(i, 1), x(i, 2)); + atom2bin(i) = ibin; const int ac = Kokkos::atomic_fetch_add(&bincount[ibin], (int)1); if(ac < bins.dimension_1()) { bins(ibin, ac) = i; diff --git a/src/KOKKOS/nbin_kokkos.h b/src/KOKKOS/nbin_kokkos.h index de3cf41d19..bf2ccc5908 100644 --- a/src/KOKKOS/nbin_kokkos.h +++ b/src/KOKKOS/nbin_kokkos.h @@ -44,11 +44,13 @@ class NBinKokkos : public NBinStandard { int atoms_per_bin; DAT::tdual_int_1d k_bincount; DAT::tdual_int_2d k_bins; + DAT::tdual_int_1d k_atom2bin; typename AT::t_int_1d bincount; const typename AT::t_int_1d_const c_bincount; typename AT::t_int_2d bins; typename AT::t_int_2d_const c_bins; + typename AT::t_int_1d atom2bin; typename AT::t_int_scalar d_resize; typename ArrayTypes::t_int_scalar h_resize; typename AT::t_x_array_randomread x; diff --git a/src/KOKKOS/neighbor_kokkos.cpp b/src/KOKKOS/neighbor_kokkos.cpp index 9a40808052..f34b149864 100644 --- a/src/KOKKOS/neighbor_kokkos.cpp +++ b/src/KOKKOS/neighbor_kokkos.cpp @@ -310,9 +310,9 @@ void NeighborKokkos::build_kokkos(int topoflag) // build pairwise lists for all perpetual NPair/NeighList // grow() with nlocal/nall args so that only realloc if have to - atomKK->sync(Host,ALL_MASK); for (i = 0; i < npair_perpetual; i++) { m = plist[i]; + if (!lists[m]->kokkos) atomKK->sync(Host,ALL_MASK); if (!lists[m]->copy) lists[m]->grow(nlocal,nall); neigh_pair[m]->build_setup(); neigh_pair[m]->build(lists[m]); diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp index b568bd5c93..d3cdcb0680 100644 --- a/src/KOKKOS/npair_kokkos.cpp +++ b/src/KOKKOS/npair_kokkos.cpp @@ -73,6 +73,7 @@ void NPairKokkos::copy_bin_info() atoms_per_bin = nbKK->atoms_per_bin; k_bincount = nbKK->k_bincount; k_bins = nbKK->k_bins; + k_atom2bin = nbKK->k_atom2bin; } /* ---------------------------------------------------------------------- @@ -88,13 +89,15 @@ void NPairKokkos::copy_stencil_info() int maxstencil = ns->get_maxstencil(); - k_stencil = DAT::tdual_int_1d("neighlist:stencil",maxstencil); + if (maxstencil > k_stencil.dimension_0()) + k_stencil = DAT::tdual_int_1d("neighlist:stencil",maxstencil); for (int k = 0; k < maxstencil; k++) k_stencil.h_view(k) = ns->stencil[k]; k_stencil.modify(); k_stencil.sync(); if (GHOST) { - k_stencilxyz = DAT::tdual_int_1d_3("neighlist:stencilxyz",maxstencil); + if (maxstencil > k_stencilxyz.dimension_0()) + k_stencilxyz = DAT::tdual_int_1d_3("neighlist:stencilxyz",maxstencil); for (int k = 0; k < maxstencil; k++) { k_stencilxyz.h_view(k,0) = ns->stencilxyz[k][0]; k_stencilxyz.h_view(k,1) = ns->stencilxyz[k][1]; @@ -122,6 +125,7 @@ void NPairKokkos::build(NeighList *list_) k_cutneighsq.view(), k_bincount.view(), k_bins.view(), + k_atom2bin.view(), nstencil, k_stencil.view(), k_stencilxyz.view(), @@ -164,8 +168,9 @@ void NPairKokkos::build(NeighList *list_) k_ex_mol_group.sync(); k_ex_mol_bit.sync(); k_ex_mol_intra.sync(); - k_bincount.sync(), - k_bins.sync(), + k_bincount.sync(); + k_bins.sync(); + k_atom2bin.sync(); atomKK->sync(Device,X_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK|TAG_MASK|SPECIAL_MASK); data.special_flag[0] = special_flag[0]; @@ -317,7 +322,7 @@ void NeighborKokkosExecute:: const X_FLOAT ztmp = x(i, 2); const int itype = type(i); - const int ibin = coord2bin(xtmp, ytmp, ztmp); + const int ibin = c_atom2bin(i); const typename ArrayTypes::t_int_1d_const_um stencil = d_stencil; @@ -431,7 +436,7 @@ void NeighborKokkosExecute:: if(n > neigh_list.maxneighs) { resize() = 1; - if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n); + if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop } neigh_list.d_ilist(i) = i; @@ -641,7 +646,7 @@ void NeighborKokkosExecute::build_ItemCuda(typename Kokkos::TeamPoli if(n > neigh_list.maxneighs) { resize() = 1; - if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n); + if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop } } } @@ -678,7 +683,7 @@ void NeighborKokkosExecute:: // no molecular test when i = ghost atom if (i < nlocal) { - const int ibin = coord2bin(xtmp, ytmp, ztmp); + const int ibin = c_atom2bin(i); for (int k = 0; k < nstencil; k++) { const int jbin = ibin + stencil[k]; for(int m = 0; m < c_bincount(jbin); m++) { @@ -764,7 +769,7 @@ void NeighborKokkosExecute:: if(n > neigh_list.maxneighs) { resize() = 1; - if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n); + if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop } neigh_list.d_ilist(i) = i; } diff --git a/src/KOKKOS/npair_kokkos.h b/src/KOKKOS/npair_kokkos.h index 517ea546fa..6c1c0e958b 100644 --- a/src/KOKKOS/npair_kokkos.h +++ b/src/KOKKOS/npair_kokkos.h @@ -105,6 +105,7 @@ class NPairKokkos : public NPair { int atoms_per_bin; DAT::tdual_int_1d k_bincount; DAT::tdual_int_2d k_bins; + DAT::tdual_int_1d k_atom2bin; // data from NStencil class @@ -148,6 +149,8 @@ class NeighborKokkosExecute const typename AT::t_int_1d_const c_bincount; typename AT::t_int_2d bins; typename AT::t_int_2d_const c_bins; + const typename AT::t_int_1d atom2bin; + const typename AT::t_int_1d_const c_atom2bin; // data from NStencil class @@ -190,6 +193,7 @@ class NeighborKokkosExecute const typename AT::t_xfloat_2d_randomread &_cutneighsq, const typename AT::t_int_1d &_bincount, const typename AT::t_int_2d &_bins, + const typename AT::t_int_1d &_atom2bin, const int _nstencil, const typename AT::t_int_1d &_d_stencil, const typename AT::t_int_1d_3 &_d_stencilxyz, @@ -224,6 +228,7 @@ class NeighborKokkosExecute const int & _xprd_half, const int & _yprd_half, const int & _zprd_half): neigh_list(_neigh_list), cutneighsq(_cutneighsq), bincount(_bincount),c_bincount(_bincount),bins(_bins),c_bins(_bins), + atom2bin(_atom2bin),c_atom2bin(_atom2bin), nstencil(_nstencil),d_stencil(_d_stencil),d_stencilxyz(_d_stencilxyz), nlocal(_nlocal), x(_x),type(_type),mask(_mask),molecule(_molecule), @@ -281,38 +286,6 @@ class NeighborKokkosExecute void build_ItemCuda(typename Kokkos::TeamPolicy::member_type dev) const; #endif - KOKKOS_INLINE_FUNCTION - int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z) const - { - int ix,iy,iz; - - if (x >= bboxhi[0]) - ix = static_cast ((x-bboxhi[0])*bininvx) + nbinx; - else if (x >= bboxlo[0]) { - ix = static_cast ((x-bboxlo[0])*bininvx); - ix = MIN(ix,nbinx-1); - } else - ix = static_cast ((x-bboxlo[0])*bininvx) - 1; - - if (y >= bboxhi[1]) - iy = static_cast ((y-bboxhi[1])*bininvy) + nbiny; - else if (y >= bboxlo[1]) { - iy = static_cast ((y-bboxlo[1])*bininvy); - iy = MIN(iy,nbiny-1); - } else - iy = static_cast ((y-bboxlo[1])*bininvy) - 1; - - if (z >= bboxhi[2]) - iz = static_cast ((z-bboxhi[2])*bininvz) + nbinz; - else if (z >= bboxlo[2]) { - iz = static_cast ((z-bboxlo[2])*bininvz); - iz = MIN(iz,nbinz-1); - } else - iz = static_cast ((z-bboxlo[2])*bininvz) - 1; - - return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo); - } - KOKKOS_INLINE_FUNCTION int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z, int* i) const { diff --git a/src/KOKKOS/pair_reaxc_kokkos.cpp b/src/KOKKOS/pair_reaxc_kokkos.cpp index d95cd8f8ae..d5f83f4537 100644 --- a/src/KOKKOS/pair_reaxc_kokkos.cpp +++ b/src/KOKKOS/pair_reaxc_kokkos.cpp @@ -131,6 +131,8 @@ template void PairReaxCKokkos::init_style() { PairReaxC::init_style(); + if (fix_reax) modify->delete_fix("REAXC"); // not needed in the Kokkos version + fix_reax = NULL; // irequest = neigh request made by parent class @@ -555,8 +557,8 @@ void PairReaxCKokkos::Deallocate_Lookup_Tables() ntypes = atom->ntypes; - for( i = 0; i < ntypes; ++i ) { - for( j = i; j < ntypes; ++j ) + for( i = 0; i <= ntypes; ++i ) { + for( j = i; j <= ntypes; ++j ) if( LR[i][j].n ) { sfree( LR[i][j].y, "LR[i,j].y" ); sfree( LR[i][j].H, "LR[i,j].H" ); diff --git a/src/KOKKOS/verlet_kokkos.cpp b/src/KOKKOS/verlet_kokkos.cpp index e4a3f857d3..adec5ff1bd 100644 --- a/src/KOKKOS/verlet_kokkos.cpp +++ b/src/KOKKOS/verlet_kokkos.cpp @@ -294,6 +294,7 @@ void VerletKokkos::run(int n) int n_pre_exchange = modify->n_pre_exchange; int n_pre_neighbor = modify->n_pre_neighbor; int n_pre_force = modify->n_pre_force; + int n_pre_reverse = modify->n_pre_reverse; int n_post_force = modify->n_post_force; int n_end_of_step = modify->n_end_of_step; @@ -304,9 +305,9 @@ void VerletKokkos::run(int n) f_merge_copy = DAT::t_f_array("VerletKokkos::f_merge_copy",atomKK->k_f.dimension_0()); - static double time = 0.0; atomKK->sync(Device,ALL_MASK); - Kokkos::Impl::Timer ktimer; + //static double time = 0.0; + //Kokkos::Impl::Timer ktimer; timer->init_timeout(); for (int i = 0; i < n; i++) { @@ -320,10 +321,10 @@ void VerletKokkos::run(int n) // initial time integration - ktimer.reset(); + //ktimer.reset(); timer->stamp(); modify->initial_integrate(vflag); - time += ktimer.seconds(); + //time += ktimer.seconds(); if (n_post_integrate) modify->post_integrate(); timer->stamp(Timer::MODIFY); @@ -523,11 +524,18 @@ void VerletKokkos::run(int n) atomKK->k_f.modify(); } + if (n_pre_reverse) { + modify->pre_reverse(eflag,vflag); + timer->stamp(Timer::MODIFY); + } // reverse communication of forces - if (force->newton) comm->reverse_comm(); - timer->stamp(Timer::COMM); + if (force->newton) { + Kokkos::fence(); + comm->reverse_comm(); + timer->stamp(Timer::COMM); + } // force modifications, final time integration, diagnostics diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich index 68f879860a..e4dc74d79b 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich @@ -15,7 +15,7 @@ DEPFLAGS = -M LINK = mpicxx -cxx=icc LINKFLAGS = -qopenmp $(OPTFLAGS) -LIB = +LIB = -ltbbmalloc SIZE = size ARCHIVE = ar diff --git a/src/REPLICA/prd.cpp b/src/REPLICA/prd.cpp index 30ebc779c5..14eeac8d66 100644 --- a/src/REPLICA/prd.cpp +++ b/src/REPLICA/prd.cpp @@ -310,6 +310,7 @@ void PRD::command(int narg, char **arg) time_dephase = time_dynamics = time_quench = time_comm = time_output = 0.0; bigint clock = 0; + timer->init(); timer->barrier_start(); time_start = timer->get_wall(Timer::TOTAL); diff --git a/src/REPLICA/tad.cpp b/src/REPLICA/tad.cpp index 5a4d885224..347cd3ba67 100644 --- a/src/REPLICA/tad.cpp +++ b/src/REPLICA/tad.cpp @@ -274,6 +274,7 @@ void TAD::command(int narg, char **arg) nbuild = ndanger = 0; time_neb = time_dynamics = time_quench = time_comm = time_output = 0.0; + timer->init(); timer->barrier_start(); time_start = timer->get_wall(Timer::TOTAL); diff --git a/src/USER-MANIFOLD/manifold_gaussian_bump.cpp b/src/USER-MANIFOLD/manifold_gaussian_bump.cpp index db8c589afb..a9ee35bbfc 100644 --- a/src/USER-MANIFOLD/manifold_gaussian_bump.cpp +++ b/src/USER-MANIFOLD/manifold_gaussian_bump.cpp @@ -134,7 +134,7 @@ public: // Manifold itself: manifold_gaussian_bump::manifold_gaussian_bump(class LAMMPS* lmp, int narg, char **arg) - : manifold(lmp), lut_z(NULL), lut_zp(NULL) {} + : manifold(lmp), lut_z(NULL), lut_zp(NULL) {} manifold_gaussian_bump::~manifold_gaussian_bump() @@ -361,13 +361,13 @@ void manifold_gaussian_bump::test_lut() n( x, nn ); double taper_z; if( xx <= rc1 ){ - taper_z = gaussian_bump(xx); + taper_z = gaussian_bump(xx); }else if( xx < rc2 ){ - taper_z = lut_get_z( xx ); + taper_z = lut_get_z( xx ); }else{ - taper_z = 0.0; + taper_z = 0.0; } - fprintf( fp, "%g %g %g %g %g\n", xx, gaussian_bump(xx), taper_z, + fprintf( fp, "%g %g %g %g %g %g %g\n", xx, gaussian_bump(xx), taper_z, gg, nn[0], nn[1], nn[2] ); } fclose(fp); diff --git a/src/USER-MISC/fix_srp.cpp b/src/USER-MISC/fix_srp.cpp index f3dec42a83..e1e5f579b8 100644 --- a/src/USER-MISC/fix_srp.cpp +++ b/src/USER-MISC/fix_srp.cpp @@ -98,7 +98,7 @@ int FixSRP::setmask() void FixSRP::init() { - if (force->pair_match("hybrid",1) == NULL) + if (force->pair_match("hybrid",1) == NULL && force->pair_match("hybrid/overlay",1) == NULL) error->all(FLERR,"Cannot use pair srp without pair_style hybrid"); int has_rigid = 0; diff --git a/src/USER-OMP/fix_qeq_reax_omp.cpp b/src/USER-OMP/fix_qeq_reax_omp.cpp index 4457ab6592..d89c9627fe 100644 --- a/src/USER-OMP/fix_qeq_reax_omp.cpp +++ b/src/USER-OMP/fix_qeq_reax_omp.cpp @@ -703,7 +703,7 @@ void FixQEqReaxOMP::calculate_Q() q[i] = s[i] - u * t[i]; // backup s & t - for (int k = 4; k > 0; --k) { + for (int k = nprev-1; k > 0; --k) { s_hist[i][k] = s_hist[i][k-1]; t_hist[i][k] = t_hist[i][k-1]; } diff --git a/src/USER-REAXC/fix_qeq_reax.cpp b/src/USER-REAXC/fix_qeq_reax.cpp index 9d165f3fd3..d1c4f90771 100644 --- a/src/USER-REAXC/fix_qeq_reax.cpp +++ b/src/USER-REAXC/fix_qeq_reax.cpp @@ -95,7 +95,7 @@ FixQEqReax::FixQEqReax(LAMMPS *lmp, int narg, char **arg) : pack_flag = 0; s = NULL; t = NULL; - nprev = 5; + nprev = 4; Hdia_inv = NULL; b_s = NULL; @@ -817,7 +817,7 @@ void FixQEqReax::calculate_Q() q[i] = s[i] - u * t[i]; /* backup s & t */ - for (k = 4; k > 0; --k) { + for (k = nprev-1; k > 0; --k) { s_hist[i][k] = s_hist[i][k-1]; t_hist[i][k] = t_hist[i][k-1]; } diff --git a/src/comm_brick.cpp b/src/comm_brick.cpp index 3c972b8244..06227b7a84 100644 --- a/src/comm_brick.cpp +++ b/src/comm_brick.cpp @@ -476,8 +476,7 @@ void CommBrick::forward_comm(int dummy) if (sendproc[iswap] != me) { if (comm_x_only) { if (size_forward_recv[iswap]) { - if (size_forward_recv[iswap]) buf = x[firstrecv[iswap]]; - else buf = NULL; + buf = x[firstrecv[iswap]]; MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE, recvproc[iswap],0,world,&request); } @@ -547,8 +546,7 @@ void CommBrick::reverse_comm() MPI_Irecv(buf_recv,size_reverse_recv[iswap],MPI_DOUBLE, sendproc[iswap],0,world,&request); if (size_reverse_send[iswap]) { - if (size_reverse_send[iswap]) buf = f[firstrecv[iswap]]; - else buf = NULL; + buf = f[firstrecv[iswap]]; MPI_Send(buf,size_reverse_send[iswap],MPI_DOUBLE, recvproc[iswap],0,world); } diff --git a/src/dump.cpp b/src/dump.cpp index 44098298ba..ddd958c25c 100644 --- a/src/dump.cpp +++ b/src/dump.cpp @@ -238,7 +238,7 @@ void Dump::init() int gcmcflag = 0; for (int i = 0; i < modify->nfix; i++) if ((strcmp(modify->fix[i]->style,"gcmc") == 0)) - gcmcflag = 1; + gcmcflag = 1; if (sortcol == 0 && atom->tag_consecutive() && !gcmcflag) { tagint *tag = atom->tag; @@ -898,7 +898,7 @@ void Dump::modify_params(int narg, char **arg) } else if (strcmp(arg[iarg],"fileper") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal dump_modify command"); if (!multiproc) - error->all(FLERR,"Cannot use dump_modify fileper " + error->all(FLERR,"Cannot use dump_modify fileper " "without % in dump file name"); int nper = force->inumeric(FLERR,arg[iarg+1]); if (nper <= 0) error->all(FLERR,"Illegal dump_modify command"); @@ -973,7 +973,7 @@ void Dump::modify_params(int narg, char **arg) } else if (strcmp(arg[iarg],"nfile") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal dump_modify command"); if (!multiproc) - error->all(FLERR,"Cannot use dump_modify nfile " + error->all(FLERR,"Cannot use dump_modify nfile " "without % in dump file name"); int nfile = force->inumeric(FLERR,arg[iarg+1]); if (nfile <= 0) error->all(FLERR,"Illegal dump_modify command"); diff --git a/src/finish.cpp b/src/finish.cpp index 45e9226388..c22ecaae60 100644 --- a/src/finish.cpp +++ b/src/finish.cpp @@ -130,7 +130,7 @@ void Finish::end(int flag) atom->natoms); if (logfile) fprintf(logfile,fmt1,time_loop,ntasks,update->nsteps, atom->natoms); - + // Gromacs/NAMD-style performance metric for suitable unit settings if ( timeflag && !minflag && !prdflag && !tadflag && @@ -144,7 +144,7 @@ void Finish::end(int flag) double one_fs = force->femtosecond; double t_step = ((double) time_loop) / ((double) update->nsteps); double step_t = 1.0/t_step; - + if (strcmp(update->unit_style,"lj") == 0) { double tau_day = 24.0*3600.0 / t_step * update->dt / one_fs; const char perf[] = "Performance: %.3f tau/day, %.3f timesteps/s\n"; @@ -161,7 +161,7 @@ void Finish::end(int flag) } // CPU use on MPI tasks and OpenMP threads - + if (timeflag) { if (lmp->kokkos) { const char fmt2[] = diff --git a/src/input.cpp b/src/input.cpp index 7d11b8741b..23b89d3040 100644 --- a/src/input.cpp +++ b/src/input.cpp @@ -18,7 +18,7 @@ #include #include #include -#include "sys/stat.h" +#include #include "input.h" #include "style_command.h" #include "universe.h" diff --git a/src/main.cpp b/src/main.cpp index 7401183fea..82dac5af6d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -22,6 +22,10 @@ #include #endif +#ifdef FFT_FFTW3 +#include +#endif + using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- @@ -62,4 +66,10 @@ int main(int argc, char **argv) #endif MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); + +#ifdef FFT_FFTW3 + // tell fftw3 to delete its global memory pool + // and thus avoid bogus valgrind memory leak reports + fftw_cleanup(); +#endif } diff --git a/src/modify.cpp b/src/modify.cpp index 4516788aa9..361079bc16 100644 --- a/src/modify.cpp +++ b/src/modify.cpp @@ -110,7 +110,7 @@ Modify::~Modify() // delete all fixes // do it via delete_fix() so callbacks in Atom are also updated correctly - while (nfix) delete_fix(fix[0]->id); + while (nfix) delete_fix(0); memory->sfree(fix); memory->destroy(fmask); @@ -863,9 +863,9 @@ void Modify::add_fix(int narg, char **arg, int trysuffix) fix[ifix]->restart(state_restart_global[i]); used_restart_global[i] = 1; if (comm->me == 0) { - if (screen) + if (screen) fprintf(screen,"Resetting global fix info from restart file:\n"); - if (logfile) + if (logfile) fprintf(logfile,"Resetting global fix info from restart file:\n"); if (screen) fprintf(screen," fix style: %s, fix ID: %s\n", fix[ifix]->style,fix[ifix]->id); @@ -885,9 +885,9 @@ void Modify::add_fix(int narg, char **arg, int trysuffix) fix[ifix]->unpack_restart(j,index_restart_peratom[i]); fix[ifix]->restart_reset = 1; if (comm->me == 0) { - if (screen) + if (screen) fprintf(screen,"Resetting peratom fix info from restart file:\n"); - if (logfile) + if (logfile) fprintf(logfile,"Resetting peratom fix info from restart file:\n"); if (screen) fprintf(screen," fix style: %s, fix ID: %s\n", fix[ifix]->style,fix[ifix]->id); @@ -944,7 +944,13 @@ void Modify::delete_fix(const char *id) { int ifix = find_fix(id); if (ifix < 0) error->all(FLERR,"Could not find fix ID to delete"); - delete fix[ifix]; + delete_fix(ifix); +} + +void Modify::delete_fix(int ifix) +{ + if(fix[ifix]) + delete fix[ifix]; atom->update_callback(ifix); // move other Fixes and fmask down in list one slot @@ -1409,24 +1415,24 @@ void Modify::restart_deallocate(int flag) if (flag && comm->me == 0) { int i; for (i = 0; i < nfix_restart_global; i++) - if (used_restart_global[i] == 0) break; + if (used_restart_global[i] == 0) break; if (i == nfix_restart_global) { - if (screen) + if (screen) fprintf(screen,"All restart file global fix info " "was re-assigned\n"); - if (logfile) + if (logfile) fprintf(logfile,"All restart file global fix info " "was re-assigned\n"); } else { - if (screen) fprintf(screen,"Unused restart file global fix info:\n"); - if (logfile) fprintf(logfile,"Unused restart file global fix info:\n"); - for (i = 0; i < nfix_restart_global; i++) { - if (used_restart_global[i]) continue; - if (screen) fprintf(screen," fix style: %s, fix ID: %s\n", - style_restart_global[i],id_restart_global[i]); - if (logfile) fprintf(logfile," fix style: %s, fix ID: %s\n", - style_restart_global[i],id_restart_global[i]); - } + if (screen) fprintf(screen,"Unused restart file global fix info:\n"); + if (logfile) fprintf(logfile,"Unused restart file global fix info:\n"); + for (i = 0; i < nfix_restart_global; i++) { + if (used_restart_global[i]) continue; + if (screen) fprintf(screen," fix style: %s, fix ID: %s\n", + style_restart_global[i],id_restart_global[i]); + if (logfile) fprintf(logfile," fix style: %s, fix ID: %s\n", + style_restart_global[i],id_restart_global[i]); + } } } @@ -1445,24 +1451,24 @@ void Modify::restart_deallocate(int flag) if (flag && comm->me == 0) { int i; for (i = 0; i < nfix_restart_peratom; i++) - if (used_restart_peratom[i] == 0) break; + if (used_restart_peratom[i] == 0) break; if (i == nfix_restart_peratom) { - if (screen) + if (screen) fprintf(screen,"All restart file peratom fix info " "was re-assigned\n"); - if (logfile) + if (logfile) fprintf(logfile,"All restart file peratom fix info " "was re-assigned\n"); } else { - if (screen) fprintf(screen,"Unused restart file peratom fix info:\n"); - if (logfile) fprintf(logfile,"Unused restart file peratom fix info:\n"); - for (i = 0; i < nfix_restart_peratom; i++) { - if (used_restart_peratom[i]) continue; - if (screen) fprintf(screen," fix style: %s, fix ID: %s\n", - style_restart_peratom[i],id_restart_peratom[i]); - if (logfile) fprintf(logfile," fix style: %s, fix ID: %s\n", - style_restart_peratom[i],id_restart_peratom[i]); - } + if (screen) fprintf(screen,"Unused restart file peratom fix info:\n"); + if (logfile) fprintf(logfile,"Unused restart file peratom fix info:\n"); + for (i = 0; i < nfix_restart_peratom; i++) { + if (used_restart_peratom[i]) continue; + if (screen) fprintf(screen," fix style: %s, fix ID: %s\n", + style_restart_peratom[i],id_restart_peratom[i]); + if (logfile) fprintf(logfile," fix style: %s, fix ID: %s\n", + style_restart_peratom[i],id_restart_peratom[i]); + } } } diff --git a/src/modify.h b/src/modify.h index d825d5c4ef..4ec61f6d57 100644 --- a/src/modify.h +++ b/src/modify.h @@ -95,6 +95,7 @@ class Modify : protected Pointers { void add_fix(int, char **, int trysuffix=1); void modify_fix(int, char **); void delete_fix(const char *); + void delete_fix(int); int find_fix(const char *); int find_fix_by_style(const char *); int check_package(const char *); diff --git a/tools/phonon/Makefile b/tools/phonon/Makefile index 0aacb1e086..67f9b91fdf 100644 --- a/tools/phonon/Makefile +++ b/tools/phonon/Makefile @@ -1,7 +1,7 @@ .SUFFIXES : .o .cpp # compiler and flags -CC = g++ -Wno-unused-result -LINK = $(CC) -static +CC = g++ -Wall +LINK = $(CC) CFLAGS = -O3 $(DEBUG) $(UFLAG) # OFLAGS = -O3 $(DEBUG) @@ -9,18 +9,17 @@ INC = $(LPKINC) $(TCINC) $(SPGINC) LIB = $(LPKLIB) $(TCLIB) $(SPGLIB) # # cLapack library needed -LPKINC = -I/opt/libs/clapack/3.2.1/include -LPKLIB = -L/opt/libs/clapack/3.2.1/lib -lclapack -lblas -lf2c #-lm +LPKINC = +LPKLIB =-llapack # -# Tricubic library needed -TCINC = -I/opt/libs/tricubic/1.0/include -TCLIB = -L/opt/libs/tricubic/1.0/lib -ltricubic # # spglib 1.8.2, used to get the irreducible q-points # if UFLAG is not set, spglib won't be used. -UFLAG = -DUseSPG -SPGINC = -I/opt/libs/spglib/1.8.2/include -SPGLIB = -L/opt/libs/spglib/1.8.2/lib -lsymspg + +# UFLAG = -DUseSPG +# SPGINC = -I/opt/libs/spglib/1.8.2/include +# SPGLIB = -L/opt/libs/spglib/1.8.2/lib -lsymspg + # if spglib other than version 1.8.2 is used, please # modify file phonon.cpp, instruction can be found by searching 1.8.2 @@ -36,7 +35,7 @@ SRC = $(wildcard *.cpp) OBJ = $(SRC:.cpp=.o) #==================================================================== -all: ver ${EXE} +all: ${EXE} ${EXE}: $(OBJ) $(LINK) $(OFLAGS) $(OBJ) $(LIB) -o $@ @@ -59,3 +58,16 @@ ver: $(CC) $(CFLAGS) -c $< .cpp.o: $(CC) $(CFLAGS) $(INC) -c $< + +#==================================================================== +# dependencies +disp.o: disp.cpp phonon.h dynmat.h memory.h interpolate.h green.h timer.h \ + global.h +dynmat.o: dynmat.cpp dynmat.h memory.h interpolate.h version.h global.h +green.o: green.cpp green.h memory.h global.h +interpolate.o: interpolate.cpp interpolate.h memory.h global.h +main.o: main.cpp dynmat.h memory.h interpolate.h phonon.h +memory.o: memory.cpp memory.h +phonon.o: phonon.cpp phonon.h dynmat.h memory.h interpolate.h green.h \ + timer.h global.h +timer.o: timer.cpp timer.h diff --git a/tools/phonon/README b/tools/phonon/README index ae6383b6bd..b54d96d8a3 100644 --- a/tools/phonon/README +++ b/tools/phonon/README @@ -5,15 +5,9 @@ analyse the phonon related information. #------------------------------------------------------------------------------- 1. Dependencies - The clapack library is needed to solve the eigen problems, - which could be downloaded from: - http://www.netlib.org/clapack/ - - The tricubic library is also needed to do tricubic interpolations, - which could be obtained from: - http://orca.princeton.edu/francois/software/tricubic/ - or - http://1drv.ms/1J2WFYk + The LAPACK library is needed to solve the eigen problems. + http://www.netlib.org/lapack/ + Intel MKL can be used as well. The spglib is optionally needed, enabling one to evaluate the phonon density of states or vibrational thermal properties diff --git a/tools/phonon/disp.cpp b/tools/phonon/disp.cpp index 2fa603916c..218e01e7fc 100644 --- a/tools/phonon/disp.cpp +++ b/tools/phonon/disp.cpp @@ -18,7 +18,8 @@ void Phonon::pdisp() { // ask the output file name and write the header. char str[MAXLINE]; - for (int ii = 0; ii < 80; ++ii) printf("="); printf("\n"); + for (int ii = 0; ii < 80; ++ii) printf("="); + printf("\n"); #ifdef UseSPG // ask method to generate q-lines int method = 2; @@ -53,7 +54,6 @@ void Phonon::pdisp() while (1){ for (int i = 0; i < 3; ++i) qstr[i] = qend[i]; - int quit = 0; printf("\nPlease input the start q-point in unit of B1->B3, q to exit [%g %g %g]: ", qstr[0], qstr[1], qstr[2]); int n = count_words(fgets(str, MAXLINE, stdin)); ptr = strtok(str, " \t\n\r\f"); @@ -2844,7 +2844,8 @@ void Phonon::pdisp() printf("\nPhonon dispersion data are written to: %s, you can visualize the results\n", fname); printf("by invoking: `gnuplot pdisp.gnuplot; gv pdisp.eps`\n"); } - for (int ii = 0; ii < 80; ++ii) printf("="); printf("\n"); + for (int ii = 0; ii < 80; ++ii) printf("="); + printf("\n"); delete []fname; nodes.clear(); diff --git a/tools/phonon/dynmat.cpp b/tools/phonon/dynmat.cpp index e82f473130..3b7bfe8268 100644 --- a/tools/phonon/dynmat.cpp +++ b/tools/phonon/dynmat.cpp @@ -3,6 +3,11 @@ #include "version.h" #include "global.h" +extern "C" void zheevd_(char *, char *, long int *, doublecomplex *, + long int *, double *, doublecomplex *, + long int *, double *, long int *, long int *, + long int *, long int *); + // to initialize the class DynMat::DynMat(int narg, char **arg) { @@ -81,7 +86,8 @@ DynMat::DynMat(int narg, char **arg) printf("Number of atoms per unit cell : %d\n", nucell); printf("System dimension : %d\n", sysdim); printf("Boltzmann constant in used units : %g\n", boltz); - for (int i = 0; i < 80; ++i) printf("="); printf("\n"); + for (int i = 0; i < 80; ++i) printf("="); + printf("\n"); if (sysdim < 1||sysdim > 3||nx < 1||ny < 1||nz < 1||nucell < 1){ printf("Wrong values read from header of file: %s, please check the binary file!\n", binfile); fclose(fp); exit(3); @@ -117,11 +123,11 @@ DynMat::DynMat(int narg, char **arg) memory->create(attyp, nucell, "DynMat:attyp"); memory->create(M_inv_sqrt, nucell, "DynMat:M_inv_sqrt"); - if ( fread(&Tmeasure, sizeof(double), 1, fp) != 1 ){printf("\nError while reading temperature from file: %s\n", binfile); fclose(fp); exit(3);} - if ( fread(&basevec[0], sizeof(double), 9, fp) != 9 ){printf("\nError while reading lattice info from file: %s\n", binfile); fclose(fp); exit(3);} - if ( fread(basis[0], sizeof(double), fftdim, fp) != fftdim){printf("\nError while reading basis info from file: %s\n", binfile); fclose(fp); exit(3);} - if ( fread(&attyp[0], sizeof(int), nucell, fp) != nucell){printf("\nError while reading atom types from file: %s\n", binfile); fclose(fp); exit(3);} - if ( fread(&M_inv_sqrt[0], sizeof(double), nucell, fp) != nucell){printf("\nError while reading atomic masses from file: %s\n", binfile); fclose(fp); exit(3);} + if ( (int) fread(&Tmeasure, sizeof(double), 1, fp) != 1 ){printf("\nError while reading temperature from file: %s\n", binfile); fclose(fp); exit(3);} + if ( (int) fread(&basevec[0], sizeof(double), 9, fp) != 9 ){printf("\nError while reading lattice info from file: %s\n", binfile); fclose(fp); exit(3);} + if ( (int) fread(basis[0], sizeof(double), fftdim, fp) != fftdim){printf("\nError while reading basis info from file: %s\n", binfile); fclose(fp); exit(3);} + if ( (int) fread(&attyp[0], sizeof(int), nucell, fp) != nucell){printf("\nError while reading atom types from file: %s\n", binfile); fclose(fp); exit(3);} + if ( (int) fread(&M_inv_sqrt[0], sizeof(double), nucell, fp) != nucell){printf("\nError while reading atomic masses from file: %s\n", binfile); fclose(fp); exit(3);} fclose(fp); car2dir(); @@ -229,9 +235,9 @@ return; int DynMat::geteigen(double *egv, int flag) { char jobz, uplo; - integer n, lda, lwork, lrwork, *iwork, liwork, info; + long int n, lda, lwork, lrwork, *iwork, liwork, info; doublecomplex *work; - doublereal *w = &egv[0], *rwork; + double *w = &egv[0], *rwork; n = fftdim; if (flag) jobz = 'V'; @@ -338,7 +344,8 @@ void DynMat::EnforceASR() char *ptr = strtok(str," \t\n\r\f"); if (ptr) nasr = atoi(ptr); if (nasr < 1){ - for (int i=0; i<80; i++) printf("="); printf("\n"); + for (int i=0; i<80; i++) printf("="); + printf("\n"); return; } @@ -404,7 +411,8 @@ void DynMat::EnforceASR() if (i == 99){ printf("...... (%d more skiped)", fftdim-100); break;} } printf("\n"); - for (int i = 0; i < 80; ++i) printf("="); printf("\n\n"); + for (int i = 0; i < 80; ++i) printf("="); + printf("\n\n"); return; } @@ -456,7 +464,7 @@ return; * --------------------------------------------------------------------*/ void DynMat::GaussJordan(int n, double *Mat) { - int i,icol,irow,j,k,l,ll,idr,idc; + int i,icol=0,irow=0,j,k,l,ll,idr,idc; int *indxc,*indxr,*ipiv; double big, nmjk; double dum, pivinv; diff --git a/tools/phonon/dynmat.h b/tools/phonon/dynmat.h index 1d6e716584..f5bd4010b8 100644 --- a/tools/phonon/dynmat.h +++ b/tools/phonon/dynmat.h @@ -7,11 +7,6 @@ #include "memory.h" #include "interpolate.h" -extern "C"{ -#include "f2c.h" -#include "clapack.h" -} - using namespace std; class DynMat { diff --git a/tools/phonon/green.cpp b/tools/phonon/green.cpp index 8f8946dc4f..35514c03fb 100644 --- a/tools/phonon/green.cpp +++ b/tools/phonon/green.cpp @@ -224,7 +224,6 @@ void Green::recursion() { // local variables std::complex Z, rec_x, rec_x_inv; - std::complex cunit = std::complex(0.,1.); double w = wmin; diff --git a/tools/phonon/interpolate.cpp b/tools/phonon/interpolate.cpp index 8c0cbde1ce..954062d415 100644 --- a/tools/phonon/interpolate.cpp +++ b/tools/phonon/interpolate.cpp @@ -1,7 +1,125 @@ #include "interpolate.h" -#include "math.h" +#include #include "global.h" +/////////////////////// +// tricubic library code +static int A[64][64] = { +{ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{-3, 3, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 2,-2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 9,-9,-9, 9, 0, 0, 0, 0, 6, 3,-6,-3, 0, 0, 0, 0, 6,-6, 3,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{-6, 6, 6,-6, 0, 0, 0, 0,-3,-3, 3, 3, 0, 0, 0, 0,-4, 4,-2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-2,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{-6, 6, 6,-6, 0, 0, 0, 0,-4,-2, 4, 2, 0, 0, 0, 0,-3, 3,-3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-1,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 4,-4,-4, 4, 0, 0, 0, 0, 2, 2,-2,-2, 0, 0, 0, 0, 2,-2, 2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9,-9,-9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3,-6,-3, 0, 0, 0, 0, 6,-6, 3,-3, 0, 0, 0, 0, 4, 2, 2, 1, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-6, 6, 6,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3,-3, 3, 3, 0, 0, 0, 0,-4, 4,-2, 2, 0, 0, 0, 0,-2,-2,-1,-1, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-6, 6, 6,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4,-2, 4, 2, 0, 0, 0, 0,-3, 3,-3, 3, 0, 0, 0, 0,-2,-1,-2,-1, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4,-4,-4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,-2,-2, 0, 0, 0, 0, 2,-2, 2,-2, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0}, +{-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 9,-9, 0, 0,-9, 9, 0, 0, 6, 3, 0, 0,-6,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6,-6, 0, 0, 3,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{-6, 6, 0, 0, 6,-6, 0, 0,-3,-3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 4, 0, 0,-2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-2, 0, 0,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0,-1, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9,-9, 0, 0,-9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3, 0, 0,-6,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6,-6, 0, 0, 3,-3, 0, 0, 4, 2, 0, 0, 2, 1, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-6, 6, 0, 0, 6,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3,-3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 4, 0, 0,-2, 2, 0, 0,-2,-2, 0, 0,-1,-1, 0, 0}, +{ 9, 0,-9, 0,-9, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0,-6, 0,-3, 0, 6, 0,-6, 0, 3, 0,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 9, 0,-9, 0,-9, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0,-6, 0,-3, 0, 6, 0,-6, 0, 3, 0,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 2, 0, 1, 0}, +{-27,27,27,-27,27,-27,-27,27,-18,-9,18, 9,18, 9,-18,-9,-18,18,-9, 9,18,-18, 9,-9,-18,18,18,-18,-9, 9, 9,-9,-12,-6,-6,-3,12, 6, 6, 3,-12,-6,12, 6,-6,-3, 6, 3,-12,12,-6, 6,-6, 6,-3, 3,-8,-4,-4,-2,-4,-2,-2,-1}, +{18,-18,-18,18,-18,18,18,-18, 9, 9,-9,-9,-9,-9, 9, 9,12,-12, 6,-6,-12,12,-6, 6,12,-12,-12,12, 6,-6,-6, 6, 6, 6, 3, 3,-6,-6,-3,-3, 6, 6,-6,-6, 3, 3,-3,-3, 8,-8, 4,-4, 4,-4, 2,-2, 4, 4, 2, 2, 2, 2, 1, 1}, +{-6, 0, 6, 0, 6, 0,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0,-3, 0, 3, 0, 3, 0,-4, 0, 4, 0,-2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-2, 0,-1, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0,-6, 0, 6, 0, 6, 0,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0,-3, 0, 3, 0, 3, 0,-4, 0, 4, 0,-2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-2, 0,-1, 0,-1, 0}, +{18,-18,-18,18,-18,18,18,-18,12, 6,-12,-6,-12,-6,12, 6, 9,-9, 9,-9,-9, 9,-9, 9,12,-12,-12,12, 6,-6,-6, 6, 6, 3, 6, 3,-6,-3,-6,-3, 8, 4,-8,-4, 4, 2,-4,-2, 6,-6, 6,-6, 3,-3, 3,-3, 4, 2, 4, 2, 2, 1, 2, 1}, +{-12,12,12,-12,12,-12,-12,12,-6,-6, 6, 6, 6, 6,-6,-6,-6, 6,-6, 6, 6,-6, 6,-6,-8, 8, 8,-8,-4, 4, 4,-4,-3,-3,-3,-3, 3, 3, 3, 3,-4,-4, 4, 4,-2,-2, 2, 2,-4, 4,-4, 4,-2, 2,-2, 2,-2,-2,-2,-2,-1,-1,-1,-1}, +{ 2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{-6, 6, 0, 0, 6,-6, 0, 0,-4,-2, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0,-3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 4,-4, 0, 0,-4, 4, 0, 0, 2, 2, 0, 0,-2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-6, 6, 0, 0, 6,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4,-2, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0,-3, 3, 0, 0,-2,-1, 0, 0,-2,-1, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4,-4, 0, 0,-4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0,-2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 2,-2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0}, +{-6, 0, 6, 0, 6, 0,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0,-2, 0, 4, 0, 2, 0,-3, 0, 3, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0,-2, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0,-6, 0, 6, 0, 6, 0,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0,-2, 0, 4, 0, 2, 0,-3, 0, 3, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0,-2, 0,-1, 0}, +{18,-18,-18,18,-18,18,18,-18,12, 6,-12,-6,-12,-6,12, 6,12,-12, 6,-6,-12,12,-6, 6, 9,-9,-9, 9, 9,-9,-9, 9, 8, 4, 4, 2,-8,-4,-4,-2, 6, 3,-6,-3, 6, 3,-6,-3, 6,-6, 3,-3, 6,-6, 3,-3, 4, 2, 2, 1, 4, 2, 2, 1}, +{-12,12,12,-12,12,-12,-12,12,-6,-6, 6, 6, 6, 6,-6,-6,-8, 8,-4, 4, 8,-8, 4,-4,-6, 6, 6,-6,-6, 6, 6,-6,-4,-4,-2,-2, 4, 4, 2, 2,-3,-3, 3, 3,-3,-3, 3, 3,-4, 4,-2, 2,-4, 4,-2, 2,-2,-2,-1,-1,-2,-2,-1,-1}, +{ 4, 0,-4, 0,-4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,-2, 0,-2, 0, 2, 0,-2, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 4, 0,-4, 0,-4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,-2, 0,-2, 0, 2, 0,-2, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0}, +{-12,12,12,-12,12,-12,-12,12,-8,-4, 8, 4, 8, 4,-8,-4,-6, 6,-6, 6, 6,-6, 6,-6,-6, 6, 6,-6,-6, 6, 6,-6,-4,-2,-4,-2, 4, 2, 4, 2,-4,-2, 4, 2,-4,-2, 4, 2,-3, 3,-3, 3,-3, 3,-3, 3,-2,-1,-2,-1,-2,-1,-2,-1}, +{ 8,-8,-8, 8,-8, 8, 8,-8, 4, 4,-4,-4,-4,-4, 4, 4, 4,-4, 4,-4,-4, 4,-4, 4, 4,-4,-4, 4, 4,-4,-4, 4, 2, 2, 2, 2,-2,-2,-2,-2, 2, 2,-2,-2, 2, 2,-2,-2, 2,-2, 2,-2, 2,-2, 2,-2, 1, 1, 1, 1, 1, 1, 1, 1}}; + +static int ijk2n(int i, int j, int k) { + return(i+4*j+16*k); +} + +/* ---------------------------------------------------------------------------- */ + +static void tricubic_get_coeff_stacked(double a[64], double x[64]) { + int i,j; + for (i=0;i<64;i++) { + a[i]=(double)(0.0); + for (j=0;j<64;j++) { + a[i]+=A[i][j]*x[j]; + } + } +} + +static void tricubic_get_coeff(double a[64], double f[8], double dfdx[8], double dfdy[8], double dfdz[8], double d2fdxdy[8], double d2fdxdz[8], double d2fdydz[8], double d3fdxdydz[8]) { + int i; + double x[64]; + for (i=0;i<8;i++) { + x[0+i]=f[i]; + x[8+i]=dfdx[i]; + x[16+i]=dfdy[i]; + x[24+i]=dfdz[i]; + x[32+i]=d2fdxdy[i]; + x[40+i]=d2fdxdz[i]; + x[48+i]=d2fdydz[i]; + x[56+i]=d3fdxdydz[i]; + } + tricubic_get_coeff_stacked(a,x); +} + +static double tricubic_eval(double a[64], double x, double y, double z) { + int i,j,k; + double ret=(double)(0.0); + /* TRICUBIC EVAL + This is the short version of tricubic_eval. It is used to compute + the value of the function at a given point (x,y,z). To compute + partial derivatives of f, use the full version with the extra args. + */ + for (i=0;i<4;i++) { + for (j=0;j<4;j++) { + for (k=0;k<4;k++) { + ret+=a[ijk2n(i,j,k)]*pow(x,i)*pow(y,j)*pow(z,k); + } + } + } + return(ret); +} + /* ---------------------------------------------------------------------------- * Constructor used to get info from caller, and prepare other necessary data * ---------------------------------------------------------------------------- */ @@ -274,7 +392,8 @@ void Interpolate::set_method() which =2-im%2; printf("Your selection: %d\n", which); - for(int i=0; i<80; i++) printf("="); printf("\n\n"); + for(int i=0; i<80; i++) printf("="); + printf("\n\n"); if (which == 1) tricubic_init(); @@ -306,4 +425,3 @@ void Interpolate::reset_gamma() return; } -/* ---------------------------------------------------------------------------- */ diff --git a/tools/phonon/interpolate.h b/tools/phonon/interpolate.h index e192fcac87..04a358ae71 100644 --- a/tools/phonon/interpolate.h +++ b/tools/phonon/interpolate.h @@ -5,11 +5,8 @@ #include "stdlib.h" #include "string.h" #include "memory.h" -#include -extern "C"{ -#include "f2c.h" -#include "clapack.h" -} + +extern "C" typedef struct { double r, i; } doublecomplex; using namespace std; diff --git a/tools/phonon/phonon.cpp b/tools/phonon/phonon.cpp index 43bea111b4..065885cf3f 100644 --- a/tools/phonon/phonon.cpp +++ b/tools/phonon/phonon.cpp @@ -42,7 +42,8 @@ Phonon::Phonon(DynMat *dm) printf("\n"); for (int i = 0; i < 37; ++i) printf("="); printf(" Menu "); - for (int i = 0; i < 37; ++i) printf("="); printf("\n"); + for (int i = 0; i < 37; ++i) printf("="); + printf("\n"); printf(" 1. Phonon DOS evaluation;\n"); printf(" 2. Phonon dispersion curves;\n"); printf(" 3. Dynamical matrix at arbitrary q;\n"); @@ -60,7 +61,8 @@ Phonon::Phonon(DynMat *dm) printf("Your choice [0]: "); if (count_words(fgets(str,MAXLINE,stdin)) > 0) job = atoi(strtok(str," \t\n\r\f")); printf("\nYour selection: %d\n", job); - for (int i = 0; i < 80; ++i) printf("=");printf("\n\n"); + for (int i = 0; i < 80; ++i) printf("="); + printf("\n\n"); // now to do the job according to user's choice if (job == 1) pdos(); @@ -414,7 +416,8 @@ void Phonon::vfanyq() dynmat->geteigen(egvs, 0); printf("q-point: [%lg %lg %lg], ", q[0], q[1], q[2]); printf("vibrational frequencies at this q-point:\n"); - for (int i = 0; i < ndim; ++i) printf("%lg ", egvs[i]); printf("\n\n"); + for (int i = 0; i < ndim; ++i) printf("%lg ", egvs[i]); + printf("\n\n"); } return; @@ -1001,7 +1004,8 @@ void Phonon::ShowCell() printf("\n"); for (int i = 0; i < 30; ++i) printf("="); printf(" Unit Cell Info "); - for (int i = 0; i < 30; ++i) printf("="); printf("\n"); + for (int i = 0; i < 30; ++i) printf("="); + printf("\n"); printf("Number of atoms in the unit cell: %d\n", dynmat->nucell); printf("Basis vectors of the unit cell:\n"); printf(" %15.8f %15.8f %15.8f\n", dynmat->basevec[0], dynmat->basevec[1], dynmat->basevec[2]); @@ -1091,7 +1095,7 @@ int Phonon::count_words(const char *line) strcpy(copy,line); char *ptr; - if (ptr = strchr(copy,'#')) *ptr = '\0'; + if ((ptr = strchr(copy,'#'))) *ptr = '\0'; if (strtok(copy," \t\n\r\f") == NULL) { memory->destroy(copy); diff --git a/tools/phonon/version.h b/tools/phonon/version.h index 8ed0e80aa7..decab631b0 100644 --- a/tools/phonon/version.h +++ b/tools/phonon/version.h @@ -1 +1 @@ -#define VERSION 7 +#define VERSION 8