diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index bc33da60de..2cc11d4ecb 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -37,6 +37,10 @@ enable_language(CXX) ##################################################################### include(CheckCCompilerFlag) +if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -restrict") +endif() + ######################################################################## # User input options # ######################################################################## @@ -76,7 +80,7 @@ add_definitions(-DLAMMPS_MEMALIGN=${LAMMPS_MEMALIGN}) option(LAMMPS_EXCEPTIONS "enable the use of C++ exceptions for error messages (useful for library interface)" OFF) if(LAMMPS_EXCEPTIONS) add_definitions(-DLAMMPS_EXCEPTIONS) - set(LAMMPS_API_DEFINES "${LAMMPS_API_DEFINES -DLAMMPS_EXCEPTIONS") + set(LAMMPS_API_DEFINES "${LAMMPS_API_DEFINES} -DLAMMPS_EXCEPTIONS") endif() set(LAMMPS_MACHINE "" CACHE STRING "Suffix to append to lmp binary and liblammps (WON'T enable any features automatically") @@ -665,7 +669,9 @@ include_directories(${LAMMPS_STYLE_HEADERS_DIR}) ############################################ add_library(lammps ${LIB_SOURCES}) target_link_libraries(lammps ${LAMMPS_LINK_LIBS}) -add_dependencies(lammps ${LAMMPS_DEPS}) +if(LAMMPS_DEPS) + add_dependencies(lammps ${LAMMPS_DEPS}) +endif() set_target_properties(lammps PROPERTIES OUTPUT_NAME lammps${LAMMPS_MACHINE}) if(BUILD_SHARED_LIBS) set_target_properties(lammps PROPERTIES SOVERSION ${SOVERSION}) diff --git a/doc/src/JPG/user_intel.png b/doc/src/JPG/user_intel.png index 7ec83b3207..5061f1af2e 100755 Binary files a/doc/src/JPG/user_intel.png and b/doc/src/JPG/user_intel.png differ diff --git a/doc/src/Section_packages.txt b/doc/src/Section_packages.txt index 6451581d3a..fa7db1d266 100644 --- a/doc/src/Section_packages.txt +++ b/doc/src/Section_packages.txt @@ -706,7 +706,7 @@ dynamics can be run with LAMMPS using density-functional tight-binding quantum forces calculated by LATTE. More information on LATTE can be found at this web site: -"https://github.com/lanl/LATTE"_#latte_home. A brief technical +"https://github.com/lanl/LATTE"_latte_home. A brief technical description is given with the "fix latte"_fix_latte.html command. :link(latte_home,https://github.com/lanl/LATTE) @@ -729,6 +729,7 @@ make lib-latte args="-b" # download and build in lib/latte/LATTE- make lib-latte args="-p $HOME/latte" # use existing LATTE installation in $HOME/latte make lib-latte args="-b -m gfortran" # download and build in lib/latte and # copy Makefile.lammps.gfortran to Makefile.lammps +:pre Note that 3 symbolic (soft) links, "includelink" and "liblink" and "filelink", are created in lib/latte to point into the LATTE home dir. diff --git a/doc/src/accelerate_intel.txt b/doc/src/accelerate_intel.txt index 83e17b4f27..aaa38d7de2 100644 --- a/doc/src/accelerate_intel.txt +++ b/doc/src/accelerate_intel.txt @@ -25,14 +25,14 @@ LAMMPS to run on the CPU cores and coprocessor cores simultaneously. [Currently Available USER-INTEL Styles:] Angle Styles: charmm, harmonic :ulb,l -Bond Styles: fene, harmonic :l +Bond Styles: fene, fourier, harmonic :l Dihedral Styles: charmm, harmonic, opls :l -Fixes: nve, npt, nvt, nvt/sllod :l +Fixes: nve, npt, nvt, nvt/sllod, nve/asphere :l Improper Styles: cvff, harmonic :l Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long, -buck, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm, -lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo, -sw, tersoff :l +buck, dpd, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm, +lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, +rebo, sw, tersoff :l K-Space Styles: pppm, pppm/disp :l :ule @@ -54,11 +54,12 @@ warmup run (for use with offload benchmarks). :c,image(JPG/user_intel.png) Results are speedups obtained on Intel Xeon E5-2697v4 processors -(code-named Broadwell) and Intel Xeon Phi 7250 processors -(code-named Knights Landing) with "June 2017" LAMMPS built with -Intel Parallel Studio 2017 update 2. Results are with 1 MPI task -per physical core. See {src/USER-INTEL/TEST/README} for the raw -simulation rates and instructions to reproduce. +(code-named Broadwell), Intel Xeon Phi 7250 processors (code-named +Knights Landing), and Intel Xeon Gold 6148 processors (code-named +Skylake) with "June 2017" LAMMPS built with Intel Parallel Studio +2017 update 2. Results are with 1 MPI task per physical core. See +{src/USER-INTEL/TEST/README} for the raw simulation rates and +instructions to reproduce. :line @@ -82,6 +83,11 @@ this order :l The {newton} setting applies to all atoms, not just atoms shared between MPI tasks :l Vectorization can change the order for adding pairwise forces :l +When using the -DLMP_USE_MKL_RNG define (all included intel optimized +makefiles do) at build time, the random number generator for +dissipative particle dynamics (pair style dpd/intel) uses the Mersenne +Twister generator included in the Intel MKL library (that should be +more robust than the default Masaglia random number generator) :l :ule The precision mode (described below) used with the USER-INTEL @@ -108,7 +114,7 @@ $t should be 2 for Intel Xeon CPUs and 2 or 4 for Intel Xeon Phi :l For some of the simple 2-body potentials without long-range electrostatics, performance and scalability can be better with the "newton off" setting added to the input script :l -For simulations on higher node counts, add "processors * * * grid +For simulations on higher node counts, add "processors * * * grid numa" to the beginning of the input script for better scalability :l If using {kspace_style pppm} in the input script, add "kspace_modify diff ad" for better performance :l @@ -119,8 +125,8 @@ For Intel Xeon Phi CPUs: Runs should be performed using MCDRAM. :ulb,l :ule -For simulations using {kspace_style pppm} on Intel CPUs -supporting AVX-512: +For simulations using {kspace_style pppm} on Intel CPUs supporting +AVX-512: Add "kspace_modify diff ad" to the input script :ulb,l The command-line option should be changed to @@ -237,14 +243,17 @@ However, if you do not have coprocessors on your system, building without offload support will produce a smaller binary. The general requirements for Makefiles with the USER-INTEL package -are as follows. "-DLAMMPS_MEMALIGN=64" is required for CCFLAGS. When -using Intel compilers, "-restrict" is required and "-qopenmp" is -highly recommended for CCFLAGS and LINKFLAGS. LIB should include -"-ltbbmalloc". For builds supporting offload, "-DLMP_INTEL_OFFLOAD" -is required for CCFLAGS and "-qoffload" is required for LINKFLAGS. -Other recommended CCFLAG options for best performance are -"-O2 -fno-alias -ansi-alias -qoverride-limits fp-model fast=2 --no-prec-div". +are as follows. When using Intel compilers, "-restrict" is required +and "-qopenmp" is highly recommended for CCFLAGS and LINKFLAGS. +CCFLAGS should include "-DLMP_INTEL_USELRT" (unless POSIX Threads +are not supported in the build environment) and "-DLMP_USE_MKL_RNG" +(unless Intel Math Kernel Library (MKL) is not available in the build +environment). For Intel compilers, LIB should include "-ltbbmalloc" +or if the library is not available, "-DLMP_INTEL_NO_TBB" can be added +to CCFLAGS. For builds supporting offload, "-DLMP_INTEL_OFFLOAD" is +required for CCFLAGS and "-qoffload" is required for LINKFLAGS. Other +recommended CCFLAG options for best performance are "-O2 -fno-alias +-ansi-alias -qoverride-limits fp-model fast=2 -no-prec-div". NOTE: The vectorization and math capabilities can differ depending on the CPU. For Intel compilers, the "-x" flag specifies the type of diff --git a/doc/src/atom_modify.txt b/doc/src/atom_modify.txt index d5c82f16ac..1dc0fa6bfb 100644 --- a/doc/src/atom_modify.txt +++ b/doc/src/atom_modify.txt @@ -16,7 +16,7 @@ atom_modify keyword values ... :pre one or more keyword/value pairs may be appended :ulb,l keyword = {id} or {map} or {first} or {sort} :l {id} value = {yes} or {no} - {map} value = {array} or {hash} + {map} value = {yes} or {array} or {hash} {first} value = group-ID = group whose atoms will appear first in internal atom lists {sort} values = Nfreq binsize Nfreq = sort atoms spatially every this many time steps @@ -25,8 +25,8 @@ keyword = {id} or {map} or {first} or {sort} :l [Examples:] -atom_modify map hash -atom_modify map array sort 10000 2.0 +atom_modify map yes +atom_modify map hash sort 10000 2.0 atom_modify first colloid :pre [Description:] @@ -62,29 +62,33 @@ switch. This is described in "Section 2.2"_Section_start.html#start_2 of the manual. If atom IDs are not used, they must be specified as 0 for all atoms, e.g. in a data or restart file. -The {map} keyword determines how atom ID lookup is done for molecular -atom styles. Lookups are performed by bond (angle, etc) routines in -LAMMPS to find the local atom index associated with a global atom ID. +The {map} keyword determines how atoms with specific IDs are found +when required. An example are the bond (angle, etc) methods which +need to find the local index of an atom with a specific global ID +which is a bond (angle, etc) partner. LAMMPS performs this operation +efficiently by creating a "map", which is either an {array} or {hash} +table, as descibed below. -When the {array} value is used, each processor stores a lookup table -of length N, where N is the largest atom ID in the system. This is a +When the {map} keyword is not specified in your input script, LAMMPS +only creates a map for "atom_styles"_atom_style.html for molecular +systems which have permanent bonds (angles, etc). No map is created +for atomic systems, since it is normally not needed. However some +LAMMPS commands require a map, even for atomic systems, and will +generate an error if one does not exist. The {map} keyword thus +allows you to force the creation of a map. The {yes} value will +create either an {array} or {hash} style map, as explained in the next +paragraph. The {array} and {hash} values create an atom-style or +hash-style map respectively. + +For an {array}-style map, each processor stores a lookup table of +length N, where N is the largest atom ID in the system. This is a fast, simple method for many simulations, but requires too much memory -for large simulations. The {hash} value uses a hash table to perform -the lookups. This can be slightly slower than the {array} method, but -its memory cost is proportional to the number of atoms owned by a -processor, i.e. N/P when N is the total number of atoms in the system -and P is the number of processors. - -When this setting is not specified in your input script, LAMMPS -creates a map, if one is needed, as an array or hash. See the -discussion of default values below for how LAMMPS chooses which kind -of map to build. Note that atomic systems do not normally need to -create a map. However, even in this case some LAMMPS commands will -create a map to find atoms (and then destroy it), or require a -permanent map. An example of the former is the "velocity loop -all"_velocity.html command, which uses a map when looping over all -atoms and insuring the same velocity values are assigned to an atom -ID, no matter which processor owns it. +for large simulations. For a {hash}-style map, a hash table is +created on each processor, which finds an atom ID in constant time +(independent of the global number of atom IDs). It can be slightly +slower than the {array} map, but its memory cost is proportional to +the number of atoms owned by a processor, i.e. N/P when N is the total +number of atoms in the system and P is the number of processors. The {first} keyword allows a "group"_group.html to be specified whose atoms will be maintained as the first atoms in each processor's list diff --git a/doc/src/dihedral_fourier.txt b/doc/src/dihedral_fourier.txt index da892b59da..0accbb22bf 100644 --- a/doc/src/dihedral_fourier.txt +++ b/doc/src/dihedral_fourier.txt @@ -7,6 +7,7 @@ :line dihedral_style fourier command :h3 +dihedral_style fourier/intel command :h3 dihedral_style fourier/omp command :h3 [Syntax:] diff --git a/doc/src/dump_modify.txt b/doc/src/dump_modify.txt index 2ea1da3db3..38d9aad4d9 100644 --- a/doc/src/dump_modify.txt +++ b/doc/src/dump_modify.txt @@ -15,9 +15,11 @@ dump_modify dump-ID keyword values ... :pre dump-ID = ID of dump to modify :ulb,l one or more keyword/value pairs may be appended :l these keywords apply to various dump styles :l -keyword = {append} or {buffer} or {element} or {every} or {fileper} or {first} or {flush} or {format} or {image} or {label} or {nfile} or {pad} or {precision} or {region} or {scale} or {sort} or {thresh} or {unwrap} :l - {append} arg = {yes} or {no} or {at} N +keyword = {append} or {at} or {buffer} or {element} or {every} or {fileper} or {first} or {flush} or {format} or {image} or {label} or {nfile} or {pad} or {precision} or {region} or {scale} or {sort} or {thresh} or {unwrap} :l + {append} arg = {yes} or {no} + {at} arg = N N = index of frame written upon first dump + only available after "append yes" {buffer} arg = {yes} or {no} {element} args = E1 E2 ... EN, where N = # of atom types E1,...,EN = element name, e.g. C or Fe or Ga diff --git a/doc/src/dump_netcdf.txt b/doc/src/dump_netcdf.txt index 63568137a6..70111a36a8 100644 --- a/doc/src/dump_netcdf.txt +++ b/doc/src/dump_netcdf.txt @@ -25,7 +25,8 @@ args = list of atom attributes, same as for "dump_style custom"_dump.html :l,ule dump 1 all netcdf 100 traj.nc type x y z vx vy vz dump_modify 1 append yes at -1 thermo yes -dump 1 all netcdf/mpiio 1000 traj.nc id type x y z :pre +dump 1 all netcdf/mpiio 1000 traj.nc id type x y z +dump 1 all netcdf 1000 traj.*.nc id type x y z :pre [Description:] @@ -73,4 +74,3 @@ section for more info. [Related commands:] "dump"_dump.html, "dump_modify"_dump_modify.html, "undump"_undump.html - diff --git a/doc/src/fix_latte.txt b/doc/src/fix_latte.txt index f78e13b866..4edd610546 100644 --- a/doc/src/fix_latte.txt +++ b/doc/src/fix_latte.txt @@ -66,7 +66,7 @@ reference charge of overlapping atom-centered densities and bond integrals are parameterized using a Slater-Koster tight-binding approach. This procedure, which usually is referred to as the DFTB method has been described in detail by ("Elstner"_#Elstner) and -("Finnis"_#Finnis) and coworkers. +("Finnis"_#Finnis2) and coworkers. The work of the LATTE developers follows that of Elstner closely with respect to the physical model. However, the development of LATTE is @@ -173,7 +173,7 @@ M. Haugk, T. Frauenheim, S. Suhai, and G. Seifert, Phys. Rev. B, 58, M. Haugk, T. Frauenheim, S. Suhai, and G. Seifert, Phys. Rev. B, 58, 7260 (1998). -:link(Finnis) +:link(Finnis2) [(Finnis)] M. W. Finnis, A. T. Paxton, M. Methfessel, and M. van Schilfgarde, Phys. Rev. Lett., 81, 5149 (1998). @@ -197,11 +197,11 @@ J. Sci. Comput. 36 (2), 147-170, (2014). [(Niklasson2014)] A. M. N. Niklasson and M. Cawkwell, J. Chem. Phys., 141, 164123, (2014). -:link(Niklasson2014) +:link(Niklasson2017) [(Niklasson2017)] A. M. N. Niklasson, J. Chem. Phys., 147, 054103 (2017). -:link(Niklasson2012) -[(Niklasson2017)] A. M. N. Niklasson, M. J. Cawkwell, Phys. Rev. B, 86 +:link(Cawkwell2012) +[(Cawkwell2012)] A. M. N. Niklasson, M. J. Cawkwell, Phys. Rev. B, 86 (17), 174308 (2012). :link(Negre2016) diff --git a/doc/src/fix_neb.txt b/doc/src/fix_neb.txt index 52d8a7df84..73b3e31266 100644 --- a/doc/src/fix_neb.txt +++ b/doc/src/fix_neb.txt @@ -93,7 +93,7 @@ intermediate replica with the previous and the next image: Fnudge_parallel = {Kspring} * (|Ri+1 - Ri| - |Ri - Ri-1|) :pre -Note that in this case the specified {Kspring) is in force/distance +Note that in this case the specified {Kspring} is in force/distance units. With a value of {ideal}, the spring force is computed as suggested in @@ -105,7 +105,7 @@ where RD is the "reaction coordinate" see "neb"_neb.html section, and RDideal is the ideal RD for which all the images are equally spaced. I.e. RDideal = (I-1)*meanDist when the climbing replica is off, where I is the replica number). The meanDist is the average distance -between replicas. Note that in this case the specified {Kspring) is +between replicas. Note that in this case the specified {Kspring} is in force units. Note that the {ideal} form of nudging can often be more effective at diff --git a/doc/src/fix_nh.txt b/doc/src/fix_nh.txt index 8fa30ac222..41d0e6438f 100644 --- a/doc/src/fix_nh.txt +++ b/doc/src/fix_nh.txt @@ -393,32 +393,36 @@ thermostatting and barostatting. :line These fixes compute a temperature and pressure each timestep. To do -this, the fix creates its own computes of style "temp" and "pressure", -as if one of these two sets of commands had been issued: +this, the thermostat and barostat fixes create their own computes of +style "temp" and "pressure", as if one of these sets of commands had +been issued: +For fix nvt: compute fix-ID_temp group-ID temp -compute fix-ID_press group-ID pressure fix-ID_temp :pre +For fix npt and fix nph: compute fix-ID_temp all temp compute fix-ID_press all pressure fix-ID_temp :pre -See the "compute temp"_compute_temp.html and "compute -pressure"_compute_pressure.html commands for details. Note that the -IDs of the new computes are the fix-ID + underscore + "temp" or fix_ID -+ underscore + "press". For fix nvt, the group for the new computes -is the same as the fix group. For fix nph and fix npt, the group for -the new computes is "all" since pressure is computed for the entire -system. +For fix nvt, the group for the new temperature compute is the same as +the fix group. For fix npt and fix nph, the group for both the new +temperature and pressure compute is "all" since pressure is computed +for the entire system. In the case of fix nph, the temperature +compute is not used for thermostatting, but just for a kinetic-energy +contribution to the pressure. See the "compute +temp"_compute_temp.html and "compute pressure"_compute_pressure.html +commands for details. Note that the IDs of the new computes are the +fix-ID + underscore + "temp" or fix_ID + underscore + "press". Note that these are NOT the computes used by thermodynamic output (see the "thermo_style"_thermo_style.html command) with ID = {thermo_temp} -and {thermo_press}. This means you can change the attributes of this +and {thermo_press}. This means you can change the attributes of these fix's temperature or pressure via the -"compute_modify"_compute_modify.html command or print this temperature -or pressure during thermodynamic output via the "thermo_style -custom"_thermo_style.html command using the appropriate compute-ID. -It also means that changing attributes of {thermo_temp} or -{thermo_press} will have no effect on this fix. +"compute_modify"_compute_modify.html command. Or you can print this +temperature or pressure during thermodynamic output via the +"thermo_style custom"_thermo_style.html command using the appropriate +compute-ID. It also means that changing attributes of {thermo_temp} +or {thermo_press} will have no effect on this fix. Like other fixes that perform thermostatting, fix nvt and fix npt can be used with "compute commands"_compute.html that calculate a diff --git a/doc/src/fixes.txt b/doc/src/fixes.txt index b93bb9d0a2..97a7b58050 100644 --- a/doc/src/fixes.txt +++ b/doc/src/fixes.txt @@ -59,6 +59,7 @@ Fixes :h1 fix_langevin fix_langevin_drude fix_langevin_eff + fix_latte fix_lb_fluid fix_lb_momentum fix_lb_pc diff --git a/doc/src/lammps.book b/doc/src/lammps.book index 77c70775d7..6a2c422a83 100644 --- a/doc/src/lammps.book +++ b/doc/src/lammps.book @@ -188,6 +188,7 @@ fix_ipi.html fix_langevin.html fix_langevin_drude.html fix_langevin_eff.html +fix_latte.html fix_lb_fluid.html fix_lb_momentum.html fix_lb_pc.html diff --git a/doc/src/package.txt b/doc/src/package.txt index 58f6a5e34d..5c698934e8 100644 --- a/doc/src/package.txt +++ b/doc/src/package.txt @@ -62,7 +62,7 @@ args = arguments specific to the style :l {no_affinity} values = none {kokkos} args = keyword value ... zero or more keyword/value pairs may be appended - keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} + keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse} {neigh} value = {full} or {half} full = full neighbor list half = half neighbor list built in thread-safe manner @@ -75,9 +75,10 @@ args = arguments specific to the style :l {binsize} value = size size = bin size for neighbor list construction (distance units) {comm} value = {no} or {host} or {device} - use value for both comm/exchange and comm/forward + use value for comm/exchange and comm/forward and comm/reverse {comm/exchange} value = {no} or {host} or {device} {comm/forward} value = {no} or {host} or {device} + {comm/reverse} value = {no} or {host} or {device} no = perform communication pack/unpack in non-KOKKOS mode host = perform pack/unpack on host (e.g. with OpenMP threading) device = perform pack/unpack on device (e.g. on GPU) @@ -429,17 +430,18 @@ Coulombic solver"_kspace_style.html because the GPU is faster at performing pairwise interactions, then this rule of thumb may give too large a binsize. -The {comm} and {comm/exchange} and {comm/forward} keywords determine +The {comm} and {comm/exchange} and {comm/forward} and {comm/reverse} keywords determine whether the host or device performs the packing and unpacking of data when communicating per-atom data between processors. "Exchange" communication happens only on timesteps that neighbor lists are rebuilt. The data is only for atoms that migrate to new processors. -"Forward" communication happens every timestep. The data is for atom +"Forward" communication happens every timestep. "Reverse" communication +happens every timestep if the {newton} option is on. The data is for atom coordinates and any other atom properties that needs to be updated for ghost atoms owned by each processor. The {comm} keyword is simply a short-cut to set the same value -for both the {comm/exchange} and {comm/forward} keywords. +for both the {comm/exchange} and {comm/forward} and {comm/reverse} keywords. The value options for all 3 keywords are {no} or {host} or {device}. A value of {no} means to use the standard non-KOKKOS method of diff --git a/doc/src/pair_dpd.txt b/doc/src/pair_dpd.txt index 8d194bb092..9e29e93430 100644 --- a/doc/src/pair_dpd.txt +++ b/doc/src/pair_dpd.txt @@ -8,6 +8,7 @@ pair_style dpd command :h3 pair_style dpd/gpu command :h3 +pair_style dpd/intel command :h3 pair_style dpd/omp command :h3 pair_style dpd/tstat command :h3 pair_style dpd/tstat/gpu command :h3 diff --git a/doc/src/pair_eam.txt b/doc/src/pair_eam.txt index a0026432ec..03e77f53ab 100644 --- a/doc/src/pair_eam.txt +++ b/doc/src/pair_eam.txt @@ -294,7 +294,7 @@ distribution have a ".cdeam" suffix. Style {eam/fs} computes pairwise interactions for metals and metal alloys using a generalized form of EAM potentials due to Finnis and -Sinclair "(Finnis)"_#Finnis. The total energy Ei of an atom I is +Sinclair "(Finnis)"_#Finnis1. The total energy Ei of an atom I is given by :c,image(Eqs/pair_eam_fs.jpg) @@ -442,7 +442,7 @@ of Physics: Condensed Matter, 16, S2629 (2004). [(Daw)] Daw, Baskes, Phys Rev Lett, 50, 1285 (1983). Daw, Baskes, Phys Rev B, 29, 6443 (1984). -:link(Finnis) +:link(Finnis1) [(Finnis)] Finnis, Sinclair, Philosophical Magazine A, 50, 45 (1984). :link(Stukowski) diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index 43d3f17d63..d414056187 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,5 +1,24 @@ # Change Log +## [2.04.04](https://github.com/kokkos/kokkos/tree/2.04.04) (2017-09-11) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.00...2.04.04) + +**Implemented enhancements:** + +- OpenMP partition: set number of threads on nested level [\#1082](https://github.com/kokkos/kokkos/issues/1082) +- Add StaticCrsGraph row\(\) method [\#1071](https://github.com/kokkos/kokkos/issues/1071) +- Enhance Kokkos complex operator overloading [\#1052](https://github.com/kokkos/kokkos/issues/1052) +- Tell Trilinos packages about host+device lambda [\#1019](https://github.com/kokkos/kokkos/issues/1019) +- Function markup for defaulted class members [\#952](https://github.com/kokkos/kokkos/issues/952) +- Add deterministic random number generator [\#857](https://github.com/kokkos/kokkos/issues/857) + +**Fixed bugs:** + +- Fix reduction\_identity\::max for floating point numbers [\#1048](https://github.com/kokkos/kokkos/issues/1048) +- Fix MD iteration policy ignores lower bound on GPUs [\#1041](https://github.com/kokkos/kokkos/issues/1041) +- (Experimental) HBWSpace Linking issues in KokkosKernels [\#1094](https://github.com/kokkos/kokkos/issues/1094) +- (Experimental) ROCm: algorithms/unit\_tests test\_sort failing with segfault [\#1070](https://github.com/kokkos/kokkos/issues/1070) + ## [2.04.00](https://github.com/kokkos/kokkos/tree/2.04.00) (2017-08-16) [Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.13...2.04.00) diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index b8236e8fd1..4641232a1f 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -443,7 +443,7 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib - KOKKOS_LIBS += -lmemkind + KOKKOS_LIBS += -lmemkind -lnuma tmp := $(shell echo "\#define KOKKOS_HAVE_HBWSPACE 1" >> KokkosCore_config.tmp ) endif @@ -614,9 +614,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1) ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) else - # Assume that this is a really a GNU compiler or it could be XL on P8. - KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8 - KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8 + ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) + KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8 + KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8 + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + + else + # Assume that this is a really a GNU compiler on P8. + KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8 + KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8 + endif + endif endif endif @@ -626,9 +635,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1) ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) else - # Assume that this is a really a GNU compiler or it could be XL on P9. - KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9 - KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9 + ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) + KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9 + KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9 + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + + else + # Assume that this is a really a GNU compiler on P9 + KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9 + KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9 + endif + endif endif endif diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp index 9082e47052..3db9a145d7 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp @@ -1265,6 +1265,243 @@ void Random_XorShift1024_Pool::free_state(const Random_XorShift102 } +#endif + +#if defined(KOKKOS_ENABLE_ROCM) + + template<> + class Random_XorShift1024 { + private: + int p_; + const int state_idx_; + uint64_t* state_; + const int stride_; + friend class Random_XorShift1024_Pool; + public: + + typedef Kokkos::Experimental::ROCm device_type; + typedef Random_XorShift1024_Pool pool_type; + + enum {MAX_URAND = 0xffffffffU}; + enum {MAX_URAND64 = 0xffffffffffffffffULL-1}; + enum {MAX_RAND = static_cast(0xffffffffU/2)}; + enum {MAX_RAND64 = static_cast(0xffffffffffffffffULL/2-1)}; + + KOKKOS_INLINE_FUNCTION + Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0): + p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){ + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand() { + uint64_t state_0 = state_[ p_ * stride_ ]; + uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ]; + state_1 ^= state_1 << 31; + state_1 ^= state_1 >> 11; + state_0 ^= state_0 >> 30; + uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL; + tmp = tmp>>16; + return static_cast(tmp&MAX_URAND); + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64() { + uint64_t state_0 = state_[ p_ * stride_ ]; + uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ]; + state_1 ^= state_1 << 31; + state_1 ^= state_1 >> 11; + state_0 ^= state_0 >> 30; + return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& range) { + const uint32_t max_val = (MAX_URAND/range)*range; + uint32_t tmp = urand(); + while(tmp>=max_val) + urand(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& start, const uint32_t& end ) { + return urand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& range) { + const uint64_t max_val = (MAX_URAND64/range)*range; + uint64_t tmp = urand64(); + while(tmp>=max_val) + urand64(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& start, const uint64_t& end ) { + return urand64(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + int rand() { + return static_cast(urand()/2); + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& range) { + const int max_val = (MAX_RAND/range)*range; + int tmp = rand(); + while(tmp>=max_val) + rand(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& start, const int& end ) { + return rand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64() { + return static_cast(urand64()/2); + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& range) { + const int64_t max_val = (MAX_RAND64/range)*range; + int64_t tmp = rand64(); + while(tmp>=max_val) + rand64(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& start, const int64_t& end ) { + return rand64(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + float frand() { + return 1.0f * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& range) { + return range * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& start, const float& end ) { + return frand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + double drand() { + return 1.0 * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& range) { + return range * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& start, const double& end ) { + return frand(end-start)+start; + } + + //Marsaglia polar method for drawing a standard normal distributed random number + KOKKOS_INLINE_FUNCTION + double normal() { + double S = 2.0; + double U; + while(S>=1.0) { + U = 2.0*drand() - 1.0; + const double V = 2.0*drand() - 1.0; + S = U*U+V*V; + } + return U*std::sqrt(-2.0*log(S)/S); + } + + KOKKOS_INLINE_FUNCTION + double normal(const double& mean, const double& std_dev=1.0) { + return mean + normal()*std_dev; + } + }; + +template<> +inline +Random_XorShift64_Pool::Random_XorShift64_Pool(uint64_t seed) { + num_states_ = 0; + init(seed,4*32768); +} + +template<> +KOKKOS_INLINE_FUNCTION +Random_XorShift64 Random_XorShift64_Pool::get_state() const { +#ifdef __HCC_ACCELERATOR__ + const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z; + int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) * + blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_; + while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) { + i+=blockDim_x*blockDim_y*blockDim_z; + if(i>=num_states_) {i = i_offset;} + } + + return Random_XorShift64(state_(i),i); +#else + return Random_XorShift64(state_(0),0); +#endif +} + +template<> +KOKKOS_INLINE_FUNCTION +void Random_XorShift64_Pool::free_state(const Random_XorShift64 &state) const { +#ifdef __HCC_ACCELERATOR__ + state_(state.state_idx_) = state.state_; + locks_(state.state_idx_) = 0; + return; +#endif +} + + +template<> +inline +Random_XorShift1024_Pool::Random_XorShift1024_Pool(uint64_t seed) { + num_states_ = 0; + init(seed,4*32768); +} + +template<> +KOKKOS_INLINE_FUNCTION +Random_XorShift1024 Random_XorShift1024_Pool::get_state() const { +#ifdef __HCC_ACCELERATOR__ + const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z; + int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) * + blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_; + while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) { + i+=blockDim_x*blockDim_y*blockDim_z; + if(i>=num_states_) {i = i_offset;} + } + + return Random_XorShift1024(state_, p_(i), i); +#else + return Random_XorShift1024(state_, p_(0), 0); +#endif +} + +template<> +KOKKOS_INLINE_FUNCTION +void Random_XorShift1024_Pool::free_state(const Random_XorShift1024 &state) const { +#ifdef __HCC_ACCELERATOR__ + for(int i=0; i<16; i++) + state_(state.state_idx_,i) = state.state_[i]; + locks_(state.state_idx_) = 0; + return; +#endif +} + + #endif diff --git a/lib/kokkos/algorithms/unit_tests/Makefile b/lib/kokkos/algorithms/unit_tests/Makefile index b74192ef18..a5a10c82ee 100644 --- a/lib/kokkos/algorithms/unit_tests/Makefile +++ b/lib/kokkos/algorithms/unit_tests/Makefile @@ -30,6 +30,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) TEST_TARGETS += test-cuda endif +ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1) + OBJ_ROCM = TestROCm.o UnitTestMain.o gtest-all.o + TARGETS += KokkosAlgorithms_UnitTest_ROCm + TEST_TARGETS += test-rocm +endif + ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o TARGETS += KokkosAlgorithms_UnitTest_Threads @@ -51,6 +57,9 @@ endif KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS) $(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Cuda +KokkosAlgorithms_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_ROCm + KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS) $(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Threads @@ -63,6 +72,9 @@ KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS) test-cuda: KokkosAlgorithms_UnitTest_Cuda ./KokkosAlgorithms_UnitTest_Cuda +test-rocm: KokkosAlgorithms_UnitTest_ROCm + ./KokkosAlgorithms_UnitTest_ROCm + test-threads: KokkosAlgorithms_UnitTest_Threads ./KokkosAlgorithms_UnitTest_Threads diff --git a/lib/kokkos/algorithms/unit_tests/TestROCm.cpp b/lib/kokkos/algorithms/unit_tests/TestROCm.cpp new file mode 100644 index 0000000000..720b377ed2 --- /dev/null +++ b/lib/kokkos/algorithms/unit_tests/TestROCm.cpp @@ -0,0 +1,112 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#ifdef KOKKOS_ENABLE_ROCM + +#include +#include +#include + +#include + +#include + +#include +#include + +namespace Test { + +class rocm : public ::testing::Test { +protected: + static void SetUpTestCase() + { + std::cout << std::setprecision(5) << std::scientific; + Kokkos::HostSpace::execution_space::initialize(); + Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) ); + } + static void TearDownTestCase() + { + Kokkos::Experimental::ROCm::finalize(); + Kokkos::HostSpace::execution_space::finalize(); + } +}; + +void rocm_test_random_xorshift64( int num_draws ) +{ + Impl::test_random >(num_draws); +} + +void rocm_test_random_xorshift1024( int num_draws ) +{ + Impl::test_random >(num_draws); +} + + +#define ROCM_RANDOM_XORSHIFT64( num_draws ) \ + TEST_F( rocm, Random_XorShift64 ) { \ + rocm_test_random_xorshift64(num_draws); \ + } + +#define ROCM_RANDOM_XORSHIFT1024( num_draws ) \ + TEST_F( rocm, Random_XorShift1024 ) { \ + rocm_test_random_xorshift1024(num_draws); \ + } + +#define ROCM_SORT_UNSIGNED( size ) \ + TEST_F( rocm, SortUnsigned ) { \ + Impl::test_sort< Kokkos::Experimental::ROCm, unsigned >(size); \ + } + +ROCM_RANDOM_XORSHIFT64( 132141141 ) +ROCM_RANDOM_XORSHIFT1024( 52428813 ) +ROCM_SORT_UNSIGNED(171) + +#undef ROCM_RANDOM_XORSHIFT64 +#undef ROCM_RANDOM_XORSHIFT1024 +#undef ROCM_SORT_UNSIGNED +} +#else +void KOKKOS_ALGORITHMS_UNITTESTS_TESTROCM_PREVENT_LINK_ERROR() {} +#endif /* #ifdef KOKKOS_ENABLE_ROCM */ + diff --git a/lib/kokkos/bin/hpcbind b/lib/kokkos/bin/hpcbind index ca34648780..b88b334f8b 100755 --- a/lib/kokkos/bin/hpcbind +++ b/lib/kokkos/bin/hpcbind @@ -27,7 +27,7 @@ fi HPCBIND_HWLOC_PARENT_CPUSET="" if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then MY_PID="$BASHPID" - HPCBIND_HWLOC_PARENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2) + HPCBIND_HWLOC_PARENT_CPUSET="$(hwloc-ps -a --cpuset | grep ${MY_PID} | cut -f 2)" fi ################################################################################ @@ -58,23 +58,34 @@ declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0)) ################################################################################ HPCBIND_QUEUE_NAME="" declare -i HPCBIND_QUEUE_INDEX=0 -declare -i HPCBIND_QUEUE_GPU_MAPPING=0 +declare -i HPCBIND_QUEUE_MAPPING=0 -if [[ ! -z "${SLURM_LOCAL_ID}" ]]; then - HPCBIND_QUEUE_GPU_MAPPING=1 - HPCBIND_QUEUE_NAME="sbatch" +if [[ ! -z "${PMI_RANK}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="mpich" + HPCBIND_QUEUE_INDEX=${PMI_RANK} +elif [[ ! -z "${OMPI_COMM_WORLD_RANK}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="openmpi" + HPCBIND_QUEUE_INDEX=${OMPI_COMM_WORLD_RANK} +elif [[ ! -z "${MV2_COMM_WORLD_RANK}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="mvapich2" + HPCBIND_QUEUE_INDEX=${MV2_COMM_WORLD_RANK} +elif [[ ! -z "${SLURM_LOCAL_ID}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="slurm" HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID} elif [[ ! -z "${LBS_JOBINDEX}" ]]; then - HPCBIND_QUEUE_GPU_MAPPING=1 + HPCBIND_QUEUE_MAPPING=1 HPCBIND_QUEUE_NAME="bsub" HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX} elif [[ ! -z "${ALPS_APP_PE}" ]]; then - HPCBIND_QUEUE_GPU_MAPPING=1 + HPCBIND_QUEUE_MAPPING=1 HPCBIND_QUEUE_NAME="aprun" HPCBIND_QUEUE_INDEX=${ALPS_APP_PE} fi - ################################################################################ # Show help ################################################################################ @@ -91,13 +102,14 @@ function show_help { echo " --proc-bind= Set the initial process mask for the script" echo " LOC can be any valid location argument for" echo " hwloc-calc Default: all" + echo " --whole-system ${cmd} will ignore the its parent process binding" echo " --distribute=N Distribute the current cpuset into N partitions" echo " --distribute-partition=I" echo " Use the i'th partition (zero based)" echo " --visible-gpus= Comma separated list of gpu ids" echo " Default: CUDA_VISIBLE_DEVICES or all gpus in" echo " sequential order" - echo " --gpu-ignore-queue Ignore queue job id when choosing visible GPU" + echo " --ignore-queue Ignore queue job id when choosing visible GPU and partition" echo " --no-gpu-mapping Do not set CUDA_VISIBLE_DEVICES" echo " --openmp=M.m Set env variables for the given OpenMP version" echo " Default: 4.0" @@ -110,22 +122,30 @@ function show_help { echo " --force-openmp-proc-bind=" echo " Override logic for selecting OMP_PROC_BIND" echo " --no-openmp-nested Set OMP_NESTED to false" - echo " --show-bindings Show the bindings" - echo " --lstopo Show bindings in lstopo without executing a command" - echo " -v|--verbose Show options and relevant environment variables" + echo " --output-prefix=

Save the output to files of the form" + echo " P-N.log, P-N.out and P-N.err where P is the prefix" + echo " and N is the queue index or mpi rank (no spaces)" + echo " --output-mode= How console output should be handled." + echo " Options are all, rank0, and none. Default: rank0" + echo " --lstopo Show bindings in lstopo" + echo " -v|--verbose Print bindings and relevant environment variables" echo " -h|--help Show this message" echo "" echo "Sample Usage:" echo " Split the current process cpuset into 4 and use the 3rd partition" echo " ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..." - echo " Bing the process to all even cores" + echo " Launch 16 jobs over 4 nodes with 4 jobs per node using only the even pus" + echo " and save the output to rank specific files" + echo " mpiexec -N 16 -npernode 4 ${cmd} --whole-system --proc-bind=pu:even \\" + echo " --distribute=4 -v --output-prefix=output -- command ..." + echo " Bind the process to all even cores" echo " ${cmd} --proc-bind=core:even -v -- command ..." - echo " Bind to the first 64 cores and split the current process cpuset into 4" - echo " ${cmd} --proc-bind=core:0-63 --distribute=4 --distribute-partition=0 -- command ..." - echo " skip GPU 0 when mapping visible devices" + echo " Bind the the even cores of socket 0 and the odd cores of socket 1" + echo " ${cmd} --proc-bind='socket:0.core:even socket:1.core:odd' -v -- command ..." + echo " Skip GPU 0 when mapping visible devices" echo " ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..." echo " Display the current bindings" - echo " ${cmd} --proc-bind=numa:0 --show-bindings -- command" + echo " ${cmd} --proc-bind=numa:0 -- command" echo " Display the current bindings using lstopo" echo " ${cmd} --proc-bind=numa:0.core:odd --lstopo" echo "" @@ -144,7 +164,7 @@ fi declare -a UNKNOWN_ARGS=() declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC} declare -i HPCBIND_DISTRIBUTE=1 -declare -i HPCBIND_PARTITION=0 +declare -i HPCBIND_PARTITION=-1 HPCBIND_PROC_BIND="all" HPCBIND_OPENMP_VERSION=4.0 declare -i HPCBIND_OPENMP_PERCENT=100 @@ -155,11 +175,15 @@ HPCBIND_OPENMP_FORCE_PROC_BIND="" HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true} declare -i HPCBIND_VERBOSE=0 -declare -i HPCBIND_SHOW_BINDINGS=0 declare -i HPCBIND_LSTOPO=0 -for i in $@; do - case $i in +HPCBIND_OUTPUT_PREFIX="" +HPCBIND_OUTPUT_MODE="rank0" + +declare -i HPCBIND_HAS_COMMAND=0 + +for i in "$@"; do + case "$i" in # number of partitions to create --no-hwloc-bind) HPCBIND_ENABLE_HWLOC_BIND=0 @@ -169,6 +193,10 @@ for i in $@; do HPCBIND_PROC_BIND="${i#*=}" shift ;; + --whole-system) + HPCBIND_HWLOC_PARENT_CPUSET="" + shift + ;; --distribute=*) HPCBIND_DISTRIBUTE="${i#*=}" shift @@ -182,8 +210,8 @@ for i in $@; do HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ') shift ;; - --gpu-ignore-queue) - HPCBIND_QUEUE_GPU_MAPPING=0 + --ignore-queue) + HPCBIND_QUEUE_MAPPING=0 shift ;; --no-gpu-mapping) @@ -218,14 +246,18 @@ for i in $@; do HPCBIND_OPENMP_NESTED="false" shift ;; - --show-bindings) - HPCBIND_VERBOSE=1 - HPCBIND_SHOW_BINDINGS=1 + --output-prefix=*) + HPCBIND_OUTPUT_PREFIX="${i#*=}" + shift + ;; + --output-mode=*) + HPCBIND_OUTPUT_MODE="${i#*=}" + #convert to lower case + HPCBIND_OUTPUT_MODE="${HPCBIND_OUTPUT_MODE,,}" shift ;; --lstopo) HPCBIND_VERBOSE=1 - HPCBIND_SHOW_BINDINGS=0 HPCBIND_LSTOPO=1 shift ;; @@ -239,6 +271,7 @@ for i in $@; do ;; # ignore remaining arguments --) + HPCBIND_HAS_COMMAND=1 shift break ;; @@ -250,16 +283,41 @@ for i in $@; do esac done +################################################################################ +# Check output mode +################################################################################ +declare -i HPCBIND_TEE=0 + +if [[ "${HPCBIND_OUTPUT_MODE}" == "none" ]]; then + HPCBIND_TEE=0 +elif [[ "${HPCBIND_OUTPUT_MODE}" == "all" ]]; then + HPCBIND_TEE=1 +elif [[ ${HPCBIND_QUEUE_INDEX} -eq 0 ]]; then + #default to rank0 printing to screen + HPCBIND_TEE=1 +fi + + +if [[ "${HPCBIND_OUTPUT_PREFIX}" == "" ]]; then + HPCBIND_LOG=/dev/null + HPCBIND_ERR=/dev/null + HPCBIND_OUT=/dev/null +else + HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.hpc.log" + HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.err" + HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.out" + > ${HPCBIND_LOG} +fi + ################################################################################ # Check unknown arguments ################################################################################ if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then - echo "Uknown options: ${UNKNOWN_ARGS[*]}" + echo "HPCBIND Uknown options: ${UNKNOWN_ARGS[*]}" > >(tee -a ${HPCBIND_LOG}) exit 1 fi - ################################################################################ # Check that visible gpus are valid ################################################################################ @@ -268,22 +326,19 @@ if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} || ${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then - echo "Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]}, setting to 0" + echo "HPCBIND Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]} (setting to 0)" > >(tee -a ${HPCBIND_LOG}) HPCBIND_VISIBLE_GPUS[$i]=0; fi done NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]} fi - ################################################################################ # Check OpenMP percent ################################################################################ if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then - echo "OpenMP percent < 1, setting to 1" HPCBIND_OPENMP_PERCENT=1 elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then - echo "OpenMP percent > 100, setting to 100" HPCBIND_OPENMP_PERCENT=100 fi @@ -291,15 +346,21 @@ fi # Check distribute ################################################################################ if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then - echo "Invalid input for distribute, changing distribute to 1" HPCBIND_DISTRIBUTE=1 fi -if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then - echo "Invalid input for distribute-partition, changing to 0" +################################################################################ +#choose the correct partition +################################################################################ +if [[ ${HPCBIND_PARTITION} -lt 0 && ${HPCBIND_QUEUE_MAPPING} -eq 1 ]]; then + HPCBIND_PARTITION=${HPCBIND_QUEUE_INDEX} +elif [[ ${HPCBIND_PARTITION} -lt 0 ]]; then HPCBIND_PARTITION=0 fi +if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then + HPCBIND_PARTITION=$((HPCBIND_PARTITION % HPCBIND_DISTRIBUTE)) +fi ################################################################################ # Find cpuset and num threads @@ -309,13 +370,17 @@ declare -i HPCBIND_NUM_PUS=0 if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then - BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND}) + BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND[*]}) else - BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND}) + BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND[*]}) fi - CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE})) - HPCBIND_HWLOC_CPUSET=${CPUSETS[${HPCBIND_PARTITION}]} + if [[ ${HPCBIND_DISTRIBUTE} -gt 1 ]]; then + CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE})) + HPCBIND_HWLOC_CPUSET="${CPUSETS[${HPCBIND_PARTITION}]}" + else + HPCBIND_HWLOC_CPUSET="${BINDING}" + fi HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l) else HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor) @@ -373,13 +438,13 @@ export OMP_NESTED=${HPCBIND_OPENMP_NESTED} ################################################################################ if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then - if [[ ${HPCBIND_QUEUE_GPU_MAPPING} -eq 0 ]]; then + if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS)) - export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]} + export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}" else declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION)) declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS)) - export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]} + export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}" fi fi @@ -389,22 +454,22 @@ fi export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC} export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA} export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS} -export HPCBIND_HWLOC_CPUSET=${HPCBIND_HWLOC_CPUSET} +export HPCBIND_HWLOC_CPUSET="${HPCBIND_HWLOC_CPUSET}" export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE} export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION} if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then export HPCBIND_HWLOC_PARENT_CPUSET="all" else - export HPCBIND_HWLOC_PARENT_CPUSET=${HPCBIND_HWLOC_PARENT_CPUSET} + export HPCBIND_HWLOC_PARENT_CPUSET="${HPCBIND_HWLOC_PARENT_CPUSET}" fi -export HPCBIND_HWLOC_PROC_BIND=${HPCBIND_PROC_BIND} +export HPCBIND_HWLOC_PROC_BIND="${HPCBIND_PROC_BIND}" export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING} export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',') -export HPCBIND_OPENMP_VERSION=${HPCBIND_OPENMP_VERSION} +export HPCBIND_OPENMP_VERSION="${HPCBIND_OPENMP_VERSION}" if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX} - export HPCBIND_QUEUE_NAME=${HPCBIND_QUEUE_NAME} - export HPCBIND_QUEUE_GPU_MAPPING=${HPCBIND_QUEUE_GPU_MAPPING} + export HPCBIND_QUEUE_NAME="${HPCBIND_QUEUE_NAME}" + export HPCBIND_QUEUE_MAPPING=${HPCBIND_QUEUE_MAPPING} fi @@ -412,43 +477,63 @@ fi # Print verbose ################################################################################ -if [[ ${HPCBIND_VERBOSE} -eq 1 ]]; then - MY_ENV=$(env | sort) - echo "[HPCBIND]" - echo "${MY_ENV}" | grep -E "^HPCBIND_" - echo "[CUDA]" - echo "${MY_ENV}" | grep -E "^CUDA_" - echo "[OPENMP]" - echo "${MY_ENV}" | grep -E "^OMP_" -fi +TMP_ENV=$(env | sort) +if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then + echo "[HOST]" >> ${HPCBIND_LOG} + hostname -s >> ${HPCBIND_LOG} + echo "[HPCBIND]" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^HPCBIND_" >> ${HPCBIND_LOG} + echo "[CUDA]" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG} + echo "[OPENMP]" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^OMP_" >> ${HPCBIND_LOG} -if [[ ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then - echo "[BINDINGS]" - hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu -elif [[ ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then - echo "Unable to show bindings, hwloc not available." + if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then + echo "[BINDINGS]" >> ${HPCBIND_LOG} + hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu >> ${HPCBIND_LOG} + else + echo "Unable to show bindings, hwloc not available." >> ${HPCBIND_LOG} + fi +else + echo "[HOST]" > >(tee -a ${HPCBIND_LOG}) + hostname -s > >(tee -a ${HPCBIND_LOG}) + echo "[HPCBIND]" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^HPCBIND_" > >(tee -a ${HPCBIND_LOG}) + echo "[CUDA]" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG}) + echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^OMP_" > >(tee -a ${HPCBIND_LOG}) + + if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then + echo "[BINDINGS]" > >(tee -a ${HPCBIND_LOG}) + hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu > >(tee -a ${HPCBIND_LOG}) + else + echo "Unable to show bindings, hwloc not available." > >(tee -a ${HPCBIND_LOG}) + fi fi ################################################################################ # Run command ################################################################################ -if [[ ${HPCBIND_LSTOPO} -eq 0 ]]; then - if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then - hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- $@ - else - eval $@ - fi -else - if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then - if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then - echo "[BINDINGS]" - hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu - hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- lstopo --pid 0 +# must be the last executed command so that the return value is correct +if [[ ${HPCBIND_LSTOPO} -eq 1 && ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then + hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- lstopo --pid 0 +elif [[ ${HPCBIND_HAS_COMMAND} -eq 1 ]]; then + # clear output files + > ${HPCBIND_ERR} + > ${HPCBIND_OUT} + if [[ ${HPCBIND_TEE} -eq 0 ]]; then + if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then + hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR} else - hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} + eval $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR} fi else - echo "Unable to show bindings, hwloc not available." + if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then + hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2) + else + eval $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2) + fi fi fi diff --git a/lib/kokkos/bin/kokkos-bind b/lib/kokkos/bin/kokkos-bind deleted file mode 100755 index b6fe07a1bd..0000000000 --- a/lib/kokkos/bin/kokkos-bind +++ /dev/null @@ -1,221 +0,0 @@ -#!/usr/bin/env bash - -# check if hwloc commands exist -declare -i HAS_HWLOC=0 -type hwloc-bind >/dev/null 2>&1 -HAS_HWLOC="${HAS_HWLOC} + $?" - -type hwloc-distrib >/dev/null 2>&1 -HAS_HWLOC="${HAS_HWLOC} + $?" - -type hwloc-ls >/dev/null 2>&1 -HAS_HWLOC="${HAS_HWLOC} + $?" - -type hwloc-calc >/dev/null 2>&1 -HAS_HWLOC="${HAS_HWLOC} + $?" - -type hwloc-ps >/dev/null 2>&1 -HAS_HWLOC="${HAS_HWLOC} + $?" - - -#parse args -declare -a UNKNOWN_ARGS=() -declare -i DISTRIBUTE=1 -declare -i INDEX=0 -PROC_BIND="all" -CURRENT_CPUSET="" -OPENMP_VERSION=4.0 -OPENMP_PROC_BIND=True -OPENMP_NESTED=True -VERBOSE=False - -#get the current process cpuset -if [[ ${HAS_HWLOC} -eq 0 ]]; then - MY_PID="$BASHPID" - CURRENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2) - echo "$CURRENT_CPUSET" -fi - -function show_help { - local cmd=$(basename "$0") - echo "Usage: ${cmd} -- command ..." - echo " Uses hwloc to divide the node into the given number of groups," - echo " set the appropriate OMP_NUM_THREADS and execute the command on the" - echo " selected group." - echo "" - echo " NOTE: This command assumes it has exclusive use of the node" - echo "" - echo "Options:" - echo " --proc-bind= Set the initial process mask for the script. " - echo " LOC can be any valid location argumnet for" - echo " hwloc-calc. Defaults to the entire machine" - echo " --distribute=N Distribute the current proc-bind into N groups" - echo " --index=I Use the i'th group (zero based)" - echo " --openmp=M.m Set env variables for the given OpenMP version" - echo " (default 4.0)" - echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES" - echo " --no-openmp-nested Set OMP_NESTED to false" - echo " -v|--verbose" - echo " -h|--help" - echo "" - echo "Sample Usage:" - echo " ${cmd} --distribute=4 --index=2 -v -- command ..." - echo "" -} - -if [[ "$#" -eq 0 ]]; then - show_help - exit 0 -fi - - -for i in $@; do - case $i in - # number of partitions to create - --proc-bind=*) - PROC_BIND="${i#*=}" - shift - ;; - --distribute=*) - DISTRIBUTE="${i#*=}" - shift - ;; - # which group to use - --index=*) - INDEX="${i#*=}" - shift - ;; - --openmp=*) - OPENMP_VERSION="${i#*=}" - shift - ;; - --no-openmp-proc-bind) - OPENMP_PROC_BIND=False - shift - ;; - --no-openmp-nested) - OPENMP_NESTED=False - shift - ;; - -v|--verbose) - VERBOSE=True - shift - ;; - -h|--help) - show_help - exit 0 - ;; - # ignore remaining arguments - --) - shift - break - ;; - # unknown option - *) - UNKNOWN_ARGS+=("$i") - shift - ;; - esac -done - -if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then - echo "Uknown options: ${UNKNOWN_ARGS[*]}" - exit 1 -fi - -if [[ ${DISTRIBUTE} -le 0 ]]; then - echo "Invalid input for distribute, changing distribute to 1" - DISTRIBUTE=1 -fi - -if [[ ${INDEX} -ge ${DISTRIBUTE} ]]; then - echo "Invalid input for index, changing index to 0" - INDEX=0 -fi - -if [[ ${HAS_HWLOC} -ne 0 ]]; then - echo "hwloc not found, no process binding will occur" - DISTRIBUTE=1 - INDEX=0 -fi - -if [[ ${HAS_HWLOC} -eq 0 ]]; then - - if [[ "${CURRENT_CPUSET}" == "" ]]; then - BINDING=$(hwloc-calc ${PROC_BIND}) - else - BINDING=$(hwloc-calc --restrict ${CURRENT_CPUSET} ${PROC_BIND}) - fi - - CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${DISTRIBUTE})) - CPUSET=${CPUSETS[${INDEX}]} - NUM_THREADS=$(hwloc-ls --restrict ${CPUSET} --only pu | wc -l) - - if [[ "${VERBOSE}" == "True" ]]; then - echo "hwloc: true" - echo " proc_bind: ${PROC_BIND}" - echo " distribute: ${DISTRIBUTE}" - echo " index: ${INDEX}" - echo " parent_cpuset: ${CURRENT_CPUSET}" - echo " cpuset: ${CPUSET}" - echo "omp_num_threads: ${NUM_THREADS}" - echo "omp_proc_bind: ${OPENMP_PROC_BIND}" - echo "omp_nested: ${OPENMP_NESTED}" - echo "OpenMP: ${OPENMP_VERSION}" - fi - - # set OMP env - if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then - if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then - export OMP_PLACES="threads" - export OMP_PROC_BIND="spread" - else - export OMP_PROC_BIND="true" - unset OMP_PLACES - fi - else - unset OMP_PLACES - unset OMP_PROC_BIND - fi - if [[ "${OPENMP_NESTED}" == "True" ]]; then - export OMP_NESTED="true" - else - export OMP_NESTED="false" - fi - export OMP_NUM_THREADS="${NUM_THREADS}" - - hwloc-bind ${CPUSET} -- $@ -else - NUM_THREADS=$(cat /proc/cpuinfo | grep -c processor) - - if [[ "${VERBOSE}" == "True" ]]; then - echo "hwloc: false" - echo "omp_num_threads: ${NUM_THREADS}" - echo "omp_proc_bind: ${OPENMP_PROC_BIND}" - echo "omp_nested: ${OPENMP_NESTED}" - echo "OpenMP: ${OPENMP_VERSION}" - fi - - # set OMP env - if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then - if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then - export OMP_PLACES="threads" - export OMP_PROC_BIND="spread" - else - export OMP_PROC_BIND="true" - unset OMP_PLACES - fi - else - unset OMP_PLACES - unset OMP_PROC_BIND - fi - if [[ "${OPENMP_NESTED}" == "True" ]]; then - export OMP_NESTED="true" - else - export OMP_NESTED="false" - fi - export OMP_NUM_THREADS="${NUM_THREADS}" - - eval $@ -fi - diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper index 09fa5d500a..76e33f3c66 100755 --- a/lib/kokkos/bin/nvcc_wrapper +++ b/lib/kokkos/bin/nvcc_wrapper @@ -78,6 +78,9 @@ temp_dir=${TMPDIR:-/tmp} # Check if we have an optimization argument already optimization_applied=0 +# Check if we have -std=c++X or --std=c++X already +stdcxx_applied=0 + #echo "Arguments: $# $@" while [ $# -gt 0 ] @@ -130,10 +133,16 @@ do cuda_args="$cuda_args $1 $2" shift ;; - #Handle c++11 setting - --std=c++11|-std=c++11) - shared_args="$shared_args $1" + #Handle c++11 + --std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++1z|-std=c++1z) + if [ $stdcxx_applied -eq 1 ]; then + echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-std=c++1* or --std=c++1*), only the first is used because nvcc can only accept a single std setting" + else + shared_args="$shared_args $1" + stdcxx_applied=1 + fi ;; + #strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98 -std=c++98|--std=c++98) ;; diff --git a/lib/kokkos/config/master_history.txt b/lib/kokkos/config/master_history.txt index 96b05c02e1..6f9ca897d9 100644 --- a/lib/kokkos/config/master_history.txt +++ b/lib/kokkos/config/master_history.txt @@ -9,3 +9,4 @@ tag: 2.03.00 date: 04:25:2017 master: 120d9ce7 develop: 015ba641 tag: 2.03.05 date: 05:27:2017 master: 36b92f43 develop: 79073186 tag: 2.03.13 date: 07:27:2017 master: da314444 develop: 29ccb58a tag: 2.04.00 date: 08:16:2017 master: 54eb75c0 develop: 32fb8ee1 +tag: 2.04.04 date: 09:11:2017 master: 2b7e9c20 develop: 51e7b25a diff --git a/lib/kokkos/config/trilinos-integration/checkin-test b/lib/kokkos/config/trilinos-integration/checkin-test index 92a1b1c068..ffb565fcbb 100644 --- a/lib/kokkos/config/trilinos-integration/checkin-test +++ b/lib/kokkos/config/trilinos-integration/checkin-test @@ -1,4 +1,4 @@ module purge -module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.59.0/base +module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.63.0/base sems-yaml_cpp sems-superlu #Run Trilinos CheckinTest diff --git a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp index 0408472c68..996b6b5610 100644 --- a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp +++ b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp @@ -125,6 +125,123 @@ namespace Impl { }; } +/// \class GraphRowViewConst +/// \brief View of a row of a sparse graph. +/// \tparam GraphType Sparse graph type, such as (but not limited to) StaticCrsGraph. +/// +/// This class provides a generic view of a row of a sparse graph. +/// We intended this class to view a row of a StaticCrsGraph, but +/// GraphType need not necessarily be CrsMatrix. +/// +/// The row view is suited for computational kernels like sparse +/// matrix-vector multiply, as well as for modifying entries in the +/// sparse matrix. The view is always const as it does not allow graph modification. +/// +/// Here is an example loop over the entries in the row: +/// \code +/// typedef typename GraphRowViewConst::ordinal_type ordinal_type; +/// +/// GraphRowView G_i = ...; +/// const ordinal_type numEntries = G_i.length; +/// for (ordinal_type k = 0; k < numEntries; ++k) { +/// ordinal_type j = G_i.colidx (k); +/// // ... do something with A_ij and j ... +/// } +/// \endcode +/// +/// GraphType must provide the \c data_type +/// typedefs. In addition, it must make sense to use GraphRowViewConst to +/// view a row of GraphType. In particular, column +/// indices of a row must be accessible using the entries +/// resp. colidx arrays given to the constructor of this +/// class, with a constant stride between successive entries. +/// The stride is one for the compressed sparse row storage format (as +/// is used by CrsMatrix), but may be greater than one for other +/// sparse matrix storage formats (e.g., ELLPACK or jagged diagonal). +template +struct GraphRowViewConst { + //! The type of the column indices in the row. + typedef const typename GraphType::data_type ordinal_type; + +private: + //! Array of (local) column indices in the row. + ordinal_type* colidx_; + /// \brief Stride between successive entries in the row. + /// + /// For compressed sparse row (CSR) storage, this is always one. + /// This might be greater than one for storage formats like ELLPACK + /// or Jagged Diagonal. Nevertheless, the stride can never be + /// greater than the number of rows or columns in the matrix. Thus, + /// \c ordinal_type is the correct type. + const ordinal_type stride_; + +public: + /// \brief Constructor + /// + /// \param values [in] Array of the row's values. + /// \param colidx [in] Array of the row's column indices. + /// \param stride [in] (Constant) stride between matrix entries in + /// each of the above arrays. + /// \param count [in] Number of entries in the row. + KOKKOS_INLINE_FUNCTION + GraphRowViewConst ( ordinal_type* const colidx_in, + const ordinal_type& stride, + const ordinal_type& count) : + colidx_ (colidx_in), stride_ (stride), length (count) + {} + + /// \brief Constructor with offset into \c colidx array + /// + /// \param colidx [in] Array of the row's column indices. + /// \param stride [in] (Constant) stride between matrix entries in + /// each of the above arrays. + /// \param count [in] Number of entries in the row. + /// \param idx [in] Start offset into \c colidx array + /// + /// \tparam OffsetType The type of \c idx (see above). Must be a + /// built-in integer type. This may differ from ordinal_type. + /// For example, the matrix may have dimensions that fit in int, + /// but a number of entries that does not fit in int. + template + KOKKOS_INLINE_FUNCTION + GraphRowViewConst ( const typename GraphType::entries_type& colidx_in, + const ordinal_type& stride, + const ordinal_type& count, + const OffsetType& idx, + const typename std::enable_if::value, int>::type& = 0) : + colidx_ (&colidx_in(idx)), stride_ (stride), length (count) + {} + + /// \brief Number of entries in the row. + /// + /// This is a public const field rather than a public const method, + /// in order to avoid possible overhead of a method call if the + /// compiler is unable to inline that method call. + /// + /// We assume that rows contain no duplicate entries (i.e., entries + /// with the same column index). Thus, a row may have up to + /// A.numCols() entries. This means that the correct type of + /// 'length' is ordinal_type. + const ordinal_type length; + + /// \brief (Const) reference to the column index of entry i in this + /// row of the sparse matrix. + /// + /// "Entry i" is not necessarily the entry with column index i, nor + /// does i necessarily correspond to the (local) row index. + KOKKOS_INLINE_FUNCTION + ordinal_type& colidx (const ordinal_type& i) const { + return colidx_[i*stride_]; + } + + /// \brief An alias for colidx + KOKKOS_INLINE_FUNCTION + ordinal_type& operator()(const ordinal_type& i) const { + return colidx(i); + } +}; + + /// \class StaticCrsGraph /// \brief Compressed row storage array. /// @@ -218,6 +335,38 @@ public: static_cast (0); } + /// \brief Return a const view of row i of the graph. + /// + /// If row i does not belong to the graph, return an empty view. + /// + /// The returned object \c view implements the following interface: + ///

    + ///
  • \c view.length is the number of entries in the row
  • + ///
  • \c view.colidx(k) returns a const reference to the + /// column index of the k-th entry in the row
  • + ///
+ /// k is not a column index; it just counts from 0 to + /// view.length - 1. + /// + /// Users should not rely on the return type of this method. They + /// should instead assign to 'auto'. That allows compile-time + /// polymorphism for different kinds of sparse matrix formats (e.g., + /// ELLPACK or Jagged Diagonal) that we may wish to support in the + /// future. + KOKKOS_INLINE_FUNCTION + GraphRowViewConst rowConst (const data_type i) const { + const size_type start = row_map(i); + // count is guaranteed to fit in ordinal_type, as long as no row + // has duplicate entries. + const data_type count = static_cast (row_map(i+1) - start); + + if (count == 0) { + return GraphRowViewConst (NULL, 1, 0); + } else { + return GraphRowViewConst (entries, 1, count, start); + } + } + /** \brief Create a row partitioning into a given number of blocks * balancing non-zeros + a fixed cost per row. */ diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp index 46321378d9..c184c14d07 100644 --- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp +++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp @@ -91,11 +91,11 @@ struct DeviceIterateTile<2,RP,Functor,void > // LL if (RP::inner_direction == RP::Left) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1); } @@ -106,11 +106,11 @@ struct DeviceIterateTile<2,RP,Functor,void > // LR else { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { m_func(offset_0 , offset_1); } @@ -143,11 +143,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag> if (RP::inner_direction == RP::Left) { // Loop over size maxnumblocks until full range covered for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { m_func(Tag(), offset_0 , offset_1); } @@ -157,11 +157,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag> } else { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { m_func(Tag(), offset_0 , offset_1); } @@ -196,15 +196,15 @@ struct DeviceIterateTile<3,RP,Functor,void > // LL if (RP::inner_direction == RP::Left) { for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1 , offset_2); } @@ -217,15 +217,15 @@ struct DeviceIterateTile<3,RP,Functor,void > // LR else { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) { m_func(offset_0 , offset_1 , offset_2); } @@ -259,15 +259,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag> { if (RP::inner_direction == RP::Left) { for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { m_func(Tag(), offset_0 , offset_1 , offset_2); } @@ -279,15 +279,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag> } else { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) { m_func(Tag(), offset_0 , offset_1 , offset_2); } @@ -340,19 +340,19 @@ struct DeviceIterateTile<4,RP,Functor,void > const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { - const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z; + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) { for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3); } @@ -378,19 +378,19 @@ struct DeviceIterateTile<4,RP,Functor,void > const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) { for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { - const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z; + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3); } @@ -442,19 +442,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag> const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { - const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z; + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) { for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3); } @@ -479,19 +479,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag> const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) { for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { - const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z; + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3); } @@ -558,23 +558,23 @@ struct DeviceIterateTile<5,RP,Functor,void > const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2]; for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { - const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z; + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4); } @@ -613,23 +613,23 @@ struct DeviceIterateTile<5,RP,Functor,void > const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { - const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z; + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4); } @@ -695,23 +695,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag> const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2]; for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { - const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z; + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4); } @@ -750,23 +750,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag> const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { - const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z; + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4); } @@ -845,27 +845,27 @@ struct DeviceIterateTile<6,RP,Functor,void > const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4]; for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { - const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { - const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5); } @@ -917,27 +917,27 @@ struct DeviceIterateTile<6,RP,Functor,void > const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { - const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { - const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5); } @@ -1016,27 +1016,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag> const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4]; for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { - const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { - const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5); } @@ -1088,27 +1088,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag> const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { - const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { - const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5); } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp index cae8ecd489..079d9f0889 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp @@ -164,7 +164,7 @@ static void cuda_parallel_launch_constant_memory() template< class DriverType, unsigned int maxTperB, unsigned int minBperSM > __global__ -__launch_bounds__(maxTperB, minBperSM) +//__launch_bounds__(maxTperB, minBperSM) static void cuda_parallel_launch_constant_memory() { const DriverType & driver = @@ -182,7 +182,7 @@ static void cuda_parallel_launch_local_memory( const DriverType driver ) template< class DriverType, unsigned int maxTperB, unsigned int minBperSM > __global__ -__launch_bounds__(maxTperB, minBperSM) +//__launch_bounds__(maxTperB, minBperSM) static void cuda_parallel_launch_local_memory( const DriverType driver ) { driver(); diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp index 26b47a8b74..f8355f0d06 100644 --- a/lib/kokkos/core/src/Kokkos_Complex.hpp +++ b/lib/kokkos/core/src/Kokkos_Complex.hpp @@ -242,45 +242,89 @@ public: re_ = v; } + template KOKKOS_INLINE_FUNCTION - complex& operator += (const complex& src) { + complex& + operator += (const complex& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ += src.re_; im_ += src.im_; return *this; } + template KOKKOS_INLINE_FUNCTION - void operator += (const volatile complex& src) volatile { + void + operator += (const volatile complex& src) volatile { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ += src.re_; im_ += src.im_; } KOKKOS_INLINE_FUNCTION - complex& operator += (const RealType& src) { + complex& + operator += (const std::complex& src) { + re_ += src.real(); + im_ += src.imag(); + return *this; + } + + template + KOKKOS_INLINE_FUNCTION + complex& + operator += (const InputRealType& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ += src; return *this; } + template KOKKOS_INLINE_FUNCTION - void operator += (const volatile RealType& src) volatile { + void + operator += (const volatile InputRealType& src) volatile { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ += src; } - + + template KOKKOS_INLINE_FUNCTION - complex& operator -= (const complex& src) { + complex& + operator -= (const complex& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ -= src.re_; im_ -= src.im_; return *this; } KOKKOS_INLINE_FUNCTION - complex& operator -= (const RealType& src) { + complex& + operator -= (const std::complex& src) { + re_ -= src.real(); + im_ -= src.imag(); + return *this; + } + + template + KOKKOS_INLINE_FUNCTION + complex& + operator -= (const InputRealType& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ -= src; return *this; } + template KOKKOS_INLINE_FUNCTION - complex& operator *= (const complex& src) { + complex& + operator *= (const complex& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); const RealType realPart = re_ * src.re_ - im_ * src.im_; const RealType imagPart = re_ * src.im_ + im_ * src.re_; re_ = realPart; @@ -288,8 +332,12 @@ public: return *this; } + template KOKKOS_INLINE_FUNCTION - void operator *= (const volatile complex& src) volatile { + void + operator *= (const volatile complex& src) volatile { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); const RealType realPart = re_ * src.re_ - im_ * src.im_; const RealType imagPart = re_ * src.im_ + im_ * src.re_; re_ = realPart; @@ -297,20 +345,70 @@ public: } KOKKOS_INLINE_FUNCTION - complex& operator *= (const RealType& src) { + complex& + operator *= (const std::complex& src) { + const RealType realPart = re_ * src.real() - im_ * src.imag(); + const RealType imagPart = re_ * src.imag() + im_ * src.real(); + re_ = realPart; + im_ = imagPart; + return *this; + } + + template + KOKKOS_INLINE_FUNCTION + complex& + operator *= (const InputRealType& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ *= src; im_ *= src; return *this; } + template KOKKOS_INLINE_FUNCTION - void operator *= (const volatile RealType& src) volatile { + void + operator *= (const volatile InputRealType& src) volatile { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ *= src; im_ *= src; } + template KOKKOS_INLINE_FUNCTION - complex& operator /= (const complex& y) { + complex& + operator /= (const complex& y) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + + // Scale (by the "1-norm" of y) to avoid unwarranted overflow. + // If the real part is +/-Inf and the imaginary part is -/+Inf, + // this won't change the result. + const RealType s = std::fabs (y.real ()) + std::fabs (y.imag ()); + + // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0. + // In that case, the relation x/y == (x/s) / (y/s) doesn't hold, + // because y/s is NaN. + if (s == 0.0) { + this->re_ /= s; + this->im_ /= s; + } + else { + const complex x_scaled (this->re_ / s, this->im_ / s); + const complex y_conj_scaled (y.re_ / s, -(y.im_) / s); + const RealType y_scaled_abs = y_conj_scaled.re_ * y_conj_scaled.re_ + + y_conj_scaled.im_ * y_conj_scaled.im_; // abs(y) == abs(conj(y)) + *this = x_scaled * y_conj_scaled; + *this /= y_scaled_abs; + } + return *this; + } + + KOKKOS_INLINE_FUNCTION + complex& + operator /= (const std::complex& y) { + // Scale (by the "1-norm" of y) to avoid unwarranted overflow. // If the real part is +/-Inf and the imaginary part is -/+Inf, // this won't change the result. @@ -334,57 +432,95 @@ public: return *this; } + + template KOKKOS_INLINE_FUNCTION - complex& operator /= (const RealType& src) { + complex& + operator /= (const InputRealType& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + re_ /= src; im_ /= src; return *this; } + template KOKKOS_INLINE_FUNCTION - bool operator == (const complex& src) { - return (re_ == src.re_) && (im_ == src.im_); + bool + operator == (const complex& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + + return (re_ == static_cast(src.re_)) && (im_ == static_cast(src.im_)); } KOKKOS_INLINE_FUNCTION - bool operator == (const RealType src) { - return (re_ == src) && (im_ == RealType(0)); + bool + operator == (const std::complex& src) { + return (re_ == src.real()) && (im_ == src.imag()); + } + + template + KOKKOS_INLINE_FUNCTION + bool + operator == (const InputRealType src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + + return (re_ == static_cast(src)) && (im_ == RealType(0)); + } + + template + KOKKOS_INLINE_FUNCTION + bool + operator != (const complex& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + + return (re_ != static_cast(src.re_)) || (im_ != static_cast(src.im_)); } KOKKOS_INLINE_FUNCTION - bool operator != (const complex& src) { - return (re_ != src.re_) || (im_ != src.im_); + bool + operator != (const std::complex& src) { + return (re_ != src.real()) || (im_ != src.imag()); } + template KOKKOS_INLINE_FUNCTION - bool operator != (const RealType src) { - return (re_ != src) || (im_ != RealType(0)); - } + bool + operator != (const InputRealType src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + return (re_ != static_cast(src)) || (im_ != RealType(0)); + } + }; //! Binary + operator for complex complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator + (const complex& x, const complex& y) { - return complex (x.real () + y.real (), x.imag () + y.imag ()); +complex::type> +operator + (const complex& x, const complex& y) { + return complex::type > (x.real () + y.real (), x.imag () + y.imag ()); } //! Binary + operator for complex scalar. -template +template KOKKOS_INLINE_FUNCTION -complex -operator + (const complex& x, const RealType& y) { - return complex (x.real () + y , x.imag ()); +complex::type> +operator + (const complex& x, const RealType2& y) { + return complex::type> (x.real () + y , x.imag ()); } //! Binary + operator for scalar complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator + (const RealType& x, const complex& y) { - return complex (x + y.real (), y.imag ()); +complex::type> +operator + (const RealType1& x, const complex& y) { + return complex::type> (x + y.real (), y.imag ()); } //! Unary + operator for complex. @@ -396,27 +532,27 @@ operator + (const complex& x) { } //! Binary - operator for complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator - (const complex& x, const complex& y) { - return complex (x.real () - y.real (), x.imag () - y.imag ()); +complex::type> +operator - (const complex& x, const complex& y) { + return complex::type> (x.real () - y.real (), x.imag () - y.imag ()); } //! Binary - operator for complex scalar. -template +template KOKKOS_INLINE_FUNCTION -complex -operator - (const complex& x, const RealType& y) { - return complex (x.real () - y , x.imag ()); +complex::type> +operator - (const complex& x, const RealType2& y) { + return complex::type> (x.real () - y , x.imag ()); } //! Binary - operator for scalar complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator - (const RealType& x, const complex& y) { - return complex (x - y.real (), - y.imag ()); +complex::type> +operator - (const RealType1& x, const complex& y) { + return complex::type> (x - y.real (), - y.imag ()); } //! Unary - operator for complex. @@ -428,12 +564,12 @@ operator - (const complex& x) { } //! Binary * operator for complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator * (const complex& x, const complex& y) { - return complex (x.real () * y.real () - x.imag () * y.imag (), - x.real () * y.imag () + x.imag () * y.real ()); +complex::type> +operator * (const complex& x, const complex& y) { + return complex::type> (x.real () * y.real () - x.imag () * y.imag (), + x.real () * y.imag () + x.imag () * y.real ()); } /// \brief Binary * operator for std::complex and complex. @@ -446,33 +582,34 @@ operator * (const complex& x, const complex& y) { /// This function cannot be called in a CUDA device function, because /// std::complex's methods and nonmember functions are not marked as /// CUDA device functions. -template -complex -operator * (const std::complex& x, const complex& y) { - return complex (x.real () * y.real () - x.imag () * y.imag (), - x.real () * y.imag () + x.imag () * y.real ()); +template +inline +complex::type> +operator * (const std::complex& x, const complex& y) { + return complex::type> (x.real () * y.real () - x.imag () * y.imag (), + x.real () * y.imag () + x.imag () * y.real ()); } /// \brief Binary * operator for RealType times complex. /// /// This function exists because the compiler doesn't know that /// RealType and complex commute with respect to operator*. -template +template KOKKOS_INLINE_FUNCTION -complex -operator * (const RealType& x, const complex& y) { - return complex (x * y.real (), x * y.imag ()); +complex::type> +operator * (const RealType1& x, const complex& y) { + return complex::type> (x * y.real (), x * y.imag ()); } /// \brief Binary * operator for RealType times complex. /// /// This function exists because the compiler doesn't know that /// RealType and complex commute with respect to operator*. -template +template KOKKOS_INLINE_FUNCTION -complex -operator * (const complex& y, const RealType& x) { - return complex (x * y.real (), x * y.imag ()); +complex::type> +operator * (const complex& y, const RealType2& x) { + return complex::type> (x * y.real (), x * y.imag ()); } //! Imaginary part of a complex number. @@ -539,33 +676,34 @@ complex pow (const complex& x) { //! Binary operator / for complex and real numbers template KOKKOS_INLINE_FUNCTION -complex +complex::type> operator / (const complex& x, const RealType2& y) { - return complex (real (x) / y, imag (x) / y); + return complex::type> (real (x) / y, imag (x) / y); } //! Binary operator / for complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator / (const complex& x, const complex& y) { +complex::type> +operator / (const complex& x, const complex& y) { // Scale (by the "1-norm" of y) to avoid unwarranted overflow. // If the real part is +/-Inf and the imaginary part is -/+Inf, // this won't change the result. - const RealType s = std::fabs (real (y)) + std::fabs (imag (y)); + typedef typename std::common_type::type common_real_type; + const common_real_type s = std::fabs (real (y)) + std::fabs (imag (y)); // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0. // In that case, the relation x/y == (x/s) / (y/s) doesn't hold, // because y/s is NaN. if (s == 0.0) { - return complex (real (x) / s, imag (x) / s); + return complex (real (x) / s, imag (x) / s); } else { - const complex x_scaled (real (x) / s, imag (x) / s); - const complex y_conj_scaled (real (y) / s, -imag (y) / s); - const RealType y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) + + const complex x_scaled (real (x) / s, imag (x) / s); + const complex y_conj_scaled (real (y) / s, -imag (y) / s); + const RealType1 y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) + imag (y_conj_scaled) * imag (y_conj_scaled); // abs(y) == abs(conj(y)) - complex result = x_scaled * y_conj_scaled; + complex result = x_scaled * y_conj_scaled; result /= y_scaled_abs; return result; } @@ -574,16 +712,19 @@ operator / (const complex& x, const complex& y) { //! Binary operator / for complex and real numbers template KOKKOS_INLINE_FUNCTION -complex +complex::type> operator / (const RealType1& x, const complex& y) { - return complex (x)/y; + return complex::type> (x)/y; } //! Equality operator for two complex numbers. -template +template KOKKOS_INLINE_FUNCTION -bool operator == (const complex& x, const complex& y) { - return real (x) == real (y) && imag (x) == imag (y); +bool +operator == (const complex& x, const complex& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(real (x)) == static_cast(real (y)) && + static_cast(imag (x)) == static_cast(imag (y)) ); } /// \brief Equality operator for std::complex and Kokkos::complex. @@ -592,50 +733,68 @@ bool operator == (const complex& x, const complex& y) { /// Otherwise, CUDA builds will give compiler warnings ("warning: /// calling a constexpr __host__ function("real") from a __host__ /// __device__ function("operator==") is not allowed"). -template -bool operator == (const std::complex& x, const complex& y) { - return std::real (x) == real (y) && std::imag (x) == imag (y); +template +inline +bool +operator == (const std::complex& x, const complex& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(std::real (x)) == static_cast(real (y)) && + static_cast(std::imag (x)) == static_cast(imag (y)) ); } - + //! Equality operator for complex and real number. template KOKKOS_INLINE_FUNCTION -bool operator == (const complex& x, const RealType2& y) { - return real (x) == y && imag (x) == static_cast (0.0); +bool +operator == (const complex& x, const RealType2& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(real (x)) == static_cast(y) && + static_cast(imag (x)) == static_cast(0.0) ); } //! Equality operator for real and complex number. -template +template KOKKOS_INLINE_FUNCTION -bool operator == (const RealType& x, const complex& y) { +bool +operator == (const RealType1& x, const complex& y) { return y == x; } //! Inequality operator for two complex numbers. -template +template KOKKOS_INLINE_FUNCTION -bool operator != (const complex& x, const complex& y) { - return real (x) != real (y) || imag (x) != imag (y); +bool +operator != (const complex& x, const complex& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(real (x)) != static_cast(real (y)) || + static_cast(imag (x)) != static_cast(imag (y)) ); } //! Inequality operator for std::complex and Kokkos::complex. -template -KOKKOS_INLINE_FUNCTION -bool operator != (const std::complex& x, const complex& y) { - return std::real (x) != real (y) || std::imag (x) != imag (y); +template +inline +bool +operator != (const std::complex& x, const complex& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(std::real (x)) != static_cast(real (y)) || + static_cast(std::imag (x)) != static_cast(imag (y)) ); } //! Inequality operator for complex and real number. template KOKKOS_INLINE_FUNCTION -bool operator != (const complex& x, const RealType2& y) { - return real (x) != y || imag (x) != static_cast (0.0); +bool +operator != (const complex& x, const RealType2& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(real (x)) != static_cast(y) || + static_cast(imag (x)) != static_cast(0.0) ); } //! Inequality operator for real and complex number. -template +template KOKKOS_INLINE_FUNCTION -bool operator != (const RealType& x, const complex& y) { +bool +operator != (const RealType1& x, const complex& y) { return y != x; } diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp index f089c16ad2..b9c131cd7a 100644 --- a/lib/kokkos/core/src/Kokkos_Crs.hpp +++ b/lib/kokkos/core/src/Kokkos_Crs.hpp @@ -353,7 +353,14 @@ struct CountAndFill { struct Fill {}; KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const { auto j = m_crs.row_map(i); - data_type* fill = &(m_crs.entries(j)); + /* we don't want to access entries(entries.size()), even if its just to get its + address and never use it. + this can happen when row (i) is empty and all rows after it are also empty. + we could compare to row_map(i + 1), but that is a read from global memory, + whereas dimension_0() should be part of the View in registers (or constant memory) */ + data_type* fill = + (j == static_cast(m_crs.entries.dimension_0())) ? + nullptr : (&(m_crs.entries(j))); m_functor(i, fill); } using self_type = CountAndFill; diff --git a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp index 9c9af0dd8b..b811751a2c 100644 --- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp @@ -147,12 +147,11 @@ public: , const size_t arg_alloc_size ) const; /**\brief Return Name of the MemorySpace */ - static constexpr const char* name(); + static constexpr const char* name() { return "HBW"; } private: AllocationMechanism m_alloc_mech; - static constexpr const char* m_name = "HBW"; friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >; }; diff --git a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp index 339571941d..a825fd54d3 100644 --- a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp +++ b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp @@ -192,7 +192,7 @@ template<> struct reduction_identity { KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum() {return static_cast(0.0f);} KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() {return static_cast(1.0f);} - KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() {return FLT_MIN;} + KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() {return -FLT_MAX;} KOKKOS_FORCEINLINE_FUNCTION constexpr static float min() {return FLT_MAX;} }; @@ -200,7 +200,7 @@ template<> struct reduction_identity { KOKKOS_FORCEINLINE_FUNCTION constexpr static double sum() {return static_cast(0.0);} KOKKOS_FORCEINLINE_FUNCTION constexpr static double prod() {return static_cast(1.0);} - KOKKOS_FORCEINLINE_FUNCTION constexpr static double max() {return DBL_MIN;} + KOKKOS_FORCEINLINE_FUNCTION constexpr static double max() {return -DBL_MAX;} KOKKOS_FORCEINLINE_FUNCTION constexpr static double min() {return DBL_MAX;} }; @@ -208,7 +208,7 @@ template<> struct reduction_identity { KOKKOS_FORCEINLINE_FUNCTION constexpr static long double sum() {return static_cast(0.0);} KOKKOS_FORCEINLINE_FUNCTION constexpr static long double prod() {return static_cast(1.0);} - KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max() {return LDBL_MIN;} + KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max() {return -LDBL_MAX;} KOKKOS_FORCEINLINE_FUNCTION constexpr static long double min() {return LDBL_MAX;} }; diff --git a/lib/kokkos/core/src/Kokkos_ROCm.hpp b/lib/kokkos/core/src/Kokkos_ROCm.hpp index b13b0b01de..0118d4667e 100644 --- a/lib/kokkos/core/src/Kokkos_ROCm.hpp +++ b/lib/kokkos/core/src/Kokkos_ROCm.hpp @@ -211,6 +211,24 @@ struct VerifyExecutionCanAccessMemorySpace } // namespace Kokkos + +#define threadIdx_x (hc_get_workitem_id(0)) +#define threadIdx_y (hc_get_workitem_id(1)) +#define threadIdx_z (hc_get_workitem_id(2)) + +#define blockIdx_x (hc_get_group_id(0)) +#define blockIdx_y (hc_get_group_id(1)) +#define blockIdx_z (hc_get_group_id(2)) + +#define blockDim_x (hc_get_group_size(0)) +#define blockDim_y (hc_get_group_size(1)) +#define blockDim_z (hc_get_group_size(2)) + +#define gridDim_x (hc_get_num_groups(0)) +#define gridDim_y (hc_get_num_groups(1)) +#define gridDim_z (hc_get_num_groups(2)) + + #include #include diff --git a/lib/kokkos/core/src/Makefile b/lib/kokkos/core/src/Makefile index 8fb13b8954..a917cf1656 100644 --- a/lib/kokkos/core/src/Makefile +++ b/lib/kokkos/core/src/Makefile @@ -88,6 +88,7 @@ build-makefile-kokkos: echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos echo "" >> Makefile.kokkos echo "#Variables used in application Makefiles" >> Makefile.kokkos + echo "KOKKOS_OS = $(KOKKOS_OS)" >> Makefile.kokkos echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos echo "KOKKOS_CPPFLAGS = $(KOKKOS_CPPFLAGS)" >> Makefile.kokkos diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp index 37d2ac8318..de84f6e59f 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp @@ -211,6 +211,7 @@ void OpenMP::partition_master( F const& f , thread_local_bytes ); + omp_set_num_threads(partition_size); f( omp_get_thread_num(), omp_get_num_threads() ); Impl::t_openmp_instance->~Exec(); diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp index 0b7a1e2583..f2674e5929 100644 --- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp +++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp @@ -113,7 +113,6 @@ void reduce_enqueue( if (output_length < 1) return; - assert(output_result != nullptr); const auto td = get_tile_desc(szElements,output_length,team_size,vector_size, shared_size); // allocate host and device memory for the results from each team @@ -176,14 +175,17 @@ void reduce_enqueue( } }); - ValueInit::init(ReducerConditional::select(f, reducer), output_result); + if (output_result != nullptr) + ValueInit::init(ReducerConditional::select(f, reducer), output_result); fut.wait(); copy(result,result_cpu.data()); - for(std::size_t i=0;i result(td.num_tiles); hc::array scratch(len); - tile_for(td, [&,len,td](hc::tiled_index<1> t_idx, tile_buffer buffer) [[hc]] + tile_for(td, [&,f,len,td](hc::tiled_index<1> t_idx, tile_buffer buffer) [[hc]] { const auto local = t_idx.local[0]; const auto global = t_idx.global[0]; @@ -135,7 +135,7 @@ void scan_enqueue( ValueJoin::join(f, &result_cpu[i], &result_cpu[i-1]); copy(result_cpu.data(),result); - hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,len,td](hc::tiled_index<1> t_idx) [[hc]] + hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,f,len,td](hc::tiled_index<1> t_idx) [[hc]] { // const auto local = t_idx.local[0]; const auto global = t_idx.global[0]; diff --git a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp index 3d3029535e..c5e73c8b26 100644 --- a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp @@ -68,6 +68,8 @@ int bit_first_zero( unsigned i ) noexcept return full != i ? _bit_scan_forward( ~i ) : -1 ; #elif defined( KOKKOS_COMPILER_IBM ) return full != i ? __cnttz4( ~i ) : -1 ; +#elif defined( KOKKOS_COMPILER_CRAYC ) + return full != i ? _popcnt( i ^ (i+1) ) - 1 : -1 ; #elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ ) return full != i ? __builtin_ffs( ~i ) - 1 : -1 ; #else @@ -90,17 +92,16 @@ int bit_scan_forward( unsigned i ) return _bit_scan_forward(i); #elif defined( KOKKOS_COMPILER_IBM ) return __cnttz4(i); +#elif defined( KOKKOS_COMPILER_CRAYC ) + return i ? _popcnt(~i & (i-1)) : -1; #elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ ) return __builtin_ffs(i) - 1; #else - unsigned t = 1u; - int r = 0; - while ( i && ( i & t == 0 ) ) - { - t = t << 1; - ++r; + int offset = -1; + if ( i ) { + for ( offset = 0 ; (i & ( 1 << offset ) ) == 0 ; ++offset ); } - return r; + return offset; #endif } @@ -116,17 +117,16 @@ int bit_scan_reverse( unsigned i ) return _bit_scan_reverse(i); #elif defined( KOKKOS_COMPILER_IBM ) return shift - __cntlz4(i); +#elif defined( KOKKOS_COMPILER_CRAYC ) + return i ? shift - _leadz32(i) : 0 ; #elif defined( __GNUC__ ) || defined( __GNUG__ ) return shift - __builtin_clz(i); #else - unsigned t = 1u << shift; - int r = 0; - while ( i && ( i & t == 0 ) ) - { - t = t >> 1; - ++r; + int offset = 0; + if ( i ) { + for ( offset = shift ; (i & ( 1 << offset ) ) == 0 ; --offset ); } - return r; + return offset; #endif } @@ -142,6 +142,8 @@ int bit_count( unsigned i ) return _popcnt32(i); #elif defined( KOKKOS_COMPILER_IBM ) return __popcnt4(i); +#elif defined( KOKKOS_COMPILER_CRAYC ) + return _popcnt(i); #elif defined( __GNUC__ ) || defined( __GNUG__ ) return __builtin_popcount(i); #else diff --git a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp index e11f8b6d34..cd0553218d 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp @@ -166,10 +166,6 @@ void HBWSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_s } } -constexpr const char* HBWSpace::name() { - return m_name; -} - } // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/core/unit_test/TestComplex.hpp b/lib/kokkos/core/unit_test/TestComplex.hpp index ce5537fed3..c7f681699e 100644 --- a/lib/kokkos/core/unit_test/TestComplex.hpp +++ b/lib/kokkos/core/unit_test/TestComplex.hpp @@ -114,7 +114,7 @@ struct TestComplexBasicMath { typename Kokkos::View*,ExecSpace>::HostMirror h_results; void testit () { - d_results = Kokkos::View*,ExecSpace>("TestComplexBasicMath",20); + d_results = Kokkos::View*,ExecSpace>("TestComplexBasicMath",24); h_results = Kokkos::create_mirror_view(d_results); Kokkos::parallel_for(Kokkos::RangePolicy(0,1), *this); @@ -125,6 +125,7 @@ struct TestComplexBasicMath { std::complex b(3.25,5.75); std::complex d(1.0,2.0); double c = 9.3; + int e = 2; std::complex r; r = a+b; ASSERT_FLOAT_EQ(h_results(0).real(), r.real()); ASSERT_FLOAT_EQ(h_results(0).imag(), r.imag()); @@ -147,6 +148,12 @@ struct TestComplexBasicMath { r = c-a; ASSERT_FLOAT_EQ(h_results(17).real(), r.real()); ASSERT_FLOAT_EQ(h_results(17).imag(), r.imag()); r = c*a; ASSERT_FLOAT_EQ(h_results(18).real(), r.real()); ASSERT_FLOAT_EQ(h_results(18).imag(), r.imag()); r = c/a; ASSERT_FLOAT_EQ(h_results(19).real(), r.real()); ASSERT_FLOAT_EQ(h_results(19).imag(), r.imag()); + + r = a; + /* r = a+e; */ ASSERT_FLOAT_EQ(h_results(20).real(), r.real()+e); ASSERT_FLOAT_EQ(h_results(20).imag(), r.imag()); + /* r = a-e; */ ASSERT_FLOAT_EQ(h_results(21).real(), r.real()-e); ASSERT_FLOAT_EQ(h_results(21).imag(), r.imag()); + /* r = a*e; */ ASSERT_FLOAT_EQ(h_results(22).real(), r.real()*e); ASSERT_FLOAT_EQ(h_results(22).imag(), r.imag()*e); + /* r = a/e; */ ASSERT_FLOAT_EQ(h_results(23).real(), r.real()/2); ASSERT_FLOAT_EQ(h_results(23).imag(), r.imag()/e); } KOKKOS_INLINE_FUNCTION @@ -190,6 +197,12 @@ struct TestComplexBasicMath { d_results(17) = c-a; d_results(18) = c*a; d_results(19) = c/a; + + int e = 2; + d_results(20) = a+e; + d_results(21) = a-e; + d_results(22) = a*e; + d_results(23) = a/e; } }; diff --git a/lib/kokkos/core/unit_test/TestMDRange.hpp b/lib/kokkos/core/unit_test/TestMDRange.hpp index f579ddf02c..fbc3a65c2f 100644 --- a/lib/kokkos/core/unit_test/TestMDRange.hpp +++ b/lib/kokkos/core/unit_test/TestMDRange.hpp @@ -286,7 +286,9 @@ struct TestMDRange_2D { // Test with reducers - scalar { typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType > range_type; - range_type range( {{ 0, 0 }}, {{ N0, N1 }}, {{ 3, 3 }} ); + int s0 = 1; + int s1 = 1; + range_type range( {{ s0, s1 }}, {{ N0, N1 }}, {{ 3, 3 }} ); TestMDRange_2D functor( N0, N1 ); @@ -297,7 +299,7 @@ struct TestMDRange_2D { parallel_reduce( range, functor, reducer_scalar ); - ASSERT_EQ( sum, 2 * N0 * N1 ); + ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) ); } // Test with reducers - scalar view { @@ -445,7 +447,9 @@ struct TestMDRange_2D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } ); + const int s0 = 1; + const int s1 = 1; + range_type range( point_type{ { s0, s1 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } ); TestMDRange_2D functor( N0, N1 ); parallel_for( range, functor ); @@ -454,8 +458,8 @@ struct TestMDRange_2D { Kokkos::deep_copy( h_view, functor.input_view ); int counter = 0; - for ( int i = 0; i < N0; ++i ) - for ( int j = 0; j < N1; ++j ) + for ( int i = s0; i < N0; ++i ) + for ( int j = s1; j < N1; ++j ) { if ( h_view( i, j ) != 3 ) { ++counter; @@ -463,7 +467,7 @@ struct TestMDRange_2D { } if ( counter != 0 ) { - printf( "Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter ); + printf( "Offset Start + Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter ); } ASSERT_EQ( counter, 0 ); @@ -699,6 +703,7 @@ struct TestMDRange_2D { ASSERT_EQ( counter, 0 ); } + } // end test_for2 }; // MDRange_2D @@ -749,7 +754,10 @@ struct TestMDRange_3D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); TestMDRange_3D functor( N0, N1, N2 ); @@ -757,7 +765,7 @@ struct TestMDRange_3D { double sum = 0.0; parallel_reduce( range, functor, sum ); - ASSERT_EQ( sum, 2 * N0 * N1 * N2 ); + ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) ); } // Test with reducers - scalar @@ -952,7 +960,10 @@ struct TestMDRange_3D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); TestMDRange_3D functor( N0, N1, N2 ); parallel_for( range, functor ); @@ -961,9 +972,9 @@ struct TestMDRange_3D { Kokkos::deep_copy( h_view, functor.input_view ); int counter = 0; - for ( int i = 0; i < N0; ++i ) - for ( int j = 0; j < N1; ++j ) - for ( int k = 0; k < N2; ++k ) + for ( int i = s0; i < N0; ++i ) + for ( int j = s1; j < N1; ++j ) + for ( int k = s2; k < N2; ++k ) { if ( h_view( i, j, k ) != 3 ) { ++counter; @@ -971,7 +982,7 @@ struct TestMDRange_3D { } if ( counter != 0 ) { - printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter ); + printf( "Offset Start + Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter ); } ASSERT_EQ( counter, 0 ); @@ -1207,7 +1218,11 @@ struct TestMDRange_4D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } ); TestMDRange_4D functor( N0, N1, N2, N3 ); @@ -1215,7 +1230,7 @@ struct TestMDRange_4D { double sum = 0.0; parallel_reduce( range, functor, sum ); - ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 ); + ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) ); } // Test with reducers - scalar @@ -1415,7 +1430,11 @@ struct TestMDRange_4D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } ); TestMDRange_4D functor( N0, N1, N2, N3 ); parallel_for( range, functor ); @@ -1424,10 +1443,10 @@ struct TestMDRange_4D { Kokkos::deep_copy( h_view, functor.input_view ); int counter = 0; - for ( int i = 0; i < N0; ++i ) - for ( int j = 0; j < N1; ++j ) - for ( int k = 0; k < N2; ++k ) - for ( int l = 0; l < N3; ++l ) + for ( int i = s0; i < N0; ++i ) + for ( int j = s1; j < N1; ++j ) + for ( int k = s2; k < N2; ++k ) + for ( int l = s3; l < N3; ++l ) { if ( h_view( i, j, k, l ) != 3 ) { ++counter; @@ -1435,7 +1454,7 @@ struct TestMDRange_4D { } if ( counter != 0 ) { - printf("Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter); + printf("Offset Start + Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter); } ASSERT_EQ( counter, 0 ); @@ -1682,7 +1701,12 @@ struct TestMDRange_5D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + int s4 = 1; + range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } ); TestMDRange_5D functor( N0, N1, N2, N3, N4 ); @@ -1690,7 +1714,7 @@ struct TestMDRange_5D { double sum = 0.0; parallel_reduce( range, functor, sum ); - ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 ); + ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) ); } // Test with reducers - scalar @@ -1810,7 +1834,12 @@ struct TestMDRange_5D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + int s4 = 1; + range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } ); TestMDRange_5D functor( N0, N1, N2, N3, N4 ); parallel_for( range, functor ); @@ -1819,11 +1848,11 @@ struct TestMDRange_5D { Kokkos::deep_copy( h_view, functor.input_view ); int counter = 0; - for ( int i = 0; i < N0; ++i ) - for ( int j = 0; j < N1; ++j ) - for ( int k = 0; k < N2; ++k ) - for ( int l = 0; l < N3; ++l ) - for ( int m = 0; m < N4; ++m ) + for ( int i = s0; i < N0; ++i ) + for ( int j = s1; j < N1; ++j ) + for ( int k = s2; k < N2; ++k ) + for ( int l = s3; l < N3; ++l ) + for ( int m = s4; m < N4; ++m ) { if ( h_view( i, j, k, l, m ) != 3 ) { ++counter; @@ -1831,7 +1860,7 @@ struct TestMDRange_5D { } if ( counter != 0 ) { - printf( "Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter ); + printf( "Offset Start + Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter ); } ASSERT_EQ( counter, 0 ); @@ -2084,7 +2113,13 @@ struct TestMDRange_6D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + int s4 = 1; + int s5 = 1; + range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } ); TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); @@ -2092,7 +2127,7 @@ struct TestMDRange_6D { double sum = 0.0; parallel_reduce( range, functor, sum ); - ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 ); + ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) * (N5 - s5) ); } // Test with reducers - scalar @@ -2214,7 +2249,13 @@ struct TestMDRange_6D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + int s4 = 1; + int s5 = 1; + range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); parallel_for( range, functor ); @@ -2223,12 +2264,12 @@ struct TestMDRange_6D { Kokkos::deep_copy( h_view, functor.input_view ); int counter = 0; - for ( int i = 0; i < N0; ++i ) - for ( int j = 0; j < N1; ++j ) - for ( int k = 0; k < N2; ++k ) - for ( int l = 0; l < N3; ++l ) - for ( int m = 0; m < N4; ++m ) - for ( int n = 0; n < N5; ++n ) + for ( int i = s0; i < N0; ++i ) + for ( int j = s1; j < N1; ++j ) + for ( int k = s2; k < N2; ++k ) + for ( int l = s3; l < N3; ++l ) + for ( int m = s4; m < N4; ++m ) + for ( int n = s5; n < N5; ++n ) { if ( h_view( i, j, k, l, m, n ) != 3 ) { ++counter; @@ -2236,7 +2277,7 @@ struct TestMDRange_6D { } if ( counter != 0 ) { - printf( "Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter ); + printf( "Offset Start + Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter ); } ASSERT_EQ( counter, 0 ); diff --git a/lib/latte/Install.py b/lib/latte/Install.py index b3e771e4cc..37cb5d6b17 100644 --- a/lib/latte/Install.py +++ b/lib/latte/Install.py @@ -159,13 +159,13 @@ if buildflag or pathflag: os.remove("includelink") if os.path.isfile("liblink") or os.path.islink("liblink"): os.remove("liblink") - if os.path.isfile("filelink") or os.path.islink("filelink"): - os.remove("filelink") + if os.path.isfile("filelink.o") or os.path.islink("filelink.o"): + os.remove("filelink.o") cmd = 'ln -s "%s/src" includelink' % lattedir subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) cmd = 'ln -s "%s" liblink' % lattedir subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) - cmd = 'ln -s "%s/src/latte_c_bind.o" filelink' % lattedir + cmd = 'ln -s "%s/src/latte_c_bind.o" filelink.o' % lattedir subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) # copy Makefile.lammps.suffix to Makefile.lammps diff --git a/lib/latte/Makefile.lammps.gfortran b/lib/latte/Makefile.lammps.gfortran index 921721552b..6aa7782f8a 100644 --- a/lib/latte/Makefile.lammps.gfortran +++ b/lib/latte/Makefile.lammps.gfortran @@ -3,5 +3,5 @@ # GNU Fortran settings latte_SYSINC = -latte_SYSLIB = ../../lib/latte/filelink -llatte -lgfortran -llapack -lblas +latte_SYSLIB = ../../lib/latte/filelink.o -llatte -lgfortran -llapack -lblas latte_SYSPATH = -fopenmp diff --git a/lib/latte/Makefile.lammps.ifort b/lib/latte/Makefile.lammps.ifort index 23d2b32fcc..0491bdd8a5 100644 --- a/lib/latte/Makefile.lammps.ifort +++ b/lib/latte/Makefile.lammps.ifort @@ -3,7 +3,7 @@ # Intel ifort settings latte_SYSINC = -latte_SYSLIB = ../../lib/latte/filelink \ +latte_SYSLIB = ../../lib/latte/filelink.o \ -llatte -lifcore -lsvml -lompstub -limf -lmkl_intel_lp64 \ -lmkl_intel_thread -lmkl_core -lmkl_intel_thread -lpthread \ -openmp -O0 diff --git a/src/.gitignore b/src/.gitignore index 8fe732ece6..3036343d2a 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -414,6 +414,8 @@ /fix_lambdah_calc.h /fix_langevin_eff.cpp /fix_langevin_eff.h +/fix_latte.cpp +/fix_latte.h /fix_lb_fluid.cpp /fix_lb_fluid.h /fix_lb_momentum.cpp diff --git a/src/Depend.sh b/src/Depend.sh index 9463607960..e1c812ebc2 100644 --- a/src/Depend.sh +++ b/src/Depend.sh @@ -119,6 +119,10 @@ if (test $1 = "USER-DPD") then depend KOKKOS fi +if (test $1 = "USER-DRUDE") then + depend USER-OMP +fi + if (test $1 = "USER-FEP") then depend USER-OMP fi diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.cpp b/src/KOKKOS/atom_vec_atomic_kokkos.cpp index b63dc5fb8c..6c610c8c11 100644 --- a/src/KOKKOS/atom_vec_atomic_kokkos.cpp +++ b/src/KOKKOS/atom_vec_atomic_kokkos.cpp @@ -136,450 +136,6 @@ void AtomVecAtomicKokkos::copy(int i, int j, int delflag) /* ---------------------------------------------------------------------- */ -template -struct AtomVecAtomicKokkos_PackComm { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array_randomread _x; - typename ArrayTypes::t_xfloat_2d_um _buf; - typename ArrayTypes::t_int_2d_const _list; - const int _iswap; - X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; - X_FLOAT _pbc[6]; - - AtomVecAtomicKokkos_PackComm( - const typename DAT::tdual_x_array &x, - const typename DAT::tdual_xfloat_2d &buf, - const typename DAT::tdual_int_2d &list, - const int & iswap, - const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, - const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): - _x(x.view()),_list(list.view()),_iswap(iswap), - _xprd(xprd),_yprd(yprd),_zprd(zprd), - _xy(xy),_xz(xz),_yz(yz) { - const size_t maxsend = (buf.view().dimension_0()*buf.view().dimension_1())/3; - const size_t elements = 3; - buffer_view(_buf,buf,maxsend,elements); - _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; - _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; - }; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - const int j = _list(_iswap,i); - if (PBC_FLAG == 0) { - _buf(i,0) = _x(j,0); - _buf(i,1) = _x(j,1); - _buf(i,2) = _x(j,2); - } else { - if (TRICLINIC == 0) { - _buf(i,0) = _x(j,0) + _pbc[0]*_xprd; - _buf(i,1) = _x(j,1) + _pbc[1]*_yprd; - _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; - } else { - _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; - _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; - _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; - } - } - } -}; - -/* ---------------------------------------------------------------------- */ - -int AtomVecAtomicKokkos::pack_comm_kokkos(const int &n, - const DAT::tdual_int_2d &list, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, - const int* const pbc) -{ - // Check whether to always run forward communication on the host - // Choose correct forward PackComm kernel - - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - - return n*size_forward; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecAtomicKokkos_PackCommSelf { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array_randomread _x; - typename ArrayTypes::t_x_array _xw; - int _nfirst; - typename ArrayTypes::t_int_2d_const _list; - const int _iswap; - X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; - X_FLOAT _pbc[6]; - - AtomVecAtomicKokkos_PackCommSelf( - const typename DAT::tdual_x_array &x, - const int &nfirst, - const typename DAT::tdual_int_2d &list, - const int & iswap, - const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, - const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): - _x(x.view()),_xw(x.view()),_nfirst(nfirst),_list(list.view()),_iswap(iswap), - _xprd(xprd),_yprd(yprd),_zprd(zprd), - _xy(xy),_xz(xz),_yz(yz) { - _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; - _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; - }; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - const int j = _list(_iswap,i); - if (PBC_FLAG == 0) { - _xw(i+_nfirst,0) = _x(j,0); - _xw(i+_nfirst,1) = _x(j,1); - _xw(i+_nfirst,2) = _x(j,2); - } else { - if (TRICLINIC == 0) { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } else { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } - } - - } -}; - -/* ---------------------------------------------------------------------- */ - -int AtomVecAtomicKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap, - const int nfirst, const int &pbc_flag, const int* const pbc) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - return n*3; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecAtomicKokkos_UnpackComm { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array _x; - typename ArrayTypes::t_xfloat_2d_const _buf; - int _first; - - AtomVecAtomicKokkos_UnpackComm( - const typename DAT::tdual_x_array &x, - const typename DAT::tdual_xfloat_2d &buf, - const int& first):_x(x.view()),_buf(buf.view()), - _first(first) {}; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - _x(i+_first,0) = _buf(i,0); - _x(i+_first,1) = _buf(i,1); - _x(i+_first,2) = _buf(i,2); - } -}; - -/* ---------------------------------------------------------------------- */ - -void AtomVecAtomicKokkos::unpack_comm_kokkos(const int &n, const int &first, - const DAT::tdual_xfloat_2d &buf ) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - struct AtomVecAtomicKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - struct AtomVecAtomicKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecAtomicKokkos::pack_comm(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecAtomicKokkos::pack_comm_vel(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz,dvx,dvy,dvz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - if (!deform_vremap) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4]; - dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3]; - dvz = pbc[2]*h_rate[2]; - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - if (mask[i] & deform_groupbit) { - buf[m++] = h_v(j,0) + dvx; - buf[m++] = h_v(j,1) + dvy; - buf[m++] = h_v(j,2) + dvz; - } else { - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecAtomicKokkos::unpack_comm(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecAtomicKokkos::unpack_comm_vel(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - h_v(i,0) = buf[m++]; - h_v(i,1) = buf[m++]; - h_v(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecAtomicKokkos::pack_reverse(int n, int first, double *buf) -{ - if(n > 0) - sync(Host,F_MASK); - - int m = 0; - const int last = first + n; - for (int i = first; i < last; i++) { - buf[m++] = h_f(i,0); - buf[m++] = h_f(i,1); - buf[m++] = h_f(i,2); - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecAtomicKokkos::unpack_reverse(int n, int *list, double *buf) -{ - if(n > 0) { - sync(Host,F_MASK); - modified(Host,F_MASK); - } - - int m = 0; - for (int i = 0; i < n; i++) { - const int j = list[i]; - h_f(j,0) += buf[m++]; - h_f(j,1) += buf[m++]; - h_f(j,2) += buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - template struct AtomVecAtomicKokkos_PackBorder { typedef DeviceType device_type; diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.h b/src/KOKKOS/atom_vec_atomic_kokkos.h index 5e9a72c2e3..e4d2654e2c 100644 --- a/src/KOKKOS/atom_vec_atomic_kokkos.h +++ b/src/KOKKOS/atom_vec_atomic_kokkos.h @@ -33,12 +33,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos { virtual ~AtomVecAtomicKokkos() {} void grow(int); void copy(int, int, int); - int pack_comm(int, int *, double *, int, int *); - int pack_comm_vel(int, int *, double *, int, int *); - void unpack_comm(int, int, double *); - void unpack_comm_vel(int, int, double *); - int pack_reverse(int, int, double *); - void unpack_reverse(int, int *, double *); int pack_border(int, int *, double *, int, int *); int pack_border_vel(int, int *, double *, int, int *); void unpack_border(int, int, double *); @@ -55,15 +49,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos { bigint memory_usage(); void grow_reset(); - int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, const int pbc[]); - void unpack_comm_kokkos(const int &n, const int &nfirst, - const DAT::tdual_xfloat_2d &buf); - int pack_comm_self(const int &n, const DAT::tdual_int_2d &list, - const int & iswap, const int nfirst, - const int &pbc_flag, const int pbc[]); int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space); @@ -99,9 +84,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos { DAT::t_x_array d_x; DAT::t_v_array d_v; DAT::t_f_array d_f; - HAT::t_x_array h_x; - HAT::t_v_array h_v; - HAT::t_f_array h_f; DAT::tdual_int_1d k_count; }; diff --git a/src/KOKKOS/atom_vec_bond_kokkos.cpp b/src/KOKKOS/atom_vec_bond_kokkos.cpp index e0f29a27bb..076144420c 100644 --- a/src/KOKKOS/atom_vec_bond_kokkos.cpp +++ b/src/KOKKOS/atom_vec_bond_kokkos.cpp @@ -178,448 +178,6 @@ void AtomVecBondKokkos::copy(int i, int j, int delflag) /* ---------------------------------------------------------------------- */ -template -struct AtomVecBondKokkos_PackComm { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array_randomread _x; - typename ArrayTypes::t_xfloat_2d_um _buf; - typename ArrayTypes::t_int_2d_const _list; - const int _iswap; - X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; - X_FLOAT _pbc[6]; - - AtomVecBondKokkos_PackComm( - const typename DAT::tdual_x_array &x, - const typename DAT::tdual_xfloat_2d &buf, - const typename DAT::tdual_int_2d &list, - const int & iswap, - const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, - const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): - _x(x.view()),_list(list.view()),_iswap(iswap), - _xprd(xprd),_yprd(yprd),_zprd(zprd), - _xy(xy),_xz(xz),_yz(yz) { - const size_t maxsend = (buf.view().dimension_0()*buf.view().dimension_1())/3; - const size_t elements = 3; - buffer_view(_buf,buf,maxsend,elements); - _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; - _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; - }; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - const int j = _list(_iswap,i); - if (PBC_FLAG == 0) { - _buf(i,0) = _x(j,0); - _buf(i,1) = _x(j,1); - _buf(i,2) = _x(j,2); - } else { - if (TRICLINIC == 0) { - _buf(i,0) = _x(j,0) + _pbc[0]*_xprd; - _buf(i,1) = _x(j,1) + _pbc[1]*_yprd; - _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; - } else { - _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; - _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; - _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; - } - } - } -}; - -/* ---------------------------------------------------------------------- */ - -int AtomVecBondKokkos::pack_comm_kokkos(const int &n, - const DAT::tdual_int_2d &list, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, - const int* const pbc) -{ - // Check whether to always run forward communication on the host - // Choose correct forward PackComm kernel - - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - - return n*size_forward; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecBondKokkos_PackCommSelf { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array_randomread _x; - typename ArrayTypes::t_x_array _xw; - int _nfirst; - typename ArrayTypes::t_int_2d_const _list; - const int _iswap; - X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; - X_FLOAT _pbc[6]; - - AtomVecBondKokkos_PackCommSelf( - const typename DAT::tdual_x_array &x, - const int &nfirst, - const typename DAT::tdual_int_2d &list, - const int & iswap, - const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, - const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): - _x(x.view()),_xw(x.view()),_nfirst(nfirst),_list(list.view()),_iswap(iswap), - _xprd(xprd),_yprd(yprd),_zprd(zprd), - _xy(xy),_xz(xz),_yz(yz) { - _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; - _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; - }; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - const int j = _list(_iswap,i); - if (PBC_FLAG == 0) { - _xw(i+_nfirst,0) = _x(j,0); - _xw(i+_nfirst,1) = _x(j,1); - _xw(i+_nfirst,2) = _x(j,2); - } else { - if (TRICLINIC == 0) { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } else { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } - } - - } -}; - -/* ---------------------------------------------------------------------- */ - -int AtomVecBondKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap, - const int nfirst, const int &pbc_flag, const int* const pbc) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - return n*3; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecBondKokkos_UnpackComm { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array _x; - typename ArrayTypes::t_xfloat_2d_const _buf; - int _first; - - AtomVecBondKokkos_UnpackComm( - const typename DAT::tdual_x_array &x, - const typename DAT::tdual_xfloat_2d &buf, - const int& first):_x(x.view()),_buf(buf.view()), - _first(first) {}; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - _x(i+_first,0) = _buf(i,0); - _x(i+_first,1) = _buf(i,1); - _x(i+_first,2) = _buf(i,2); - } -}; - -/* ---------------------------------------------------------------------- */ - -void AtomVecBondKokkos::unpack_comm_kokkos(const int &n, const int &first, - const DAT::tdual_xfloat_2d &buf ) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - struct AtomVecBondKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - struct AtomVecBondKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecBondKokkos::pack_comm(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecBondKokkos::pack_comm_vel(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz,dvx,dvy,dvz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - if (!deform_vremap) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4]; - dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3]; - dvz = pbc[2]*h_rate[2]; - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - if (mask[i] & deform_groupbit) { - buf[m++] = h_v(j,0) + dvx; - buf[m++] = h_v(j,1) + dvy; - buf[m++] = h_v(j,2) + dvz; - } else { - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecBondKokkos::unpack_comm(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecBondKokkos::unpack_comm_vel(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - h_v(i,0) = buf[m++]; - h_v(i,1) = buf[m++]; - h_v(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecBondKokkos::pack_reverse(int n, int first, double *buf) -{ - if(n > 0) - sync(Host,F_MASK); - - int m = 0; - const int last = first + n; - for (int i = first; i < last; i++) { - buf[m++] = h_f(i,0); - buf[m++] = h_f(i,1); - buf[m++] = h_f(i,2); - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecBondKokkos::unpack_reverse(int n, int *list, double *buf) -{ - if(n > 0) - modified(Host,F_MASK); - - int m = 0; - for (int i = 0; i < n; i++) { - const int j = list[i]; - h_f(j,0) += buf[m++]; - h_f(j,1) += buf[m++]; - h_f(j,2) += buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - template struct AtomVecBondKokkos_PackBorder { typedef DeviceType device_type; diff --git a/src/KOKKOS/atom_vec_bond_kokkos.h b/src/KOKKOS/atom_vec_bond_kokkos.h index 3dcc99fa78..7ec15450ef 100644 --- a/src/KOKKOS/atom_vec_bond_kokkos.h +++ b/src/KOKKOS/atom_vec_bond_kokkos.h @@ -32,12 +32,6 @@ class AtomVecBondKokkos : public AtomVecKokkos { virtual ~AtomVecBondKokkos() {} void grow(int); void copy(int, int, int); - int pack_comm(int, int *, double *, int, int *); - int pack_comm_vel(int, int *, double *, int, int *); - void unpack_comm(int, int, double *); - void unpack_comm_vel(int, int, double *); - int pack_reverse(int, int, double *); - void unpack_reverse(int, int *, double *); int pack_border(int, int *, double *, int, int *); int pack_border_vel(int, int *, double *, int, int *); int pack_border_hybrid(int, int *, double *); @@ -59,15 +53,6 @@ class AtomVecBondKokkos : public AtomVecKokkos { bigint memory_usage(); void grow_reset(); - int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, const int pbc[]); - void unpack_comm_kokkos(const int &n, const int &nfirst, - const DAT::tdual_xfloat_2d &buf); - int pack_comm_self(const int &n, const DAT::tdual_int_2d &list, - const int & iswap, const int nfirst, - const int &pbc_flag, const int pbc[]); int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space); @@ -112,9 +97,6 @@ class AtomVecBondKokkos : public AtomVecKokkos { DAT::t_x_array d_x; DAT::t_v_array d_v; DAT::t_f_array d_f; - HAT::t_x_array h_x; - HAT::t_v_array h_v; - HAT::t_f_array h_f; DAT::t_tagint_1d d_molecule; DAT::t_int_2d d_nspecial; diff --git a/src/KOKKOS/atom_vec_charge_kokkos.cpp b/src/KOKKOS/atom_vec_charge_kokkos.cpp index 89f7e91c2b..7b8b74b405 100644 --- a/src/KOKKOS/atom_vec_charge_kokkos.cpp +++ b/src/KOKKOS/atom_vec_charge_kokkos.cpp @@ -199,397 +199,6 @@ struct AtomVecChargeKokkos_PackComm { /* ---------------------------------------------------------------------- */ -int AtomVecChargeKokkos::pack_comm_kokkos(const int &n, - const DAT::tdual_int_2d &list, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, - const int* const pbc) -{ - // Check whether to always run forward communication on the host - // Choose correct forward PackComm kernel - - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - - return n*size_forward; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecChargeKokkos_PackCommSelf { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array_randomread _x; - typename ArrayTypes::t_x_array _xw; - int _nfirst; - typename ArrayTypes::t_int_2d_const _list; - const int _iswap; - X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; - X_FLOAT _pbc[6]; - - AtomVecChargeKokkos_PackCommSelf( - const typename DAT::tdual_x_array &x, - const int &nfirst, - const typename DAT::tdual_int_2d &list, - const int & iswap, - const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, - const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): - _x(x.view()),_xw(x.view()),_nfirst(nfirst),_list(list.view()),_iswap(iswap), - _xprd(xprd),_yprd(yprd),_zprd(zprd), - _xy(xy),_xz(xz),_yz(yz) { - _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; - _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; - }; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - const int j = _list(_iswap,i); - if (PBC_FLAG == 0) { - _xw(i+_nfirst,0) = _x(j,0); - _xw(i+_nfirst,1) = _x(j,1); - _xw(i+_nfirst,2) = _x(j,2); - } else { - if (TRICLINIC == 0) { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } else { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } - } - - } -}; - -/* ---------------------------------------------------------------------- */ - -int AtomVecChargeKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap, - const int nfirst, const int &pbc_flag, const int* const pbc) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - return n*3; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecChargeKokkos_UnpackComm { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array _x; - typename ArrayTypes::t_xfloat_2d_const _buf; - int _first; - - AtomVecChargeKokkos_UnpackComm( - const typename DAT::tdual_x_array &x, - const typename DAT::tdual_xfloat_2d &buf, - const int& first):_x(x.view()),_buf(buf.view()), - _first(first) {}; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - _x(i+_first,0) = _buf(i,0); - _x(i+_first,1) = _buf(i,1); - _x(i+_first,2) = _buf(i,2); - } -}; - -/* ---------------------------------------------------------------------- */ - -void AtomVecChargeKokkos::unpack_comm_kokkos(const int &n, const int &first, - const DAT::tdual_xfloat_2d &buf ) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - struct AtomVecChargeKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - struct AtomVecChargeKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecChargeKokkos::pack_comm(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecChargeKokkos::pack_comm_vel(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz,dvx,dvy,dvz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - if (!deform_vremap) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4]; - dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3]; - dvz = pbc[2]*h_rate[2]; - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - if (mask[i] & deform_groupbit) { - buf[m++] = h_v(j,0) + dvx; - buf[m++] = h_v(j,1) + dvy; - buf[m++] = h_v(j,2) + dvz; - } else { - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecChargeKokkos::unpack_comm(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecChargeKokkos::unpack_comm_vel(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - h_v(i,0) = buf[m++]; - h_v(i,1) = buf[m++]; - h_v(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecChargeKokkos::pack_reverse(int n, int first, double *buf) -{ - if(n > 0) - sync(Host,F_MASK); - - int m = 0; - const int last = first + n; - for (int i = first; i < last; i++) { - buf[m++] = h_f(i,0); - buf[m++] = h_f(i,1); - buf[m++] = h_f(i,2); - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecChargeKokkos::unpack_reverse(int n, int *list, double *buf) -{ - if(n > 0) - modified(Host,F_MASK); - - int m = 0; - for (int i = 0; i < n; i++) { - const int j = list[i]; - h_f(j,0) += buf[m++]; - h_f(j,1) += buf[m++]; - h_f(j,2) += buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - template struct AtomVecChargeKokkos_PackBorder { typedef DeviceType device_type; diff --git a/src/KOKKOS/atom_vec_charge_kokkos.h b/src/KOKKOS/atom_vec_charge_kokkos.h index f9b385e7ed..e9ff70bbe1 100644 --- a/src/KOKKOS/atom_vec_charge_kokkos.h +++ b/src/KOKKOS/atom_vec_charge_kokkos.h @@ -33,12 +33,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos { virtual ~AtomVecChargeKokkos() {} void grow(int); void copy(int, int, int); - int pack_comm(int, int *, double *, int, int *); - int pack_comm_vel(int, int *, double *, int, int *); - void unpack_comm(int, int, double *); - void unpack_comm_vel(int, int, double *); - int pack_reverse(int, int, double *); - void unpack_reverse(int, int *, double *); int pack_border(int, int *, double *, int, int *); int pack_border_vel(int, int *, double *, int, int *); int pack_border_hybrid(int, int *, double *); @@ -60,15 +54,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos { bigint memory_usage(); void grow_reset(); - int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, const int pbc[]); - void unpack_comm_kokkos(const int &n, const int &nfirst, - const DAT::tdual_xfloat_2d &buf); - int pack_comm_self(const int &n, const DAT::tdual_int_2d &list, - const int & iswap, const int nfirst, - const int &pbc_flag, const int pbc[]); int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space); @@ -108,9 +93,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos { DAT::t_x_array d_x; DAT::t_v_array d_v; DAT::t_f_array d_f; - HAT::t_x_array h_x; - HAT::t_v_array h_v; - HAT::t_f_array h_f; DAT::t_float_1d d_q; diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.h b/src/KOKKOS/atom_vec_dpd_kokkos.h index 372404cc7d..cec1b82357 100644 --- a/src/KOKKOS/atom_vec_dpd_kokkos.h +++ b/src/KOKKOS/atom_vec_dpd_kokkos.h @@ -111,9 +111,6 @@ class AtomVecDPDKokkos : public AtomVecKokkos { DAT::t_x_array d_x; DAT::t_v_array d_v; DAT::t_f_array d_f; - HAT::t_x_array h_x; - HAT::t_v_array h_v; - HAT::t_f_array h_f; DAT::tdual_int_1d k_count; }; diff --git a/src/KOKKOS/atom_vec_full_kokkos.cpp b/src/KOKKOS/atom_vec_full_kokkos.cpp index fd7eaf7c81..8e9abe4067 100644 --- a/src/KOKKOS/atom_vec_full_kokkos.cpp +++ b/src/KOKKOS/atom_vec_full_kokkos.cpp @@ -307,452 +307,6 @@ void AtomVecFullKokkos::copy(int i, int j, int delflag) /* ---------------------------------------------------------------------- */ -template -struct AtomVecFullKokkos_PackComm { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array_randomread _x; - typename ArrayTypes::t_xfloat_2d_um _buf; - typename ArrayTypes::t_int_2d_const _list; - const int _iswap; - X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; - X_FLOAT _pbc[6]; - - AtomVecFullKokkos_PackComm( - const typename DAT::tdual_x_array &x, - const typename DAT::tdual_xfloat_2d &buf, - const typename DAT::tdual_int_2d &list, - const int & iswap, - const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, - const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): - _x(x.view()),_list(list.view()),_iswap(iswap), - _xprd(xprd),_yprd(yprd),_zprd(zprd), - _xy(xy),_xz(xz),_yz(yz) { - const size_t maxsend = (buf.view().dimension_0() - *buf.view().dimension_1())/3; - const size_t elements = 3; - buffer_view(_buf,buf,maxsend,elements); - _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; - _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; - }; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - const int j = _list(_iswap,i); - if (PBC_FLAG == 0) { - _buf(i,0) = _x(j,0); - _buf(i,1) = _x(j,1); - _buf(i,2) = _x(j,2); - } else { - if (TRICLINIC == 0) { - _buf(i,0) = _x(j,0) + _pbc[0]*_xprd; - _buf(i,1) = _x(j,1) + _pbc[1]*_yprd; - _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; - } else { - _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; - _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; - _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; - } - } - } -}; - -/* ---------------------------------------------------------------------- */ - -int AtomVecFullKokkos::pack_comm_kokkos(const int &n, - const DAT::tdual_int_2d &list, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, - const int* const pbc) -{ - // Check whether to always run forward communication on the host - // Choose correct forward PackComm kernel - - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - - return n*size_forward; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecFullKokkos_PackCommSelf { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array_randomread _x; - typename ArrayTypes::t_x_array _xw; - int _nfirst; - typename ArrayTypes::t_int_2d_const _list; - const int _iswap; - X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; - X_FLOAT _pbc[6]; - - AtomVecFullKokkos_PackCommSelf( - const typename DAT::tdual_x_array &x, - const int &nfirst, - const typename DAT::tdual_int_2d &list, - const int & iswap, - const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, - const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): - _x(x.view()),_xw(x.view()),_nfirst(nfirst), - _list(list.view()),_iswap(iswap), - _xprd(xprd),_yprd(yprd),_zprd(zprd), - _xy(xy),_xz(xz),_yz(yz) { - _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; - _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; - }; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - const int j = _list(_iswap,i); - if (PBC_FLAG == 0) { - _xw(i+_nfirst,0) = _x(j,0); - _xw(i+_nfirst,1) = _x(j,1); - _xw(i+_nfirst,2) = _x(j,2); - } else { - if (TRICLINIC == 0) { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } else { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } - } - - } -}; - -/* ---------------------------------------------------------------------- */ - -int AtomVecFullKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, - const int & iswap, - const int nfirst, const int &pbc_flag, - const int* const pbc) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - return n*3; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecFullKokkos_UnpackComm { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array _x; - typename ArrayTypes::t_xfloat_2d_const _buf; - int _first; - - AtomVecFullKokkos_UnpackComm( - const typename DAT::tdual_x_array &x, - const typename DAT::tdual_xfloat_2d &buf, - const int& first):_x(x.view()),_buf(buf.view()), - _first(first) {}; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - _x(i+_first,0) = _buf(i,0); - _x(i+_first,1) = _buf(i,1); - _x(i+_first,2) = _buf(i,2); - } -}; - -/* ---------------------------------------------------------------------- */ - -void AtomVecFullKokkos::unpack_comm_kokkos(const int &n, const int &first, - const DAT::tdual_xfloat_2d &buf ) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - struct AtomVecFullKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - struct AtomVecFullKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecFullKokkos::pack_comm(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecFullKokkos::pack_comm_vel(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz,dvx,dvy,dvz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - if (!deform_vremap) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4]; - dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3]; - dvz = pbc[2]*h_rate[2]; - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - if (mask[i] & deform_groupbit) { - buf[m++] = h_v(j,0) + dvx; - buf[m++] = h_v(j,1) + dvy; - buf[m++] = h_v(j,2) + dvz; - } else { - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecFullKokkos::unpack_comm(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecFullKokkos::unpack_comm_vel(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - h_v(i,0) = buf[m++]; - h_v(i,1) = buf[m++]; - h_v(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecFullKokkos::pack_reverse(int n, int first, double *buf) -{ - if(n > 0) - sync(Host,F_MASK); - - int m = 0; - const int last = first + n; - for (int i = first; i < last; i++) { - buf[m++] = h_f(i,0); - buf[m++] = h_f(i,1); - buf[m++] = h_f(i,2); - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecFullKokkos::unpack_reverse(int n, int *list, double *buf) -{ - if(n > 0) - modified(Host,F_MASK); - - int m = 0; - for (int i = 0; i < n; i++) { - const int j = list[i]; - h_f(j,0) += buf[m++]; - h_f(j,1) += buf[m++]; - h_f(j,2) += buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - template struct AtomVecFullKokkos_PackBorder { typedef DeviceType device_type; diff --git a/src/KOKKOS/atom_vec_full_kokkos.h b/src/KOKKOS/atom_vec_full_kokkos.h index 760df087e1..33760a8b5f 100644 --- a/src/KOKKOS/atom_vec_full_kokkos.h +++ b/src/KOKKOS/atom_vec_full_kokkos.h @@ -32,12 +32,6 @@ class AtomVecFullKokkos : public AtomVecKokkos { virtual ~AtomVecFullKokkos() {} void grow(int); void copy(int, int, int); - int pack_comm(int, int *, double *, int, int *); - int pack_comm_vel(int, int *, double *, int, int *); - void unpack_comm(int, int, double *); - void unpack_comm_vel(int, int, double *); - int pack_reverse(int, int, double *); - void unpack_reverse(int, int *, double *); int pack_border(int, int *, double *, int, int *); int pack_border_vel(int, int *, double *, int, int *); int pack_border_hybrid(int, int *, double *); @@ -59,15 +53,6 @@ class AtomVecFullKokkos : public AtomVecKokkos { bigint memory_usage(); void grow_reset(); - int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, const int pbc[]); - void unpack_comm_kokkos(const int &n, const int &nfirst, - const DAT::tdual_xfloat_2d &buf); - int pack_comm_self(const int &n, const DAT::tdual_int_2d &list, - const int & iswap, const int nfirst, - const int &pbc_flag, const int pbc[]); int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space); @@ -125,9 +110,6 @@ class AtomVecFullKokkos : public AtomVecKokkos { DAT::t_x_array d_x; DAT::t_v_array d_v; DAT::t_f_array d_f; - HAT::t_x_array h_x; - HAT::t_v_array h_v; - HAT::t_f_array h_f; DAT::t_float_1d d_q; HAT::t_float_1d h_q; diff --git a/src/KOKKOS/atom_vec_kokkos.cpp b/src/KOKKOS/atom_vec_kokkos.cpp index 5542991395..03fb2a4ead 100644 --- a/src/KOKKOS/atom_vec_kokkos.cpp +++ b/src/KOKKOS/atom_vec_kokkos.cpp @@ -12,6 +12,10 @@ ------------------------------------------------------------------------- */ #include "atom_vec_kokkos.h" +#include "atom_kokkos.h" +#include "comm_kokkos.h" +#include "domain.h" +#include "atom_masks.h" using namespace LAMMPS_NS; @@ -24,3 +28,585 @@ AtomVecKokkos::AtomVecKokkos(LAMMPS *lmp) : AtomVec(lmp) buffer_size = 0; } +/* ---------------------------------------------------------------------- */ + +template +struct AtomVecKokkos_PackComm { + typedef DeviceType device_type; + + typename ArrayTypes::t_x_array_randomread _x; + typename ArrayTypes::t_xfloat_2d_um _buf; + typename ArrayTypes::t_int_2d_const _list; + const int _iswap; + X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; + X_FLOAT _pbc[6]; + + AtomVecKokkos_PackComm( + const typename DAT::tdual_x_array &x, + const typename DAT::tdual_xfloat_2d &buf, + const typename DAT::tdual_int_2d &list, + const int & iswap, + const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, + const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): + _x(x.view()),_list(list.view()),_iswap(iswap), + _xprd(xprd),_yprd(yprd),_zprd(zprd), + _xy(xy),_xz(xz),_yz(yz) { + const size_t maxsend = (buf.view().dimension_0()*buf.view().dimension_1())/3; + const size_t elements = 3; + buffer_view(_buf,buf,maxsend,elements); + _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; + _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; + }; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + const int j = _list(_iswap,i); + if (PBC_FLAG == 0) { + _buf(i,0) = _x(j,0); + _buf(i,1) = _x(j,1); + _buf(i,2) = _x(j,2); + } else { + if (TRICLINIC == 0) { + _buf(i,0) = _x(j,0) + _pbc[0]*_xprd; + _buf(i,1) = _x(j,1) + _pbc[1]*_yprd; + _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; + } else { + _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; + _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; + _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; + } + } + } +}; + +/* ---------------------------------------------------------------------- */ + +int AtomVecKokkos::pack_comm_kokkos(const int &n, + const DAT::tdual_int_2d &list, + const int & iswap, + const DAT::tdual_xfloat_2d &buf, + const int &pbc_flag, + const int* const pbc) +{ + // Check whether to always run forward communication on the host + // Choose correct forward PackComm kernel + + if(commKK->forward_comm_on_host) { + sync(Host,X_MASK); + if(pbc_flag) { + if(domain->triclinic) { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } else { + if(domain->triclinic) { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } + } else { + sync(Device,X_MASK); + if(pbc_flag) { + if(domain->triclinic) { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } else { + if(domain->triclinic) { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } + } + + return n*size_forward; +} + +/* ---------------------------------------------------------------------- */ + +template +struct AtomVecKokkos_PackCommSelf { + typedef DeviceType device_type; + + typename ArrayTypes::t_x_array_randomread _x; + typename ArrayTypes::t_x_array _xw; + int _nfirst; + typename ArrayTypes::t_int_2d_const _list; + const int _iswap; + X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; + X_FLOAT _pbc[6]; + + AtomVecKokkos_PackCommSelf( + const typename DAT::tdual_x_array &x, + const int &nfirst, + const typename DAT::tdual_int_2d &list, + const int & iswap, + const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, + const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): + _x(x.view()),_xw(x.view()),_nfirst(nfirst),_list(list.view()),_iswap(iswap), + _xprd(xprd),_yprd(yprd),_zprd(zprd), + _xy(xy),_xz(xz),_yz(yz) { + _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; + _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; + }; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + const int j = _list(_iswap,i); + if (PBC_FLAG == 0) { + _xw(i+_nfirst,0) = _x(j,0); + _xw(i+_nfirst,1) = _x(j,1); + _xw(i+_nfirst,2) = _x(j,2); + } else { + if (TRICLINIC == 0) { + _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd; + _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd; + _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; + } else { + _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; + _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; + _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; + } + } + + } +}; + +/* ---------------------------------------------------------------------- */ + +int AtomVecKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap, + const int nfirst, const int &pbc_flag, const int* const pbc) { + if(commKK->forward_comm_on_host) { + sync(Host,X_MASK); + modified(Host,X_MASK); + if(pbc_flag) { + if(domain->triclinic) { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } else { + if(domain->triclinic) { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } + } else { + sync(Device,X_MASK); + modified(Device,X_MASK); + if(pbc_flag) { + if(domain->triclinic) { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } else { + if(domain->triclinic) { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } + } + return n*3; +} + +/* ---------------------------------------------------------------------- */ + +template +struct AtomVecKokkos_UnpackComm { + typedef DeviceType device_type; + + typename ArrayTypes::t_x_array _x; + typename ArrayTypes::t_xfloat_2d_const _buf; + int _first; + + AtomVecKokkos_UnpackComm( + const typename DAT::tdual_x_array &x, + const typename DAT::tdual_xfloat_2d &buf, + const int& first):_x(x.view()),_buf(buf.view()), + _first(first) {}; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + _x(i+_first,0) = _buf(i,0); + _x(i+_first,1) = _buf(i,1); + _x(i+_first,2) = _buf(i,2); + } +}; + +/* ---------------------------------------------------------------------- */ + +void AtomVecKokkos::unpack_comm_kokkos(const int &n, const int &first, + const DAT::tdual_xfloat_2d &buf ) { + if(commKK->forward_comm_on_host) { + sync(Host,X_MASK); + modified(Host,X_MASK); + struct AtomVecKokkos_UnpackComm f(atomKK->k_x,buf,first); + Kokkos::parallel_for(n,f); + } else { + sync(Device,X_MASK); + modified(Device,X_MASK); + struct AtomVecKokkos_UnpackComm f(atomKK->k_x,buf,first); + Kokkos::parallel_for(n,f); + } +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecKokkos::pack_comm(int n, int *list, double *buf, + int pbc_flag, int *pbc) +{ + int i,j,m; + double dx,dy,dz; + + m = 0; + if (pbc_flag == 0) { + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0); + buf[m++] = h_x(j,1); + buf[m++] = h_x(j,2); + } + } else { + if (domain->triclinic == 0) { + dx = pbc[0]*domain->xprd; + dy = pbc[1]*domain->yprd; + dz = pbc[2]*domain->zprd; + } else { + dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; + dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; + dz = pbc[2]*domain->zprd; + } + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0) + dx; + buf[m++] = h_x(j,1) + dy; + buf[m++] = h_x(j,2) + dz; + } + } + return m; +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecKokkos::pack_comm_vel(int n, int *list, double *buf, + int pbc_flag, int *pbc) +{ + int i,j,m; + double dx,dy,dz,dvx,dvy,dvz; + + m = 0; + if (pbc_flag == 0) { + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0); + buf[m++] = h_x(j,1); + buf[m++] = h_x(j,2); + buf[m++] = h_v(j,0); + buf[m++] = h_v(j,1); + buf[m++] = h_v(j,2); + } + } else { + if (domain->triclinic == 0) { + dx = pbc[0]*domain->xprd; + dy = pbc[1]*domain->yprd; + dz = pbc[2]*domain->zprd; + } else { + dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; + dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; + dz = pbc[2]*domain->zprd; + } + if (!deform_vremap) { + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0) + dx; + buf[m++] = h_x(j,1) + dy; + buf[m++] = h_x(j,2) + dz; + buf[m++] = h_v(j,0); + buf[m++] = h_v(j,1); + buf[m++] = h_v(j,2); + } + } else { + dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4]; + dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3]; + dvz = pbc[2]*h_rate[2]; + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0) + dx; + buf[m++] = h_x(j,1) + dy; + buf[m++] = h_x(j,2) + dz; + if (atom->mask[i] & deform_groupbit) { + buf[m++] = h_v(j,0) + dvx; + buf[m++] = h_v(j,1) + dvy; + buf[m++] = h_v(j,2) + dvz; + } else { + buf[m++] = h_v(j,0); + buf[m++] = h_v(j,1); + buf[m++] = h_v(j,2); + } + } + } + } + return m; +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecKokkos::unpack_comm(int n, int first, double *buf) +{ + int i,m,last; + + m = 0; + last = first + n; + for (i = first; i < last; i++) { + h_x(i,0) = buf[m++]; + h_x(i,1) = buf[m++]; + h_x(i,2) = buf[m++]; + } +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecKokkos::unpack_comm_vel(int n, int first, double *buf) +{ + int i,m,last; + + m = 0; + last = first + n; + for (i = first; i < last; i++) { + h_x(i,0) = buf[m++]; + h_x(i,1) = buf[m++]; + h_x(i,2) = buf[m++]; + h_v(i,0) = buf[m++]; + h_v(i,1) = buf[m++]; + h_v(i,2) = buf[m++]; + } +} + +/* ---------------------------------------------------------------------- */ + +template +struct AtomVecKokkos_PackReverse { + typedef DeviceType device_type; + + typename ArrayTypes::t_f_array_randomread _f; + typename ArrayTypes::t_ffloat_2d _buf; + int _first; + + AtomVecKokkos_PackReverse( + const typename DAT::tdual_f_array &f, + const typename DAT::tdual_ffloat_2d &buf, + const int& first):_f(f.view()),_buf(buf.view()), + _first(first) {}; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + _buf(i,0) = _f(i+_first,0); + _buf(i,1) = _f(i+_first,1); + _buf(i,2) = _f(i+_first,2); + } +}; + +/* ---------------------------------------------------------------------- */ + +int AtomVecKokkos::pack_reverse_kokkos(const int &n, const int &first, + const DAT::tdual_ffloat_2d &buf ) { + if(commKK->reverse_comm_on_host) { + sync(Host,F_MASK); + struct AtomVecKokkos_PackReverse f(atomKK->k_f,buf,first); + Kokkos::parallel_for(n,f); + } else { + sync(Device,F_MASK); + struct AtomVecKokkos_PackReverse f(atomKK->k_f,buf,first); + Kokkos::parallel_for(n,f); + } + + return n*size_reverse; +} + +/* ---------------------------------------------------------------------- */ + +template +struct AtomVecKokkos_UnPackReverseSelf { + typedef DeviceType device_type; + + typename ArrayTypes::t_f_array_randomread _f; + typename ArrayTypes::t_f_array _fw; + int _nfirst; + typename ArrayTypes::t_int_2d_const _list; + const int _iswap; + + AtomVecKokkos_UnPackReverseSelf( + const typename DAT::tdual_f_array &f, + const int &nfirst, + const typename DAT::tdual_int_2d &list, + const int & iswap): + _f(f.view()),_fw(f.view()),_nfirst(nfirst),_list(list.view()),_iswap(iswap) { + }; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + const int j = _list(_iswap,i); + _fw(j,0) += _f(i+_nfirst,0); + _fw(j,1) += _f(i+_nfirst,1); + _fw(j,2) += _f(i+_nfirst,2); + } +}; + +/* ---------------------------------------------------------------------- */ + +int AtomVecKokkos::unpack_reverse_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap, + const int nfirst) { + if(commKK->reverse_comm_on_host) { + sync(Host,F_MASK); + struct AtomVecKokkos_UnPackReverseSelf f(atomKK->k_f,nfirst,list,iswap); + Kokkos::parallel_for(n,f); + modified(Host,F_MASK); + } else { + sync(Device,F_MASK); + struct AtomVecKokkos_UnPackReverseSelf f(atomKK->k_f,nfirst,list,iswap); + Kokkos::parallel_for(n,f); + modified(Device,F_MASK); + } + return n*3; +} + +/* ---------------------------------------------------------------------- */ + +template +struct AtomVecKokkos_UnPackReverse { + typedef DeviceType device_type; + + typename ArrayTypes::t_f_array _f; + typename ArrayTypes::t_ffloat_2d_const _buf; + typename ArrayTypes::t_int_2d_const _list; + const int _iswap; + + AtomVecKokkos_UnPackReverse( + const typename DAT::tdual_f_array &f, + const typename DAT::tdual_ffloat_2d &buf, + const typename DAT::tdual_int_2d &list, + const int & iswap): + _f(f.view()),_list(list.view()),_iswap(iswap) { + const size_t maxsend = (buf.view().dimension_0()*buf.view().dimension_1())/3; + const size_t elements = 3; + buffer_view(_buf,buf,maxsend,elements); + }; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + const int j = _list(_iswap,i); + _f(j,0) += _buf(i,0); + _f(j,1) += _buf(i,1); + _f(j,2) += _buf(i,2); + } +}; + +/* ---------------------------------------------------------------------- */ + +void AtomVecKokkos::unpack_reverse_kokkos(const int &n, + const DAT::tdual_int_2d &list, + const int & iswap, + const DAT::tdual_ffloat_2d &buf) +{ + // Check whether to always run reverse communication on the host + // Choose correct reverse UnPackReverse kernel + + if(commKK->reverse_comm_on_host) { + struct AtomVecKokkos_UnPackReverse f(atomKK->k_f,buf,list,iswap); + Kokkos::parallel_for(n,f); + modified(Host,F_MASK); + } else { + struct AtomVecKokkos_UnPackReverse f(atomKK->k_f,buf,list,iswap); + Kokkos::parallel_for(n,f); + modified(Device,F_MASK); + } +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecKokkos::pack_reverse(int n, int first, double *buf) +{ + if(n > 0) + sync(Host,F_MASK); + + int m = 0; + const int last = first + n; + for (int i = first; i < last; i++) { + buf[m++] = h_f(i,0); + buf[m++] = h_f(i,1); + buf[m++] = h_f(i,2); + } + + return m; +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecKokkos::unpack_reverse(int n, int *list, double *buf) +{ + int m = 0; + for (int i = 0; i < n; i++) { + const int j = list[i]; + h_f(j,0) += buf[m++]; + h_f(j,1) += buf[m++]; + h_f(j,2) += buf[m++]; + } + + if(n > 0) + modified(Host,F_MASK); +} diff --git a/src/KOKKOS/atom_vec_kokkos.h b/src/KOKKOS/atom_vec_kokkos.h index 7f593f235f..20a07ec443 100644 --- a/src/KOKKOS/atom_vec_kokkos.h +++ b/src/KOKKOS/atom_vec_kokkos.h @@ -35,29 +35,48 @@ class AtomVecKokkos : public AtomVec { public: AtomVecKokkos(class LAMMPS *); virtual ~AtomVecKokkos() {} + virtual int pack_comm(int, int *, double *, int, int *); + virtual int pack_comm_vel(int, int *, double *, int, int *); + virtual void unpack_comm(int, int, double *); + virtual void unpack_comm_vel(int, int, double *); + virtual int pack_reverse(int, int, double *); + virtual void unpack_reverse(int, int *, double *); virtual void sync(ExecutionSpace space, unsigned int mask) = 0; virtual void modified(ExecutionSpace space, unsigned int mask) = 0; - virtual void sync_overlapping_device(ExecutionSpace space, unsigned int mask) {}; + virtual void sync_overlapping_device(ExecutionSpace space, unsigned int mask) = 0; virtual int pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap, const int nfirst, - const int &pbc_flag, const int pbc[]) = 0; - //{return 0;} + const int &pbc_flag, const int pbc[]); + virtual int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &list, const int & iswap, const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, const int pbc[]) = 0; - //{return 0;} + const int &pbc_flag, const int pbc[]); + virtual void unpack_comm_kokkos(const int &n, const int &nfirst, - const DAT::tdual_xfloat_2d &buf) = 0; + const DAT::tdual_xfloat_2d &buf); + + virtual int + unpack_reverse_self(const int &n, const DAT::tdual_int_2d &list, + const int & iswap, const int nfirst); + + virtual int + pack_reverse_kokkos(const int &n, const int &nfirst, + const DAT::tdual_ffloat_2d &buf); + + virtual void + unpack_reverse_kokkos(const int &n, const DAT::tdual_int_2d &list, + const int & iswap, const DAT::tdual_ffloat_2d &buf); + virtual int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space) = 0; - //{return 0;}; + virtual void unpack_border_kokkos(const int &n, const int &nfirst, const DAT::tdual_xfloat_2d &buf, @@ -68,15 +87,19 @@ class AtomVecKokkos : public AtomVec { DAT::tdual_int_1d k_sendlist, DAT::tdual_int_1d k_copylist, ExecutionSpace space, int dim, X_FLOAT lo, X_FLOAT hi) = 0; - //{return 0;}; + virtual int unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv, int nlocal, int dim, X_FLOAT lo, X_FLOAT hi, ExecutionSpace space) = 0; - //{return 0;}; + protected: + HAT::t_x_array h_x; + HAT::t_v_array h_v; + HAT::t_f_array h_f; + class CommKokkos *commKK; size_t buffer_size; void* buffer; diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index f5ed0f525f..5534341342 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -46,7 +46,8 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp) if (sendlist) for (int i = 0; i < maxswap; i++) memory->destroy(sendlist[i]); memory->sfree(sendlist); sendlist = NULL; - k_sendlist = ArrayTypes::tdual_int_2d(); + k_sendlist = DAT::tdual_int_2d(); + k_total_send = DAT::tdual_int_scalar("comm::k_total_send"); // error check for disallow of OpenMP threads? @@ -57,12 +58,12 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp) memory->destroy(buf_recv); buf_recv = NULL; - k_exchange_sendlist = ArrayTypes:: + k_exchange_sendlist = DAT:: tdual_int_1d("comm:k_exchange_sendlist",100); - k_exchange_copylist = ArrayTypes:: + k_exchange_copylist = DAT:: tdual_int_1d("comm:k_exchange_copylist",100); - k_count = ArrayTypes::tdual_int_1d("comm:k_count",1); - k_sendflag = ArrayTypes::tdual_int_1d("comm:k_sendflag",100); + k_count = DAT::tdual_int_scalar("comm:k_count"); + k_sendflag = DAT::tdual_int_1d("comm:k_sendflag",100); memory->destroy(maxsendlist); maxsendlist = NULL; @@ -102,8 +103,10 @@ void CommKokkos::init() atomKK = (AtomKokkos *) atom; exchange_comm_classic = lmp->kokkos->exchange_comm_classic; forward_comm_classic = lmp->kokkos->forward_comm_classic; + reverse_comm_classic = lmp->kokkos->reverse_comm_classic; exchange_comm_on_host = lmp->kokkos->exchange_comm_on_host; forward_comm_on_host = lmp->kokkos->forward_comm_on_host; + reverse_comm_on_host = lmp->kokkos->reverse_comm_on_host; CommBrick::init(); @@ -132,8 +135,11 @@ void CommKokkos::init() if (force->newton == 0) check_reverse = 0; if (force->pair) check_reverse += force->pair->comm_reverse_off; - if(check_reverse || check_forward) + if (ghost_velocity) forward_comm_classic = true; + + if (!comm_f_only) // not all Kokkos atom_vec styles have reverse pack/unpack routines yet + reverse_comm_classic = true; } /* ---------------------------------------------------------------------- @@ -173,7 +179,6 @@ void CommKokkos::forward_comm_device(int dummy) int n; MPI_Request request; AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec; - double **x = atom->x; double *buf; // exchange data with another proc @@ -181,32 +186,29 @@ void CommKokkos::forward_comm_device(int dummy) // if comm_x_only set, exchange or copy directly to x, don't unpack k_sendlist.sync(); + atomKK->sync(ExecutionSpaceFromDevice::space,X_MASK); for (int iswap = 0; iswap < nswap; iswap++) { - if (sendproc[iswap] != me) { if (comm_x_only) { - atomKK->sync(ExecutionSpaceFromDevice::space,X_MASK); - if (size_forward_recv[iswap]) buf = x[firstrecv[iswap]]; - else buf = NULL; - if (size_forward_recv[iswap]) { buf = atomKK->k_x.view().ptr_on_device() + firstrecv[iswap]*atomKK->k_x.view().dimension_1(); MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE, - recvproc[iswap],0,world,&request); + recvproc[iswap],0,world,&request); } n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist, iswap,k_buf_send,pbc_flag[iswap],pbc[iswap]); - if (n) { MPI_Send(k_buf_send.view().ptr_on_device(), n,MPI_DOUBLE,sendproc[iswap],0,world); } - if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); - atomKK->modified(ExecutionSpaceFromDevice:: - space,X_MASK); + if (size_forward_recv[iswap]) { + MPI_Wait(&request,MPI_STATUS_IGNORE); + atomKK->modified(ExecutionSpaceFromDevice:: + space,X_MASK); + } } else if (ghost_velocity) { error->all(FLERR,"Ghost velocity forward comm not yet " "implemented with Kokkos"); @@ -248,21 +250,93 @@ void CommKokkos::forward_comm_device(int dummy) } } } + +/* ---------------------------------------------------------------------- + reverse communication of forces on atoms every timestep + other per-atom attributes may also be sent via pack/unpack routines +------------------------------------------------------------------------- */ + void CommKokkos::reverse_comm() { + if (!reverse_comm_classic) { + if (reverse_comm_on_host) reverse_comm_device(); + else reverse_comm_device(); + return; + } + k_sendlist.sync(); + if (comm_f_only) atomKK->sync(Host,F_MASK); else atomKK->sync(Host,ALL_MASK); + CommBrick::reverse_comm(); + if (comm_f_only) atomKK->modified(Host,F_MASK); else atomKK->modified(Host,ALL_MASK); - atomKK->sync(Device,ALL_MASK); + + //atomKK->sync(Device,ALL_MASK); // is this needed? } +template +void CommKokkos::reverse_comm_device() +{ + int n; + MPI_Request request; + AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec; + double *buf; + + // exchange data with another proc + // if other proc is self, just copy + // if comm_f_only set, exchange or copy directly from f, don't pack + + k_sendlist.sync(); + atomKK->sync(ExecutionSpaceFromDevice::space,F_MASK); + + for (int iswap = nswap-1; iswap >= 0; iswap--) { + if (sendproc[iswap] != me) { + if (comm_f_only) { + if (size_reverse_recv[iswap]) + MPI_Irecv(k_buf_recv.view().ptr_on_device(),size_reverse_recv[iswap],MPI_DOUBLE, + sendproc[iswap],0,world,&request); + if (size_reverse_send[iswap]) { + buf = atomKK->k_f.view().ptr_on_device() + + firstrecv[iswap]*atomKK->k_f.view().dimension_1(); + + MPI_Send(buf,size_reverse_send[iswap],MPI_DOUBLE, + recvproc[iswap],0,world); + } + if (size_reverse_recv[iswap]) { + MPI_Wait(&request,MPI_STATUS_IGNORE); + atomKK->modified(ExecutionSpaceFromDevice:: + space,F_MASK); + } + } else { + if (size_reverse_recv[iswap]) + MPI_Irecv(k_buf_recv.view().ptr_on_device(), + size_reverse_recv[iswap],MPI_DOUBLE, + sendproc[iswap],0,world,&request); + n = avec->pack_reverse_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send); + if (n) + MPI_Send(k_buf_send.view().ptr_on_device(),n, + MPI_DOUBLE,recvproc[iswap],0,world); + if (size_reverse_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); + } + avec->unpack_reverse_kokkos(sendnum[iswap],k_sendlist,iswap, + k_buf_recv); + } else { + if (sendnum[iswap]) + n = avec->unpack_reverse_self(sendnum[iswap],k_sendlist,iswap, + firstrecv[iswap]); + } + } +} + +/* ---------------------------------------------------------------------- */ + void CommKokkos::forward_comm_fix(Fix *fix, int size) { k_sendlist.sync(); @@ -408,7 +482,7 @@ struct BuildExchangeListFunctor { typename AT::t_x_array _x; int _nlocal,_dim; - typename AT::t_int_1d _nsend; + typename AT::t_int_scalar _nsend; typename AT::t_int_1d _sendlist; typename AT::t_int_1d _sendflag; @@ -416,7 +490,7 @@ struct BuildExchangeListFunctor { BuildExchangeListFunctor( const typename AT::tdual_x_array x, const typename AT::tdual_int_1d sendlist, - typename AT::tdual_int_1d nsend, + typename AT::tdual_int_scalar nsend, typename AT::tdual_int_1d sendflag,int nlocal, int dim, X_FLOAT lo, X_FLOAT hi): _x(x.template view()), @@ -430,7 +504,7 @@ struct BuildExchangeListFunctor { KOKKOS_INLINE_FUNCTION void operator() (int i) const { if (_x(i,_dim) < _lo || _x(i,_dim) >= _hi) { - const int mysend=Kokkos::atomic_fetch_add(&_nsend(0),1); + const int mysend=Kokkos::atomic_fetch_add(&_nsend(),1); if(mysend<_sendlist.dimension_0()) { _sendlist(mysend) = i; _sendflag(i) = 1; @@ -489,9 +563,9 @@ void CommKokkos::exchange_device() if (true) { if (k_sendflag.h_view.dimension_0()(); - k_count.h_view(0) = k_exchange_sendlist.h_view.dimension_0(); - while (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) { - k_count.h_view(0) = 0; + k_count.h_view() = k_exchange_sendlist.h_view.dimension_0(); + while (k_count.h_view()>=k_exchange_sendlist.h_view.dimension_0()) { + k_count.h_view() = 0; k_count.modify(); k_count.sync(); @@ -504,10 +578,10 @@ void CommKokkos::exchange_device() k_count.modify(); k_count.sync(); - if (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) { - k_exchange_sendlist.resize(k_count.h_view(0)*1.1); - k_exchange_copylist.resize(k_count.h_view(0)*1.1); - k_count.h_view(0)=k_exchange_sendlist.h_view.dimension_0(); + if (k_count.h_view()>=k_exchange_sendlist.h_view.dimension_0()) { + k_exchange_sendlist.resize(k_count.h_view()*1.1); + k_exchange_copylist.resize(k_count.h_view()*1.1); + k_count.h_view()=k_exchange_sendlist.h_view.dimension_0(); } } k_exchange_copylist.sync(); @@ -515,22 +589,22 @@ void CommKokkos::exchange_device() k_sendflag.sync(); int sendpos = nlocal-1; - nlocal -= k_count.h_view(0); - for(int i = 0; i < k_count.h_view(0); i++) { + nlocal -= k_count.h_view(); + for(int i = 0; i < k_count.h_view(); i++) { if (k_exchange_sendlist.h_view(i)(); k_exchange_copylist.sync(); - nsend = k_count.h_view(0); + nsend = k_count.h_view(); if (nsend > maxsend) grow_send_kokkos(nsend,1); nsend = - avec->pack_exchange_kokkos(k_count.h_view(0),k_buf_send, + avec->pack_exchange_kokkos(k_count.h_view(),k_buf_send, k_exchange_sendlist,k_exchange_copylist, ExecutionSpaceFromDevice:: space,dim,lo,hi); @@ -640,9 +714,7 @@ void CommKokkos::borders() } atomKK->sync(Host,ALL_MASK); - atomKK->modified(Host,ALL_MASK); k_sendlist.sync(); - k_sendlist.modify(); CommBrick::borders(); k_sendlist.modify(); atomKK->modified(Host,ALL_MASK); @@ -659,11 +731,11 @@ struct BuildBorderListFunctor { int iswap,maxsendlist; int nfirst,nlast,dim; typename AT::t_int_2d sendlist; - typename AT::t_int_1d nsend; + typename AT::t_int_scalar nsend; BuildBorderListFunctor(typename AT::tdual_x_array _x, typename AT::tdual_int_2d _sendlist, - typename AT::tdual_int_1d _nsend,int _nfirst, + typename AT::tdual_int_scalar _nsend,int _nfirst, int _nlast, int _dim, X_FLOAT _lo, X_FLOAT _hi, int _iswap, int _maxsendlist): @@ -684,7 +756,7 @@ struct BuildBorderListFunctor { for (int i=teamstart + dev.team_rank(); i= lo && x(i,dim) <= hi) mysend++; } - const int my_store_pos = dev.team_scan(mysend,&nsend(0)); + const int my_store_pos = dev.team_scan(mysend,&nsend()); if (my_store_pos+mysend < maxsendlist) { mysend = my_store_pos; @@ -713,7 +785,7 @@ void CommKokkos::borders_device() { AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec; ExecutionSpace exec_space = ExecutionSpaceFromDevice::space; - k_sendlist.modify(); + k_sendlist.sync(); atomKK->sync(exec_space,ALL_MASK); // do swaps over all 3 dimensions @@ -763,37 +835,38 @@ void CommKokkos::borders_device() { if (sendflag) { if (!bordergroup || ineed >= 2) { if (style == SINGLE) { - typename ArrayTypes::tdual_int_1d total_send("TS",1); - total_send.h_view(0) = 0; - if(exec_space == Device) { - total_send.template modify(); - total_send.template sync(); - } + k_total_send.h_view() = 0; + k_total_send.template modify(); + k_total_send.template sync(); BuildBorderListFunctor f(atomKK->k_x,k_sendlist, - total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]); + k_total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]); Kokkos::TeamPolicy config((nlast-nfirst+127)/128,128); Kokkos::parallel_for(config,f); - total_send.template modify(); - total_send.template sync(); + k_total_send.template modify(); + k_total_send.template sync(); + + k_sendlist.modify(); + + if(k_total_send.h_view() >= maxsendlist[iswap]) { + grow_list(iswap,k_total_send.h_view()); + + k_total_send.h_view() = 0; + k_total_send.template modify(); + k_total_send.template sync(); - if(total_send.h_view(0) >= maxsendlist[iswap]) { - grow_list(iswap,total_send.h_view(0)); - k_sendlist.modify(); - total_send.h_view(0) = 0; - if(exec_space == Device) { - total_send.template modify(); - total_send.template sync(); - } BuildBorderListFunctor f(atomKK->k_x,k_sendlist, - total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]); + k_total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]); Kokkos::TeamPolicy config((nlast-nfirst+127)/128,128); Kokkos::parallel_for(config,f); - total_send.template modify(); - total_send.template sync(); + + k_total_send.template modify(); + k_total_send.template sync(); + + k_sendlist.modify(); } - nsend = total_send.h_view(0); + nsend = k_total_send.h_view(); } else { error->all(FLERR,"Required border comm not yet " "implemented with Kokkos"); @@ -916,10 +989,11 @@ void CommKokkos::borders_device() { // reset global->local map - if (exec_space == Host) k_sendlist.sync(); atomKK->modified(exec_space,ALL_MASK); - atomKK->sync(Host,TAG_MASK); - if (map_style) atom->map_set(); + if (map_style) { + atomKK->sync(Host,TAG_MASK); + atom->map_set(); + } } /* ---------------------------------------------------------------------- realloc the size of the send buffer as needed with BUFFACTOR and bufextra @@ -961,7 +1035,7 @@ void CommKokkos::grow_send_kokkos(int n, int flag, ExecutionSpace space) buf_send = k_buf_send.view().ptr_on_device(); } else { - k_buf_send = ArrayTypes:: + k_buf_send = DAT:: tdual_xfloat_2d("comm:k_buf_send",maxsend_border,atom->avec->size_border); buf_send = k_buf_send.view().ptr_on_device(); } @@ -975,7 +1049,7 @@ void CommKokkos::grow_recv_kokkos(int n, ExecutionSpace space) { maxrecv = static_cast (BUFFACTOR * n); int maxrecv_border = (maxrecv+BUFEXTRA+5)/atom->avec->size_border + 2; - k_buf_recv = ArrayTypes:: + k_buf_recv = DAT:: tdual_xfloat_2d("comm:k_buf_recv",maxrecv_border,atom->avec->size_border); buf_recv = k_buf_recv.view().ptr_on_device(); } @@ -988,6 +1062,11 @@ void CommKokkos::grow_list(int iswap, int n) { int size = static_cast (BUFFACTOR * n); + if (exchange_comm_classic) { // force realloc on Host + k_sendlist.sync(); + k_sendlist.modify(); + } + memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist"); for(int i=0;i(); + k_sendlist.modify(); + } + memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist"); memory->grow(maxsendlist,n,"comm:maxsendlist"); diff --git a/src/KOKKOS/comm_kokkos.h b/src/KOKKOS/comm_kokkos.h index a8ae973124..f137655cb8 100644 --- a/src/KOKKOS/comm_kokkos.h +++ b/src/KOKKOS/comm_kokkos.h @@ -25,15 +25,17 @@ class CommKokkos : public CommBrick { bool exchange_comm_classic; bool forward_comm_classic; + bool reverse_comm_classic; bool exchange_comm_on_host; bool forward_comm_on_host; + bool reverse_comm_on_host; CommKokkos(class LAMMPS *); ~CommKokkos(); void init(); void forward_comm(int dummy = 0); // forward comm of atom coords - void reverse_comm(); // reverse comm of atom coords + void reverse_comm(); // reverse comm of atom coords void exchange(); // move atoms to new procs void borders(); // setup list of atoms to comm @@ -47,15 +49,17 @@ class CommKokkos : public CommBrick { void reverse_comm_dump(class Dump *); // reverse comm from a Dump template void forward_comm_device(int dummy); + template void reverse_comm_device(); template void forward_comm_pair_device(Pair *pair); template void exchange_device(); template void borders_device(); protected: DAT::tdual_int_2d k_sendlist; + DAT::tdual_int_scalar k_total_send; DAT::tdual_xfloat_2d k_buf_send,k_buf_recv; DAT::tdual_int_1d k_exchange_sendlist,k_exchange_copylist,k_sendflag; - DAT::tdual_int_1d k_count; + DAT::tdual_int_scalar k_count; //double *buf_send; // send buffer for all comm //double *buf_recv; // recv buffer for all comm diff --git a/src/KOKKOS/fix_qeq_reax_kokkos.cpp b/src/KOKKOS/fix_qeq_reax_kokkos.cpp index e54b53ae89..5d2f6a0438 100644 --- a/src/KOKKOS/fix_qeq_reax_kokkos.cpp +++ b/src/KOKKOS/fix_qeq_reax_kokkos.cpp @@ -63,6 +63,7 @@ FixQEqReaxKokkos(LAMMPS *lmp, int narg, char **arg) : nmax = nmax = m_cap = 0; allocated_flag = 0; + nprev = 4; } /* ---------------------------------------------------------------------- */ @@ -158,15 +159,15 @@ void FixQEqReaxKokkos::init_hist() { int i,j; - k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",atom->nmax,5); + k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",atom->nmax,nprev); d_s_hist = k_s_hist.template view(); h_s_hist = k_s_hist.h_view; - k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",atom->nmax,5); + k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",atom->nmax,nprev); d_t_hist = k_t_hist.template view(); h_t_hist = k_t_hist.h_view; for( i = 0; i < atom->nmax; i++ ) - for( j = 0; j < 5; j++ ) + for( j = 0; j < nprev; j++ ) k_s_hist.h_view(i,j) = k_t_hist.h_view(i,j) = 0.0; k_s_hist.template modify(); @@ -334,11 +335,11 @@ void FixQEqReaxKokkos::allocate_array() d_d = k_d.template view(); h_d = k_d.h_view; - k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",nmax,5); + k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",nmax,nprev); d_s_hist = k_s_hist.template view(); h_s_hist = k_s_hist.h_view; - k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",nmax,5); + k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",nmax,nprev); d_t_hist = k_t_hist.template view(); h_t_hist = k_t_hist.h_view; } @@ -368,7 +369,7 @@ void FixQEqReaxKokkos::zero_item(int ii) const d_o[i] = 0.0; d_r[i] = 0.0; d_d[i] = 0.0; - //for( int j = 0; j < 5; j++ ) + //for( int j = 0; j < nprev; j++ ) //d_s_hist(i,j) = d_t_hist(i,j) = 0.0; } @@ -1087,7 +1088,7 @@ void FixQEqReaxKokkos::calculate_q_item(int ii) const if (mask[i] & groupbit) { q(i) = d_s[i] - delta * d_t[i]; - for (int k = 4; k > 0; --k) { + for (int k = nprev-1; k > 0; --k) { d_s_hist(i,k) = d_s_hist(i,k-1); d_t_hist(i,k) = d_t_hist(i,k-1); } @@ -1173,7 +1174,7 @@ double FixQEqReaxKokkos::memory_usage() { double bytes; - bytes = atom->nmax*5*2 * sizeof(F_FLOAT); // s_hist & t_hist + bytes = atom->nmax*nprev*2 * sizeof(F_FLOAT); // s_hist & t_hist bytes += atom->nmax*8 * sizeof(F_FLOAT); // storage bytes += n_cap*2 * sizeof(int); // matrix... bytes += m_cap * sizeof(int); diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp index 072a802b54..2b02624dce 100644 --- a/src/KOKKOS/kokkos.cpp +++ b/src/KOKKOS/kokkos.cpp @@ -123,8 +123,10 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) neighflag_qeq_set = 0; exchange_comm_classic = 0; forward_comm_classic = 0; + reverse_comm_classic = 0; exchange_comm_on_host = 0; forward_comm_on_host = 0; + reverse_comm_on_host = 0; #ifdef KILL_KOKKOS_ON_SIGSEGV signal(SIGSEGV, my_signal_handler); @@ -158,8 +160,8 @@ void KokkosLMP::accelerator(int narg, char **arg) neighflag_qeq_set = 0; int newtonflag = 0; double binsize = 0.0; - exchange_comm_classic = forward_comm_classic = 0; - exchange_comm_on_host = forward_comm_on_host = 0; + exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0; + exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; int iarg = 0; while (iarg < narg) { @@ -200,13 +202,13 @@ void KokkosLMP::accelerator(int narg, char **arg) } else if (strcmp(arg[iarg],"comm") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); if (strcmp(arg[iarg+1],"no") == 0) { - exchange_comm_classic = forward_comm_classic = 1; + exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1; } else if (strcmp(arg[iarg+1],"host") == 0) { - exchange_comm_classic = forward_comm_classic = 0; - exchange_comm_on_host = forward_comm_on_host = 1; + exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0; + exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 1; } else if (strcmp(arg[iarg+1],"device") == 0) { - exchange_comm_classic = forward_comm_classic = 0; - exchange_comm_on_host = forward_comm_on_host = 0; + exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0; + exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; } else error->all(FLERR,"Illegal package kokkos command"); iarg += 2; } else if (strcmp(arg[iarg],"comm/exchange") == 0) { @@ -231,6 +233,17 @@ void KokkosLMP::accelerator(int narg, char **arg) forward_comm_on_host = 0; } else error->all(FLERR,"Illegal package kokkos command"); iarg += 2; + } else if (strcmp(arg[iarg],"comm/reverse") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); + if (strcmp(arg[iarg+1],"no") == 0) reverse_comm_classic = 1; + else if (strcmp(arg[iarg+1],"host") == 0) { + reverse_comm_classic = 0; + reverse_comm_on_host = 1; + } else if (strcmp(arg[iarg+1],"device") == 0) { + reverse_comm_classic = 0; + reverse_comm_on_host = 0; + } else error->all(FLERR,"Illegal package kokkos command"); + iarg += 2; } else error->all(FLERR,"Illegal package kokkos command"); } diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h index 8e28b38cbf..7b7848f1f0 100644 --- a/src/KOKKOS/kokkos.h +++ b/src/KOKKOS/kokkos.h @@ -27,8 +27,10 @@ class KokkosLMP : protected Pointers { int neighflag_qeq_set; int exchange_comm_classic; int forward_comm_classic; + int reverse_comm_classic; int exchange_comm_on_host; int forward_comm_on_host; + int reverse_comm_on_host; int num_threads,ngpu; int numa; int auto_sync; diff --git a/src/KOKKOS/nbin_kokkos.cpp b/src/KOKKOS/nbin_kokkos.cpp index c7e815928a..95ea105ad9 100644 --- a/src/KOKKOS/nbin_kokkos.cpp +++ b/src/KOKKOS/nbin_kokkos.cpp @@ -75,6 +75,10 @@ void NBinKokkos::bin_atoms_setup(int nall) k_bincount = DAT::tdual_int_1d("Neighbor::d_bincount",mbins); bincount = k_bincount.view(); } + if (nall > k_atom2bin.d_view.dimension_0()) { + k_atom2bin = DAT::tdual_int_1d("Neighbor::d_atom2bin",nall); + atom2bin = k_atom2bin.view(); + } } /* ---------------------------------------------------------------------- @@ -86,6 +90,10 @@ void NBinKokkos::bin_atoms() { last_bin = update->ntimestep; + k_bins.template sync(); + k_bincount.template sync(); + k_atom2bin.template sync(); + h_resize() = 1; while(h_resize() > 0) { @@ -115,6 +123,10 @@ void NBinKokkos::bin_atoms() c_bins = bins; } } + + k_bins.template modify(); + k_bincount.template modify(); + k_atom2bin.template modify(); } /* ---------------------------------------------------------------------- */ @@ -125,6 +137,7 @@ void NBinKokkos::binatomsItem(const int &i) const { const int ibin = coord2bin(x(i, 0), x(i, 1), x(i, 2)); + atom2bin(i) = ibin; const int ac = Kokkos::atomic_fetch_add(&bincount[ibin], (int)1); if(ac < bins.dimension_1()) { bins(ibin, ac) = i; diff --git a/src/KOKKOS/nbin_kokkos.h b/src/KOKKOS/nbin_kokkos.h index de3cf41d19..bf2ccc5908 100644 --- a/src/KOKKOS/nbin_kokkos.h +++ b/src/KOKKOS/nbin_kokkos.h @@ -44,11 +44,13 @@ class NBinKokkos : public NBinStandard { int atoms_per_bin; DAT::tdual_int_1d k_bincount; DAT::tdual_int_2d k_bins; + DAT::tdual_int_1d k_atom2bin; typename AT::t_int_1d bincount; const typename AT::t_int_1d_const c_bincount; typename AT::t_int_2d bins; typename AT::t_int_2d_const c_bins; + typename AT::t_int_1d atom2bin; typename AT::t_int_scalar d_resize; typename ArrayTypes::t_int_scalar h_resize; typename AT::t_x_array_randomread x; diff --git a/src/KOKKOS/neighbor_kokkos.cpp b/src/KOKKOS/neighbor_kokkos.cpp index 9a40808052..f34b149864 100644 --- a/src/KOKKOS/neighbor_kokkos.cpp +++ b/src/KOKKOS/neighbor_kokkos.cpp @@ -310,9 +310,9 @@ void NeighborKokkos::build_kokkos(int topoflag) // build pairwise lists for all perpetual NPair/NeighList // grow() with nlocal/nall args so that only realloc if have to - atomKK->sync(Host,ALL_MASK); for (i = 0; i < npair_perpetual; i++) { m = plist[i]; + if (!lists[m]->kokkos) atomKK->sync(Host,ALL_MASK); if (!lists[m]->copy) lists[m]->grow(nlocal,nall); neigh_pair[m]->build_setup(); neigh_pair[m]->build(lists[m]); diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp index b568bd5c93..d3cdcb0680 100644 --- a/src/KOKKOS/npair_kokkos.cpp +++ b/src/KOKKOS/npair_kokkos.cpp @@ -73,6 +73,7 @@ void NPairKokkos::copy_bin_info() atoms_per_bin = nbKK->atoms_per_bin; k_bincount = nbKK->k_bincount; k_bins = nbKK->k_bins; + k_atom2bin = nbKK->k_atom2bin; } /* ---------------------------------------------------------------------- @@ -88,13 +89,15 @@ void NPairKokkos::copy_stencil_info() int maxstencil = ns->get_maxstencil(); - k_stencil = DAT::tdual_int_1d("neighlist:stencil",maxstencil); + if (maxstencil > k_stencil.dimension_0()) + k_stencil = DAT::tdual_int_1d("neighlist:stencil",maxstencil); for (int k = 0; k < maxstencil; k++) k_stencil.h_view(k) = ns->stencil[k]; k_stencil.modify(); k_stencil.sync(); if (GHOST) { - k_stencilxyz = DAT::tdual_int_1d_3("neighlist:stencilxyz",maxstencil); + if (maxstencil > k_stencilxyz.dimension_0()) + k_stencilxyz = DAT::tdual_int_1d_3("neighlist:stencilxyz",maxstencil); for (int k = 0; k < maxstencil; k++) { k_stencilxyz.h_view(k,0) = ns->stencilxyz[k][0]; k_stencilxyz.h_view(k,1) = ns->stencilxyz[k][1]; @@ -122,6 +125,7 @@ void NPairKokkos::build(NeighList *list_) k_cutneighsq.view(), k_bincount.view(), k_bins.view(), + k_atom2bin.view(), nstencil, k_stencil.view(), k_stencilxyz.view(), @@ -164,8 +168,9 @@ void NPairKokkos::build(NeighList *list_) k_ex_mol_group.sync(); k_ex_mol_bit.sync(); k_ex_mol_intra.sync(); - k_bincount.sync(), - k_bins.sync(), + k_bincount.sync(); + k_bins.sync(); + k_atom2bin.sync(); atomKK->sync(Device,X_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK|TAG_MASK|SPECIAL_MASK); data.special_flag[0] = special_flag[0]; @@ -317,7 +322,7 @@ void NeighborKokkosExecute:: const X_FLOAT ztmp = x(i, 2); const int itype = type(i); - const int ibin = coord2bin(xtmp, ytmp, ztmp); + const int ibin = c_atom2bin(i); const typename ArrayTypes::t_int_1d_const_um stencil = d_stencil; @@ -431,7 +436,7 @@ void NeighborKokkosExecute:: if(n > neigh_list.maxneighs) { resize() = 1; - if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n); + if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop } neigh_list.d_ilist(i) = i; @@ -641,7 +646,7 @@ void NeighborKokkosExecute::build_ItemCuda(typename Kokkos::TeamPoli if(n > neigh_list.maxneighs) { resize() = 1; - if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n); + if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop } } } @@ -678,7 +683,7 @@ void NeighborKokkosExecute:: // no molecular test when i = ghost atom if (i < nlocal) { - const int ibin = coord2bin(xtmp, ytmp, ztmp); + const int ibin = c_atom2bin(i); for (int k = 0; k < nstencil; k++) { const int jbin = ibin + stencil[k]; for(int m = 0; m < c_bincount(jbin); m++) { @@ -764,7 +769,7 @@ void NeighborKokkosExecute:: if(n > neigh_list.maxneighs) { resize() = 1; - if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n); + if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop } neigh_list.d_ilist(i) = i; } diff --git a/src/KOKKOS/npair_kokkos.h b/src/KOKKOS/npair_kokkos.h index 517ea546fa..6c1c0e958b 100644 --- a/src/KOKKOS/npair_kokkos.h +++ b/src/KOKKOS/npair_kokkos.h @@ -105,6 +105,7 @@ class NPairKokkos : public NPair { int atoms_per_bin; DAT::tdual_int_1d k_bincount; DAT::tdual_int_2d k_bins; + DAT::tdual_int_1d k_atom2bin; // data from NStencil class @@ -148,6 +149,8 @@ class NeighborKokkosExecute const typename AT::t_int_1d_const c_bincount; typename AT::t_int_2d bins; typename AT::t_int_2d_const c_bins; + const typename AT::t_int_1d atom2bin; + const typename AT::t_int_1d_const c_atom2bin; // data from NStencil class @@ -190,6 +193,7 @@ class NeighborKokkosExecute const typename AT::t_xfloat_2d_randomread &_cutneighsq, const typename AT::t_int_1d &_bincount, const typename AT::t_int_2d &_bins, + const typename AT::t_int_1d &_atom2bin, const int _nstencil, const typename AT::t_int_1d &_d_stencil, const typename AT::t_int_1d_3 &_d_stencilxyz, @@ -224,6 +228,7 @@ class NeighborKokkosExecute const int & _xprd_half, const int & _yprd_half, const int & _zprd_half): neigh_list(_neigh_list), cutneighsq(_cutneighsq), bincount(_bincount),c_bincount(_bincount),bins(_bins),c_bins(_bins), + atom2bin(_atom2bin),c_atom2bin(_atom2bin), nstencil(_nstencil),d_stencil(_d_stencil),d_stencilxyz(_d_stencilxyz), nlocal(_nlocal), x(_x),type(_type),mask(_mask),molecule(_molecule), @@ -281,38 +286,6 @@ class NeighborKokkosExecute void build_ItemCuda(typename Kokkos::TeamPolicy::member_type dev) const; #endif - KOKKOS_INLINE_FUNCTION - int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z) const - { - int ix,iy,iz; - - if (x >= bboxhi[0]) - ix = static_cast ((x-bboxhi[0])*bininvx) + nbinx; - else if (x >= bboxlo[0]) { - ix = static_cast ((x-bboxlo[0])*bininvx); - ix = MIN(ix,nbinx-1); - } else - ix = static_cast ((x-bboxlo[0])*bininvx) - 1; - - if (y >= bboxhi[1]) - iy = static_cast ((y-bboxhi[1])*bininvy) + nbiny; - else if (y >= bboxlo[1]) { - iy = static_cast ((y-bboxlo[1])*bininvy); - iy = MIN(iy,nbiny-1); - } else - iy = static_cast ((y-bboxlo[1])*bininvy) - 1; - - if (z >= bboxhi[2]) - iz = static_cast ((z-bboxhi[2])*bininvz) + nbinz; - else if (z >= bboxlo[2]) { - iz = static_cast ((z-bboxlo[2])*bininvz); - iz = MIN(iz,nbinz-1); - } else - iz = static_cast ((z-bboxlo[2])*bininvz) - 1; - - return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo); - } - KOKKOS_INLINE_FUNCTION int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z, int* i) const { diff --git a/src/KOKKOS/pair_reaxc_kokkos.cpp b/src/KOKKOS/pair_reaxc_kokkos.cpp index d95cd8f8ae..d5f83f4537 100644 --- a/src/KOKKOS/pair_reaxc_kokkos.cpp +++ b/src/KOKKOS/pair_reaxc_kokkos.cpp @@ -131,6 +131,8 @@ template void PairReaxCKokkos::init_style() { PairReaxC::init_style(); + if (fix_reax) modify->delete_fix("REAXC"); // not needed in the Kokkos version + fix_reax = NULL; // irequest = neigh request made by parent class @@ -555,8 +557,8 @@ void PairReaxCKokkos::Deallocate_Lookup_Tables() ntypes = atom->ntypes; - for( i = 0; i < ntypes; ++i ) { - for( j = i; j < ntypes; ++j ) + for( i = 0; i <= ntypes; ++i ) { + for( j = i; j <= ntypes; ++j ) if( LR[i][j].n ) { sfree( LR[i][j].y, "LR[i,j].y" ); sfree( LR[i][j].H, "LR[i,j].H" ); diff --git a/src/KOKKOS/verlet_kokkos.cpp b/src/KOKKOS/verlet_kokkos.cpp index e4a3f857d3..adec5ff1bd 100644 --- a/src/KOKKOS/verlet_kokkos.cpp +++ b/src/KOKKOS/verlet_kokkos.cpp @@ -294,6 +294,7 @@ void VerletKokkos::run(int n) int n_pre_exchange = modify->n_pre_exchange; int n_pre_neighbor = modify->n_pre_neighbor; int n_pre_force = modify->n_pre_force; + int n_pre_reverse = modify->n_pre_reverse; int n_post_force = modify->n_post_force; int n_end_of_step = modify->n_end_of_step; @@ -304,9 +305,9 @@ void VerletKokkos::run(int n) f_merge_copy = DAT::t_f_array("VerletKokkos::f_merge_copy",atomKK->k_f.dimension_0()); - static double time = 0.0; atomKK->sync(Device,ALL_MASK); - Kokkos::Impl::Timer ktimer; + //static double time = 0.0; + //Kokkos::Impl::Timer ktimer; timer->init_timeout(); for (int i = 0; i < n; i++) { @@ -320,10 +321,10 @@ void VerletKokkos::run(int n) // initial time integration - ktimer.reset(); + //ktimer.reset(); timer->stamp(); modify->initial_integrate(vflag); - time += ktimer.seconds(); + //time += ktimer.seconds(); if (n_post_integrate) modify->post_integrate(); timer->stamp(Timer::MODIFY); @@ -523,11 +524,18 @@ void VerletKokkos::run(int n) atomKK->k_f.modify(); } + if (n_pre_reverse) { + modify->pre_reverse(eflag,vflag); + timer->stamp(Timer::MODIFY); + } // reverse communication of forces - if (force->newton) comm->reverse_comm(); - timer->stamp(Timer::COMM); + if (force->newton) { + Kokkos::fence(); + comm->reverse_comm(); + timer->stamp(Timer::COMM); + } // force modifications, final time integration, diagnostics diff --git a/src/MAKE/MACHINES/Makefile.cori2 b/src/MAKE/MACHINES/Makefile.cori2 index a367d54080..45e1ab1f8a 100755 --- a/src/MAKE/MACHINES/Makefile.cori2 +++ b/src/MAKE/MACHINES/Makefile.cori2 @@ -15,13 +15,14 @@ SHELL = /bin/sh CC = CC OPTFLAGS = -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_NO_TBB +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG -DLMP_INTEL_NO_TBB \ + $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M LINK = CC -LINKFLAGS = -g -qopenmp $(OPTFLAGS) +LINKFLAGS = -qopenmp $(OPTFLAGS) LIB = SIZE = size diff --git a/src/MAKE/OPTIONS/Makefile.intel_coprocessor b/src/MAKE/OPTIONS/Makefile.intel_coprocessor index a717be93ff..75e4d89170 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_coprocessor +++ b/src/MAKE/OPTIONS/Makefile.intel_coprocessor @@ -10,7 +10,7 @@ CC = mpiicpc MIC_OPT = -qoffload-option,mic,compiler,"-fp-model fast=2 -mGLOB_default_function_attrs=\"gather_scatter_loop_unroll=4\"" CCFLAGS = -g -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \ -xHost -fno-alias -ansi-alias -restrict -DLMP_INTEL_USELRT \ - -qoverride-limits $(MIC_OPT) + -qoverride-limits $(MIC_OPT) -DLMP_USE_MKL_RNG SHFLAGS = -fPIC DEPFLAGS = -M diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu b/src/MAKE/OPTIONS/Makefile.intel_cpu old mode 100755 new mode 100644 index b7db064574..41d0f959fe --- a/src/MAKE/OPTIONS/Makefile.intel_cpu +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu @@ -8,14 +8,14 @@ SHELL = /bin/sh CC = mpiicpc OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M LINK = mpiicpc -LINKFLAGS = -g -qopenmp $(OPTFLAGS) -LIB = -ltbbmalloc -ltbbmalloc_proxy +LINKFLAGS = -qopenmp $(OPTFLAGS) +LIB = -ltbbmalloc SIZE = size ARCHIVE = ar diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi index 8a45b781f8..ef514f43c6 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi @@ -8,8 +8,8 @@ SHELL = /bin/sh CC = mpiicpc OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich index 40d517bce4..e4dc74d79b 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich @@ -8,14 +8,14 @@ SHELL = /bin/sh CC = mpicxx -cxx=icc OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M LINK = mpicxx -cxx=icc -LINKFLAGS = -g -qopenmp $(OPTFLAGS) -LIB = +LINKFLAGS = -qopenmp $(OPTFLAGS) +LIB = -ltbbmalloc SIZE = size ARCHIVE = ar diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi index fe1be99e58..457a64b223 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi @@ -9,14 +9,14 @@ SHELL = /bin/sh export OMPI_CXX = icc CC = mpicxx OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M LINK = mpicxx -LINKFLAGS = -g -qopenmp $(OPTFLAGS) -LIB = -ltbbmalloc -ltbbmalloc_proxy +LINKFLAGS = -qopenmp $(OPTFLAGS) +LIB = -ltbbmalloc SIZE = size ARCHIVE = ar diff --git a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor b/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor deleted file mode 100644 index 406e98b36d..0000000000 --- a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor +++ /dev/null @@ -1,123 +0,0 @@ -# intel_phi = USER-INTEL with Phi x200 (KNL) offload support,Intel MPI,MKL FFT - -SHELL = /bin/sh - -# --------------------------------------------------------------------- -# compiler/linker settings -# specify flags and libraries needed for your compiler - -CC = mpiicpc -MIC_OPT = -qoffload-arch=mic-avx512 -fp-model fast=2 -CCFLAGS = -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \ - -xHost -fno-alias -ansi-alias -restrict \ - -qoverride-limits $(MIC_OPT) -DLMP_INTEL_USELRT -SHFLAGS = -fPIC -DEPFLAGS = -M - -LINK = mpiicpc -LINKFLAGS = -g -O3 -xHost -qopenmp -qoffload $(MIC_OPT) -LIB = -ltbbmalloc -SIZE = size - -ARCHIVE = ar -ARFLAGS = -rc -SHLIBFLAGS = -shared - -# --------------------------------------------------------------------- -# LAMMPS-specific settings, all OPTIONAL -# specify settings for LAMMPS features you will use -# if you change any -D setting, do full re-compile after "make clean" - -# LAMMPS ifdef settings -# see possible settings in Section 2.2 (step 4) of manual - -LMP_INC = -DLAMMPS_GZIP -DLAMMPS_JPEG - -# MPI library -# see discussion in Section 2.2 (step 5) of manual -# MPI wrapper compiler/linker can provide this info -# can point to dummy MPI library in src/STUBS as in Makefile.serial -# use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts -# INC = path for mpi.h, MPI compiler settings -# PATH = path for MPI library -# LIB = name of MPI library - -MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 -MPI_PATH = -MPI_LIB = - -# FFT library -# see discussion in Section 2.2 (step 6) of manaul -# can be left blank to use provided KISS FFT library -# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings -# PATH = path for FFT library -# LIB = name of FFT library - -FFT_INC = -DFFT_MKL -DFFT_SINGLE -FFT_PATH = -FFT_LIB = -L$(MKLROOT)/lib/intel64/ -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core - -# JPEG and/or PNG library -# see discussion in Section 2.2 (step 7) of manual -# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC -# INC = path(s) for jpeglib.h and/or png.h -# PATH = path(s) for JPEG library and/or PNG library -# LIB = name(s) of JPEG library and/or PNG library - -JPG_INC = -JPG_PATH = -JPG_LIB = -ljpeg - -# --------------------------------------------------------------------- -# build rules and dependencies -# do not edit this section - -include Makefile.package.settings -include Makefile.package - -EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC) -EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH) -EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB) -EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS) -EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS) - -# Path to src files - -vpath %.cpp .. -vpath %.h .. - -# Link target - -$(EXE): $(OBJ) $(EXTRA_LINK_DEPENDS) - $(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE) - $(SIZE) $(EXE) - -# Library targets - -lib: $(OBJ) $(EXTRA_LINK_DEPENDS) - $(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ) - -shlib: $(OBJ) $(EXTRA_LINK_DEPENDS) - $(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \ - $(OBJ) $(EXTRA_LIB) $(LIB) - -# Compilation rules - -%.o:%.cpp $(EXTRA_CPP_DEPENDS) - $(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $< - -%.d:%.cpp $(EXTRA_CPP_DEPENDS) - $(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@ - -%.o:%.cu $(EXTRA_CPP_DEPENDS) - $(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $< - -# Individual dependencies - -depend : fastdep.exe $(SRC) - @./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1 - -fastdep.exe: ../DEPEND/fastdep.c - cc -O -o $@ $< - -sinclude .depend diff --git a/src/MAKE/OPTIONS/Makefile.knl b/src/MAKE/OPTIONS/Makefile.knl index 881c51f0e4..8e266a4fce 100644 --- a/src/MAKE/OPTIONS/Makefile.knl +++ b/src/MAKE/OPTIONS/Makefile.knl @@ -8,13 +8,13 @@ SHELL = /bin/sh CC = mpiicpc OPTFLAGS = -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M LINK = mpiicpc -LINKFLAGS = -g -qopenmp $(OPTFLAGS) +LINKFLAGS = -qopenmp $(OPTFLAGS) LIB = -ltbbmalloc SIZE = size diff --git a/src/REPLICA/prd.cpp b/src/REPLICA/prd.cpp index 30ebc779c5..14eeac8d66 100644 --- a/src/REPLICA/prd.cpp +++ b/src/REPLICA/prd.cpp @@ -310,6 +310,7 @@ void PRD::command(int narg, char **arg) time_dephase = time_dynamics = time_quench = time_comm = time_output = 0.0; bigint clock = 0; + timer->init(); timer->barrier_start(); time_start = timer->get_wall(Timer::TOTAL); diff --git a/src/REPLICA/tad.cpp b/src/REPLICA/tad.cpp index 5a4d885224..347cd3ba67 100644 --- a/src/REPLICA/tad.cpp +++ b/src/REPLICA/tad.cpp @@ -274,6 +274,7 @@ void TAD::command(int narg, char **arg) nbuild = ndanger = 0; time_neb = time_dynamics = time_quench = time_comm = time_output = 0.0; + timer->init(); timer->barrier_start(); time_start = timer->get_wall(Timer::TOTAL); diff --git a/src/USER-INTEL/Install.sh b/src/USER-INTEL/Install.sh index f7163e6791..da553d158a 100644 --- a/src/USER-INTEL/Install.sh +++ b/src/USER-INTEL/Install.sh @@ -46,7 +46,7 @@ action nbin_intel.h action nbin_intel.cpp action npair_intel.h action npair_intel.cpp -action intel_simd.h pair_sw_intel.cpp +action intel_simd.h action intel_intrinsics.h pair_tersoff_intel.cpp action intel_intrinsics_airebo.h pair_airebo_intel.cpp diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README index 3b84446057..871d881f39 100644 --- a/src/USER-INTEL/README +++ b/src/USER-INTEL/README @@ -30,28 +30,37 @@ be added or changed in the Makefile depending on the version: 2017 update 2 - No changes needed 2017 updates 3 or 4 - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512 -2018 or newer - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high +2018 inital release - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512 +2018u1 or newer - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high ----------------------------------------------------------------------------- When using the suffix command with "intel", intel styles will be used if they exist. If the suffix command is used with "hybrid intel omp" and the USER-OMP -USER-OMP styles will be used whenever USER-INTEL styles are not available. This -allow for running most styles in LAMMPS with threading. +is installed, USER-OMP styles will be used whenever USER-INTEL styles are not +available. This allow for running most styles in LAMMPS with threading. ----------------------------------------------------------------------------- -The Long-Range Thread mode (LRT) in the Intel package currently uses -pthreads by default. If pthreads are not supported in the build environment, -the compile flag "-DLMP_INTEL_NOLRT" will disable the feature to allow for -builds without pthreads. Alternatively, "-DLMP_INTEL_LRT11" can be used to -build with compilers that support threads using the C++11 standard. When using +The Long-Range Thread mode (LRT) in the Intel package is enabled through the +-DLMP_INTEL_USELRT define at compile time. All intel optimized makefiles +include this define. This feature will use pthreads by default. +Alternatively, "-DLMP_INTEL_LRT11" can be used to build with compilers that +support threads intrinsically using the C++11 standard. When using LRT mode, you might need to disable OpenMP affinity settings (e.g. export KMP_AFFINITY=none). LAMMPS will generate a warning if the settings need to be changed. ----------------------------------------------------------------------------- +Unless Intel Math Kernel Library (MKL) is unavailable, -DLMP_USE_MKL_RNG +should be added to the compile flags. This will enable using the MKL Mersenne +Twister random number generator (RNG) for Dissipative Particle Dynamics +(DPD). This RNG can allow significantly faster performance and it also has a +significantly longer period than the standard RNG for DPD. + +----------------------------------------------------------------------------- + In order to use offload to Intel(R) Xeon Phi(TM) coprocessors, the flag -DLMP_INTEL_OFFLOAD should be set in the Makefile. Offload requires the use of Intel compilers. diff --git a/src/USER-INTEL/TEST/README b/src/USER-INTEL/TEST/README index 434189dd26..62602d5920 100644 --- a/src/USER-INTEL/TEST/README +++ b/src/USER-INTEL/TEST/README @@ -9,6 +9,7 @@ # in.intel.tersoff - Silicon benchmark with Tersoff # in.intel.water - Coarse-grain water benchmark using Stillinger-Weber # in.intel.airebo - Polyethelene benchmark with AIREBO +# in.intel.dpd - Dissipative Particle Dynamics # ############################################################################# @@ -16,16 +17,17 @@ # Expected Timesteps/second with turbo on and HT enabled, LAMMPS June-2017 # - Compiled w/ Intel Parallel Studio 2017u2 and Makefile.intel_cpu_intelmpi # -# Xeon E5-2697v4 Xeon Phi 7250 +# Xeon E5-2697v4 Xeon Phi 7250 Xeon Gold 6148 # -# in.intel.lj - 199.5 282.3 -# in.intel.rhodo - 12.4 17.5 -# in.intel.lc - 19.0 25.7 -# in.intel.eam - 59.4 92.8 -# in.intel.sw - 132.4 161.9 -# in.intel.tersoff - 83.3 101.1 -# in.intel.water - 53.4 90.3 -# in.intel.airebo - 7.3 11.8 +# in.intel.lj - 199.5 282.3 317.3 +# in.intel.rhodo - 12.4 17.5 24.4 +# in.intel.lc - 19.0 25.7 26.8 +# in.intel.eam - 59.4 92.8 105.6 +# in.intel.sw - 132.4 161.9 213.8 +# in.intel.tersoff - 83.3 101.1 109.6 +# in.intel.water - 53.4 90.3 105.5 +# in.intel.airebo - 7.3 11.8 17.6 +# in.intel.dpd - 74.5 100.4 148.1 # ############################################################################# diff --git a/src/USER-INTEL/TEST/in.intel.dpd b/src/USER-INTEL/TEST/in.intel.dpd new file mode 100644 index 0000000000..e257d91f84 --- /dev/null +++ b/src/USER-INTEL/TEST/in.intel.dpd @@ -0,0 +1,48 @@ +# DPD benchmark + +variable N index on # Newton Setting +variable w index 10 # Warmup Timesteps +variable t index 4000 # Main Run Timesteps +variable m index 1 # Main Run Timestep Multiplier +variable n index 0 # Use NUMA Mapping for Multi-Node +variable p index 0 # Use Power Measurement + +variable x index 4 +variable y index 2 +variable z index 2 + +variable xx equal 20*$x +variable yy equal 20*$y +variable zz equal 20*$z +variable rr equal floor($t*$m) + +newton $N +if "$n > 0" then "processors * * * grid numa" + +units lj +atom_style atomic +comm_modify mode single vel yes + +lattice fcc 3.0 +region box block 0 ${xx} 0 ${yy} 0 ${zz} +create_box 1 box +create_atoms 1 box +mass 1 1.0 + +velocity all create 1.0 87287 loop geom + +pair_style dpd 1.0 1.0 928948 +pair_coeff 1 1 25.0 4.5 + +neighbor 0.5 bin +neigh_modify delay 0 every 1 + +fix 1 all nve +timestep 0.04 + +thermo 1000 + +if "$p > 0" then "run_style verlet/power" + +if "$w > 0" then "run $w" +run ${rr} diff --git a/src/USER-INTEL/dihedral_fourier_intel.cpp b/src/USER-INTEL/dihedral_fourier_intel.cpp new file mode 100644 index 0000000000..805ffc0e25 --- /dev/null +++ b/src/USER-INTEL/dihedral_fourier_intel.cpp @@ -0,0 +1,441 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include +#include +#include "dihedral_fourier_intel.h" +#include "atom.h" +#include "comm.h" +#include "memory.h" +#include "neighbor.h" +#include "domain.h" +#include "force.h" +#include "pair.h" +#include "update.h" +#include "error.h" + +#include "suffix.h" +using namespace LAMMPS_NS; + +#define PTOLERANCE (flt_t)1.05 +#define MTOLERANCE (flt_t)-1.05 +typedef struct { int a,b,c,d,t; } int5_t; + +/* ---------------------------------------------------------------------- */ + +DihedralFourierIntel::DihedralFourierIntel(class LAMMPS *lmp) + : DihedralFourier(lmp) +{ + suffix_flag |= Suffix::INTEL; +} + +/* ---------------------------------------------------------------------- */ + +void DihedralFourierIntel::compute(int eflag, int vflag) +{ + #ifdef _LMP_INTEL_OFFLOAD + if (_use_base) { + DihedralFourier::compute(eflag, vflag); + return; + } + #endif + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers(), + force_const_single); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers(), + force_const_double); + else + compute(eflag, vflag, fix->get_single_buffers(), + force_const_single); +} + +/* ---------------------------------------------------------------------- */ + +template +void DihedralFourierIntel::compute(int eflag, int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + if (eflag || vflag) { + ev_setup(eflag,vflag); + } else evflag = 0; + + if (evflag) { + if (vflag && !eflag) { + if (force->newton_bond) + eval<0,1,1>(vflag, buffers, fc); + else + eval<0,1,0>(vflag, buffers, fc); + } else { + if (force->newton_bond) + eval<1,1,1>(vflag, buffers, fc); + else + eval<1,1,0>(vflag, buffers, fc); + } + } else { + if (force->newton_bond) + eval<0,0,1>(vflag, buffers, fc); + else + eval<0,0,0>(vflag, buffers, fc); + } +} + +template +void DihedralFourierIntel::eval(const int vflag, + IntelBuffers *buffers, + const ForceConst &fc) + +{ + const int inum = neighbor->ndihedrallist; + if (inum == 0) return; + + ATOM_T * _noalias const x = buffers->get_x(0); + const int nlocal = atom->nlocal; + const int nall = nlocal + atom->nghost; + + int f_stride; + if (NEWTON_BOND) f_stride = buffers->get_stride(nall); + else f_stride = buffers->get_stride(nlocal); + + int tc; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; + IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global); + const int nthreads = tc; + + acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5; + if (EFLAG) oedihedral = (acc_t)0.0; + if (VFLAG && vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; + } + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(f_start,f_stride,fc) \ + reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5) + #endif + { + int nfrom, npl, nto, tid; + #ifdef LMP_INTEL_USE_SIMDOFF + IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + #else + IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads); + #endif + + FORCE_T * _noalias const f = f_start + (tid * f_stride); + if (fix->need_zero(tid)) + memset(f, 0, f_stride * sizeof(FORCE_T)); + + const int5_t * _noalias const dihedrallist = + (int5_t *) neighbor->dihedrallist[0]; + + #ifdef LMP_INTEL_USE_SIMDOFF + acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5; + if (EFLAG) sedihedral = (acc_t)0.0; + if (VFLAG && vflag) { + sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; + } + #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5) + for (int n = nfrom; n < nto; n ++) { + #else + for (int n = nfrom; n < nto; n += npl) { + #endif + const int i1 = dihedrallist[n].a; + const int i2 = dihedrallist[n].b; + const int i3 = dihedrallist[n].c; + const int i4 = dihedrallist[n].d; + const int type = dihedrallist[n].t; + + // 1st bond + + const flt_t vb1x = x[i1].x - x[i2].x; + const flt_t vb1y = x[i1].y - x[i2].y; + const flt_t vb1z = x[i1].z - x[i2].z; + + // 2nd bond + + const flt_t vb2xm = x[i2].x - x[i3].x; + const flt_t vb2ym = x[i2].y - x[i3].y; + const flt_t vb2zm = x[i2].z - x[i3].z; + + // 3rd bond + + const flt_t vb3x = x[i4].x - x[i3].x; + const flt_t vb3y = x[i4].y - x[i3].y; + const flt_t vb3z = x[i4].z - x[i3].z; + + // c,s calculation + + const flt_t ax = vb1y*vb2zm - vb1z*vb2ym; + const flt_t ay = vb1z*vb2xm - vb1x*vb2zm; + const flt_t az = vb1x*vb2ym - vb1y*vb2xm; + const flt_t bx = vb3y*vb2zm - vb3z*vb2ym; + const flt_t by = vb3z*vb2xm - vb3x*vb2zm; + const flt_t bz = vb3x*vb2ym - vb3y*vb2xm; + + const flt_t rasq = ax*ax + ay*ay + az*az; + const flt_t rbsq = bx*bx + by*by + bz*bz; + const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm; + const flt_t rg = sqrt(rgsq); + + flt_t rginv, ra2inv, rb2inv; + rginv = ra2inv = rb2inv = (flt_t)0.0; + if (rg > 0) rginv = (flt_t)1.0/rg; + if (rasq > 0) ra2inv = (flt_t)1.0/rasq; + if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq; + const flt_t rabinv = sqrt(ra2inv*rb2inv); + + flt_t c = (ax*bx + ay*by + az*bz)*rabinv; + const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z); + + // error check + #ifndef LMP_INTEL_USE_SIMDOFF + if (c > PTOLERANCE || c < MTOLERANCE) { + int me = comm->me; + + if (screen) { + char str[128]; + sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT, + me,tid,update->ntimestep, + atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); + error->warning(FLERR,str,0); + fprintf(screen," 1st atom: %d %g %g %g\n", + me,x[i1].x,x[i1].y,x[i1].z); + fprintf(screen," 2nd atom: %d %g %g %g\n", + me,x[i2].x,x[i2].y,x[i2].z); + fprintf(screen," 3rd atom: %d %g %g %g\n", + me,x[i3].x,x[i3].y,x[i3].z); + fprintf(screen," 4th atom: %d %g %g %g\n", + me,x[i4].x,x[i4].y,x[i4].z); + } + } + #endif + + if (c > (flt_t)1.0) c = (flt_t)1.0; + if (c < (flt_t)-1.0) c = (flt_t)-1.0; + + flt_t deng; + flt_t df = (flt_t)0.0; + if (EFLAG) deng = (flt_t)0.0; + + for (int j = 0; j < nterms[type]; j++) { + const flt_t tcos_shift = fc.bp[j][type].cos_shift; + const flt_t tsin_shift = fc.bp[j][type].sin_shift; + const flt_t tk = fc.bp[j][type].k; + const int m = fc.bp[j][type].multiplicity; + + flt_t p = (flt_t)1.0; + flt_t ddf1, df1; + ddf1 = df1 = (flt_t)0.0; + + for (int i = 0; i < m; i++) { + ddf1 = p*c - df1*s; + df1 = p*s + df1*c; + p = ddf1; + } + + p = p*tcos_shift + df1*tsin_shift; + df1 = df1*tcos_shift - ddf1*tsin_shift; + df1 *= -m; + p += (flt_t)1.0; + + if (m == 0) { + p = (flt_t)1.0 + tcos_shift; + df1 = (flt_t)0.0; + } + + if (EFLAG) deng += tk * p; + df -= tk * df1; + } + + const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm; + const flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm; + const flt_t fga = fg*ra2inv*rginv; + const flt_t hgb = hg*rb2inv*rginv; + const flt_t gaa = -ra2inv*rg; + const flt_t gbb = rb2inv*rg; + + const flt_t dtfx = gaa*ax; + const flt_t dtfy = gaa*ay; + const flt_t dtfz = gaa*az; + const flt_t dtgx = fga*ax - hgb*bx; + const flt_t dtgy = fga*ay - hgb*by; + const flt_t dtgz = fga*az - hgb*bz; + const flt_t dthx = gbb*bx; + const flt_t dthy = gbb*by; + const flt_t dthz = gbb*bz; + + const flt_t sx2 = df*dtgx; + const flt_t sy2 = df*dtgy; + const flt_t sz2 = df*dtgz; + + flt_t f1x = df*dtfx; + flt_t f1y = df*dtfy; + flt_t f1z = df*dtfz; + + const flt_t f2x = sx2 - f1x; + const flt_t f2y = sy2 - f1y; + const flt_t f2z = sz2 - f1z; + + flt_t f4x = df*dthx; + flt_t f4y = df*dthy; + flt_t f4z = df*dthz; + + const flt_t f3x = -sx2 - f4x; + const flt_t f3y = -sy2 - f4y; + const flt_t f3z = -sz2 - f4z; + + if (EFLAG || VFLAG) { + #ifdef LMP_INTEL_USE_SIMDOFF + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4, + f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, + vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, + vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal, + sv0, sv1, sv2, sv3, sv4, sv5); + #else + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4, + f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, + vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, + vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal, + ov0, ov1, ov2, ov3, ov4, ov5); + #endif + } + + #ifdef LMP_INTEL_USE_SIMDOFF + #pragma simdoff + #endif + { + if (NEWTON_BOND || i1 < nlocal) { + f[i1].x += f1x; + f[i1].y += f1y; + f[i1].z += f1z; + } + + if (NEWTON_BOND || i2 < nlocal) { + f[i2].x += f2x; + f[i2].y += f2y; + f[i2].z += f2z; + } + + if (NEWTON_BOND || i3 < nlocal) { + f[i3].x += f3x; + f[i3].y += f3y; + f[i3].z += f3z; + } + + if (NEWTON_BOND || i4 < nlocal) { + f[i4].x += f4x; + f[i4].y += f4y; + f[i4].z += f4z; + } + } + } // for n + #ifdef LMP_INTEL_USE_SIMDOFF + if (EFLAG) oedihedral += sedihedral; + if (VFLAG && vflag) { + ov0 += sv0; ov1 += sv1; ov2 += sv2; + ov3 += sv3; ov4 += sv4; ov5 += sv5; + } + #endif + } // omp parallel + + if (EFLAG) energy += oedihedral; + if (VFLAG && vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + } + + fix->set_reduce_flag(); +} + +/* ---------------------------------------------------------------------- */ + +void DihedralFourierIntel::init_style() +{ + DihedralFourier::init_style(); + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + #ifdef _LMP_INTEL_OFFLOAD + _use_base = 0; + if (fix->offload_balance() != 0.0) { + _use_base = 1; + return; + } + #endif + + fix->bond_init_check(); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + pack_force_const(force_const_single, fix->get_mixed_buffers()); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + pack_force_const(force_const_double, fix->get_double_buffers()); + else + pack_force_const(force_const_single, fix->get_single_buffers()); +} + +/* ---------------------------------------------------------------------- */ + +template +void DihedralFourierIntel::pack_force_const(ForceConst &fc, + IntelBuffers *buffers) +{ + const int bp1 = atom->ndihedraltypes + 1; + fc.set_ntypes(bp1, setflag, nterms, memory); + + for (int i = 1; i < bp1; i++) { + if (setflag[i]) { + for (int j = 0; j < nterms[i]; j++) { + fc.bp[j][i].cos_shift = cos_shift[i][j]; + fc.bp[j][i].sin_shift = sin_shift[i][j]; + fc.bp[j][i].k = k[i][j]; + fc.bp[j][i].multiplicity = multiplicity[i][j]; + } + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +void DihedralFourierIntel::ForceConst::set_ntypes(const int nbondtypes, + int *setflag, + int *nterms, + Memory *memory) { + if (nbondtypes != _nbondtypes) { + if (_nbondtypes > 0) + _memory->destroy(bp); + + if (nbondtypes > 0) { + _maxnterms = 1; + for (int i = 1; i <= nbondtypes; i++) + if (setflag[i]) _maxnterms = MAX(_maxnterms, nterms[i]); + + _memory->create(bp, _maxnterms, nbondtypes, "dihedralfourierintel.bp"); + } + } + _nbondtypes = nbondtypes; + _memory = memory; +} diff --git a/src/USER-INTEL/dihedral_fourier_intel.h b/src/USER-INTEL/dihedral_fourier_intel.h new file mode 100644 index 0000000000..a775e129f4 --- /dev/null +++ b/src/USER-INTEL/dihedral_fourier_intel.h @@ -0,0 +1,82 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef DIHEDRAL_CLASS + +DihedralStyle(fourier/intel,DihedralFourierIntel) + +#else + +#ifndef LMP_DIHEDRAL_FOURIER_INTEL_H +#define LMP_DIHEDRAL_FOURIER_INTEL_H + +#include "dihedral_fourier.h" +#include "fix_intel.h" + +namespace LAMMPS_NS { + +class DihedralFourierIntel : public DihedralFourier { + + public: + DihedralFourierIntel(class LAMMPS *lmp); + virtual void compute(int, int); + void init_style(); + + private: + FixIntel *fix; + + template class ForceConst; + template + void compute(int eflag, int vflag, IntelBuffers *buffers, + const ForceConst &fc); + template + void eval(const int vflag, IntelBuffers * buffers, + const ForceConst &fc); + template + void pack_force_const(ForceConst &fc, + IntelBuffers *buffers); + + #ifdef _LMP_INTEL_OFFLOAD + int _use_base; + #endif + + template + class ForceConst { + public: + typedef struct { flt_t cos_shift, sin_shift, k; + int multiplicity; } fc_packed1; + + fc_packed1 **bp; + + ForceConst() : _nbondtypes(0) {} + ~ForceConst() { set_ntypes(0, NULL, NULL, NULL); } + + void set_ntypes(const int nbondtypes, int *setflag, int *nterms, + Memory *memory); + + private: + int _nbondtypes, _maxnterms; + Memory *_memory; + }; + ForceConst force_const_single; + ForceConst force_const_double; +}; + +} + +#endif +#endif diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp index 637fc0d06e..eac48b8510 100644 --- a/src/USER-INTEL/fix_intel.cpp +++ b/src/USER-INTEL/fix_intel.cpp @@ -285,6 +285,7 @@ int FixIntel::setmask() { int mask = 0; mask |= PRE_REVERSE; + mask |= MIN_PRE_REVERSE; #ifdef _LMP_INTEL_OFFLOAD mask |= POST_FORCE; mask |= MIN_POST_FORCE; diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h index 068e5ed890..d7093e79bb 100644 --- a/src/USER-INTEL/fix_intel.h +++ b/src/USER-INTEL/fix_intel.h @@ -43,6 +43,7 @@ class FixIntel : public Fix { virtual int setmask(); virtual void init(); virtual void setup(int); + inline void min_setup(int in) { setup(in); } void setup_pre_reverse(int eflag = 0, int vflag = 0); void pair_init_check(const bool cdmessage=false); @@ -50,6 +51,8 @@ class FixIntel : public Fix { void kspace_init_check(); void pre_reverse(int eflag = 0, int vflag = 0); + inline void min_pre_reverse(int eflag = 0, int vflag = 0) + { pre_reverse(eflag, vflag); } // Get all forces, calculation results from coprocesser void sync_coprocessor(); diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp index b4b664cb94..ac208f5a0c 100644 --- a/src/USER-INTEL/intel_buffers.cpp +++ b/src/USER-INTEL/intel_buffers.cpp @@ -409,6 +409,7 @@ void IntelBuffers::grow_ccache(const int off_flag, IP_PRE_get_stride(_ccache_stride3, nsize * 3, sizeof(acc_t), 0); lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef"); #endif + memset(_ccachei, 0, vsize * sizeof(int)); memset(_ccachej, 0, vsize * sizeof(int)); #ifdef _LMP_INTEL_OFFLOAD @@ -425,7 +426,7 @@ void IntelBuffers::grow_ccache(const int off_flag, #pragma offload_transfer target(mic:_cop) \ nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \ nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \ - nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \ + in(ccachei:length(vsize) alloc_if(1) free_if(0)) \ in(ccachej:length(vsize) alloc_if(1) free_if(0)) } #ifdef LMP_USE_AVXCD diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h index a7663d54a6..d49d0d8b00 100644 --- a/src/USER-INTEL/intel_preprocess.h +++ b/src/USER-INTEL/intel_preprocess.h @@ -292,6 +292,15 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, ito = inum; \ } +#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum, \ + nthr, vecsize) \ + { \ + tid = 0; \ + ifrom = 0; \ + ip = 1; \ + ito = inum; \ + } + #endif #define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \ diff --git a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp index 12101712f1..e6d45d7b2c 100644 --- a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp +++ b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp @@ -319,7 +319,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, const int bstart = binhead[ibin + binstart[k]]; const int bend = binhead[ibin + binend[k]]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned #pragma simd #endif for (int jj = bstart; jj < bend; jj++) @@ -341,7 +340,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, const int bstart = binhead[ibin + stencil[k]]; const int bend = binhead[ibin + stencil[k] + 1]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned #pragma simd #endif for (int jj = bstart; jj < bend; jj++) diff --git a/src/USER-INTEL/npair_intel.cpp b/src/USER-INTEL/npair_intel.cpp index 79dc75366e..0068e02635 100644 --- a/src/USER-INTEL/npair_intel.cpp +++ b/src/USER-INTEL/npair_intel.cpp @@ -273,7 +273,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, const int bstart = binhead[ibin + binstart[k]]; const int bend = binhead[ibin + binend[k]]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned #pragma simd #endif for (int jj = bstart; jj < bend; jj++) @@ -307,7 +306,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, const int bstart = binhead[ibin]; const int bend = binhead[ibin + 1]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned #pragma simd #endif for (int jj = bstart; jj < bend; jj++) { diff --git a/src/USER-INTEL/pair_dpd_intel.cpp b/src/USER-INTEL/pair_dpd_intel.cpp new file mode 100644 index 0000000000..c7cddfccc1 --- /dev/null +++ b/src/USER-INTEL/pair_dpd_intel.cpp @@ -0,0 +1,617 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + This software is distributed under the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) + Shun Xu (Computer Network Information Center, CAS) +------------------------------------------------------------------------- */ + +#include +#include "pair_dpd_intel.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "memory.h" +#include "modify.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "suffix.h" +using namespace LAMMPS_NS; + +#define LMP_MKL_RNG VSL_BRNG_MT19937 +#define FC_PACKED1_T typename ForceConst::fc_packed1 +#define IEPSILON 1.0e10 + +/* ---------------------------------------------------------------------- */ + +PairDPDIntel::PairDPDIntel(LAMMPS *lmp) : + PairDPD(lmp) +{ + suffix_flag |= Suffix::INTEL; + respa_enable = 0; + random_thread = NULL; + _nrandom_thread = 0; +} + +/* ---------------------------------------------------------------------- */ + +PairDPDIntel::~PairDPDIntel() +{ + #if defined(_OPENMP) + if (_nrandom_thread) { + #ifdef LMP_USE_MKL_RNG + for (int i = 0; i < _nrandom_thread; i++) + vslDeleteStream(&random_thread[i]); + #else + for (int i = 1; i < _nrandom_thread; i++) + delete random_thread[i]; + #endif + } + #endif + delete []random_thread; +} + +/* ---------------------------------------------------------------------- */ + +void PairDPDIntel::compute(int eflag, int vflag) +{ + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers(), + force_const_single); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers(), + force_const_double); + else + compute(eflag, vflag, fix->get_single_buffers(), + force_const_single); + + fix->balance_stamp(); + vflag_fdotr = 0; +} + +template +void PairDPDIntel::compute(int eflag, int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + if (eflag || vflag) { + ev_setup(eflag, vflag); + } else evflag = vflag_fdotr = 0; + + const int inum = list->inum; + const int nthreads = comm->nthreads; + const int host_start = fix->host_start_pair(); + const int offload_end = fix->offload_end_pair(); + const int ago = neighbor->ago; + + if (ago != 0 && fix->separate_buffers() == 0) { + fix->start_watch(TIME_PACK); + + int packthreads; + if (nthreads > INTEL_HTHREADS) packthreads = nthreads; + else packthreads = 1; + #if defined(_OPENMP) + #pragma omp parallel if(packthreads > 1) + #endif + { + int ifrom, ito, tid; + IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, + packthreads, sizeof(ATOM_T)); + buffers->thr_pack(ifrom,ito,ago); + } + fix->stop_watch(TIME_PACK); + } + + int ovflag = 0; + if (vflag_fdotr) ovflag = 2; + else if (vflag) ovflag = 1; + if (_onetype) { + if (eflag) { + if (force->newton_pair) { + eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); + } + } else { + if (force->newton_pair) { + eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); + } + } + } else { + if (eflag) { + if (force->newton_pair) { + eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum); + } + } else { + if (force->newton_pair) { + eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum); + } + } + } +} + +template +void PairDPDIntel::eval(const int offload, const int vflag, + IntelBuffers *buffers, + const ForceConst &fc, + const int astart, const int aend) +{ + const int inum = aend - astart; + if (inum == 0) return; + int nlocal, nall, minlocal; + fix->get_buffern(offload, nlocal, nall, minlocal); + + const int ago = neighbor->ago; + IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall); + + ATOM_T * _noalias const x = buffers->get_x(offload); + typedef struct { double x, y, z; } lmp_vt; + lmp_vt *v = (lmp_vt *)atom->v[0]; + const flt_t dtinvsqrt = 1.0/sqrt(update->dt); + + const int * _noalias const numneigh = list->numneigh; + const int * _noalias const cnumneigh = buffers->cnumneigh(list); + const int * _noalias const firstneigh = buffers->firstneigh(list); + const FC_PACKED1_T * _noalias const param = fc.param[0]; + const flt_t * _noalias const special_lj = fc.special_lj; + int * _noalias const rngi_thread = fc.rngi; + const int rng_size = buffers->get_max_nbors(); + + const int ntypes = atom->ntypes + 1; + const int eatom = this->eflag_atom; + + // Determine how much data to transfer + int x_size, q_size, f_stride, ev_size, separate_flag; + IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, + buffers, offload, fix, separate_flag, + x_size, q_size, ev_size, f_stride); + + int tc; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; + IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global); + const int nthreads = tc; + int *overflow = fix->get_off_overflow_flag(); + { + #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) + *timer_compute = MIC_Wtime(); + #endif + + IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, + f_stride, x, 0); + + acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; + if (EFLAG) oevdwl = (acc_t)0; + if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + + // loop over neighbors of my atoms + #if defined(_OPENMP) + #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) + #endif + { + int iifrom, iip, iito, tid; + IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads); + iifrom += astart; + iito += astart; + + #ifdef LMP_USE_MKL_RNG + VSLStreamStatePtr *my_random = &(random_thread[tid]); + #else + RanMars *my_random = random_thread[tid]; + #endif + flt_t *my_rand_buffer = fc.rand_buffer_thread[tid]; + int rngi = rngi_thread[tid]; + + int foff; + if (NEWTON_PAIR) foff = tid * f_stride - minlocal; + else foff = -minlocal; + FORCE_T * _noalias const f = f_start + foff; + if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); + + flt_t icut, a0, gamma, sigma; + if (ONETYPE) { + icut = param[3].icut; + a0 = param[3].a0; + gamma = param[3].gamma; + sigma = param[3].sigma; + } + for (int i = iifrom; i < iito; i += iip) { + int itype, ptr_off; + const FC_PACKED1_T * _noalias parami; + if (!ONETYPE) { + itype = x[i].w; + ptr_off = itype * ntypes; + parami = param + ptr_off; + } + + const int * _noalias const jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + + acc_t fxtmp, fytmp, fztmp, fwtmp; + acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; + + const flt_t xtmp = x[i].x; + const flt_t ytmp = x[i].y; + const flt_t ztmp = x[i].z; + const flt_t vxtmp = v[i].x; + const flt_t vytmp = v[i].y; + const flt_t vztmp = v[i].z; + fxtmp = fytmp = fztmp = (acc_t)0; + if (EFLAG) fwtmp = sevdwl = (acc_t)0; + if (NEWTON_PAIR == 0) + if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; + + if (rngi + jnum > rng_size) { + #ifdef LMP_USE_MKL_RNG + if (sizeof(flt_t) == sizeof(float)) + vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, + (float*)my_rand_buffer, (float)0.0, (float)1.0 ); + else + vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, + (double*)my_rand_buffer, 0.0, 1.0 ); + #else + for (int jj = 0; jj < rngi; jj++) + my_rand_buffer[jj] = my_random->gaussian(); + #endif + rngi = 0; + } + + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + sv0, sv1, sv2, sv3, sv4, sv5) + #endif + for (int jj = 0; jj < jnum; jj++) { + flt_t forcelj, evdwl; + forcelj = evdwl = (flt_t)0.0; + + int j, jtype, sbindex; + if (!ONETYPE) { + sbindex = jlist[jj] >> SBBITS & 3; + j = jlist[jj] & NEIGHMASK; + } else + j = jlist[jj]; + + const flt_t delx = xtmp - x[j].x; + const flt_t dely = ytmp - x[j].y; + const flt_t delz = ztmp - x[j].z; + if (!ONETYPE) { + jtype = x[j].w; + icut = parami[jtype].icut; + } + const flt_t rsq = delx * delx + dely * dely + delz * delz; + const flt_t rinv = (flt_t)1.0/sqrt(rsq); + + if (rinv > icut) { + flt_t factor_dpd; + if (!ONETYPE) factor_dpd = special_lj[sbindex]; + + flt_t delvx = vxtmp - v[j].x; + flt_t delvy = vytmp - v[j].y; + flt_t delvz = vztmp - v[j].z; + flt_t dot = delx*delvx + dely*delvy + delz*delvz; + flt_t randnum = my_rand_buffer[jj]; + + flt_t iwd = rinv - icut; + if (rinv > (flt_t)IEPSILON) iwd = (flt_t)0.0; + + if (!ONETYPE) { + a0 = parami[jtype].a0; + gamma = parami[jtype].gamma; + sigma = parami[jtype].sigma; + } + flt_t fpair = a0 - iwd * gamma * dot + sigma * randnum * dtinvsqrt; + if (!ONETYPE) fpair *= factor_dpd; + fpair *= iwd; + + const flt_t fpx = fpair * delx; + fxtmp += fpx; + if (NEWTON_PAIR) f[j].x -= fpx; + const flt_t fpy = fpair * dely; + fytmp += fpy; + if (NEWTON_PAIR) f[j].y -= fpy; + const flt_t fpz = fpair * delz; + fztmp += fpz; + if (NEWTON_PAIR) f[j].z -= fpz; + + if (EFLAG) { + flt_t cut = (flt_t)1.0/icut; + flt_t r = (flt_t)1.0/rinv; + evdwl = (flt_t)0.5 * a0 * (cut - (flt_t)2.0*r + rsq * icut); + if (!ONETYPE) evdwl *= factor_dpd; + sevdwl += evdwl; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl; + if (NEWTON_PAIR) + f[j].w += (flt_t)0.5 * evdwl; + } + } + + if (NEWTON_PAIR == 0) + IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz); + } // if rsq + } // for jj + if (NEWTON_PAIR) { + f[i].x += fxtmp; + f[i].y += fytmp; + f[i].z += fztmp; + } else { + f[i].x = fxtmp; + f[i].y = fytmp; + f[i].z = fztmp; + } + + IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); + rngi += jnum; + } // for ii + + IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, + f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, + ov4, ov5); + rngi_thread[tid] = rngi; + } // end omp + + IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, + ov0, ov1, ov2, ov3, ov4, ov5); + + if (EFLAG) { + if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; + ev_global[0] = oevdwl; + ev_global[1] = (acc_t)0.0; + } + if (vflag) { + if (NEWTON_PAIR == 0) { + ov0 *= (acc_t)0.5; + ov1 *= (acc_t)0.5; + ov2 *= (acc_t)0.5; + ov3 *= (acc_t)0.5; + ov4 *= (acc_t)0.5; + ov5 *= (acc_t)0.5; + } + ev_global[2] = ov0; + ev_global[3] = ov1; + ev_global[4] = ov2; + ev_global[5] = ov3; + ev_global[6] = ov4; + ev_global[7] = ov5; + } + #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) + *timer_compute = MIC_Wtime() - *timer_compute; + #endif + } // end offload + + if (offload) + fix->stop_watch(TIME_OFFLOAD_LATENCY); + else + fix->stop_watch(TIME_HOST_PAIR); + + if (EFLAG || vflag) + fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); + else + fix->add_result_array(f_start, 0, offload); +} + +/* ---------------------------------------------------------------------- + global settings + ------------------------------------------------------------------------- */ + +void PairDPDIntel::settings(int narg, char **arg) { + #if defined(_OPENMP) + if (_nrandom_thread) { + #ifdef LMP_USE_MKL_RNG + for (int i = 0; i < _nrandom_thread; i++) + vslDeleteStream(&random_thread[i]); + #else + for (int i = 1; i < _nrandom_thread; i++) + delete random_thread[i]; + #endif + } + delete []random_thread; + #endif + PairDPD::settings(narg,arg); + _nrandom_thread = comm->nthreads; + + #ifdef LMP_USE_MKL_RNG + + random_thread=new VSLStreamStatePtr[comm->nthreads]; + #if defined(_OPENMP) + #pragma omp parallel + { + int tid = omp_get_thread_num(); + vslNewStream(&random_thread[tid], LMP_MKL_RNG, + seed + comm->me + comm->nprocs * tid ); + } + #endif + + #else + + random_thread =new RanMars*[comm->nthreads]; + random_thread[0] = random; + #if defined(_OPENMP) + #pragma omp parallel + { + int tid = omp_get_thread_num(); + if (tid > 0) + random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid); + } + #endif + + #endif +} + +/* ---------------------------------------------------------------------- */ + +void PairDPDIntel::init_style() +{ + PairDPD::init_style(); + if (force->newton_pair == 0) { + neighbor->requests[neighbor->nrequest-1]->half = 0; + neighbor->requests[neighbor->nrequest-1]->full = 1; + } + neighbor->requests[neighbor->nrequest-1]->intel = 1; + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + fix->pair_init_check(); + #ifdef _LMP_INTEL_OFFLOAD + if (fix->offload_balance() != 0.0) + error->all(FLERR, + "Offload for dpd/intel is not yet available. Set balance to 0."); + #endif + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + pack_force_const(force_const_single, fix->get_mixed_buffers()); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + pack_force_const(force_const_double, fix->get_double_buffers()); + else + pack_force_const(force_const_single, fix->get_single_buffers()); +} + +/* ---------------------------------------------------------------------- */ + +template +void PairDPDIntel::pack_force_const(ForceConst &fc, + IntelBuffers *buffers) +{ + _onetype = 0; + if (atom->ntypes == 1 && !atom->molecular) _onetype = 1; + + int tp1 = atom->ntypes + 1; + fc.set_ntypes(tp1,comm->nthreads,buffers->get_max_nbors(),memory,_cop); + buffers->set_ntypes(tp1); + flt_t **cutneighsq = buffers->get_cutneighsq(); + + // Repeat cutsq calculation because done after call to init_style + double cut, cutneigh; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i,j); + cutneigh = cut + neighbor->skin; + cutsq[i][j] = cutsq[j][i] = cut*cut; + cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; + double icut = 1.0 / cut; + fc.param[i][j].icut = fc.param[j][i].icut = icut; + } else { + cut = init_one(i,j); + double icut = 1.0 / cut; + fc.param[i][j].icut = fc.param[j][i].icut = icut; + } + } + } + + for (int i = 0; i < 4; i++) { + fc.special_lj[i] = force->special_lj[i]; + fc.special_lj[0] = 1.0; + } + + for (int i = 0; i < tp1; i++) { + for (int j = 0; j < tp1; j++) { + fc.param[i][j].a0 = a0[i][j]; + fc.param[i][j].gamma = gamma[i][j]; + fc.param[i][j].sigma = sigma[i][j]; + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +void PairDPDIntel::ForceConst::set_ntypes(const int ntypes, + const int nthreads, + const int max_nbors, + Memory *memory, + const int cop) { + if (ntypes != _ntypes) { + if (_ntypes > 0) { + _memory->destroy(param); + _memory->destroy(rand_buffer_thread); + _memory->destroy(rngi); + } + if (ntypes > 0) { + _cop = cop; + memory->create(param,ntypes,ntypes,"fc.param"); + memory->create(rand_buffer_thread, nthreads, max_nbors, + "fc.rand_buffer_thread"); + memory->create(rngi,nthreads,"fc.param"); + for (int i = 0; i < nthreads; i++) rngi[i] = max_nbors; + } + } + _ntypes = ntypes; + _memory = memory; +} + +/* ---------------------------------------------------------------------- + proc 0 reads from restart file, bcasts + ------------------------------------------------------------------------- */ + +void PairDPDIntel::read_restart_settings(FILE *fp) +{ + #if defined(_OPENMP) + if (_nrandom_thread) { + #ifdef LMP_USE_MKL_RNG + for (int i = 0; i < _nrandom_thread; i++) + vslDeleteStream(&random_thread[i]); + #else + for (int i = 1; i < _nrandom_thread; i++) + delete random_thread[i]; + #endif + } + delete []random_thread; + #endif + PairDPD::read_restart_settings(fp); + _nrandom_thread = comm->nthreads; + + #ifdef LMP_USE_MKL_RNG + + random_thread=new VSLStreamStatePtr[comm->nthreads]; + #if defined(_OPENMP) + #pragma omp parallel + { + int tid = omp_get_thread_num(); + vslNewStream(&random_thread[tid], LMP_MKL_RNG, + seed + comm->me + comm->nprocs * tid ); + } + #endif + + #else + + random_thread =new RanMars*[comm->nthreads]; + random_thread[0] = random; + #if defined(_OPENMP) + #pragma omp parallel + { + int tid = omp_get_thread_num(); + if (tid > 0) + random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid); + } + #endif + + #endif +} diff --git a/src/USER-INTEL/pair_dpd_intel.h b/src/USER-INTEL/pair_dpd_intel.h new file mode 100644 index 0000000000..416d873c00 --- /dev/null +++ b/src/USER-INTEL/pair_dpd_intel.h @@ -0,0 +1,110 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) + Shun Xu (Computer Network Information Center, CAS) +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(dpd/intel,PairDPDIntel) + +#else + +#ifndef LMP_PAIR_DPD_INTEL_H +#define LMP_PAIR_DPD_INTEL_H + +#include "pair_dpd.h" +#include "fix_intel.h" + +#ifdef LMP_USE_MKL_RNG +#include "mkl_vsl.h" +#else +#include "random_mars.h" +#endif + +namespace LAMMPS_NS { + +class PairDPDIntel : public PairDPD { + + public: + PairDPDIntel(class LAMMPS *); + ~PairDPDIntel(); + + virtual void compute(int, int); + void settings(int, char **); + void init_style(); + void read_restart_settings(FILE *); + + private: + FixIntel *fix; + int _cop, _onetype, _nrandom_thread; + + #ifdef LMP_USE_MKL_RNG + VSLStreamStatePtr *random_thread; + #else + RanMars **random_thread; + #endif + + template class ForceConst; + template + void compute(int eflag, int vflag, IntelBuffers *buffers, + const ForceConst &fc); + template + void eval(const int offload, const int vflag, + IntelBuffers * buffers, + const ForceConst &fc, const int astart, const int aend); + + template + void pack_force_const(ForceConst &fc, + IntelBuffers *buffers); + + // ---------------------------------------------------------------------- + + template + class ForceConst { + public: + typedef struct { flt_t icut, a0, gamma, sigma; } fc_packed1; + + _alignvar(flt_t special_lj[4],64); + fc_packed1 **param; + flt_t **rand_buffer_thread; + int *rngi; + + ForceConst() : _ntypes(0) {} + ~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); } + + void set_ntypes(const int ntypes, const int nthreads, const int max_nbors, + Memory *memory, const int cop); + + private: + int _ntypes, _cop; + Memory *_memory; + }; + ForceConst force_const_single; + ForceConst force_const_double; +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: The 'package intel' command is required for /intel styles + +Self-explanatory. + +*/ diff --git a/src/USER-INTEL/verlet_lrt_intel.cpp b/src/USER-INTEL/verlet_lrt_intel.cpp index 81f4586143..9ff5f85176 100644 --- a/src/USER-INTEL/verlet_lrt_intel.cpp +++ b/src/USER-INTEL/verlet_lrt_intel.cpp @@ -68,7 +68,7 @@ void VerletLRTIntel::init() _intel_kspace = (PPPMIntel*)(force->kspace_match("pppm/intel", 0)); - #ifdef LMP_INTEL_NOLRT + #ifndef LMP_INTEL_USELRT error->all(FLERR, "LRT otion for Intel package disabled at compile time"); #endif diff --git a/src/USER-INTEL/verlet_lrt_intel.h b/src/USER-INTEL/verlet_lrt_intel.h index 813cd53605..0d7154ff64 100644 --- a/src/USER-INTEL/verlet_lrt_intel.h +++ b/src/USER-INTEL/verlet_lrt_intel.h @@ -23,10 +23,7 @@ IntegrateStyle(verlet/lrt/intel,VerletLRTIntel) #include "verlet.h" #include "pppm_intel.h" -#ifndef LMP_INTEL_USELRT -#define LMP_INTEL_NOLRT -#else - +#ifdef LMP_INTEL_USELRT #ifdef LMP_INTEL_LRT11 #define _LMP_INTEL_LRT_11 #include diff --git a/src/USER-MANIFOLD/manifold_gaussian_bump.cpp b/src/USER-MANIFOLD/manifold_gaussian_bump.cpp index db8c589afb..a9ee35bbfc 100644 --- a/src/USER-MANIFOLD/manifold_gaussian_bump.cpp +++ b/src/USER-MANIFOLD/manifold_gaussian_bump.cpp @@ -134,7 +134,7 @@ public: // Manifold itself: manifold_gaussian_bump::manifold_gaussian_bump(class LAMMPS* lmp, int narg, char **arg) - : manifold(lmp), lut_z(NULL), lut_zp(NULL) {} + : manifold(lmp), lut_z(NULL), lut_zp(NULL) {} manifold_gaussian_bump::~manifold_gaussian_bump() @@ -361,13 +361,13 @@ void manifold_gaussian_bump::test_lut() n( x, nn ); double taper_z; if( xx <= rc1 ){ - taper_z = gaussian_bump(xx); + taper_z = gaussian_bump(xx); }else if( xx < rc2 ){ - taper_z = lut_get_z( xx ); + taper_z = lut_get_z( xx ); }else{ - taper_z = 0.0; + taper_z = 0.0; } - fprintf( fp, "%g %g %g %g %g\n", xx, gaussian_bump(xx), taper_z, + fprintf( fp, "%g %g %g %g %g %g %g\n", xx, gaussian_bump(xx), taper_z, gg, nn[0], nn[1], nn[2] ); } fclose(fp); diff --git a/src/USER-MISC/fix_srp.cpp b/src/USER-MISC/fix_srp.cpp index f3dec42a83..e1e5f579b8 100644 --- a/src/USER-MISC/fix_srp.cpp +++ b/src/USER-MISC/fix_srp.cpp @@ -98,7 +98,7 @@ int FixSRP::setmask() void FixSRP::init() { - if (force->pair_match("hybrid",1) == NULL) + if (force->pair_match("hybrid",1) == NULL && force->pair_match("hybrid/overlay",1) == NULL) error->all(FLERR,"Cannot use pair srp without pair_style hybrid"); int has_rigid = 0; diff --git a/src/USER-NETCDF/dump_netcdf.cpp b/src/USER-NETCDF/dump_netcdf.cpp index 971f69f7cc..a9532d1077 100644 --- a/src/USER-NETCDF/dump_netcdf.cpp +++ b/src/USER-NETCDF/dump_netcdf.cpp @@ -88,8 +88,8 @@ DumpNetCDF::DumpNetCDF(LAMMPS *lmp, int narg, char **arg) : if (multiproc) error->all(FLERR,"Multi-processor writes are not supported."); - if (multifile) - error->all(FLERR,"Multiple files are not supported."); + if (append_flag && multifile) + error->all(FLERR,"Cannot append when writing to multiple files."); perat = new nc_perat_t[nfield]; @@ -224,6 +224,24 @@ DumpNetCDF::~DumpNetCDF() void DumpNetCDF::openfile() { + char *filecurrent = filename; + if (multifile && !singlefile_opened) { + char *filestar = filecurrent; + filecurrent = new char[strlen(filestar) + 16]; + char *ptr = strchr(filestar,'*'); + *ptr = '\0'; + if (padflag == 0) + sprintf(filecurrent,"%s" BIGINT_FORMAT "%s", + filestar,update->ntimestep,ptr+1); + else { + char bif[8],pad[16]; + strcpy(bif,BIGINT_FORMAT); + sprintf(pad,"%%s%%0%d%s%%s",padflag,&bif[1]); + sprintf(filecurrent,pad,filestar,update->ntimestep,ptr+1); + } + *ptr = '*'; + } + if (thermo && !singlefile_opened) { if (thermovar) delete [] thermovar; thermovar = new int[output->thermo->nfield]; @@ -268,14 +286,14 @@ void DumpNetCDF::openfile() ntotalgr = group->count(igroup); if (filewriter) { - if (append_flag && access(filename, F_OK) != -1) { + if (append_flag && !multifile && access(filecurrent, F_OK) != -1) { // Fixme! Perform checks if dimensions and variables conform with // data structure standard. if (singlefile_opened) return; singlefile_opened = 1; - NCERRX( nc_open(filename, NC_WRITE, &ncid), filename ); + NCERRX( nc_open(filecurrent, NC_WRITE, &ncid), filecurrent ); // dimensions NCERRX( nc_inq_dimid(ncid, NC_FRAME_STR, &frame_dim), NC_FRAME_STR ); @@ -348,8 +366,8 @@ void DumpNetCDF::openfile() if (singlefile_opened) return; singlefile_opened = 1; - NCERRX( nc_create(filename, NC_64BIT_DATA, &ncid), - filename ); + NCERRX( nc_create(filecurrent, NC_64BIT_DATA, &ncid), + filecurrent ); // dimensions NCERRX( nc_def_dim(ncid, NC_FRAME_STR, NC_UNLIMITED, &frame_dim), @@ -598,15 +616,39 @@ void DumpNetCDF::closefile() if (filewriter && singlefile_opened) { NCERR( nc_close(ncid) ); singlefile_opened = 0; - // append next time DumpNetCDF::openfile is called - append_flag = 1; // write to next frame upon next open - framei++; + if (multifile) + framei = 1; + else { + // append next time DumpNetCDF::openfile is called + append_flag = 1; + framei++; + } } } /* ---------------------------------------------------------------------- */ +template +int nc_put_var1_bigint(int ncid, int varid, const size_t index[], const T* tp) +{ + return nc_put_var1_int(ncid, varid, index, tp); +} + +template <> +int nc_put_var1_bigint(int ncid, int varid, const size_t index[], + const long* tp) +{ + return nc_put_var1_long(ncid, varid, index, tp); +} + +template <> +int nc_put_var1_bigint(int ncid, int varid, const size_t index[], + const long long* tp) +{ + return nc_put_var1_longlong(ncid, varid, index, tp); +} + void DumpNetCDF::write() { // open file @@ -638,13 +680,8 @@ void DumpNetCDF::write() th->keyword[i] ); } else if (th->vtype[i] == BIGINT) { -#if defined(LAMMPS_SMALLBIG) || defined(LAMMPS_BIGBIG) - NCERRX( nc_put_var1_long(ncid, thermovar[i], start, &th->bivalue), + NCERRX( nc_put_var1_bigint(ncid, thermovar[i], start, &th->bivalue), th->keyword[i] ); -#else - NCERRX( nc_put_var1_int(ncid, thermovar[i], start, &th->bivalue), - th->keyword[i] ); -#endif } } } @@ -887,6 +924,8 @@ int DumpNetCDF::modify_param(int narg, char **arg) return 2; } else if (strcmp(arg[iarg],"at") == 0) { + if (!append_flag) + error->all(FLERR,"expected 'append yes' before 'at' keyword"); iarg++; framei = force->inumeric(FLERR,arg[iarg]); if (framei < 0) framei--; @@ -911,68 +950,6 @@ int DumpNetCDF::modify_param(int narg, char **arg) /* ---------------------------------------------------------------------- */ -void DumpNetCDF::write_prmtop() -{ - char fn[1024]; - char tmp[81]; - FILE *f; - - strcpy(fn, filename); - strcat(fn, ".prmtop"); - - f = fopen(fn, "w"); - fprintf(f, "%%VERSION LAMMPS\n"); - fprintf(f, "%%FLAG TITLE\n"); - fprintf(f, "%%FORMAT(20a4)\n"); - memset(tmp, ' ', 76); - tmp[76] = '\0'; - fprintf(f, "NASN%s\n", tmp); - - fprintf(f, "%%FLAG POINTERS\n"); - fprintf(f, "%%FORMAT(10I8)\n"); -#if defined(LAMMPS_SMALLBIG) || defined(LAMMPS_BIGBIG) - fprintf(f, "%8li", ntotalgr); -#else - fprintf(f, "%8i", ntotalgr); -#endif - for (int i = 0; i < 11; i++) - fprintf(f, "%8i", 0); - fprintf(f, "\n"); - for (int i = 0; i < 12; i++) - fprintf(f, "%8i", 0); - fprintf(f, "\n"); - for (int i = 0; i < 6; i++) - fprintf(f, "%8i", 0); - fprintf(f, "\n"); - - fprintf(f, "%%FLAG ATOM_NAME\n"); - fprintf(f, "%%FORMAT(20a4)\n"); - for (int i = 0; i < ntotalgr; i++) { - fprintf(f, "%4s", "He"); - if ((i+1) % 20 == 0) - fprintf(f, "\n"); - } - - fprintf(f, "%%FLAG CHARGE\n"); - fprintf(f, "%%FORMAT(5E16.5)\n"); - for (int i = 0; i < ntotalgr; i++) { - fprintf(f, "%16.5e", 0.0); - if ((i+1) % 5 == 0) - fprintf(f, "\n"); - } - - fprintf(f, "%%FLAG MASS\n"); - fprintf(f, "%%FORMAT(5E16.5)\n"); - for (int i = 0; i < ntotalgr; i++) { - fprintf(f, "%16.5e", 1.0); - if ((i+1) % 5 == 0) - fprintf(f, "\n"); - } - fclose(f); -} - -/* ---------------------------------------------------------------------- */ - void DumpNetCDF::ncerr(int err, const char *descr, int line) { if (err != NC_NOERR) { diff --git a/src/USER-NETCDF/dump_netcdf.h b/src/USER-NETCDF/dump_netcdf.h index b86f294d30..25d64efade 100644 --- a/src/USER-NETCDF/dump_netcdf.h +++ b/src/USER-NETCDF/dump_netcdf.h @@ -92,7 +92,6 @@ class DumpNetCDF : public DumpCustom { void closefile(); virtual void write_header(bigint); virtual void write_data(int, double *); - void write_prmtop(); virtual int modify_param(int, char **); diff --git a/src/USER-NETCDF/dump_netcdf_mpiio.cpp b/src/USER-NETCDF/dump_netcdf_mpiio.cpp index 3b753b1b04..746b904655 100644 --- a/src/USER-NETCDF/dump_netcdf_mpiio.cpp +++ b/src/USER-NETCDF/dump_netcdf_mpiio.cpp @@ -88,8 +88,8 @@ DumpNetCDFMPIIO::DumpNetCDFMPIIO(LAMMPS *lmp, int narg, char **arg) : if (multiproc) error->all(FLERR,"Multi-processor writes are not supported."); - if (multifile) - error->all(FLERR,"Multiple files are not supported."); + if (append_flag && multifile) + error->all(FLERR,"Cannot append when writing to multiple files."); perat = new nc_perat_t[nfield]; @@ -217,6 +217,24 @@ DumpNetCDFMPIIO::~DumpNetCDFMPIIO() void DumpNetCDFMPIIO::openfile() { + char *filecurrent = filename; + if (multifile && !singlefile_opened) { + char *filestar = filecurrent; + filecurrent = new char[strlen(filestar) + 16]; + char *ptr = strchr(filestar,'*'); + *ptr = '\0'; + if (padflag == 0) + sprintf(filecurrent,"%s" BIGINT_FORMAT "%s", + filestar,update->ntimestep,ptr+1); + else { + char bif[8],pad[16]; + strcpy(bif,BIGINT_FORMAT); + sprintf(pad,"%%s%%0%d%s%%s",padflag,&bif[1]); + sprintf(filecurrent,pad,filestar,update->ntimestep,ptr+1); + } + *ptr = '*'; + } + if (thermo && !singlefile_opened) { if (thermovar) delete [] thermovar; thermovar = new int[output->thermo->nfield]; @@ -260,7 +278,7 @@ void DumpNetCDFMPIIO::openfile() // get total number of atoms ntotalgr = group->count(igroup); - if (append_flag && access(filename, F_OK) != -1) { + if (append_flag && !multifile && access(filecurrent, F_OK) != -1) { // Fixme! Perform checks if dimensions and variables conform with // data structure standard. @@ -270,8 +288,8 @@ void DumpNetCDFMPIIO::openfile() if (singlefile_opened) return; singlefile_opened = 1; - NCERRX( ncmpi_open(MPI_COMM_WORLD, filename, NC_WRITE, MPI_INFO_NULL, - &ncid), filename ); + NCERRX( ncmpi_open(MPI_COMM_WORLD, filecurrent, NC_WRITE, MPI_INFO_NULL, + &ncid), filecurrent ); // dimensions NCERRX( ncmpi_inq_dimid(ncid, NC_FRAME_STR, &frame_dim), NC_FRAME_STR ); @@ -344,8 +362,8 @@ void DumpNetCDFMPIIO::openfile() if (singlefile_opened) return; singlefile_opened = 1; - NCERRX( ncmpi_create(MPI_COMM_WORLD, filename, NC_64BIT_DATA, - MPI_INFO_NULL, &ncid), filename ); + NCERRX( ncmpi_create(MPI_COMM_WORLD, filecurrent, NC_64BIT_DATA, + MPI_INFO_NULL, &ncid), filecurrent ); // dimensions NCERRX( ncmpi_def_dim(ncid, NC_FRAME_STR, NC_UNLIMITED, &frame_dim), @@ -574,15 +592,40 @@ void DumpNetCDFMPIIO::closefile() if (singlefile_opened) { NCERR( ncmpi_close(ncid) ); singlefile_opened = 0; - // append next time DumpNetCDFMPIIO::openfile is called - append_flag = 1; // write to next frame upon next open - framei++; + if (multifile) + framei = 1; + else { + // append next time DumpNetCDFMPIIO::openfile is called + append_flag = 1; + framei++; + } } } /* ---------------------------------------------------------------------- */ +template +int ncmpi_put_var1_bigint(int ncid, int varid, const MPI_Offset index[], + const T* tp) +{ + return ncmpi_put_var1_int(ncid, varid, index, tp); +} + +template <> +int ncmpi_put_var1_bigint(int ncid, int varid, const MPI_Offset index[], + const long* tp) +{ + return ncmpi_put_var1_long(ncid, varid, index, tp); +} + +template <> +int ncmpi_put_var1_bigint(int ncid, int varid, const MPI_Offset index[], + const long long* tp) +{ + return ncmpi_put_var1_longlong(ncid, varid, index, tp); +} + void DumpNetCDFMPIIO::write() { // open file @@ -616,13 +659,8 @@ void DumpNetCDFMPIIO::write() th->keyword[i] ); } else if (th->vtype[i] == BIGINT) { -#if defined(LAMMPS_SMALLBIG) || defined(LAMMPS_BIGBIG) - NCERRX( ncmpi_put_var1_long(ncid, thermovar[i], start, &th->bivalue), + NCERRX( ncmpi_put_var1_bigint(ncid, thermovar[i], start, &th->bivalue), th->keyword[i] ); -#else - NCERRX( ncmpi_put_var1_int(ncid, thermovar[i], start, &th->bivalue), - th->keyword[i] ); -#endif } } } @@ -882,6 +920,8 @@ int DumpNetCDFMPIIO::modify_param(int narg, char **arg) return 2; } else if (strcmp(arg[iarg],"at") == 0) { + if (!append_flag) + error->all(FLERR,"expected 'append yes' before 'at' keyword"); iarg++; framei = force->inumeric(FLERR,arg[iarg]); if (framei < 0) framei--; diff --git a/src/USER-OMP/fix_qeq_reax_omp.cpp b/src/USER-OMP/fix_qeq_reax_omp.cpp index 4457ab6592..d89c9627fe 100644 --- a/src/USER-OMP/fix_qeq_reax_omp.cpp +++ b/src/USER-OMP/fix_qeq_reax_omp.cpp @@ -703,7 +703,7 @@ void FixQEqReaxOMP::calculate_Q() q[i] = s[i] - u * t[i]; // backup s & t - for (int k = 4; k > 0; --k) { + for (int k = nprev-1; k > 0; --k) { s_hist[i][k] = s_hist[i][k-1]; t_hist[i][k] = t_hist[i][k-1]; } diff --git a/src/USER-REAXC/fix_qeq_reax.cpp b/src/USER-REAXC/fix_qeq_reax.cpp index 9d165f3fd3..d1c4f90771 100644 --- a/src/USER-REAXC/fix_qeq_reax.cpp +++ b/src/USER-REAXC/fix_qeq_reax.cpp @@ -95,7 +95,7 @@ FixQEqReax::FixQEqReax(LAMMPS *lmp, int narg, char **arg) : pack_flag = 0; s = NULL; t = NULL; - nprev = 5; + nprev = 4; Hdia_inv = NULL; b_s = NULL; @@ -817,7 +817,7 @@ void FixQEqReax::calculate_Q() q[i] = s[i] - u * t[i]; /* backup s & t */ - for (k = 4; k > 0; --k) { + for (k = nprev-1; k > 0; --k) { s_hist[i][k] = s_hist[i][k-1]; t_hist[i][k] = t_hist[i][k-1]; } diff --git a/src/atom.cpp b/src/atom.cpp index 1191f0f2b5..7d343a0807 100644 --- a/src/atom.cpp +++ b/src/atom.cpp @@ -453,12 +453,12 @@ void Atom::create_avec(const char *style, int narg, char **arg, int trysuffix) // if molecular system: // atom IDs must be defined // force atom map to be created - // map style may be reset by map_init() and its call to map_style_set() + // map style will be reset to array vs hash to by map_init() molecular = avec->molecular; if (molecular && tag_enable == 0) error->all(FLERR,"Atom IDs must be used for molecular systems"); - if (molecular) map_style = 1; + if (molecular) map_style = 3; } /* ---------------------------------------------------------------------- @@ -593,6 +593,7 @@ void Atom::modify_params(int narg, char **arg) "Atom_modify map command after simulation box is defined"); if (strcmp(arg[iarg+1],"array") == 0) map_user = 1; else if (strcmp(arg[iarg+1],"hash") == 0) map_user = 2; + else if (strcmp(arg[iarg+1],"yes") == 0) map_user = 3; else error->all(FLERR,"Illegal atom_modify command"); map_style = map_user; iarg += 2; diff --git a/src/atom_map.cpp b/src/atom_map.cpp index bbfe014dec..9d257d99de 100644 --- a/src/atom_map.cpp +++ b/src/atom_map.cpp @@ -298,12 +298,12 @@ int Atom::map_style_set() MPI_Allreduce(&max,&map_tag_max,1,MPI_LMP_TAGINT,MPI_MAX,world); // set map_style for new map - // if user-selected, use that setting + // if user-selected to array/hash, use that setting // else if map_tag_max > 1M, use hash // else use array int map_style_old = map_style; - if (map_user) map_style = map_user; + if (map_user == 1 || map_user == 2) map_style = map_user; else if (map_tag_max > 1000000) map_style = 2; else map_style = 1; diff --git a/src/comm_brick.cpp b/src/comm_brick.cpp index 3c972b8244..06227b7a84 100644 --- a/src/comm_brick.cpp +++ b/src/comm_brick.cpp @@ -476,8 +476,7 @@ void CommBrick::forward_comm(int dummy) if (sendproc[iswap] != me) { if (comm_x_only) { if (size_forward_recv[iswap]) { - if (size_forward_recv[iswap]) buf = x[firstrecv[iswap]]; - else buf = NULL; + buf = x[firstrecv[iswap]]; MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE, recvproc[iswap],0,world,&request); } @@ -547,8 +546,7 @@ void CommBrick::reverse_comm() MPI_Irecv(buf_recv,size_reverse_recv[iswap],MPI_DOUBLE, sendproc[iswap],0,world,&request); if (size_reverse_send[iswap]) { - if (size_reverse_send[iswap]) buf = f[firstrecv[iswap]]; - else buf = NULL; + buf = f[firstrecv[iswap]]; MPI_Send(buf,size_reverse_send[iswap],MPI_DOUBLE, recvproc[iswap],0,world); } diff --git a/src/create_atoms.cpp b/src/create_atoms.cpp index 04a2df91f8..992049a81f 100644 --- a/src/create_atoms.cpp +++ b/src/create_atoms.cpp @@ -343,6 +343,11 @@ void CreateAtoms::command(int narg, char **arg) } } + // Record wall time for atom creation + + MPI_Barrier(world); + double time1 = MPI_Wtime(); + // clear ghost count and any ghost bonus data internal to AtomVec // same logic as beginning of Comm::exchange() // do it now b/c creating atoms will overwrite ghost atoms @@ -509,6 +514,9 @@ void CreateAtoms::command(int narg, char **arg) if (domain->triclinic) domain->lamda2x(atom->nlocal); } + MPI_Barrier(world); + double time2 = MPI_Wtime(); + // clean up delete ranmol; @@ -521,12 +529,16 @@ void CreateAtoms::command(int narg, char **arg) // print status if (comm->me == 0) { - if (screen) + if (screen) { fprintf(screen,"Created " BIGINT_FORMAT " atoms\n", atom->natoms-natoms_previous); - if (logfile) + fprintf(screen," Time spent = %g secs\n",time2-time1); + } + if (logfile) { fprintf(logfile,"Created " BIGINT_FORMAT " atoms\n", atom->natoms-natoms_previous); + fprintf(logfile," Time spent = %g secs\n",time2-time1); + } } // for MOLECULE mode: diff --git a/src/dump.cpp b/src/dump.cpp index 44098298ba..ddd958c25c 100644 --- a/src/dump.cpp +++ b/src/dump.cpp @@ -238,7 +238,7 @@ void Dump::init() int gcmcflag = 0; for (int i = 0; i < modify->nfix; i++) if ((strcmp(modify->fix[i]->style,"gcmc") == 0)) - gcmcflag = 1; + gcmcflag = 1; if (sortcol == 0 && atom->tag_consecutive() && !gcmcflag) { tagint *tag = atom->tag; @@ -898,7 +898,7 @@ void Dump::modify_params(int narg, char **arg) } else if (strcmp(arg[iarg],"fileper") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal dump_modify command"); if (!multiproc) - error->all(FLERR,"Cannot use dump_modify fileper " + error->all(FLERR,"Cannot use dump_modify fileper " "without % in dump file name"); int nper = force->inumeric(FLERR,arg[iarg+1]); if (nper <= 0) error->all(FLERR,"Illegal dump_modify command"); @@ -973,7 +973,7 @@ void Dump::modify_params(int narg, char **arg) } else if (strcmp(arg[iarg],"nfile") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal dump_modify command"); if (!multiproc) - error->all(FLERR,"Cannot use dump_modify nfile " + error->all(FLERR,"Cannot use dump_modify nfile " "without % in dump file name"); int nfile = force->inumeric(FLERR,arg[iarg+1]); if (nfile <= 0) error->all(FLERR,"Illegal dump_modify command"); diff --git a/src/finish.cpp b/src/finish.cpp index 45e9226388..c22ecaae60 100644 --- a/src/finish.cpp +++ b/src/finish.cpp @@ -130,7 +130,7 @@ void Finish::end(int flag) atom->natoms); if (logfile) fprintf(logfile,fmt1,time_loop,ntasks,update->nsteps, atom->natoms); - + // Gromacs/NAMD-style performance metric for suitable unit settings if ( timeflag && !minflag && !prdflag && !tadflag && @@ -144,7 +144,7 @@ void Finish::end(int flag) double one_fs = force->femtosecond; double t_step = ((double) time_loop) / ((double) update->nsteps); double step_t = 1.0/t_step; - + if (strcmp(update->unit_style,"lj") == 0) { double tau_day = 24.0*3600.0 / t_step * update->dt / one_fs; const char perf[] = "Performance: %.3f tau/day, %.3f timesteps/s\n"; @@ -161,7 +161,7 @@ void Finish::end(int flag) } // CPU use on MPI tasks and OpenMP threads - + if (timeflag) { if (lmp->kokkos) { const char fmt2[] = diff --git a/src/input.cpp b/src/input.cpp index 7d11b8741b..23b89d3040 100644 --- a/src/input.cpp +++ b/src/input.cpp @@ -18,7 +18,7 @@ #include #include #include -#include "sys/stat.h" +#include #include "input.h" #include "style_command.h" #include "universe.h" diff --git a/src/main.cpp b/src/main.cpp index 7401183fea..82dac5af6d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -22,6 +22,10 @@ #include #endif +#ifdef FFT_FFTW3 +#include +#endif + using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- @@ -62,4 +66,10 @@ int main(int argc, char **argv) #endif MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); + +#ifdef FFT_FFTW3 + // tell fftw3 to delete its global memory pool + // and thus avoid bogus valgrind memory leak reports + fftw_cleanup(); +#endif } diff --git a/src/modify.cpp b/src/modify.cpp index 4516788aa9..361079bc16 100644 --- a/src/modify.cpp +++ b/src/modify.cpp @@ -110,7 +110,7 @@ Modify::~Modify() // delete all fixes // do it via delete_fix() so callbacks in Atom are also updated correctly - while (nfix) delete_fix(fix[0]->id); + while (nfix) delete_fix(0); memory->sfree(fix); memory->destroy(fmask); @@ -863,9 +863,9 @@ void Modify::add_fix(int narg, char **arg, int trysuffix) fix[ifix]->restart(state_restart_global[i]); used_restart_global[i] = 1; if (comm->me == 0) { - if (screen) + if (screen) fprintf(screen,"Resetting global fix info from restart file:\n"); - if (logfile) + if (logfile) fprintf(logfile,"Resetting global fix info from restart file:\n"); if (screen) fprintf(screen," fix style: %s, fix ID: %s\n", fix[ifix]->style,fix[ifix]->id); @@ -885,9 +885,9 @@ void Modify::add_fix(int narg, char **arg, int trysuffix) fix[ifix]->unpack_restart(j,index_restart_peratom[i]); fix[ifix]->restart_reset = 1; if (comm->me == 0) { - if (screen) + if (screen) fprintf(screen,"Resetting peratom fix info from restart file:\n"); - if (logfile) + if (logfile) fprintf(logfile,"Resetting peratom fix info from restart file:\n"); if (screen) fprintf(screen," fix style: %s, fix ID: %s\n", fix[ifix]->style,fix[ifix]->id); @@ -944,7 +944,13 @@ void Modify::delete_fix(const char *id) { int ifix = find_fix(id); if (ifix < 0) error->all(FLERR,"Could not find fix ID to delete"); - delete fix[ifix]; + delete_fix(ifix); +} + +void Modify::delete_fix(int ifix) +{ + if(fix[ifix]) + delete fix[ifix]; atom->update_callback(ifix); // move other Fixes and fmask down in list one slot @@ -1409,24 +1415,24 @@ void Modify::restart_deallocate(int flag) if (flag && comm->me == 0) { int i; for (i = 0; i < nfix_restart_global; i++) - if (used_restart_global[i] == 0) break; + if (used_restart_global[i] == 0) break; if (i == nfix_restart_global) { - if (screen) + if (screen) fprintf(screen,"All restart file global fix info " "was re-assigned\n"); - if (logfile) + if (logfile) fprintf(logfile,"All restart file global fix info " "was re-assigned\n"); } else { - if (screen) fprintf(screen,"Unused restart file global fix info:\n"); - if (logfile) fprintf(logfile,"Unused restart file global fix info:\n"); - for (i = 0; i < nfix_restart_global; i++) { - if (used_restart_global[i]) continue; - if (screen) fprintf(screen," fix style: %s, fix ID: %s\n", - style_restart_global[i],id_restart_global[i]); - if (logfile) fprintf(logfile," fix style: %s, fix ID: %s\n", - style_restart_global[i],id_restart_global[i]); - } + if (screen) fprintf(screen,"Unused restart file global fix info:\n"); + if (logfile) fprintf(logfile,"Unused restart file global fix info:\n"); + for (i = 0; i < nfix_restart_global; i++) { + if (used_restart_global[i]) continue; + if (screen) fprintf(screen," fix style: %s, fix ID: %s\n", + style_restart_global[i],id_restart_global[i]); + if (logfile) fprintf(logfile," fix style: %s, fix ID: %s\n", + style_restart_global[i],id_restart_global[i]); + } } } @@ -1445,24 +1451,24 @@ void Modify::restart_deallocate(int flag) if (flag && comm->me == 0) { int i; for (i = 0; i < nfix_restart_peratom; i++) - if (used_restart_peratom[i] == 0) break; + if (used_restart_peratom[i] == 0) break; if (i == nfix_restart_peratom) { - if (screen) + if (screen) fprintf(screen,"All restart file peratom fix info " "was re-assigned\n"); - if (logfile) + if (logfile) fprintf(logfile,"All restart file peratom fix info " "was re-assigned\n"); } else { - if (screen) fprintf(screen,"Unused restart file peratom fix info:\n"); - if (logfile) fprintf(logfile,"Unused restart file peratom fix info:\n"); - for (i = 0; i < nfix_restart_peratom; i++) { - if (used_restart_peratom[i]) continue; - if (screen) fprintf(screen," fix style: %s, fix ID: %s\n", - style_restart_peratom[i],id_restart_peratom[i]); - if (logfile) fprintf(logfile," fix style: %s, fix ID: %s\n", - style_restart_peratom[i],id_restart_peratom[i]); - } + if (screen) fprintf(screen,"Unused restart file peratom fix info:\n"); + if (logfile) fprintf(logfile,"Unused restart file peratom fix info:\n"); + for (i = 0; i < nfix_restart_peratom; i++) { + if (used_restart_peratom[i]) continue; + if (screen) fprintf(screen," fix style: %s, fix ID: %s\n", + style_restart_peratom[i],id_restart_peratom[i]); + if (logfile) fprintf(logfile," fix style: %s, fix ID: %s\n", + style_restart_peratom[i],id_restart_peratom[i]); + } } } diff --git a/src/modify.h b/src/modify.h index d825d5c4ef..4ec61f6d57 100644 --- a/src/modify.h +++ b/src/modify.h @@ -95,6 +95,7 @@ class Modify : protected Pointers { void add_fix(int, char **, int trysuffix=1); void modify_fix(int, char **); void delete_fix(const char *); + void delete_fix(int); int find_fix(const char *); int find_fix_by_style(const char *); int check_package(const char *); diff --git a/src/output.cpp b/src/output.cpp index ce7fcb7cca..ce593ec6ae 100644 --- a/src/output.cpp +++ b/src/output.cpp @@ -827,9 +827,9 @@ void Output::create_restart(int narg, char **arg) sum and print memory usage result is only memory on proc 0, not averaged across procs ------------------------------------------------------------------------- */ + void Output::memory_usage() { - bigint bytes = 0; bytes += atom->memory_usage(); bytes += neighbor->memory_usage(); diff --git a/src/replicate.cpp b/src/replicate.cpp index e2ed718f65..f3d1964169 100644 --- a/src/replicate.cpp +++ b/src/replicate.cpp @@ -74,6 +74,11 @@ void Replicate::command(int narg, char **arg) if (atom->nextra_grow || atom->nextra_restart || atom->nextra_store) error->all(FLERR,"Cannot replicate with fixes that store atom quantities"); + // Record wall time for atom replication + + MPI_Barrier(world); + double time1 = MPI_Wtime(); + // maxtag = largest atom tag across all existing atoms tagint maxtag = 0; @@ -424,4 +429,16 @@ void Replicate::command(int narg, char **arg) Special special(lmp); special.build(); } + + // Wall time + + MPI_Barrier(world); + double time2 = MPI_Wtime(); + + if (me == 0) { + if (screen) + fprintf(screen," Time spent = %g secs\n",time2-time1); + if (logfile) + fprintf(logfile," Time spent = %g secs\n",time2-time1); + } } diff --git a/tools/phonon/Makefile b/tools/phonon/Makefile index 0aacb1e086..67f9b91fdf 100644 --- a/tools/phonon/Makefile +++ b/tools/phonon/Makefile @@ -1,7 +1,7 @@ .SUFFIXES : .o .cpp # compiler and flags -CC = g++ -Wno-unused-result -LINK = $(CC) -static +CC = g++ -Wall +LINK = $(CC) CFLAGS = -O3 $(DEBUG) $(UFLAG) # OFLAGS = -O3 $(DEBUG) @@ -9,18 +9,17 @@ INC = $(LPKINC) $(TCINC) $(SPGINC) LIB = $(LPKLIB) $(TCLIB) $(SPGLIB) # # cLapack library needed -LPKINC = -I/opt/libs/clapack/3.2.1/include -LPKLIB = -L/opt/libs/clapack/3.2.1/lib -lclapack -lblas -lf2c #-lm +LPKINC = +LPKLIB =-llapack # -# Tricubic library needed -TCINC = -I/opt/libs/tricubic/1.0/include -TCLIB = -L/opt/libs/tricubic/1.0/lib -ltricubic # # spglib 1.8.2, used to get the irreducible q-points # if UFLAG is not set, spglib won't be used. -UFLAG = -DUseSPG -SPGINC = -I/opt/libs/spglib/1.8.2/include -SPGLIB = -L/opt/libs/spglib/1.8.2/lib -lsymspg + +# UFLAG = -DUseSPG +# SPGINC = -I/opt/libs/spglib/1.8.2/include +# SPGLIB = -L/opt/libs/spglib/1.8.2/lib -lsymspg + # if spglib other than version 1.8.2 is used, please # modify file phonon.cpp, instruction can be found by searching 1.8.2 @@ -36,7 +35,7 @@ SRC = $(wildcard *.cpp) OBJ = $(SRC:.cpp=.o) #==================================================================== -all: ver ${EXE} +all: ${EXE} ${EXE}: $(OBJ) $(LINK) $(OFLAGS) $(OBJ) $(LIB) -o $@ @@ -59,3 +58,16 @@ ver: $(CC) $(CFLAGS) -c $< .cpp.o: $(CC) $(CFLAGS) $(INC) -c $< + +#==================================================================== +# dependencies +disp.o: disp.cpp phonon.h dynmat.h memory.h interpolate.h green.h timer.h \ + global.h +dynmat.o: dynmat.cpp dynmat.h memory.h interpolate.h version.h global.h +green.o: green.cpp green.h memory.h global.h +interpolate.o: interpolate.cpp interpolate.h memory.h global.h +main.o: main.cpp dynmat.h memory.h interpolate.h phonon.h +memory.o: memory.cpp memory.h +phonon.o: phonon.cpp phonon.h dynmat.h memory.h interpolate.h green.h \ + timer.h global.h +timer.o: timer.cpp timer.h diff --git a/tools/phonon/README b/tools/phonon/README index ae6383b6bd..b54d96d8a3 100644 --- a/tools/phonon/README +++ b/tools/phonon/README @@ -5,15 +5,9 @@ analyse the phonon related information. #------------------------------------------------------------------------------- 1. Dependencies - The clapack library is needed to solve the eigen problems, - which could be downloaded from: - http://www.netlib.org/clapack/ - - The tricubic library is also needed to do tricubic interpolations, - which could be obtained from: - http://orca.princeton.edu/francois/software/tricubic/ - or - http://1drv.ms/1J2WFYk + The LAPACK library is needed to solve the eigen problems. + http://www.netlib.org/lapack/ + Intel MKL can be used as well. The spglib is optionally needed, enabling one to evaluate the phonon density of states or vibrational thermal properties diff --git a/tools/phonon/disp.cpp b/tools/phonon/disp.cpp index 2fa603916c..218e01e7fc 100644 --- a/tools/phonon/disp.cpp +++ b/tools/phonon/disp.cpp @@ -18,7 +18,8 @@ void Phonon::pdisp() { // ask the output file name and write the header. char str[MAXLINE]; - for (int ii = 0; ii < 80; ++ii) printf("="); printf("\n"); + for (int ii = 0; ii < 80; ++ii) printf("="); + printf("\n"); #ifdef UseSPG // ask method to generate q-lines int method = 2; @@ -53,7 +54,6 @@ void Phonon::pdisp() while (1){ for (int i = 0; i < 3; ++i) qstr[i] = qend[i]; - int quit = 0; printf("\nPlease input the start q-point in unit of B1->B3, q to exit [%g %g %g]: ", qstr[0], qstr[1], qstr[2]); int n = count_words(fgets(str, MAXLINE, stdin)); ptr = strtok(str, " \t\n\r\f"); @@ -2844,7 +2844,8 @@ void Phonon::pdisp() printf("\nPhonon dispersion data are written to: %s, you can visualize the results\n", fname); printf("by invoking: `gnuplot pdisp.gnuplot; gv pdisp.eps`\n"); } - for (int ii = 0; ii < 80; ++ii) printf("="); printf("\n"); + for (int ii = 0; ii < 80; ++ii) printf("="); + printf("\n"); delete []fname; nodes.clear(); diff --git a/tools/phonon/dynmat.cpp b/tools/phonon/dynmat.cpp index e82f473130..3b7bfe8268 100644 --- a/tools/phonon/dynmat.cpp +++ b/tools/phonon/dynmat.cpp @@ -3,6 +3,11 @@ #include "version.h" #include "global.h" +extern "C" void zheevd_(char *, char *, long int *, doublecomplex *, + long int *, double *, doublecomplex *, + long int *, double *, long int *, long int *, + long int *, long int *); + // to initialize the class DynMat::DynMat(int narg, char **arg) { @@ -81,7 +86,8 @@ DynMat::DynMat(int narg, char **arg) printf("Number of atoms per unit cell : %d\n", nucell); printf("System dimension : %d\n", sysdim); printf("Boltzmann constant in used units : %g\n", boltz); - for (int i = 0; i < 80; ++i) printf("="); printf("\n"); + for (int i = 0; i < 80; ++i) printf("="); + printf("\n"); if (sysdim < 1||sysdim > 3||nx < 1||ny < 1||nz < 1||nucell < 1){ printf("Wrong values read from header of file: %s, please check the binary file!\n", binfile); fclose(fp); exit(3); @@ -117,11 +123,11 @@ DynMat::DynMat(int narg, char **arg) memory->create(attyp, nucell, "DynMat:attyp"); memory->create(M_inv_sqrt, nucell, "DynMat:M_inv_sqrt"); - if ( fread(&Tmeasure, sizeof(double), 1, fp) != 1 ){printf("\nError while reading temperature from file: %s\n", binfile); fclose(fp); exit(3);} - if ( fread(&basevec[0], sizeof(double), 9, fp) != 9 ){printf("\nError while reading lattice info from file: %s\n", binfile); fclose(fp); exit(3);} - if ( fread(basis[0], sizeof(double), fftdim, fp) != fftdim){printf("\nError while reading basis info from file: %s\n", binfile); fclose(fp); exit(3);} - if ( fread(&attyp[0], sizeof(int), nucell, fp) != nucell){printf("\nError while reading atom types from file: %s\n", binfile); fclose(fp); exit(3);} - if ( fread(&M_inv_sqrt[0], sizeof(double), nucell, fp) != nucell){printf("\nError while reading atomic masses from file: %s\n", binfile); fclose(fp); exit(3);} + if ( (int) fread(&Tmeasure, sizeof(double), 1, fp) != 1 ){printf("\nError while reading temperature from file: %s\n", binfile); fclose(fp); exit(3);} + if ( (int) fread(&basevec[0], sizeof(double), 9, fp) != 9 ){printf("\nError while reading lattice info from file: %s\n", binfile); fclose(fp); exit(3);} + if ( (int) fread(basis[0], sizeof(double), fftdim, fp) != fftdim){printf("\nError while reading basis info from file: %s\n", binfile); fclose(fp); exit(3);} + if ( (int) fread(&attyp[0], sizeof(int), nucell, fp) != nucell){printf("\nError while reading atom types from file: %s\n", binfile); fclose(fp); exit(3);} + if ( (int) fread(&M_inv_sqrt[0], sizeof(double), nucell, fp) != nucell){printf("\nError while reading atomic masses from file: %s\n", binfile); fclose(fp); exit(3);} fclose(fp); car2dir(); @@ -229,9 +235,9 @@ return; int DynMat::geteigen(double *egv, int flag) { char jobz, uplo; - integer n, lda, lwork, lrwork, *iwork, liwork, info; + long int n, lda, lwork, lrwork, *iwork, liwork, info; doublecomplex *work; - doublereal *w = &egv[0], *rwork; + double *w = &egv[0], *rwork; n = fftdim; if (flag) jobz = 'V'; @@ -338,7 +344,8 @@ void DynMat::EnforceASR() char *ptr = strtok(str," \t\n\r\f"); if (ptr) nasr = atoi(ptr); if (nasr < 1){ - for (int i=0; i<80; i++) printf("="); printf("\n"); + for (int i=0; i<80; i++) printf("="); + printf("\n"); return; } @@ -404,7 +411,8 @@ void DynMat::EnforceASR() if (i == 99){ printf("...... (%d more skiped)", fftdim-100); break;} } printf("\n"); - for (int i = 0; i < 80; ++i) printf("="); printf("\n\n"); + for (int i = 0; i < 80; ++i) printf("="); + printf("\n\n"); return; } @@ -456,7 +464,7 @@ return; * --------------------------------------------------------------------*/ void DynMat::GaussJordan(int n, double *Mat) { - int i,icol,irow,j,k,l,ll,idr,idc; + int i,icol=0,irow=0,j,k,l,ll,idr,idc; int *indxc,*indxr,*ipiv; double big, nmjk; double dum, pivinv; diff --git a/tools/phonon/dynmat.h b/tools/phonon/dynmat.h index 1d6e716584..f5bd4010b8 100644 --- a/tools/phonon/dynmat.h +++ b/tools/phonon/dynmat.h @@ -7,11 +7,6 @@ #include "memory.h" #include "interpolate.h" -extern "C"{ -#include "f2c.h" -#include "clapack.h" -} - using namespace std; class DynMat { diff --git a/tools/phonon/green.cpp b/tools/phonon/green.cpp index 8f8946dc4f..35514c03fb 100644 --- a/tools/phonon/green.cpp +++ b/tools/phonon/green.cpp @@ -224,7 +224,6 @@ void Green::recursion() { // local variables std::complex Z, rec_x, rec_x_inv; - std::complex cunit = std::complex(0.,1.); double w = wmin; diff --git a/tools/phonon/interpolate.cpp b/tools/phonon/interpolate.cpp index 8c0cbde1ce..954062d415 100644 --- a/tools/phonon/interpolate.cpp +++ b/tools/phonon/interpolate.cpp @@ -1,7 +1,125 @@ #include "interpolate.h" -#include "math.h" +#include #include "global.h" +/////////////////////// +// tricubic library code +static int A[64][64] = { +{ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{-3, 3, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 2,-2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 9,-9,-9, 9, 0, 0, 0, 0, 6, 3,-6,-3, 0, 0, 0, 0, 6,-6, 3,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{-6, 6, 6,-6, 0, 0, 0, 0,-3,-3, 3, 3, 0, 0, 0, 0,-4, 4,-2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-2,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{-6, 6, 6,-6, 0, 0, 0, 0,-4,-2, 4, 2, 0, 0, 0, 0,-3, 3,-3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-1,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 4,-4,-4, 4, 0, 0, 0, 0, 2, 2,-2,-2, 0, 0, 0, 0, 2,-2, 2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9,-9,-9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3,-6,-3, 0, 0, 0, 0, 6,-6, 3,-3, 0, 0, 0, 0, 4, 2, 2, 1, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-6, 6, 6,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3,-3, 3, 3, 0, 0, 0, 0,-4, 4,-2, 2, 0, 0, 0, 0,-2,-2,-1,-1, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-6, 6, 6,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4,-2, 4, 2, 0, 0, 0, 0,-3, 3,-3, 3, 0, 0, 0, 0,-2,-1,-2,-1, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4,-4,-4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,-2,-2, 0, 0, 0, 0, 2,-2, 2,-2, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0}, +{-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 9,-9, 0, 0,-9, 9, 0, 0, 6, 3, 0, 0,-6,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6,-6, 0, 0, 3,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{-6, 6, 0, 0, 6,-6, 0, 0,-3,-3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 4, 0, 0,-2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-2, 0, 0,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0,-1, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9,-9, 0, 0,-9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3, 0, 0,-6,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6,-6, 0, 0, 3,-3, 0, 0, 4, 2, 0, 0, 2, 1, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-6, 6, 0, 0, 6,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3,-3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 4, 0, 0,-2, 2, 0, 0,-2,-2, 0, 0,-1,-1, 0, 0}, +{ 9, 0,-9, 0,-9, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0,-6, 0,-3, 0, 6, 0,-6, 0, 3, 0,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 9, 0,-9, 0,-9, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0,-6, 0,-3, 0, 6, 0,-6, 0, 3, 0,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 2, 0, 1, 0}, +{-27,27,27,-27,27,-27,-27,27,-18,-9,18, 9,18, 9,-18,-9,-18,18,-9, 9,18,-18, 9,-9,-18,18,18,-18,-9, 9, 9,-9,-12,-6,-6,-3,12, 6, 6, 3,-12,-6,12, 6,-6,-3, 6, 3,-12,12,-6, 6,-6, 6,-3, 3,-8,-4,-4,-2,-4,-2,-2,-1}, +{18,-18,-18,18,-18,18,18,-18, 9, 9,-9,-9,-9,-9, 9, 9,12,-12, 6,-6,-12,12,-6, 6,12,-12,-12,12, 6,-6,-6, 6, 6, 6, 3, 3,-6,-6,-3,-3, 6, 6,-6,-6, 3, 3,-3,-3, 8,-8, 4,-4, 4,-4, 2,-2, 4, 4, 2, 2, 2, 2, 1, 1}, +{-6, 0, 6, 0, 6, 0,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0,-3, 0, 3, 0, 3, 0,-4, 0, 4, 0,-2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-2, 0,-1, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0,-6, 0, 6, 0, 6, 0,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0,-3, 0, 3, 0, 3, 0,-4, 0, 4, 0,-2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-2, 0,-1, 0,-1, 0}, +{18,-18,-18,18,-18,18,18,-18,12, 6,-12,-6,-12,-6,12, 6, 9,-9, 9,-9,-9, 9,-9, 9,12,-12,-12,12, 6,-6,-6, 6, 6, 3, 6, 3,-6,-3,-6,-3, 8, 4,-8,-4, 4, 2,-4,-2, 6,-6, 6,-6, 3,-3, 3,-3, 4, 2, 4, 2, 2, 1, 2, 1}, +{-12,12,12,-12,12,-12,-12,12,-6,-6, 6, 6, 6, 6,-6,-6,-6, 6,-6, 6, 6,-6, 6,-6,-8, 8, 8,-8,-4, 4, 4,-4,-3,-3,-3,-3, 3, 3, 3, 3,-4,-4, 4, 4,-2,-2, 2, 2,-4, 4,-4, 4,-2, 2,-2, 2,-2,-2,-2,-2,-1,-1,-1,-1}, +{ 2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{-6, 6, 0, 0, 6,-6, 0, 0,-4,-2, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0,-3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 4,-4, 0, 0,-4, 4, 0, 0, 2, 2, 0, 0,-2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-6, 6, 0, 0, 6,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4,-2, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0,-3, 3, 0, 0,-2,-1, 0, 0,-2,-1, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4,-4, 0, 0,-4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0,-2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 2,-2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0}, +{-6, 0, 6, 0, 6, 0,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0,-2, 0, 4, 0, 2, 0,-3, 0, 3, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0,-2, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0,-6, 0, 6, 0, 6, 0,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0,-2, 0, 4, 0, 2, 0,-3, 0, 3, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0,-2, 0,-1, 0}, +{18,-18,-18,18,-18,18,18,-18,12, 6,-12,-6,-12,-6,12, 6,12,-12, 6,-6,-12,12,-6, 6, 9,-9,-9, 9, 9,-9,-9, 9, 8, 4, 4, 2,-8,-4,-4,-2, 6, 3,-6,-3, 6, 3,-6,-3, 6,-6, 3,-3, 6,-6, 3,-3, 4, 2, 2, 1, 4, 2, 2, 1}, +{-12,12,12,-12,12,-12,-12,12,-6,-6, 6, 6, 6, 6,-6,-6,-8, 8,-4, 4, 8,-8, 4,-4,-6, 6, 6,-6,-6, 6, 6,-6,-4,-4,-2,-2, 4, 4, 2, 2,-3,-3, 3, 3,-3,-3, 3, 3,-4, 4,-2, 2,-4, 4,-2, 2,-2,-2,-1,-1,-2,-2,-1,-1}, +{ 4, 0,-4, 0,-4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,-2, 0,-2, 0, 2, 0,-2, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 4, 0,-4, 0,-4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,-2, 0,-2, 0, 2, 0,-2, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0}, +{-12,12,12,-12,12,-12,-12,12,-8,-4, 8, 4, 8, 4,-8,-4,-6, 6,-6, 6, 6,-6, 6,-6,-6, 6, 6,-6,-6, 6, 6,-6,-4,-2,-4,-2, 4, 2, 4, 2,-4,-2, 4, 2,-4,-2, 4, 2,-3, 3,-3, 3,-3, 3,-3, 3,-2,-1,-2,-1,-2,-1,-2,-1}, +{ 8,-8,-8, 8,-8, 8, 8,-8, 4, 4,-4,-4,-4,-4, 4, 4, 4,-4, 4,-4,-4, 4,-4, 4, 4,-4,-4, 4, 4,-4,-4, 4, 2, 2, 2, 2,-2,-2,-2,-2, 2, 2,-2,-2, 2, 2,-2,-2, 2,-2, 2,-2, 2,-2, 2,-2, 1, 1, 1, 1, 1, 1, 1, 1}}; + +static int ijk2n(int i, int j, int k) { + return(i+4*j+16*k); +} + +/* ---------------------------------------------------------------------------- */ + +static void tricubic_get_coeff_stacked(double a[64], double x[64]) { + int i,j; + for (i=0;i<64;i++) { + a[i]=(double)(0.0); + for (j=0;j<64;j++) { + a[i]+=A[i][j]*x[j]; + } + } +} + +static void tricubic_get_coeff(double a[64], double f[8], double dfdx[8], double dfdy[8], double dfdz[8], double d2fdxdy[8], double d2fdxdz[8], double d2fdydz[8], double d3fdxdydz[8]) { + int i; + double x[64]; + for (i=0;i<8;i++) { + x[0+i]=f[i]; + x[8+i]=dfdx[i]; + x[16+i]=dfdy[i]; + x[24+i]=dfdz[i]; + x[32+i]=d2fdxdy[i]; + x[40+i]=d2fdxdz[i]; + x[48+i]=d2fdydz[i]; + x[56+i]=d3fdxdydz[i]; + } + tricubic_get_coeff_stacked(a,x); +} + +static double tricubic_eval(double a[64], double x, double y, double z) { + int i,j,k; + double ret=(double)(0.0); + /* TRICUBIC EVAL + This is the short version of tricubic_eval. It is used to compute + the value of the function at a given point (x,y,z). To compute + partial derivatives of f, use the full version with the extra args. + */ + for (i=0;i<4;i++) { + for (j=0;j<4;j++) { + for (k=0;k<4;k++) { + ret+=a[ijk2n(i,j,k)]*pow(x,i)*pow(y,j)*pow(z,k); + } + } + } + return(ret); +} + /* ---------------------------------------------------------------------------- * Constructor used to get info from caller, and prepare other necessary data * ---------------------------------------------------------------------------- */ @@ -274,7 +392,8 @@ void Interpolate::set_method() which =2-im%2; printf("Your selection: %d\n", which); - for(int i=0; i<80; i++) printf("="); printf("\n\n"); + for(int i=0; i<80; i++) printf("="); + printf("\n\n"); if (which == 1) tricubic_init(); @@ -306,4 +425,3 @@ void Interpolate::reset_gamma() return; } -/* ---------------------------------------------------------------------------- */ diff --git a/tools/phonon/interpolate.h b/tools/phonon/interpolate.h index e192fcac87..04a358ae71 100644 --- a/tools/phonon/interpolate.h +++ b/tools/phonon/interpolate.h @@ -5,11 +5,8 @@ #include "stdlib.h" #include "string.h" #include "memory.h" -#include -extern "C"{ -#include "f2c.h" -#include "clapack.h" -} + +extern "C" typedef struct { double r, i; } doublecomplex; using namespace std; diff --git a/tools/phonon/phonon.cpp b/tools/phonon/phonon.cpp index 43bea111b4..065885cf3f 100644 --- a/tools/phonon/phonon.cpp +++ b/tools/phonon/phonon.cpp @@ -42,7 +42,8 @@ Phonon::Phonon(DynMat *dm) printf("\n"); for (int i = 0; i < 37; ++i) printf("="); printf(" Menu "); - for (int i = 0; i < 37; ++i) printf("="); printf("\n"); + for (int i = 0; i < 37; ++i) printf("="); + printf("\n"); printf(" 1. Phonon DOS evaluation;\n"); printf(" 2. Phonon dispersion curves;\n"); printf(" 3. Dynamical matrix at arbitrary q;\n"); @@ -60,7 +61,8 @@ Phonon::Phonon(DynMat *dm) printf("Your choice [0]: "); if (count_words(fgets(str,MAXLINE,stdin)) > 0) job = atoi(strtok(str," \t\n\r\f")); printf("\nYour selection: %d\n", job); - for (int i = 0; i < 80; ++i) printf("=");printf("\n\n"); + for (int i = 0; i < 80; ++i) printf("="); + printf("\n\n"); // now to do the job according to user's choice if (job == 1) pdos(); @@ -414,7 +416,8 @@ void Phonon::vfanyq() dynmat->geteigen(egvs, 0); printf("q-point: [%lg %lg %lg], ", q[0], q[1], q[2]); printf("vibrational frequencies at this q-point:\n"); - for (int i = 0; i < ndim; ++i) printf("%lg ", egvs[i]); printf("\n\n"); + for (int i = 0; i < ndim; ++i) printf("%lg ", egvs[i]); + printf("\n\n"); } return; @@ -1001,7 +1004,8 @@ void Phonon::ShowCell() printf("\n"); for (int i = 0; i < 30; ++i) printf("="); printf(" Unit Cell Info "); - for (int i = 0; i < 30; ++i) printf("="); printf("\n"); + for (int i = 0; i < 30; ++i) printf("="); + printf("\n"); printf("Number of atoms in the unit cell: %d\n", dynmat->nucell); printf("Basis vectors of the unit cell:\n"); printf(" %15.8f %15.8f %15.8f\n", dynmat->basevec[0], dynmat->basevec[1], dynmat->basevec[2]); @@ -1091,7 +1095,7 @@ int Phonon::count_words(const char *line) strcpy(copy,line); char *ptr; - if (ptr = strchr(copy,'#')) *ptr = '\0'; + if ((ptr = strchr(copy,'#'))) *ptr = '\0'; if (strtok(copy," \t\n\r\f") == NULL) { memory->destroy(copy); diff --git a/tools/phonon/version.h b/tools/phonon/version.h index 8ed0e80aa7..decab631b0 100644 --- a/tools/phonon/version.h +++ b/tools/phonon/version.h @@ -1 +1 @@ -#define VERSION 7 +#define VERSION 8