Merge branch 'master' into HEAD

This commit is contained in:
Christoph Junghans
2017-10-07 12:35:51 -06:00
125 changed files with 3966 additions and 2965 deletions

View File

@ -37,6 +37,10 @@ enable_language(CXX)
#####################################################################
include(CheckCCompilerFlag)
if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -restrict")
endif()
########################################################################
# User input options #
########################################################################
@ -76,7 +80,7 @@ add_definitions(-DLAMMPS_MEMALIGN=${LAMMPS_MEMALIGN})
option(LAMMPS_EXCEPTIONS "enable the use of C++ exceptions for error messages (useful for library interface)" OFF)
if(LAMMPS_EXCEPTIONS)
add_definitions(-DLAMMPS_EXCEPTIONS)
set(LAMMPS_API_DEFINES "${LAMMPS_API_DEFINES -DLAMMPS_EXCEPTIONS")
set(LAMMPS_API_DEFINES "${LAMMPS_API_DEFINES} -DLAMMPS_EXCEPTIONS")
endif()
set(LAMMPS_MACHINE "" CACHE STRING "Suffix to append to lmp binary and liblammps (WON'T enable any features automatically")
@ -665,7 +669,9 @@ include_directories(${LAMMPS_STYLE_HEADERS_DIR})
############################################
add_library(lammps ${LIB_SOURCES})
target_link_libraries(lammps ${LAMMPS_LINK_LIBS})
if(LAMMPS_DEPS)
add_dependencies(lammps ${LAMMPS_DEPS})
endif()
set_target_properties(lammps PROPERTIES OUTPUT_NAME lammps${LAMMPS_MACHINE})
if(BUILD_SHARED_LIBS)
set_target_properties(lammps PROPERTIES SOVERSION ${SOVERSION})

Binary file not shown.

Before

Width:  |  Height:  |  Size: 20 KiB

After

Width:  |  Height:  |  Size: 19 KiB

View File

@ -706,7 +706,7 @@ dynamics can be run with LAMMPS using density-functional tight-binding
quantum forces calculated by LATTE.
More information on LATTE can be found at this web site:
"https://github.com/lanl/LATTE"_#latte_home. A brief technical
"https://github.com/lanl/LATTE"_latte_home. A brief technical
description is given with the "fix latte"_fix_latte.html command.
:link(latte_home,https://github.com/lanl/LATTE)
@ -729,6 +729,7 @@ make lib-latte args="-b" # download and build in lib/latte/LATTE-
make lib-latte args="-p $HOME/latte" # use existing LATTE installation in $HOME/latte
make lib-latte args="-b -m gfortran" # download and build in lib/latte and
# copy Makefile.lammps.gfortran to Makefile.lammps
:pre
Note that 3 symbolic (soft) links, "includelink" and "liblink" and
"filelink", are created in lib/latte to point into the LATTE home dir.

View File

@ -25,14 +25,14 @@ LAMMPS to run on the CPU cores and coprocessor cores simultaneously.
[Currently Available USER-INTEL Styles:]
Angle Styles: charmm, harmonic :ulb,l
Bond Styles: fene, harmonic :l
Bond Styles: fene, fourier, harmonic :l
Dihedral Styles: charmm, harmonic, opls :l
Fixes: nve, npt, nvt, nvt/sllod :l
Fixes: nve, npt, nvt, nvt/sllod, nve/asphere :l
Improper Styles: cvff, harmonic :l
Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long,
buck, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm,
lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo,
sw, tersoff :l
buck, dpd, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm,
lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long,
rebo, sw, tersoff :l
K-Space Styles: pppm, pppm/disp :l
:ule
@ -54,11 +54,12 @@ warmup run (for use with offload benchmarks).
:c,image(JPG/user_intel.png)
Results are speedups obtained on Intel Xeon E5-2697v4 processors
(code-named Broadwell) and Intel Xeon Phi 7250 processors
(code-named Knights Landing) with "June 2017" LAMMPS built with
Intel Parallel Studio 2017 update 2. Results are with 1 MPI task
per physical core. See {src/USER-INTEL/TEST/README} for the raw
simulation rates and instructions to reproduce.
(code-named Broadwell), Intel Xeon Phi 7250 processors (code-named
Knights Landing), and Intel Xeon Gold 6148 processors (code-named
Skylake) with "June 2017" LAMMPS built with Intel Parallel Studio
2017 update 2. Results are with 1 MPI task per physical core. See
{src/USER-INTEL/TEST/README} for the raw simulation rates and
instructions to reproduce.
:line
@ -82,6 +83,11 @@ this order :l
The {newton} setting applies to all atoms, not just atoms shared
between MPI tasks :l
Vectorization can change the order for adding pairwise forces :l
When using the -DLMP_USE_MKL_RNG define (all included intel optimized
makefiles do) at build time, the random number generator for
dissipative particle dynamics (pair style dpd/intel) uses the Mersenne
Twister generator included in the Intel MKL library (that should be
more robust than the default Masaglia random number generator) :l
:ule
The precision mode (described below) used with the USER-INTEL
@ -119,8 +125,8 @@ For Intel Xeon Phi CPUs:
Runs should be performed using MCDRAM. :ulb,l
:ule
For simulations using {kspace_style pppm} on Intel CPUs
supporting AVX-512:
For simulations using {kspace_style pppm} on Intel CPUs supporting
AVX-512:
Add "kspace_modify diff ad" to the input script :ulb,l
The command-line option should be changed to
@ -237,14 +243,17 @@ However, if you do not have coprocessors on your system, building
without offload support will produce a smaller binary.
The general requirements for Makefiles with the USER-INTEL package
are as follows. "-DLAMMPS_MEMALIGN=64" is required for CCFLAGS. When
using Intel compilers, "-restrict" is required and "-qopenmp" is
highly recommended for CCFLAGS and LINKFLAGS. LIB should include
"-ltbbmalloc". For builds supporting offload, "-DLMP_INTEL_OFFLOAD"
is required for CCFLAGS and "-qoffload" is required for LINKFLAGS.
Other recommended CCFLAG options for best performance are
"-O2 -fno-alias -ansi-alias -qoverride-limits fp-model fast=2
-no-prec-div".
are as follows. When using Intel compilers, "-restrict" is required
and "-qopenmp" is highly recommended for CCFLAGS and LINKFLAGS.
CCFLAGS should include "-DLMP_INTEL_USELRT" (unless POSIX Threads
are not supported in the build environment) and "-DLMP_USE_MKL_RNG"
(unless Intel Math Kernel Library (MKL) is not available in the build
environment). For Intel compilers, LIB should include "-ltbbmalloc"
or if the library is not available, "-DLMP_INTEL_NO_TBB" can be added
to CCFLAGS. For builds supporting offload, "-DLMP_INTEL_OFFLOAD" is
required for CCFLAGS and "-qoffload" is required for LINKFLAGS. Other
recommended CCFLAG options for best performance are "-O2 -fno-alias
-ansi-alias -qoverride-limits fp-model fast=2 -no-prec-div".
NOTE: The vectorization and math capabilities can differ depending on
the CPU. For Intel compilers, the "-x" flag specifies the type of

View File

@ -16,7 +16,7 @@ atom_modify keyword values ... :pre
one or more keyword/value pairs may be appended :ulb,l
keyword = {id} or {map} or {first} or {sort} :l
{id} value = {yes} or {no}
{map} value = {array} or {hash}
{map} value = {yes} or {array} or {hash}
{first} value = group-ID = group whose atoms will appear first in internal atom lists
{sort} values = Nfreq binsize
Nfreq = sort atoms spatially every this many time steps
@ -25,8 +25,8 @@ keyword = {id} or {map} or {first} or {sort} :l
[Examples:]
atom_modify map hash
atom_modify map array sort 10000 2.0
atom_modify map yes
atom_modify map hash sort 10000 2.0
atom_modify first colloid :pre
[Description:]
@ -62,29 +62,33 @@ switch. This is described in "Section 2.2"_Section_start.html#start_2
of the manual. If atom IDs are not used, they must be specified as 0
for all atoms, e.g. in a data or restart file.
The {map} keyword determines how atom ID lookup is done for molecular
atom styles. Lookups are performed by bond (angle, etc) routines in
LAMMPS to find the local atom index associated with a global atom ID.
The {map} keyword determines how atoms with specific IDs are found
when required. An example are the bond (angle, etc) methods which
need to find the local index of an atom with a specific global ID
which is a bond (angle, etc) partner. LAMMPS performs this operation
efficiently by creating a "map", which is either an {array} or {hash}
table, as descibed below.
When the {array} value is used, each processor stores a lookup table
of length N, where N is the largest atom ID in the system. This is a
When the {map} keyword is not specified in your input script, LAMMPS
only creates a map for "atom_styles"_atom_style.html for molecular
systems which have permanent bonds (angles, etc). No map is created
for atomic systems, since it is normally not needed. However some
LAMMPS commands require a map, even for atomic systems, and will
generate an error if one does not exist. The {map} keyword thus
allows you to force the creation of a map. The {yes} value will
create either an {array} or {hash} style map, as explained in the next
paragraph. The {array} and {hash} values create an atom-style or
hash-style map respectively.
For an {array}-style map, each processor stores a lookup table of
length N, where N is the largest atom ID in the system. This is a
fast, simple method for many simulations, but requires too much memory
for large simulations. The {hash} value uses a hash table to perform
the lookups. This can be slightly slower than the {array} method, but
its memory cost is proportional to the number of atoms owned by a
processor, i.e. N/P when N is the total number of atoms in the system
and P is the number of processors.
When this setting is not specified in your input script, LAMMPS
creates a map, if one is needed, as an array or hash. See the
discussion of default values below for how LAMMPS chooses which kind
of map to build. Note that atomic systems do not normally need to
create a map. However, even in this case some LAMMPS commands will
create a map to find atoms (and then destroy it), or require a
permanent map. An example of the former is the "velocity loop
all"_velocity.html command, which uses a map when looping over all
atoms and insuring the same velocity values are assigned to an atom
ID, no matter which processor owns it.
for large simulations. For a {hash}-style map, a hash table is
created on each processor, which finds an atom ID in constant time
(independent of the global number of atom IDs). It can be slightly
slower than the {array} map, but its memory cost is proportional to
the number of atoms owned by a processor, i.e. N/P when N is the total
number of atoms in the system and P is the number of processors.
The {first} keyword allows a "group"_group.html to be specified whose
atoms will be maintained as the first atoms in each processor's list

View File

@ -7,6 +7,7 @@
:line
dihedral_style fourier command :h3
dihedral_style fourier/intel command :h3
dihedral_style fourier/omp command :h3
[Syntax:]

View File

@ -15,9 +15,11 @@ dump_modify dump-ID keyword values ... :pre
dump-ID = ID of dump to modify :ulb,l
one or more keyword/value pairs may be appended :l
these keywords apply to various dump styles :l
keyword = {append} or {buffer} or {element} or {every} or {fileper} or {first} or {flush} or {format} or {image} or {label} or {nfile} or {pad} or {precision} or {region} or {scale} or {sort} or {thresh} or {unwrap} :l
{append} arg = {yes} or {no} or {at} N
keyword = {append} or {at} or {buffer} or {element} or {every} or {fileper} or {first} or {flush} or {format} or {image} or {label} or {nfile} or {pad} or {precision} or {region} or {scale} or {sort} or {thresh} or {unwrap} :l
{append} arg = {yes} or {no}
{at} arg = N
N = index of frame written upon first dump
only available after "append yes"
{buffer} arg = {yes} or {no}
{element} args = E1 E2 ... EN, where N = # of atom types
E1,...,EN = element name, e.g. C or Fe or Ga

View File

@ -25,7 +25,8 @@ args = list of atom attributes, same as for "dump_style custom"_dump.html :l,ule
dump 1 all netcdf 100 traj.nc type x y z vx vy vz
dump_modify 1 append yes at -1 thermo yes
dump 1 all netcdf/mpiio 1000 traj.nc id type x y z :pre
dump 1 all netcdf/mpiio 1000 traj.nc id type x y z
dump 1 all netcdf 1000 traj.*.nc id type x y z :pre
[Description:]
@ -73,4 +74,3 @@ section for more info.
[Related commands:]
"dump"_dump.html, "dump_modify"_dump_modify.html, "undump"_undump.html

View File

@ -66,7 +66,7 @@ reference charge of overlapping atom-centered densities and bond
integrals are parameterized using a Slater-Koster tight-binding
approach. This procedure, which usually is referred to as the DFTB
method has been described in detail by ("Elstner"_#Elstner) and
("Finnis"_#Finnis) and coworkers.
("Finnis"_#Finnis2) and coworkers.
The work of the LATTE developers follows that of Elstner closely with
respect to the physical model. However, the development of LATTE is
@ -173,7 +173,7 @@ M. Haugk, T. Frauenheim, S. Suhai, and G. Seifert, Phys. Rev. B, 58,
M. Haugk, T. Frauenheim, S. Suhai, and G. Seifert, Phys. Rev. B, 58,
7260 (1998).
:link(Finnis)
:link(Finnis2)
[(Finnis)] M. W. Finnis, A. T. Paxton, M. Methfessel, and M. van
Schilfgarde, Phys. Rev. Lett., 81, 5149 (1998).
@ -197,11 +197,11 @@ J. Sci. Comput. 36 (2), 147-170, (2014).
[(Niklasson2014)] A. M. N. Niklasson and M. Cawkwell, J. Chem. Phys.,
141, 164123, (2014).
:link(Niklasson2014)
:link(Niklasson2017)
[(Niklasson2017)] A. M. N. Niklasson, J. Chem. Phys., 147, 054103 (2017).
:link(Niklasson2012)
[(Niklasson2017)] A. M. N. Niklasson, M. J. Cawkwell, Phys. Rev. B, 86
:link(Cawkwell2012)
[(Cawkwell2012)] A. M. N. Niklasson, M. J. Cawkwell, Phys. Rev. B, 86
(17), 174308 (2012).
:link(Negre2016)

View File

@ -93,7 +93,7 @@ intermediate replica with the previous and the next image:
Fnudge_parallel = {Kspring} * (|Ri+1 - Ri| - |Ri - Ri-1|) :pre
Note that in this case the specified {Kspring) is in force/distance
Note that in this case the specified {Kspring} is in force/distance
units.
With a value of {ideal}, the spring force is computed as suggested in
@ -105,7 +105,7 @@ where RD is the "reaction coordinate" see "neb"_neb.html section, and
RDideal is the ideal RD for which all the images are equally spaced.
I.e. RDideal = (I-1)*meanDist when the climbing replica is off, where
I is the replica number). The meanDist is the average distance
between replicas. Note that in this case the specified {Kspring) is
between replicas. Note that in this case the specified {Kspring} is
in force units.
Note that the {ideal} form of nudging can often be more effective at

View File

@ -393,32 +393,36 @@ thermostatting and barostatting.
:line
These fixes compute a temperature and pressure each timestep. To do
this, the fix creates its own computes of style "temp" and "pressure",
as if one of these two sets of commands had been issued:
this, the thermostat and barostat fixes create their own computes of
style "temp" and "pressure", as if one of these sets of commands had
been issued:
For fix nvt:
compute fix-ID_temp group-ID temp
compute fix-ID_press group-ID pressure fix-ID_temp :pre
For fix npt and fix nph:
compute fix-ID_temp all temp
compute fix-ID_press all pressure fix-ID_temp :pre
See the "compute temp"_compute_temp.html and "compute
pressure"_compute_pressure.html commands for details. Note that the
IDs of the new computes are the fix-ID + underscore + "temp" or fix_ID
+ underscore + "press". For fix nvt, the group for the new computes
is the same as the fix group. For fix nph and fix npt, the group for
the new computes is "all" since pressure is computed for the entire
system.
For fix nvt, the group for the new temperature compute is the same as
the fix group. For fix npt and fix nph, the group for both the new
temperature and pressure compute is "all" since pressure is computed
for the entire system. In the case of fix nph, the temperature
compute is not used for thermostatting, but just for a kinetic-energy
contribution to the pressure. See the "compute
temp"_compute_temp.html and "compute pressure"_compute_pressure.html
commands for details. Note that the IDs of the new computes are the
fix-ID + underscore + "temp" or fix_ID + underscore + "press".
Note that these are NOT the computes used by thermodynamic output (see
the "thermo_style"_thermo_style.html command) with ID = {thermo_temp}
and {thermo_press}. This means you can change the attributes of this
and {thermo_press}. This means you can change the attributes of these
fix's temperature or pressure via the
"compute_modify"_compute_modify.html command or print this temperature
or pressure during thermodynamic output via the "thermo_style
custom"_thermo_style.html command using the appropriate compute-ID.
It also means that changing attributes of {thermo_temp} or
{thermo_press} will have no effect on this fix.
"compute_modify"_compute_modify.html command. Or you can print this
temperature or pressure during thermodynamic output via the
"thermo_style custom"_thermo_style.html command using the appropriate
compute-ID. It also means that changing attributes of {thermo_temp}
or {thermo_press} will have no effect on this fix.
Like other fixes that perform thermostatting, fix nvt and fix npt can
be used with "compute commands"_compute.html that calculate a

View File

@ -59,6 +59,7 @@ Fixes :h1
fix_langevin
fix_langevin_drude
fix_langevin_eff
fix_latte
fix_lb_fluid
fix_lb_momentum
fix_lb_pc

View File

@ -188,6 +188,7 @@ fix_ipi.html
fix_langevin.html
fix_langevin_drude.html
fix_langevin_eff.html
fix_latte.html
fix_lb_fluid.html
fix_lb_momentum.html
fix_lb_pc.html

View File

@ -62,7 +62,7 @@ args = arguments specific to the style :l
{no_affinity} values = none
{kokkos} args = keyword value ...
zero or more keyword/value pairs may be appended
keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward}
keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse}
{neigh} value = {full} or {half}
full = full neighbor list
half = half neighbor list built in thread-safe manner
@ -75,9 +75,10 @@ args = arguments specific to the style :l
{binsize} value = size
size = bin size for neighbor list construction (distance units)
{comm} value = {no} or {host} or {device}
use value for both comm/exchange and comm/forward
use value for comm/exchange and comm/forward and comm/reverse
{comm/exchange} value = {no} or {host} or {device}
{comm/forward} value = {no} or {host} or {device}
{comm/reverse} value = {no} or {host} or {device}
no = perform communication pack/unpack in non-KOKKOS mode
host = perform pack/unpack on host (e.g. with OpenMP threading)
device = perform pack/unpack on device (e.g. on GPU)
@ -429,17 +430,18 @@ Coulombic solver"_kspace_style.html because the GPU is faster at
performing pairwise interactions, then this rule of thumb may give too
large a binsize.
The {comm} and {comm/exchange} and {comm/forward} keywords determine
The {comm} and {comm/exchange} and {comm/forward} and {comm/reverse} keywords determine
whether the host or device performs the packing and unpacking of data
when communicating per-atom data between processors. "Exchange"
communication happens only on timesteps that neighbor lists are
rebuilt. The data is only for atoms that migrate to new processors.
"Forward" communication happens every timestep. The data is for atom
"Forward" communication happens every timestep. "Reverse" communication
happens every timestep if the {newton} option is on. The data is for atom
coordinates and any other atom properties that needs to be updated for
ghost atoms owned by each processor.
The {comm} keyword is simply a short-cut to set the same value
for both the {comm/exchange} and {comm/forward} keywords.
for both the {comm/exchange} and {comm/forward} and {comm/reverse} keywords.
The value options for all 3 keywords are {no} or {host} or {device}.
A value of {no} means to use the standard non-KOKKOS method of

View File

@ -8,6 +8,7 @@
pair_style dpd command :h3
pair_style dpd/gpu command :h3
pair_style dpd/intel command :h3
pair_style dpd/omp command :h3
pair_style dpd/tstat command :h3
pair_style dpd/tstat/gpu command :h3

View File

@ -294,7 +294,7 @@ distribution have a ".cdeam" suffix.
Style {eam/fs} computes pairwise interactions for metals and metal
alloys using a generalized form of EAM potentials due to Finnis and
Sinclair "(Finnis)"_#Finnis. The total energy Ei of an atom I is
Sinclair "(Finnis)"_#Finnis1. The total energy Ei of an atom I is
given by
:c,image(Eqs/pair_eam_fs.jpg)
@ -442,7 +442,7 @@ of Physics: Condensed Matter, 16, S2629 (2004).
[(Daw)] Daw, Baskes, Phys Rev Lett, 50, 1285 (1983).
Daw, Baskes, Phys Rev B, 29, 6443 (1984).
:link(Finnis)
:link(Finnis1)
[(Finnis)] Finnis, Sinclair, Philosophical Magazine A, 50, 45 (1984).
:link(Stukowski)

View File

@ -1,5 +1,24 @@
# Change Log
## [2.04.04](https://github.com/kokkos/kokkos/tree/2.04.04) (2017-09-11)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.00...2.04.04)
**Implemented enhancements:**
- OpenMP partition: set number of threads on nested level [\#1082](https://github.com/kokkos/kokkos/issues/1082)
- Add StaticCrsGraph row\(\) method [\#1071](https://github.com/kokkos/kokkos/issues/1071)
- Enhance Kokkos complex operator overloading [\#1052](https://github.com/kokkos/kokkos/issues/1052)
- Tell Trilinos packages about host+device lambda [\#1019](https://github.com/kokkos/kokkos/issues/1019)
- Function markup for defaulted class members [\#952](https://github.com/kokkos/kokkos/issues/952)
- Add deterministic random number generator [\#857](https://github.com/kokkos/kokkos/issues/857)
**Fixed bugs:**
- Fix reduction\_identity\<T\>::max for floating point numbers [\#1048](https://github.com/kokkos/kokkos/issues/1048)
- Fix MD iteration policy ignores lower bound on GPUs [\#1041](https://github.com/kokkos/kokkos/issues/1041)
- (Experimental) HBWSpace Linking issues in KokkosKernels [\#1094](https://github.com/kokkos/kokkos/issues/1094)
- (Experimental) ROCm: algorithms/unit\_tests test\_sort failing with segfault [\#1070](https://github.com/kokkos/kokkos/issues/1070)
## [2.04.00](https://github.com/kokkos/kokkos/tree/2.04.00) (2017-08-16)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.13...2.04.00)

View File

@ -443,7 +443,7 @@ endif
ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include
KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib
KOKKOS_LIBS += -lmemkind
KOKKOS_LIBS += -lmemkind -lnuma
tmp := $(shell echo "\#define KOKKOS_HAVE_HBWSPACE 1" >> KokkosCore_config.tmp )
endif
@ -614,9 +614,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
else
# Assume that this is a really a GNU compiler or it could be XL on P8.
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
else
# Assume that this is a really a GNU compiler on P8.
KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8
endif
endif
endif
endif
@ -626,9 +635,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
else
# Assume that this is a really a GNU compiler or it could be XL on P9.
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
else
# Assume that this is a really a GNU compiler on P9
KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9
endif
endif
endif
endif

View File

@ -1265,6 +1265,243 @@ void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift102
}
#endif
#if defined(KOKKOS_ENABLE_ROCM)
template<>
class Random_XorShift1024<Kokkos::Experimental::ROCm> {
private:
int p_;
const int state_idx_;
uint64_t* state_;
const int stride_;
friend class Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>;
public:
typedef Kokkos::Experimental::ROCm device_type;
typedef Random_XorShift1024_Pool<device_type> pool_type;
enum {MAX_URAND = 0xffffffffU};
enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
KOKKOS_INLINE_FUNCTION
Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){
}
KOKKOS_INLINE_FUNCTION
uint32_t urand() {
uint64_t state_0 = state_[ p_ * stride_ ];
uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
state_1 ^= state_1 << 31;
state_1 ^= state_1 >> 11;
state_0 ^= state_0 >> 30;
uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
tmp = tmp>>16;
return static_cast<uint32_t>(tmp&MAX_URAND);
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64() {
uint64_t state_0 = state_[ p_ * stride_ ];
uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
state_1 ^= state_1 << 31;
state_1 ^= state_1 >> 11;
state_0 ^= state_0 >> 30;
return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
}
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& range) {
const uint32_t max_val = (MAX_URAND/range)*range;
uint32_t tmp = urand();
while(tmp>=max_val)
urand();
return tmp%range;
}
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& start, const uint32_t& end ) {
return urand(end-start)+start;
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& range) {
const uint64_t max_val = (MAX_URAND64/range)*range;
uint64_t tmp = urand64();
while(tmp>=max_val)
urand64();
return tmp%range;
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& start, const uint64_t& end ) {
return urand64(end-start)+start;
}
KOKKOS_INLINE_FUNCTION
int rand() {
return static_cast<int>(urand()/2);
}
KOKKOS_INLINE_FUNCTION
int rand(const int& range) {
const int max_val = (MAX_RAND/range)*range;
int tmp = rand();
while(tmp>=max_val)
rand();
return tmp%range;
}
KOKKOS_INLINE_FUNCTION
int rand(const int& start, const int& end ) {
return rand(end-start)+start;
}
KOKKOS_INLINE_FUNCTION
int64_t rand64() {
return static_cast<int64_t>(urand64()/2);
}
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& range) {
const int64_t max_val = (MAX_RAND64/range)*range;
int64_t tmp = rand64();
while(tmp>=max_val)
rand64();
return tmp%range;
}
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& start, const int64_t& end ) {
return rand64(end-start)+start;
}
KOKKOS_INLINE_FUNCTION
float frand() {
return 1.0f * urand64()/MAX_URAND64;
}
KOKKOS_INLINE_FUNCTION
float frand(const float& range) {
return range * urand64()/MAX_URAND64;
}
KOKKOS_INLINE_FUNCTION
float frand(const float& start, const float& end ) {
return frand(end-start)+start;
}
KOKKOS_INLINE_FUNCTION
double drand() {
return 1.0 * urand64()/MAX_URAND64;
}
KOKKOS_INLINE_FUNCTION
double drand(const double& range) {
return range * urand64()/MAX_URAND64;
}
KOKKOS_INLINE_FUNCTION
double drand(const double& start, const double& end ) {
return frand(end-start)+start;
}
//Marsaglia polar method for drawing a standard normal distributed random number
KOKKOS_INLINE_FUNCTION
double normal() {
double S = 2.0;
double U;
while(S>=1.0) {
U = 2.0*drand() - 1.0;
const double V = 2.0*drand() - 1.0;
S = U*U+V*V;
}
return U*std::sqrt(-2.0*log(S)/S);
}
KOKKOS_INLINE_FUNCTION
double normal(const double& mean, const double& std_dev=1.0) {
return mean + normal()*std_dev;
}
};
template<>
inline
Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::Random_XorShift64_Pool(uint64_t seed) {
num_states_ = 0;
init(seed,4*32768);
}
template<>
KOKKOS_INLINE_FUNCTION
Random_XorShift64<Kokkos::Experimental::ROCm> Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::get_state() const {
#ifdef __HCC_ACCELERATOR__
const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
i+=blockDim_x*blockDim_y*blockDim_z;
if(i>=num_states_) {i = i_offset;}
}
return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(i),i);
#else
return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(0),0);
#endif
}
template<>
KOKKOS_INLINE_FUNCTION
void Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift64<Kokkos::Experimental::ROCm> &state) const {
#ifdef __HCC_ACCELERATOR__
state_(state.state_idx_) = state.state_;
locks_(state.state_idx_) = 0;
return;
#endif
}
template<>
inline
Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::Random_XorShift1024_Pool(uint64_t seed) {
num_states_ = 0;
init(seed,4*32768);
}
template<>
KOKKOS_INLINE_FUNCTION
Random_XorShift1024<Kokkos::Experimental::ROCm> Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::get_state() const {
#ifdef __HCC_ACCELERATOR__
const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
i+=blockDim_x*blockDim_y*blockDim_z;
if(i>=num_states_) {i = i_offset;}
}
return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(i), i);
#else
return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(0), 0);
#endif
}
template<>
KOKKOS_INLINE_FUNCTION
void Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift1024<Kokkos::Experimental::ROCm> &state) const {
#ifdef __HCC_ACCELERATOR__
for(int i=0; i<16; i++)
state_(state.state_idx_,i) = state.state_[i];
locks_(state.state_idx_) = 0;
return;
#endif
}
#endif

View File

@ -30,6 +30,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
TEST_TARGETS += test-cuda
endif
ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
OBJ_ROCM = TestROCm.o UnitTestMain.o gtest-all.o
TARGETS += KokkosAlgorithms_UnitTest_ROCm
TEST_TARGETS += test-rocm
endif
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o
TARGETS += KokkosAlgorithms_UnitTest_Threads
@ -51,6 +57,9 @@ endif
KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Cuda
KokkosAlgorithms_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_ROCm
KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Threads
@ -63,6 +72,9 @@ KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
test-cuda: KokkosAlgorithms_UnitTest_Cuda
./KokkosAlgorithms_UnitTest_Cuda
test-rocm: KokkosAlgorithms_UnitTest_ROCm
./KokkosAlgorithms_UnitTest_ROCm
test-threads: KokkosAlgorithms_UnitTest_Threads
./KokkosAlgorithms_UnitTest_Threads

View File

@ -0,0 +1,112 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_ROCM
#include <cstdint>
#include <iostream>
#include <iomanip>
#include <gtest/gtest.h>
#include <Kokkos_Core.hpp>
#include <TestRandom.hpp>
#include <TestSort.hpp>
namespace Test {
class rocm : public ::testing::Test {
protected:
static void SetUpTestCase()
{
std::cout << std::setprecision(5) << std::scientific;
Kokkos::HostSpace::execution_space::initialize();
Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) );
}
static void TearDownTestCase()
{
Kokkos::Experimental::ROCm::finalize();
Kokkos::HostSpace::execution_space::finalize();
}
};
void rocm_test_random_xorshift64( int num_draws )
{
Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Experimental::ROCm> >(num_draws);
}
void rocm_test_random_xorshift1024( int num_draws )
{
Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Experimental::ROCm> >(num_draws);
}
#define ROCM_RANDOM_XORSHIFT64( num_draws ) \
TEST_F( rocm, Random_XorShift64 ) { \
rocm_test_random_xorshift64(num_draws); \
}
#define ROCM_RANDOM_XORSHIFT1024( num_draws ) \
TEST_F( rocm, Random_XorShift1024 ) { \
rocm_test_random_xorshift1024(num_draws); \
}
#define ROCM_SORT_UNSIGNED( size ) \
TEST_F( rocm, SortUnsigned ) { \
Impl::test_sort< Kokkos::Experimental::ROCm, unsigned >(size); \
}
ROCM_RANDOM_XORSHIFT64( 132141141 )
ROCM_RANDOM_XORSHIFT1024( 52428813 )
ROCM_SORT_UNSIGNED(171)
#undef ROCM_RANDOM_XORSHIFT64
#undef ROCM_RANDOM_XORSHIFT1024
#undef ROCM_SORT_UNSIGNED
}
#else
void KOKKOS_ALGORITHMS_UNITTESTS_TESTROCM_PREVENT_LINK_ERROR() {}
#endif /* #ifdef KOKKOS_ENABLE_ROCM */

View File

@ -27,7 +27,7 @@ fi
HPCBIND_HWLOC_PARENT_CPUSET=""
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
MY_PID="$BASHPID"
HPCBIND_HWLOC_PARENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
HPCBIND_HWLOC_PARENT_CPUSET="$(hwloc-ps -a --cpuset | grep ${MY_PID} | cut -f 2)"
fi
################################################################################
@ -58,23 +58,34 @@ declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0))
################################################################################
HPCBIND_QUEUE_NAME=""
declare -i HPCBIND_QUEUE_INDEX=0
declare -i HPCBIND_QUEUE_GPU_MAPPING=0
declare -i HPCBIND_QUEUE_MAPPING=0
if [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
HPCBIND_QUEUE_GPU_MAPPING=1
HPCBIND_QUEUE_NAME="sbatch"
if [[ ! -z "${PMI_RANK}" ]]; then
HPCBIND_QUEUE_MAPPING=1
HPCBIND_QUEUE_NAME="mpich"
HPCBIND_QUEUE_INDEX=${PMI_RANK}
elif [[ ! -z "${OMPI_COMM_WORLD_RANK}" ]]; then
HPCBIND_QUEUE_MAPPING=1
HPCBIND_QUEUE_NAME="openmpi"
HPCBIND_QUEUE_INDEX=${OMPI_COMM_WORLD_RANK}
elif [[ ! -z "${MV2_COMM_WORLD_RANK}" ]]; then
HPCBIND_QUEUE_MAPPING=1
HPCBIND_QUEUE_NAME="mvapich2"
HPCBIND_QUEUE_INDEX=${MV2_COMM_WORLD_RANK}
elif [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
HPCBIND_QUEUE_MAPPING=1
HPCBIND_QUEUE_NAME="slurm"
HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID}
elif [[ ! -z "${LBS_JOBINDEX}" ]]; then
HPCBIND_QUEUE_GPU_MAPPING=1
HPCBIND_QUEUE_MAPPING=1
HPCBIND_QUEUE_NAME="bsub"
HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX}
elif [[ ! -z "${ALPS_APP_PE}" ]]; then
HPCBIND_QUEUE_GPU_MAPPING=1
HPCBIND_QUEUE_MAPPING=1
HPCBIND_QUEUE_NAME="aprun"
HPCBIND_QUEUE_INDEX=${ALPS_APP_PE}
fi
################################################################################
# Show help
################################################################################
@ -91,13 +102,14 @@ function show_help {
echo " --proc-bind=<LOC> Set the initial process mask for the script"
echo " LOC can be any valid location argument for"
echo " hwloc-calc Default: all"
echo " --whole-system ${cmd} will ignore the its parent process binding"
echo " --distribute=N Distribute the current cpuset into N partitions"
echo " --distribute-partition=I"
echo " Use the i'th partition (zero based)"
echo " --visible-gpus=<L> Comma separated list of gpu ids"
echo " Default: CUDA_VISIBLE_DEVICES or all gpus in"
echo " sequential order"
echo " --gpu-ignore-queue Ignore queue job id when choosing visible GPU"
echo " --ignore-queue Ignore queue job id when choosing visible GPU and partition"
echo " --no-gpu-mapping Do not set CUDA_VISIBLE_DEVICES"
echo " --openmp=M.m Set env variables for the given OpenMP version"
echo " Default: 4.0"
@ -110,22 +122,30 @@ function show_help {
echo " --force-openmp-proc-bind=<OP>"
echo " Override logic for selecting OMP_PROC_BIND"
echo " --no-openmp-nested Set OMP_NESTED to false"
echo " --show-bindings Show the bindings"
echo " --lstopo Show bindings in lstopo without executing a command"
echo " -v|--verbose Show options and relevant environment variables"
echo " --output-prefix=<P> Save the output to files of the form"
echo " P-N.log, P-N.out and P-N.err where P is the prefix"
echo " and N is the queue index or mpi rank (no spaces)"
echo " --output-mode=<Op> How console output should be handled."
echo " Options are all, rank0, and none. Default: rank0"
echo " --lstopo Show bindings in lstopo"
echo " -v|--verbose Print bindings and relevant environment variables"
echo " -h|--help Show this message"
echo ""
echo "Sample Usage:"
echo " Split the current process cpuset into 4 and use the 3rd partition"
echo " ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..."
echo " Bing the process to all even cores"
echo " Launch 16 jobs over 4 nodes with 4 jobs per node using only the even pus"
echo " and save the output to rank specific files"
echo " mpiexec -N 16 -npernode 4 ${cmd} --whole-system --proc-bind=pu:even \\"
echo " --distribute=4 -v --output-prefix=output -- command ..."
echo " Bind the process to all even cores"
echo " ${cmd} --proc-bind=core:even -v -- command ..."
echo " Bind to the first 64 cores and split the current process cpuset into 4"
echo " ${cmd} --proc-bind=core:0-63 --distribute=4 --distribute-partition=0 -- command ..."
echo " skip GPU 0 when mapping visible devices"
echo " Bind the the even cores of socket 0 and the odd cores of socket 1"
echo " ${cmd} --proc-bind='socket:0.core:even socket:1.core:odd' -v -- command ..."
echo " Skip GPU 0 when mapping visible devices"
echo " ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..."
echo " Display the current bindings"
echo " ${cmd} --proc-bind=numa:0 --show-bindings -- command"
echo " ${cmd} --proc-bind=numa:0 -- command"
echo " Display the current bindings using lstopo"
echo " ${cmd} --proc-bind=numa:0.core:odd --lstopo"
echo ""
@ -144,7 +164,7 @@ fi
declare -a UNKNOWN_ARGS=()
declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC}
declare -i HPCBIND_DISTRIBUTE=1
declare -i HPCBIND_PARTITION=0
declare -i HPCBIND_PARTITION=-1
HPCBIND_PROC_BIND="all"
HPCBIND_OPENMP_VERSION=4.0
declare -i HPCBIND_OPENMP_PERCENT=100
@ -155,11 +175,15 @@ HPCBIND_OPENMP_FORCE_PROC_BIND=""
HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true}
declare -i HPCBIND_VERBOSE=0
declare -i HPCBIND_SHOW_BINDINGS=0
declare -i HPCBIND_LSTOPO=0
for i in $@; do
case $i in
HPCBIND_OUTPUT_PREFIX=""
HPCBIND_OUTPUT_MODE="rank0"
declare -i HPCBIND_HAS_COMMAND=0
for i in "$@"; do
case "$i" in
# number of partitions to create
--no-hwloc-bind)
HPCBIND_ENABLE_HWLOC_BIND=0
@ -169,6 +193,10 @@ for i in $@; do
HPCBIND_PROC_BIND="${i#*=}"
shift
;;
--whole-system)
HPCBIND_HWLOC_PARENT_CPUSET=""
shift
;;
--distribute=*)
HPCBIND_DISTRIBUTE="${i#*=}"
shift
@ -182,8 +210,8 @@ for i in $@; do
HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ')
shift
;;
--gpu-ignore-queue)
HPCBIND_QUEUE_GPU_MAPPING=0
--ignore-queue)
HPCBIND_QUEUE_MAPPING=0
shift
;;
--no-gpu-mapping)
@ -218,14 +246,18 @@ for i in $@; do
HPCBIND_OPENMP_NESTED="false"
shift
;;
--show-bindings)
HPCBIND_VERBOSE=1
HPCBIND_SHOW_BINDINGS=1
--output-prefix=*)
HPCBIND_OUTPUT_PREFIX="${i#*=}"
shift
;;
--output-mode=*)
HPCBIND_OUTPUT_MODE="${i#*=}"
#convert to lower case
HPCBIND_OUTPUT_MODE="${HPCBIND_OUTPUT_MODE,,}"
shift
;;
--lstopo)
HPCBIND_VERBOSE=1
HPCBIND_SHOW_BINDINGS=0
HPCBIND_LSTOPO=1
shift
;;
@ -239,6 +271,7 @@ for i in $@; do
;;
# ignore remaining arguments
--)
HPCBIND_HAS_COMMAND=1
shift
break
;;
@ -250,16 +283,41 @@ for i in $@; do
esac
done
################################################################################
# Check output mode
################################################################################
declare -i HPCBIND_TEE=0
if [[ "${HPCBIND_OUTPUT_MODE}" == "none" ]]; then
HPCBIND_TEE=0
elif [[ "${HPCBIND_OUTPUT_MODE}" == "all" ]]; then
HPCBIND_TEE=1
elif [[ ${HPCBIND_QUEUE_INDEX} -eq 0 ]]; then
#default to rank0 printing to screen
HPCBIND_TEE=1
fi
if [[ "${HPCBIND_OUTPUT_PREFIX}" == "" ]]; then
HPCBIND_LOG=/dev/null
HPCBIND_ERR=/dev/null
HPCBIND_OUT=/dev/null
else
HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.hpc.log"
HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.err"
HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.out"
> ${HPCBIND_LOG}
fi
################################################################################
# Check unknown arguments
################################################################################
if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
echo "Uknown options: ${UNKNOWN_ARGS[*]}"
echo "HPCBIND Uknown options: ${UNKNOWN_ARGS[*]}" > >(tee -a ${HPCBIND_LOG})
exit 1
fi
################################################################################
# Check that visible gpus are valid
################################################################################
@ -268,22 +326,19 @@ if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do
if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} ||
${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then
echo "Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]}, setting to 0"
echo "HPCBIND Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]} (setting to 0)" > >(tee -a ${HPCBIND_LOG})
HPCBIND_VISIBLE_GPUS[$i]=0;
fi
done
NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]}
fi
################################################################################
# Check OpenMP percent
################################################################################
if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then
echo "OpenMP percent < 1, setting to 1"
HPCBIND_OPENMP_PERCENT=1
elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then
echo "OpenMP percent > 100, setting to 100"
HPCBIND_OPENMP_PERCENT=100
fi
@ -291,15 +346,21 @@ fi
# Check distribute
################################################################################
if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then
echo "Invalid input for distribute, changing distribute to 1"
HPCBIND_DISTRIBUTE=1
fi
if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
echo "Invalid input for distribute-partition, changing to 0"
################################################################################
#choose the correct partition
################################################################################
if [[ ${HPCBIND_PARTITION} -lt 0 && ${HPCBIND_QUEUE_MAPPING} -eq 1 ]]; then
HPCBIND_PARTITION=${HPCBIND_QUEUE_INDEX}
elif [[ ${HPCBIND_PARTITION} -lt 0 ]]; then
HPCBIND_PARTITION=0
fi
if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
HPCBIND_PARTITION=$((HPCBIND_PARTITION % HPCBIND_DISTRIBUTE))
fi
################################################################################
# Find cpuset and num threads
@ -309,13 +370,17 @@ declare -i HPCBIND_NUM_PUS=0
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND})
BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND[*]})
else
BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND})
BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND[*]})
fi
if [[ ${HPCBIND_DISTRIBUTE} -gt 1 ]]; then
CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
HPCBIND_HWLOC_CPUSET=${CPUSETS[${HPCBIND_PARTITION}]}
HPCBIND_HWLOC_CPUSET="${CPUSETS[${HPCBIND_PARTITION}]}"
else
HPCBIND_HWLOC_CPUSET="${BINDING}"
fi
HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l)
else
HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor)
@ -373,13 +438,13 @@ export OMP_NESTED=${HPCBIND_OPENMP_NESTED}
################################################################################
if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
if [[ ${HPCBIND_QUEUE_GPU_MAPPING} -eq 0 ]]; then
if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then
declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
else
declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
fi
fi
@ -389,22 +454,22 @@ fi
export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC}
export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA}
export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS}
export HPCBIND_HWLOC_CPUSET=${HPCBIND_HWLOC_CPUSET}
export HPCBIND_HWLOC_CPUSET="${HPCBIND_HWLOC_CPUSET}"
export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE}
export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION}
if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
export HPCBIND_HWLOC_PARENT_CPUSET="all"
else
export HPCBIND_HWLOC_PARENT_CPUSET=${HPCBIND_HWLOC_PARENT_CPUSET}
export HPCBIND_HWLOC_PARENT_CPUSET="${HPCBIND_HWLOC_PARENT_CPUSET}"
fi
export HPCBIND_HWLOC_PROC_BIND=${HPCBIND_PROC_BIND}
export HPCBIND_HWLOC_PROC_BIND="${HPCBIND_PROC_BIND}"
export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
export HPCBIND_OPENMP_VERSION=${HPCBIND_OPENMP_VERSION}
export HPCBIND_OPENMP_VERSION="${HPCBIND_OPENMP_VERSION}"
if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then
export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX}
export HPCBIND_QUEUE_NAME=${HPCBIND_QUEUE_NAME}
export HPCBIND_QUEUE_GPU_MAPPING=${HPCBIND_QUEUE_GPU_MAPPING}
export HPCBIND_QUEUE_NAME="${HPCBIND_QUEUE_NAME}"
export HPCBIND_QUEUE_MAPPING=${HPCBIND_QUEUE_MAPPING}
fi
@ -412,43 +477,63 @@ fi
# Print verbose
################################################################################
if [[ ${HPCBIND_VERBOSE} -eq 1 ]]; then
MY_ENV=$(env | sort)
echo "[HPCBIND]"
echo "${MY_ENV}" | grep -E "^HPCBIND_"
echo "[CUDA]"
echo "${MY_ENV}" | grep -E "^CUDA_"
echo "[OPENMP]"
echo "${MY_ENV}" | grep -E "^OMP_"
fi
TMP_ENV=$(env | sort)
if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then
echo "[HOST]" >> ${HPCBIND_LOG}
hostname -s >> ${HPCBIND_LOG}
echo "[HPCBIND]" >> ${HPCBIND_LOG}
echo "${TMP_ENV}" | grep -E "^HPCBIND_" >> ${HPCBIND_LOG}
echo "[CUDA]" >> ${HPCBIND_LOG}
echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG}
echo "[OPENMP]" >> ${HPCBIND_LOG}
echo "${TMP_ENV}" | grep -E "^OMP_" >> ${HPCBIND_LOG}
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
echo "[BINDINGS]"
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
elif [[ ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
echo "Unable to show bindings, hwloc not available."
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
echo "[BINDINGS]" >> ${HPCBIND_LOG}
hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu >> ${HPCBIND_LOG}
else
echo "Unable to show bindings, hwloc not available." >> ${HPCBIND_LOG}
fi
else
echo "[HOST]" > >(tee -a ${HPCBIND_LOG})
hostname -s > >(tee -a ${HPCBIND_LOG})
echo "[HPCBIND]" > >(tee -a ${HPCBIND_LOG})
echo "${TMP_ENV}" | grep -E "^HPCBIND_" > >(tee -a ${HPCBIND_LOG})
echo "[CUDA]" > >(tee -a ${HPCBIND_LOG})
echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG})
echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG})
echo "${TMP_ENV}" | grep -E "^OMP_" > >(tee -a ${HPCBIND_LOG})
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
echo "[BINDINGS]" > >(tee -a ${HPCBIND_LOG})
hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu > >(tee -a ${HPCBIND_LOG})
else
echo "Unable to show bindings, hwloc not available." > >(tee -a ${HPCBIND_LOG})
fi
fi
################################################################################
# Run command
################################################################################
if [[ ${HPCBIND_LSTOPO} -eq 0 ]]; then
# must be the last executed command so that the return value is correct
if [[ ${HPCBIND_LSTOPO} -eq 1 && ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- lstopo --pid 0
elif [[ ${HPCBIND_HAS_COMMAND} -eq 1 ]]; then
# clear output files
> ${HPCBIND_ERR}
> ${HPCBIND_OUT}
if [[ ${HPCBIND_TEE} -eq 0 ]]; then
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- $@
hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
else
eval $@
eval $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
fi
else
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
echo "[BINDINGS]"
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- lstopo --pid 0
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
else
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET}
fi
else
echo "Unable to show bindings, hwloc not available."
eval $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
fi
fi
fi

View File

@ -1,221 +0,0 @@
#!/usr/bin/env bash
# check if hwloc commands exist
declare -i HAS_HWLOC=0
type hwloc-bind >/dev/null 2>&1
HAS_HWLOC="${HAS_HWLOC} + $?"
type hwloc-distrib >/dev/null 2>&1
HAS_HWLOC="${HAS_HWLOC} + $?"
type hwloc-ls >/dev/null 2>&1
HAS_HWLOC="${HAS_HWLOC} + $?"
type hwloc-calc >/dev/null 2>&1
HAS_HWLOC="${HAS_HWLOC} + $?"
type hwloc-ps >/dev/null 2>&1
HAS_HWLOC="${HAS_HWLOC} + $?"
#parse args
declare -a UNKNOWN_ARGS=()
declare -i DISTRIBUTE=1
declare -i INDEX=0
PROC_BIND="all"
CURRENT_CPUSET=""
OPENMP_VERSION=4.0
OPENMP_PROC_BIND=True
OPENMP_NESTED=True
VERBOSE=False
#get the current process cpuset
if [[ ${HAS_HWLOC} -eq 0 ]]; then
MY_PID="$BASHPID"
CURRENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
echo "$CURRENT_CPUSET"
fi
function show_help {
local cmd=$(basename "$0")
echo "Usage: ${cmd} <options> -- command ..."
echo " Uses hwloc to divide the node into the given number of groups,"
echo " set the appropriate OMP_NUM_THREADS and execute the command on the"
echo " selected group."
echo ""
echo " NOTE: This command assumes it has exclusive use of the node"
echo ""
echo "Options:"
echo " --proc-bind=<LOC> Set the initial process mask for the script. "
echo " LOC can be any valid location argumnet for"
echo " hwloc-calc. Defaults to the entire machine"
echo " --distribute=N Distribute the current proc-bind into N groups"
echo " --index=I Use the i'th group (zero based)"
echo " --openmp=M.m Set env variables for the given OpenMP version"
echo " (default 4.0)"
echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
echo " --no-openmp-nested Set OMP_NESTED to false"
echo " -v|--verbose"
echo " -h|--help"
echo ""
echo "Sample Usage:"
echo " ${cmd} --distribute=4 --index=2 -v -- command ..."
echo ""
}
if [[ "$#" -eq 0 ]]; then
show_help
exit 0
fi
for i in $@; do
case $i in
# number of partitions to create
--proc-bind=*)
PROC_BIND="${i#*=}"
shift
;;
--distribute=*)
DISTRIBUTE="${i#*=}"
shift
;;
# which group to use
--index=*)
INDEX="${i#*=}"
shift
;;
--openmp=*)
OPENMP_VERSION="${i#*=}"
shift
;;
--no-openmp-proc-bind)
OPENMP_PROC_BIND=False
shift
;;
--no-openmp-nested)
OPENMP_NESTED=False
shift
;;
-v|--verbose)
VERBOSE=True
shift
;;
-h|--help)
show_help
exit 0
;;
# ignore remaining arguments
--)
shift
break
;;
# unknown option
*)
UNKNOWN_ARGS+=("$i")
shift
;;
esac
done
if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
echo "Uknown options: ${UNKNOWN_ARGS[*]}"
exit 1
fi
if [[ ${DISTRIBUTE} -le 0 ]]; then
echo "Invalid input for distribute, changing distribute to 1"
DISTRIBUTE=1
fi
if [[ ${INDEX} -ge ${DISTRIBUTE} ]]; then
echo "Invalid input for index, changing index to 0"
INDEX=0
fi
if [[ ${HAS_HWLOC} -ne 0 ]]; then
echo "hwloc not found, no process binding will occur"
DISTRIBUTE=1
INDEX=0
fi
if [[ ${HAS_HWLOC} -eq 0 ]]; then
if [[ "${CURRENT_CPUSET}" == "" ]]; then
BINDING=$(hwloc-calc ${PROC_BIND})
else
BINDING=$(hwloc-calc --restrict ${CURRENT_CPUSET} ${PROC_BIND})
fi
CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${DISTRIBUTE}))
CPUSET=${CPUSETS[${INDEX}]}
NUM_THREADS=$(hwloc-ls --restrict ${CPUSET} --only pu | wc -l)
if [[ "${VERBOSE}" == "True" ]]; then
echo "hwloc: true"
echo " proc_bind: ${PROC_BIND}"
echo " distribute: ${DISTRIBUTE}"
echo " index: ${INDEX}"
echo " parent_cpuset: ${CURRENT_CPUSET}"
echo " cpuset: ${CPUSET}"
echo "omp_num_threads: ${NUM_THREADS}"
echo "omp_proc_bind: ${OPENMP_PROC_BIND}"
echo "omp_nested: ${OPENMP_NESTED}"
echo "OpenMP: ${OPENMP_VERSION}"
fi
# set OMP env
if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
export OMP_PLACES="threads"
export OMP_PROC_BIND="spread"
else
export OMP_PROC_BIND="true"
unset OMP_PLACES
fi
else
unset OMP_PLACES
unset OMP_PROC_BIND
fi
if [[ "${OPENMP_NESTED}" == "True" ]]; then
export OMP_NESTED="true"
else
export OMP_NESTED="false"
fi
export OMP_NUM_THREADS="${NUM_THREADS}"
hwloc-bind ${CPUSET} -- $@
else
NUM_THREADS=$(cat /proc/cpuinfo | grep -c processor)
if [[ "${VERBOSE}" == "True" ]]; then
echo "hwloc: false"
echo "omp_num_threads: ${NUM_THREADS}"
echo "omp_proc_bind: ${OPENMP_PROC_BIND}"
echo "omp_nested: ${OPENMP_NESTED}"
echo "OpenMP: ${OPENMP_VERSION}"
fi
# set OMP env
if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
export OMP_PLACES="threads"
export OMP_PROC_BIND="spread"
else
export OMP_PROC_BIND="true"
unset OMP_PLACES
fi
else
unset OMP_PLACES
unset OMP_PROC_BIND
fi
if [[ "${OPENMP_NESTED}" == "True" ]]; then
export OMP_NESTED="true"
else
export OMP_NESTED="false"
fi
export OMP_NUM_THREADS="${NUM_THREADS}"
eval $@
fi

View File

@ -78,6 +78,9 @@ temp_dir=${TMPDIR:-/tmp}
# Check if we have an optimization argument already
optimization_applied=0
# Check if we have -std=c++X or --std=c++X already
stdcxx_applied=0
#echo "Arguments: $# $@"
while [ $# -gt 0 ]
@ -130,10 +133,16 @@ do
cuda_args="$cuda_args $1 $2"
shift
;;
#Handle c++11 setting
--std=c++11|-std=c++11)
#Handle c++11
--std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++1z|-std=c++1z)
if [ $stdcxx_applied -eq 1 ]; then
echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-std=c++1* or --std=c++1*), only the first is used because nvcc can only accept a single std setting"
else
shared_args="$shared_args $1"
stdcxx_applied=1
fi
;;
#strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98
-std=c++98|--std=c++98)
;;

View File

@ -9,3 +9,4 @@ tag: 2.03.00 date: 04:25:2017 master: 120d9ce7 develop: 015ba641
tag: 2.03.05 date: 05:27:2017 master: 36b92f43 develop: 79073186
tag: 2.03.13 date: 07:27:2017 master: da314444 develop: 29ccb58a
tag: 2.04.00 date: 08:16:2017 master: 54eb75c0 develop: 32fb8ee1
tag: 2.04.04 date: 09:11:2017 master: 2b7e9c20 develop: 51e7b25a

View File

@ -1,4 +1,4 @@
module purge
module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.59.0/base
module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.63.0/base sems-yaml_cpp sems-superlu
#Run Trilinos CheckinTest

View File

@ -125,6 +125,123 @@ namespace Impl {
};
}
/// \class GraphRowViewConst
/// \brief View of a row of a sparse graph.
/// \tparam GraphType Sparse graph type, such as (but not limited to) StaticCrsGraph.
///
/// This class provides a generic view of a row of a sparse graph.
/// We intended this class to view a row of a StaticCrsGraph, but
/// GraphType need not necessarily be CrsMatrix.
///
/// The row view is suited for computational kernels like sparse
/// matrix-vector multiply, as well as for modifying entries in the
/// sparse matrix. The view is always const as it does not allow graph modification.
///
/// Here is an example loop over the entries in the row:
/// \code
/// typedef typename GraphRowViewConst<MatrixType>::ordinal_type ordinal_type;
///
/// GraphRowView<GraphType> G_i = ...;
/// const ordinal_type numEntries = G_i.length;
/// for (ordinal_type k = 0; k < numEntries; ++k) {
/// ordinal_type j = G_i.colidx (k);
/// // ... do something with A_ij and j ...
/// }
/// \endcode
///
/// GraphType must provide the \c data_type
/// typedefs. In addition, it must make sense to use GraphRowViewConst to
/// view a row of GraphType. In particular, column
/// indices of a row must be accessible using the <tt>entries</tt>
/// resp. <tt>colidx</tt> arrays given to the constructor of this
/// class, with a constant <tt>stride</tt> between successive entries.
/// The stride is one for the compressed sparse row storage format (as
/// is used by CrsMatrix), but may be greater than one for other
/// sparse matrix storage formats (e.g., ELLPACK or jagged diagonal).
template<class GraphType>
struct GraphRowViewConst {
//! The type of the column indices in the row.
typedef const typename GraphType::data_type ordinal_type;
private:
//! Array of (local) column indices in the row.
ordinal_type* colidx_;
/// \brief Stride between successive entries in the row.
///
/// For compressed sparse row (CSR) storage, this is always one.
/// This might be greater than one for storage formats like ELLPACK
/// or Jagged Diagonal. Nevertheless, the stride can never be
/// greater than the number of rows or columns in the matrix. Thus,
/// \c ordinal_type is the correct type.
const ordinal_type stride_;
public:
/// \brief Constructor
///
/// \param values [in] Array of the row's values.
/// \param colidx [in] Array of the row's column indices.
/// \param stride [in] (Constant) stride between matrix entries in
/// each of the above arrays.
/// \param count [in] Number of entries in the row.
KOKKOS_INLINE_FUNCTION
GraphRowViewConst ( ordinal_type* const colidx_in,
const ordinal_type& stride,
const ordinal_type& count) :
colidx_ (colidx_in), stride_ (stride), length (count)
{}
/// \brief Constructor with offset into \c colidx array
///
/// \param colidx [in] Array of the row's column indices.
/// \param stride [in] (Constant) stride between matrix entries in
/// each of the above arrays.
/// \param count [in] Number of entries in the row.
/// \param idx [in] Start offset into \c colidx array
///
/// \tparam OffsetType The type of \c idx (see above). Must be a
/// built-in integer type. This may differ from ordinal_type.
/// For example, the matrix may have dimensions that fit in int,
/// but a number of entries that does not fit in int.
template<class OffsetType>
KOKKOS_INLINE_FUNCTION
GraphRowViewConst ( const typename GraphType::entries_type& colidx_in,
const ordinal_type& stride,
const ordinal_type& count,
const OffsetType& idx,
const typename std::enable_if<std::is_integral<OffsetType>::value, int>::type& = 0) :
colidx_ (&colidx_in(idx)), stride_ (stride), length (count)
{}
/// \brief Number of entries in the row.
///
/// This is a public const field rather than a public const method,
/// in order to avoid possible overhead of a method call if the
/// compiler is unable to inline that method call.
///
/// We assume that rows contain no duplicate entries (i.e., entries
/// with the same column index). Thus, a row may have up to
/// A.numCols() entries. This means that the correct type of
/// 'length' is ordinal_type.
const ordinal_type length;
/// \brief (Const) reference to the column index of entry i in this
/// row of the sparse matrix.
///
/// "Entry i" is not necessarily the entry with column index i, nor
/// does i necessarily correspond to the (local) row index.
KOKKOS_INLINE_FUNCTION
ordinal_type& colidx (const ordinal_type& i) const {
return colidx_[i*stride_];
}
/// \brief An alias for colidx
KOKKOS_INLINE_FUNCTION
ordinal_type& operator()(const ordinal_type& i) const {
return colidx(i);
}
};
/// \class StaticCrsGraph
/// \brief Compressed row storage array.
///
@ -218,6 +335,38 @@ public:
static_cast<size_type> (0);
}
/// \brief Return a const view of row i of the graph.
///
/// If row i does not belong to the graph, return an empty view.
///
/// The returned object \c view implements the following interface:
/// <ul>
/// <li> \c view.length is the number of entries in the row </li>
/// <li> \c view.colidx(k) returns a const reference to the
/// column index of the k-th entry in the row </li>
/// </ul>
/// k is not a column index; it just counts from 0 to
/// <tt>view.length - 1</tt>.
///
/// Users should not rely on the return type of this method. They
/// should instead assign to 'auto'. That allows compile-time
/// polymorphism for different kinds of sparse matrix formats (e.g.,
/// ELLPACK or Jagged Diagonal) that we may wish to support in the
/// future.
KOKKOS_INLINE_FUNCTION
GraphRowViewConst<StaticCrsGraph> rowConst (const data_type i) const {
const size_type start = row_map(i);
// count is guaranteed to fit in ordinal_type, as long as no row
// has duplicate entries.
const data_type count = static_cast<data_type> (row_map(i+1) - start);
if (count == 0) {
return GraphRowViewConst<StaticCrsGraph> (NULL, 1, 0);
} else {
return GraphRowViewConst<StaticCrsGraph> (entries, 1, count, start);
}
}
/** \brief Create a row partitioning into a given number of blocks
* balancing non-zeros + a fixed cost per row.
*/

View File

@ -91,11 +91,11 @@ struct DeviceIterateTile<2,RP,Functor,void >
// LL
if (RP::inner_direction == RP::Left) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1);
}
@ -106,11 +106,11 @@ struct DeviceIterateTile<2,RP,Functor,void >
// LR
else {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
m_func(offset_0 , offset_1);
}
@ -143,11 +143,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag>
if (RP::inner_direction == RP::Left) {
// Loop over size maxnumblocks until full range covered
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
m_func(Tag(), offset_0 , offset_1);
}
@ -157,11 +157,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag>
}
else {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
m_func(Tag(), offset_0 , offset_1);
}
@ -196,15 +196,15 @@ struct DeviceIterateTile<3,RP,Functor,void >
// LL
if (RP::inner_direction == RP::Left) {
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2);
}
@ -217,15 +217,15 @@ struct DeviceIterateTile<3,RP,Functor,void >
// LR
else {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
m_func(offset_0 , offset_1 , offset_2);
}
@ -259,15 +259,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag>
{
if (RP::inner_direction == RP::Left) {
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
m_func(Tag(), offset_0 , offset_1 , offset_2);
}
@ -279,15 +279,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag>
}
else {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
m_func(Tag(), offset_0 , offset_1 , offset_2);
}
@ -340,19 +340,19 @@ struct DeviceIterateTile<4,RP,Functor,void >
const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3);
}
@ -378,19 +378,19 @@ struct DeviceIterateTile<4,RP,Functor,void >
const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3);
}
@ -442,19 +442,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag>
const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
}
@ -479,19 +479,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag>
const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
}
@ -558,23 +558,23 @@ struct DeviceIterateTile<5,RP,Functor,void >
const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
}
@ -613,23 +613,23 @@ struct DeviceIterateTile<5,RP,Functor,void >
const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
}
@ -695,23 +695,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag>
const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
}
@ -750,23 +750,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag>
const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
}
@ -845,27 +845,27 @@ struct DeviceIterateTile<6,RP,Functor,void >
const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
}
@ -917,27 +917,27 @@ struct DeviceIterateTile<6,RP,Functor,void >
const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
}
@ -1016,27 +1016,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag>
const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
}
@ -1088,27 +1088,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag>
const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
}

View File

@ -164,7 +164,7 @@ static void cuda_parallel_launch_constant_memory()
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
__global__
__launch_bounds__(maxTperB, minBperSM)
//__launch_bounds__(maxTperB, minBperSM)
static void cuda_parallel_launch_constant_memory()
{
const DriverType & driver =
@ -182,7 +182,7 @@ static void cuda_parallel_launch_local_memory( const DriverType driver )
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
__global__
__launch_bounds__(maxTperB, minBperSM)
//__launch_bounds__(maxTperB, minBperSM)
static void cuda_parallel_launch_local_memory( const DriverType driver )
{
driver();

View File

@ -242,45 +242,89 @@ public:
re_ = v;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator += (const complex<RealType>& src) {
complex<RealType>&
operator += (const complex<InputRealType>& src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
re_ += src.re_;
im_ += src.im_;
return *this;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
void operator += (const volatile complex<RealType>& src) volatile {
void
operator += (const volatile complex<InputRealType>& src) volatile {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
re_ += src.re_;
im_ += src.im_;
}
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator += (const RealType& src) {
complex<RealType>&
operator += (const std::complex<RealType>& src) {
re_ += src.real();
im_ += src.imag();
return *this;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>&
operator += (const InputRealType& src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
re_ += src;
return *this;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
void operator += (const volatile RealType& src) volatile {
void
operator += (const volatile InputRealType& src) volatile {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
re_ += src;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator -= (const complex<RealType>& src) {
complex<RealType>&
operator -= (const complex<InputRealType>& src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
re_ -= src.re_;
im_ -= src.im_;
return *this;
}
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator -= (const RealType& src) {
complex<RealType>&
operator -= (const std::complex<RealType>& src) {
re_ -= src.real();
im_ -= src.imag();
return *this;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>&
operator -= (const InputRealType& src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
re_ -= src;
return *this;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator *= (const complex<RealType>& src) {
complex<RealType>&
operator *= (const complex<InputRealType>& src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
const RealType realPart = re_ * src.re_ - im_ * src.im_;
const RealType imagPart = re_ * src.im_ + im_ * src.re_;
re_ = realPart;
@ -288,8 +332,12 @@ public:
return *this;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
void operator *= (const volatile complex<RealType>& src) volatile {
void
operator *= (const volatile complex<InputRealType>& src) volatile {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
const RealType realPart = re_ * src.re_ - im_ * src.im_;
const RealType imagPart = re_ * src.im_ + im_ * src.re_;
re_ = realPart;
@ -297,20 +345,43 @@ public:
}
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator *= (const RealType& src) {
complex<RealType>&
operator *= (const std::complex<RealType>& src) {
const RealType realPart = re_ * src.real() - im_ * src.imag();
const RealType imagPart = re_ * src.imag() + im_ * src.real();
re_ = realPart;
im_ = imagPart;
return *this;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>&
operator *= (const InputRealType& src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
re_ *= src;
im_ *= src;
return *this;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
void operator *= (const volatile RealType& src) volatile {
void
operator *= (const volatile InputRealType& src) volatile {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
re_ *= src;
im_ *= src;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator /= (const complex<RealType>& y) {
complex<RealType>&
operator /= (const complex<InputRealType>& y) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
// If the real part is +/-Inf and the imaginary part is -/+Inf,
// this won't change the result.
@ -335,56 +406,121 @@ public:
}
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator /= (const RealType& src) {
complex<RealType>&
operator /= (const std::complex<RealType>& y) {
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
// If the real part is +/-Inf and the imaginary part is -/+Inf,
// this won't change the result.
const RealType s = std::fabs (y.real ()) + std::fabs (y.imag ());
// If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
// In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
// because y/s is NaN.
if (s == 0.0) {
this->re_ /= s;
this->im_ /= s;
}
else {
const complex<RealType> x_scaled (this->re_ / s, this->im_ / s);
const complex<RealType> y_conj_scaled (y.re_ / s, -(y.im_) / s);
const RealType y_scaled_abs = y_conj_scaled.re_ * y_conj_scaled.re_ +
y_conj_scaled.im_ * y_conj_scaled.im_; // abs(y) == abs(conj(y))
*this = x_scaled * y_conj_scaled;
*this /= y_scaled_abs;
}
return *this;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>&
operator /= (const InputRealType& src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
re_ /= src;
im_ /= src;
return *this;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
bool operator == (const complex<RealType>& src) {
return (re_ == src.re_) && (im_ == src.im_);
bool
operator == (const complex<InputRealType>& src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
return (re_ == static_cast<RealType>(src.re_)) && (im_ == static_cast<RealType>(src.im_));
}
KOKKOS_INLINE_FUNCTION
bool operator == (const RealType src) {
return (re_ == src) && (im_ == RealType(0));
bool
operator == (const std::complex<RealType>& src) {
return (re_ == src.real()) && (im_ == src.imag());
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
bool
operator == (const InputRealType src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
return (re_ == static_cast<RealType>(src)) && (im_ == RealType(0));
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
bool
operator != (const complex<InputRealType>& src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
return (re_ != static_cast<RealType>(src.re_)) || (im_ != static_cast<RealType>(src.im_));
}
KOKKOS_INLINE_FUNCTION
bool operator != (const complex<RealType>& src) {
return (re_ != src.re_) || (im_ != src.im_);
bool
operator != (const std::complex<RealType>& src) {
return (re_ != src.real()) || (im_ != src.imag());
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
bool operator != (const RealType src) {
return (re_ != src) || (im_ != RealType(0));
bool
operator != (const InputRealType src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
return (re_ != static_cast<RealType>(src)) || (im_ != RealType(0));
}
};
//! Binary + operator for complex complex.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator + (const complex<RealType>& x, const complex<RealType>& y) {
return complex<RealType> (x.real () + y.real (), x.imag () + y.imag ());
complex<typename std::common_type<RealType1,RealType2>::type>
operator + (const complex<RealType1>& x, const complex<RealType2>& y) {
return complex<typename std::common_type<RealType1,RealType2>::type > (x.real () + y.real (), x.imag () + y.imag ());
}
//! Binary + operator for complex scalar.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator + (const complex<RealType>& x, const RealType& y) {
return complex<RealType> (x.real () + y , x.imag ());
complex<typename std::common_type<RealType1,RealType2>::type>
operator + (const complex<RealType1>& x, const RealType2& y) {
return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () + y , x.imag ());
}
//! Binary + operator for scalar complex.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator + (const RealType& x, const complex<RealType>& y) {
return complex<RealType> (x + y.real (), y.imag ());
complex<typename std::common_type<RealType1,RealType2>::type>
operator + (const RealType1& x, const complex<RealType2>& y) {
return complex<typename std::common_type<RealType1,RealType2>::type> (x + y.real (), y.imag ());
}
//! Unary + operator for complex.
@ -396,27 +532,27 @@ operator + (const complex<RealType>& x) {
}
//! Binary - operator for complex.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator - (const complex<RealType>& x, const complex<RealType>& y) {
return complex<RealType> (x.real () - y.real (), x.imag () - y.imag ());
complex<typename std::common_type<RealType1,RealType2>::type>
operator - (const complex<RealType1>& x, const complex<RealType2>& y) {
return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () - y.real (), x.imag () - y.imag ());
}
//! Binary - operator for complex scalar.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator - (const complex<RealType>& x, const RealType& y) {
return complex<RealType> (x.real () - y , x.imag ());
complex<typename std::common_type<RealType1,RealType2>::type>
operator - (const complex<RealType1>& x, const RealType2& y) {
return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () - y , x.imag ());
}
//! Binary - operator for scalar complex.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator - (const RealType& x, const complex<RealType>& y) {
return complex<RealType> (x - y.real (), - y.imag ());
complex<typename std::common_type<RealType1,RealType2>::type>
operator - (const RealType1& x, const complex<RealType2>& y) {
return complex<typename std::common_type<RealType1,RealType2>::type> (x - y.real (), - y.imag ());
}
//! Unary - operator for complex.
@ -428,11 +564,11 @@ operator - (const complex<RealType>& x) {
}
//! Binary * operator for complex.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator * (const complex<RealType>& x, const complex<RealType>& y) {
return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
complex<typename std::common_type<RealType1,RealType2>::type>
operator * (const complex<RealType1>& x, const complex<RealType2>& y) {
return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () * y.real () - x.imag () * y.imag (),
x.real () * y.imag () + x.imag () * y.real ());
}
@ -446,10 +582,11 @@ operator * (const complex<RealType>& x, const complex<RealType>& y) {
/// This function cannot be called in a CUDA device function, because
/// std::complex's methods and nonmember functions are not marked as
/// CUDA device functions.
template<class RealType>
complex<RealType>
operator * (const std::complex<RealType>& x, const complex<RealType>& y) {
return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
template<class RealType1, class RealType2>
inline
complex<typename std::common_type<RealType1,RealType2>::type>
operator * (const std::complex<RealType1>& x, const complex<RealType2>& y) {
return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () * y.real () - x.imag () * y.imag (),
x.real () * y.imag () + x.imag () * y.real ());
}
@ -457,22 +594,22 @@ operator * (const std::complex<RealType>& x, const complex<RealType>& y) {
///
/// This function exists because the compiler doesn't know that
/// RealType and complex<RealType> commute with respect to operator*.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator * (const RealType& x, const complex<RealType>& y) {
return complex<RealType> (x * y.real (), x * y.imag ());
complex<typename std::common_type<RealType1,RealType2>::type>
operator * (const RealType1& x, const complex<RealType2>& y) {
return complex<typename std::common_type<RealType1,RealType2>::type> (x * y.real (), x * y.imag ());
}
/// \brief Binary * operator for RealType times complex.
///
/// This function exists because the compiler doesn't know that
/// RealType and complex<RealType> commute with respect to operator*.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator * (const complex<RealType>& y, const RealType& x) {
return complex<RealType> (x * y.real (), x * y.imag ());
complex<typename std::common_type<RealType1,RealType2>::type>
operator * (const complex<RealType1>& y, const RealType2& x) {
return complex<typename std::common_type<RealType1,RealType2>::type> (x * y.real (), x * y.imag ());
}
//! Imaginary part of a complex number.
@ -539,33 +676,34 @@ complex<RealType> pow (const complex<RealType>& x) {
//! Binary operator / for complex and real numbers
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType1>
complex<typename std::common_type<RealType1,RealType2>::type>
operator / (const complex<RealType1>& x, const RealType2& y) {
return complex<RealType1> (real (x) / y, imag (x) / y);
return complex<typename std::common_type<RealType1,RealType2>::type> (real (x) / y, imag (x) / y);
}
//! Binary operator / for complex.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator / (const complex<RealType>& x, const complex<RealType>& y) {
complex<typename std::common_type<RealType1,RealType2>::type>
operator / (const complex<RealType1>& x, const complex<RealType2>& y) {
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
// If the real part is +/-Inf and the imaginary part is -/+Inf,
// this won't change the result.
const RealType s = std::fabs (real (y)) + std::fabs (imag (y));
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
const common_real_type s = std::fabs (real (y)) + std::fabs (imag (y));
// If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
// In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
// because y/s is NaN.
if (s == 0.0) {
return complex<RealType> (real (x) / s, imag (x) / s);
return complex<common_real_type> (real (x) / s, imag (x) / s);
}
else {
const complex<RealType> x_scaled (real (x) / s, imag (x) / s);
const complex<RealType> y_conj_scaled (real (y) / s, -imag (y) / s);
const RealType y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) +
const complex<common_real_type> x_scaled (real (x) / s, imag (x) / s);
const complex<common_real_type> y_conj_scaled (real (y) / s, -imag (y) / s);
const RealType1 y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) +
imag (y_conj_scaled) * imag (y_conj_scaled); // abs(y) == abs(conj(y))
complex<RealType> result = x_scaled * y_conj_scaled;
complex<common_real_type> result = x_scaled * y_conj_scaled;
result /= y_scaled_abs;
return result;
}
@ -574,16 +712,19 @@ operator / (const complex<RealType>& x, const complex<RealType>& y) {
//! Binary operator / for complex and real numbers
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType1>
complex<typename std::common_type<RealType1,RealType2>::type>
operator / (const RealType1& x, const complex<RealType2>& y) {
return complex<RealType1> (x)/y;
return complex<typename std::common_type<RealType1,RealType2>::type> (x)/y;
}
//! Equality operator for two complex numbers.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
return real (x) == real (y) && imag (x) == imag (y);
bool
operator == (const complex<RealType1>& x, const complex<RealType2>& y) {
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
return ( static_cast<common_real_type>(real (x)) == static_cast<common_real_type>(real (y)) &&
static_cast<common_real_type>(imag (x)) == static_cast<common_real_type>(imag (y)) );
}
/// \brief Equality operator for std::complex and Kokkos::complex.
@ -592,50 +733,68 @@ bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
/// Otherwise, CUDA builds will give compiler warnings ("warning:
/// calling a constexpr __host__ function("real") from a __host__
/// __device__ function("operator==") is not allowed").
template<class RealType>
bool operator == (const std::complex<RealType>& x, const complex<RealType>& y) {
return std::real (x) == real (y) && std::imag (x) == imag (y);
template<class RealType1, class RealType2>
inline
bool
operator == (const std::complex<RealType1>& x, const complex<RealType2>& y) {
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
return ( static_cast<common_real_type>(std::real (x)) == static_cast<common_real_type>(real (y)) &&
static_cast<common_real_type>(std::imag (x)) == static_cast<common_real_type>(imag (y)) );
}
//! Equality operator for complex and real number.
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
bool operator == (const complex<RealType1>& x, const RealType2& y) {
return real (x) == y && imag (x) == static_cast<RealType1> (0.0);
bool
operator == (const complex<RealType1>& x, const RealType2& y) {
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
return ( static_cast<common_real_type>(real (x)) == static_cast<common_real_type>(y) &&
static_cast<common_real_type>(imag (x)) == static_cast<common_real_type>(0.0) );
}
//! Equality operator for real and complex number.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
bool operator == (const RealType& x, const complex<RealType>& y) {
bool
operator == (const RealType1& x, const complex<RealType2>& y) {
return y == x;
}
//! Inequality operator for two complex numbers.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
bool operator != (const complex<RealType>& x, const complex<RealType>& y) {
return real (x) != real (y) || imag (x) != imag (y);
bool
operator != (const complex<RealType1>& x, const complex<RealType2>& y) {
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
return ( static_cast<common_real_type>(real (x)) != static_cast<common_real_type>(real (y)) ||
static_cast<common_real_type>(imag (x)) != static_cast<common_real_type>(imag (y)) );
}
//! Inequality operator for std::complex and Kokkos::complex.
template<class RealType>
KOKKOS_INLINE_FUNCTION
bool operator != (const std::complex<RealType>& x, const complex<RealType>& y) {
return std::real (x) != real (y) || std::imag (x) != imag (y);
template<class RealType1, class RealType2>
inline
bool
operator != (const std::complex<RealType1>& x, const complex<RealType2>& y) {
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
return ( static_cast<common_real_type>(std::real (x)) != static_cast<common_real_type>(real (y)) ||
static_cast<common_real_type>(std::imag (x)) != static_cast<common_real_type>(imag (y)) );
}
//! Inequality operator for complex and real number.
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
bool operator != (const complex<RealType1>& x, const RealType2& y) {
return real (x) != y || imag (x) != static_cast<RealType1> (0.0);
bool
operator != (const complex<RealType1>& x, const RealType2& y) {
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
return ( static_cast<common_real_type>(real (x)) != static_cast<common_real_type>(y) ||
static_cast<common_real_type>(imag (x)) != static_cast<common_real_type>(0.0) );
}
//! Inequality operator for real and complex number.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
bool operator != (const RealType& x, const complex<RealType>& y) {
bool
operator != (const RealType1& x, const complex<RealType2>& y) {
return y != x;
}

View File

@ -353,7 +353,14 @@ struct CountAndFill {
struct Fill {};
KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const {
auto j = m_crs.row_map(i);
data_type* fill = &(m_crs.entries(j));
/* we don't want to access entries(entries.size()), even if its just to get its
address and never use it.
this can happen when row (i) is empty and all rows after it are also empty.
we could compare to row_map(i + 1), but that is a read from global memory,
whereas dimension_0() should be part of the View in registers (or constant memory) */
data_type* fill =
(j == static_cast<decltype(j)>(m_crs.entries.dimension_0())) ?
nullptr : (&(m_crs.entries(j)));
m_functor(i, fill);
}
using self_type = CountAndFill<CrsType, Functor>;

View File

@ -147,12 +147,11 @@ public:
, const size_t arg_alloc_size ) const;
/**\brief Return Name of the MemorySpace */
static constexpr const char* name();
static constexpr const char* name() { return "HBW"; }
private:
AllocationMechanism m_alloc_mech;
static constexpr const char* m_name = "HBW";
friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >;
};

View File

@ -192,7 +192,7 @@ template<>
struct reduction_identity<float> {
KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum() {return static_cast<float>(0.0f);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() {return static_cast<float>(1.0f);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() {return FLT_MIN;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() {return -FLT_MAX;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static float min() {return FLT_MAX;}
};
@ -200,7 +200,7 @@ template<>
struct reduction_identity<double> {
KOKKOS_FORCEINLINE_FUNCTION constexpr static double sum() {return static_cast<double>(0.0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static double prod() {return static_cast<double>(1.0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static double max() {return DBL_MIN;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static double max() {return -DBL_MAX;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static double min() {return DBL_MAX;}
};
@ -208,7 +208,7 @@ template<>
struct reduction_identity<long double> {
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double sum() {return static_cast<long double>(0.0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double prod() {return static_cast<long double>(1.0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max() {return LDBL_MIN;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max() {return -LDBL_MAX;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double min() {return LDBL_MAX;}
};

View File

@ -211,6 +211,24 @@ struct VerifyExecutionCanAccessMemorySpace
} // namespace Kokkos
#define threadIdx_x (hc_get_workitem_id(0))
#define threadIdx_y (hc_get_workitem_id(1))
#define threadIdx_z (hc_get_workitem_id(2))
#define blockIdx_x (hc_get_group_id(0))
#define blockIdx_y (hc_get_group_id(1))
#define blockIdx_z (hc_get_group_id(2))
#define blockDim_x (hc_get_group_size(0))
#define blockDim_y (hc_get_group_size(1))
#define blockDim_z (hc_get_group_size(2))
#define gridDim_x (hc_get_num_groups(0))
#define gridDim_y (hc_get_num_groups(1))
#define gridDim_z (hc_get_num_groups(2))
#include <ROCm/Kokkos_ROCm_Parallel.hpp>
#include <ROCm/Kokkos_ROCm_Task.hpp>

View File

@ -88,6 +88,7 @@ build-makefile-kokkos:
echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos
echo "" >> Makefile.kokkos
echo "#Variables used in application Makefiles" >> Makefile.kokkos
echo "KOKKOS_OS = $(KOKKOS_OS)" >> Makefile.kokkos
echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos
echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos
echo "KOKKOS_CPPFLAGS = $(KOKKOS_CPPFLAGS)" >> Makefile.kokkos

View File

@ -211,6 +211,7 @@ void OpenMP::partition_master( F const& f
, thread_local_bytes
);
omp_set_num_threads(partition_size);
f( omp_get_thread_num(), omp_get_num_threads() );
Impl::t_openmp_instance->~Exec();

View File

@ -113,7 +113,6 @@ void reduce_enqueue(
if (output_length < 1) return;
assert(output_result != nullptr);
const auto td = get_tile_desc<T>(szElements,output_length,team_size,vector_size, shared_size);
// allocate host and device memory for the results from each team
@ -176,14 +175,17 @@ void reduce_enqueue(
}
});
if (output_result != nullptr)
ValueInit::init(ReducerConditional::select(f, reducer), output_result);
fut.wait();
copy(result,result_cpu.data());
if (output_result != nullptr) {
for(std::size_t i=0;i<td.num_tiles;i++)
ValueJoin::join(ReducerConditional::select(f, reducer), output_result, result_cpu.data()+i*output_length);
ValueFinal::final( ReducerConditional::select(f, reducer) , output_result );
}
}

View File

@ -67,7 +67,7 @@ void scan_enqueue(
hc::array<value_type> result(td.num_tiles);
hc::array<value_type> scratch(len);
tile_for<value_type>(td, [&,len,td](hc::tiled_index<1> t_idx, tile_buffer<value_type> buffer) [[hc]]
tile_for<value_type>(td, [&,f,len,td](hc::tiled_index<1> t_idx, tile_buffer<value_type> buffer) [[hc]]
{
const auto local = t_idx.local[0];
const auto global = t_idx.global[0];
@ -135,7 +135,7 @@ void scan_enqueue(
ValueJoin::join(f, &result_cpu[i], &result_cpu[i-1]);
copy(result_cpu.data(),result);
hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,len,td](hc::tiled_index<1> t_idx) [[hc]]
hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,f,len,td](hc::tiled_index<1> t_idx) [[hc]]
{
// const auto local = t_idx.local[0];
const auto global = t_idx.global[0];

View File

@ -68,6 +68,8 @@ int bit_first_zero( unsigned i ) noexcept
return full != i ? _bit_scan_forward( ~i ) : -1 ;
#elif defined( KOKKOS_COMPILER_IBM )
return full != i ? __cnttz4( ~i ) : -1 ;
#elif defined( KOKKOS_COMPILER_CRAYC )
return full != i ? _popcnt( i ^ (i+1) ) - 1 : -1 ;
#elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
return full != i ? __builtin_ffs( ~i ) - 1 : -1 ;
#else
@ -90,17 +92,16 @@ int bit_scan_forward( unsigned i )
return _bit_scan_forward(i);
#elif defined( KOKKOS_COMPILER_IBM )
return __cnttz4(i);
#elif defined( KOKKOS_COMPILER_CRAYC )
return i ? _popcnt(~i & (i-1)) : -1;
#elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
return __builtin_ffs(i) - 1;
#else
unsigned t = 1u;
int r = 0;
while ( i && ( i & t == 0 ) )
{
t = t << 1;
++r;
int offset = -1;
if ( i ) {
for ( offset = 0 ; (i & ( 1 << offset ) ) == 0 ; ++offset );
}
return r;
return offset;
#endif
}
@ -116,17 +117,16 @@ int bit_scan_reverse( unsigned i )
return _bit_scan_reverse(i);
#elif defined( KOKKOS_COMPILER_IBM )
return shift - __cntlz4(i);
#elif defined( KOKKOS_COMPILER_CRAYC )
return i ? shift - _leadz32(i) : 0 ;
#elif defined( __GNUC__ ) || defined( __GNUG__ )
return shift - __builtin_clz(i);
#else
unsigned t = 1u << shift;
int r = 0;
while ( i && ( i & t == 0 ) )
{
t = t >> 1;
++r;
int offset = 0;
if ( i ) {
for ( offset = shift ; (i & ( 1 << offset ) ) == 0 ; --offset );
}
return r;
return offset;
#endif
}
@ -142,6 +142,8 @@ int bit_count( unsigned i )
return _popcnt32(i);
#elif defined( KOKKOS_COMPILER_IBM )
return __popcnt4(i);
#elif defined( KOKKOS_COMPILER_CRAYC )
return _popcnt(i);
#elif defined( __GNUC__ ) || defined( __GNUG__ )
return __builtin_popcount(i);
#else

View File

@ -166,10 +166,6 @@ void HBWSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_s
}
}
constexpr const char* HBWSpace::name() {
return m_name;
}
} // namespace Experimental
} // namespace Kokkos

View File

@ -114,7 +114,7 @@ struct TestComplexBasicMath {
typename Kokkos::View<Kokkos::complex<double>*,ExecSpace>::HostMirror h_results;
void testit () {
d_results = Kokkos::View<Kokkos::complex<double>*,ExecSpace>("TestComplexBasicMath",20);
d_results = Kokkos::View<Kokkos::complex<double>*,ExecSpace>("TestComplexBasicMath",24);
h_results = Kokkos::create_mirror_view(d_results);
Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0,1), *this);
@ -125,6 +125,7 @@ struct TestComplexBasicMath {
std::complex<double> b(3.25,5.75);
std::complex<double> d(1.0,2.0);
double c = 9.3;
int e = 2;
std::complex<double> r;
r = a+b; ASSERT_FLOAT_EQ(h_results(0).real(), r.real()); ASSERT_FLOAT_EQ(h_results(0).imag(), r.imag());
@ -147,6 +148,12 @@ struct TestComplexBasicMath {
r = c-a; ASSERT_FLOAT_EQ(h_results(17).real(), r.real()); ASSERT_FLOAT_EQ(h_results(17).imag(), r.imag());
r = c*a; ASSERT_FLOAT_EQ(h_results(18).real(), r.real()); ASSERT_FLOAT_EQ(h_results(18).imag(), r.imag());
r = c/a; ASSERT_FLOAT_EQ(h_results(19).real(), r.real()); ASSERT_FLOAT_EQ(h_results(19).imag(), r.imag());
r = a;
/* r = a+e; */ ASSERT_FLOAT_EQ(h_results(20).real(), r.real()+e); ASSERT_FLOAT_EQ(h_results(20).imag(), r.imag());
/* r = a-e; */ ASSERT_FLOAT_EQ(h_results(21).real(), r.real()-e); ASSERT_FLOAT_EQ(h_results(21).imag(), r.imag());
/* r = a*e; */ ASSERT_FLOAT_EQ(h_results(22).real(), r.real()*e); ASSERT_FLOAT_EQ(h_results(22).imag(), r.imag()*e);
/* r = a/e; */ ASSERT_FLOAT_EQ(h_results(23).real(), r.real()/2); ASSERT_FLOAT_EQ(h_results(23).imag(), r.imag()/e);
}
KOKKOS_INLINE_FUNCTION
@ -190,6 +197,12 @@ struct TestComplexBasicMath {
d_results(17) = c-a;
d_results(18) = c*a;
d_results(19) = c/a;
int e = 2;
d_results(20) = a+e;
d_results(21) = a-e;
d_results(22) = a*e;
d_results(23) = a/e;
}
};

View File

@ -286,7 +286,9 @@ struct TestMDRange_2D {
// Test with reducers - scalar
{
typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> > range_type;
range_type range( {{ 0, 0 }}, {{ N0, N1 }}, {{ 3, 3 }} );
int s0 = 1;
int s1 = 1;
range_type range( {{ s0, s1 }}, {{ N0, N1 }}, {{ 3, 3 }} );
TestMDRange_2D functor( N0, N1 );
@ -297,7 +299,7 @@ struct TestMDRange_2D {
parallel_reduce( range, functor, reducer_scalar );
ASSERT_EQ( sum, 2 * N0 * N1 );
ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) );
}
// Test with reducers - scalar view
{
@ -445,7 +447,9 @@ struct TestMDRange_2D {
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
const int s0 = 1;
const int s1 = 1;
range_type range( point_type{ { s0, s1 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
TestMDRange_2D functor( N0, N1 );
parallel_for( range, functor );
@ -454,8 +458,8 @@ struct TestMDRange_2D {
Kokkos::deep_copy( h_view, functor.input_view );
int counter = 0;
for ( int i = 0; i < N0; ++i )
for ( int j = 0; j < N1; ++j )
for ( int i = s0; i < N0; ++i )
for ( int j = s1; j < N1; ++j )
{
if ( h_view( i, j ) != 3 ) {
++counter;
@ -463,7 +467,7 @@ struct TestMDRange_2D {
}
if ( counter != 0 ) {
printf( "Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
printf( "Offset Start + Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
}
ASSERT_EQ( counter, 0 );
@ -699,6 +703,7 @@ struct TestMDRange_2D {
ASSERT_EQ( counter, 0 );
}
} // end test_for2
}; // MDRange_2D
@ -749,7 +754,10 @@ struct TestMDRange_3D {
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
int s0 = 1;
int s1 = 1;
int s2 = 1;
range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
TestMDRange_3D functor( N0, N1, N2 );
@ -757,7 +765,7 @@ struct TestMDRange_3D {
double sum = 0.0;
parallel_reduce( range, functor, sum );
ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) );
}
// Test with reducers - scalar
@ -952,7 +960,10 @@ struct TestMDRange_3D {
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
int s0 = 1;
int s1 = 1;
int s2 = 1;
range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
TestMDRange_3D functor( N0, N1, N2 );
parallel_for( range, functor );
@ -961,9 +972,9 @@ struct TestMDRange_3D {
Kokkos::deep_copy( h_view, functor.input_view );
int counter = 0;
for ( int i = 0; i < N0; ++i )
for ( int j = 0; j < N1; ++j )
for ( int k = 0; k < N2; ++k )
for ( int i = s0; i < N0; ++i )
for ( int j = s1; j < N1; ++j )
for ( int k = s2; k < N2; ++k )
{
if ( h_view( i, j, k ) != 3 ) {
++counter;
@ -971,7 +982,7 @@ struct TestMDRange_3D {
}
if ( counter != 0 ) {
printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
printf( "Offset Start + Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
}
ASSERT_EQ( counter, 0 );
@ -1207,7 +1218,11 @@ struct TestMDRange_4D {
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } );
int s0 = 1;
int s1 = 1;
int s2 = 1;
int s3 = 1;
range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } );
TestMDRange_4D functor( N0, N1, N2, N3 );
@ -1215,7 +1230,7 @@ struct TestMDRange_4D {
double sum = 0.0;
parallel_reduce( range, functor, sum );
ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) );
}
// Test with reducers - scalar
@ -1415,7 +1430,11 @@ struct TestMDRange_4D {
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } );
int s0 = 1;
int s1 = 1;
int s2 = 1;
int s3 = 1;
range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } );
TestMDRange_4D functor( N0, N1, N2, N3 );
parallel_for( range, functor );
@ -1424,10 +1443,10 @@ struct TestMDRange_4D {
Kokkos::deep_copy( h_view, functor.input_view );
int counter = 0;
for ( int i = 0; i < N0; ++i )
for ( int j = 0; j < N1; ++j )
for ( int k = 0; k < N2; ++k )
for ( int l = 0; l < N3; ++l )
for ( int i = s0; i < N0; ++i )
for ( int j = s1; j < N1; ++j )
for ( int k = s2; k < N2; ++k )
for ( int l = s3; l < N3; ++l )
{
if ( h_view( i, j, k, l ) != 3 ) {
++counter;
@ -1435,7 +1454,7 @@ struct TestMDRange_4D {
}
if ( counter != 0 ) {
printf("Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter);
printf("Offset Start + Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter);
}
ASSERT_EQ( counter, 0 );
@ -1682,7 +1701,12 @@ struct TestMDRange_5D {
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } );
int s0 = 1;
int s1 = 1;
int s2 = 1;
int s3 = 1;
int s4 = 1;
range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } );
TestMDRange_5D functor( N0, N1, N2, N3, N4 );
@ -1690,7 +1714,7 @@ struct TestMDRange_5D {
double sum = 0.0;
parallel_reduce( range, functor, sum );
ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 );
ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) );
}
// Test with reducers - scalar
@ -1810,7 +1834,12 @@ struct TestMDRange_5D {
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } );
int s0 = 1;
int s1 = 1;
int s2 = 1;
int s3 = 1;
int s4 = 1;
range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } );
TestMDRange_5D functor( N0, N1, N2, N3, N4 );
parallel_for( range, functor );
@ -1819,11 +1848,11 @@ struct TestMDRange_5D {
Kokkos::deep_copy( h_view, functor.input_view );
int counter = 0;
for ( int i = 0; i < N0; ++i )
for ( int j = 0; j < N1; ++j )
for ( int k = 0; k < N2; ++k )
for ( int l = 0; l < N3; ++l )
for ( int m = 0; m < N4; ++m )
for ( int i = s0; i < N0; ++i )
for ( int j = s1; j < N1; ++j )
for ( int k = s2; k < N2; ++k )
for ( int l = s3; l < N3; ++l )
for ( int m = s4; m < N4; ++m )
{
if ( h_view( i, j, k, l, m ) != 3 ) {
++counter;
@ -1831,7 +1860,7 @@ struct TestMDRange_5D {
}
if ( counter != 0 ) {
printf( "Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter );
printf( "Offset Start + Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter );
}
ASSERT_EQ( counter, 0 );
@ -2084,7 +2113,13 @@ struct TestMDRange_6D {
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } );
int s0 = 1;
int s1 = 1;
int s2 = 1;
int s3 = 1;
int s4 = 1;
int s5 = 1;
range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } );
TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
@ -2092,7 +2127,7 @@ struct TestMDRange_6D {
double sum = 0.0;
parallel_reduce( range, functor, sum );
ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 );
ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) * (N5 - s5) );
}
// Test with reducers - scalar
@ -2214,7 +2249,13 @@ struct TestMDRange_6D {
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging
int s0 = 1;
int s1 = 1;
int s2 = 1;
int s3 = 1;
int s4 = 1;
int s5 = 1;
range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging
TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
parallel_for( range, functor );
@ -2223,12 +2264,12 @@ struct TestMDRange_6D {
Kokkos::deep_copy( h_view, functor.input_view );
int counter = 0;
for ( int i = 0; i < N0; ++i )
for ( int j = 0; j < N1; ++j )
for ( int k = 0; k < N2; ++k )
for ( int l = 0; l < N3; ++l )
for ( int m = 0; m < N4; ++m )
for ( int n = 0; n < N5; ++n )
for ( int i = s0; i < N0; ++i )
for ( int j = s1; j < N1; ++j )
for ( int k = s2; k < N2; ++k )
for ( int l = s3; l < N3; ++l )
for ( int m = s4; m < N4; ++m )
for ( int n = s5; n < N5; ++n )
{
if ( h_view( i, j, k, l, m, n ) != 3 ) {
++counter;
@ -2236,7 +2277,7 @@ struct TestMDRange_6D {
}
if ( counter != 0 ) {
printf( "Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter );
printf( "Offset Start + Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter );
}
ASSERT_EQ( counter, 0 );

View File

@ -159,13 +159,13 @@ if buildflag or pathflag:
os.remove("includelink")
if os.path.isfile("liblink") or os.path.islink("liblink"):
os.remove("liblink")
if os.path.isfile("filelink") or os.path.islink("filelink"):
os.remove("filelink")
if os.path.isfile("filelink.o") or os.path.islink("filelink.o"):
os.remove("filelink.o")
cmd = 'ln -s "%s/src" includelink' % lattedir
subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
cmd = 'ln -s "%s" liblink' % lattedir
subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
cmd = 'ln -s "%s/src/latte_c_bind.o" filelink' % lattedir
cmd = 'ln -s "%s/src/latte_c_bind.o" filelink.o' % lattedir
subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
# copy Makefile.lammps.suffix to Makefile.lammps

View File

@ -3,5 +3,5 @@
# GNU Fortran settings
latte_SYSINC =
latte_SYSLIB = ../../lib/latte/filelink -llatte -lgfortran -llapack -lblas
latte_SYSLIB = ../../lib/latte/filelink.o -llatte -lgfortran -llapack -lblas
latte_SYSPATH = -fopenmp

View File

@ -3,7 +3,7 @@
# Intel ifort settings
latte_SYSINC =
latte_SYSLIB = ../../lib/latte/filelink \
latte_SYSLIB = ../../lib/latte/filelink.o \
-llatte -lifcore -lsvml -lompstub -limf -lmkl_intel_lp64 \
-lmkl_intel_thread -lmkl_core -lmkl_intel_thread -lpthread \
-openmp -O0

2
src/.gitignore vendored
View File

@ -414,6 +414,8 @@
/fix_lambdah_calc.h
/fix_langevin_eff.cpp
/fix_langevin_eff.h
/fix_latte.cpp
/fix_latte.h
/fix_lb_fluid.cpp
/fix_lb_fluid.h
/fix_lb_momentum.cpp

View File

@ -119,6 +119,10 @@ if (test $1 = "USER-DPD") then
depend KOKKOS
fi
if (test $1 = "USER-DRUDE") then
depend USER-OMP
fi
if (test $1 = "USER-FEP") then
depend USER-OMP
fi

View File

@ -136,450 +136,6 @@ void AtomVecAtomicKokkos::copy(int i, int j, int delflag)
/* ---------------------------------------------------------------------- */
template<class DeviceType,int PBC_FLAG,int TRICLINIC>
struct AtomVecAtomicKokkos_PackComm {
typedef DeviceType device_type;
typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
const int _iswap;
X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
X_FLOAT _pbc[6];
AtomVecAtomicKokkos_PackComm(
const typename DAT::tdual_x_array &x,
const typename DAT::tdual_xfloat_2d &buf,
const typename DAT::tdual_int_2d &list,
const int & iswap,
const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
_x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap),
_xprd(xprd),_yprd(yprd),_zprd(zprd),
_xy(xy),_xz(xz),_yz(yz) {
const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
const size_t elements = 3;
buffer_view<DeviceType>(_buf,buf,maxsend,elements);
_pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
_pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
};
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
const int j = _list(_iswap,i);
if (PBC_FLAG == 0) {
_buf(i,0) = _x(j,0);
_buf(i,1) = _x(j,1);
_buf(i,2) = _x(j,2);
} else {
if (TRICLINIC == 0) {
_buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
_buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
_buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
} else {
_buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
_buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
_buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
}
}
}
};
/* ---------------------------------------------------------------------- */
int AtomVecAtomicKokkos::pack_comm_kokkos(const int &n,
const DAT::tdual_int_2d &list,
const int & iswap,
const DAT::tdual_xfloat_2d &buf,
const int &pbc_flag,
const int* const pbc)
{
// Check whether to always run forward communication on the host
// Choose correct forward PackComm kernel
if(commKK->forward_comm_on_host) {
sync(Host,X_MASK);
if(pbc_flag) {
if(domain->triclinic) {
struct AtomVecAtomicKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecAtomicKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
} else {
if(domain->triclinic) {
struct AtomVecAtomicKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecAtomicKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
}
} else {
sync(Device,X_MASK);
if(pbc_flag) {
if(domain->triclinic) {
struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
} else {
if(domain->triclinic) {
struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
}
}
return n*size_forward;
}
/* ---------------------------------------------------------------------- */
template<class DeviceType,int PBC_FLAG,int TRICLINIC>
struct AtomVecAtomicKokkos_PackCommSelf {
typedef DeviceType device_type;
typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
typename ArrayTypes<DeviceType>::t_x_array _xw;
int _nfirst;
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
const int _iswap;
X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
X_FLOAT _pbc[6];
AtomVecAtomicKokkos_PackCommSelf(
const typename DAT::tdual_x_array &x,
const int &nfirst,
const typename DAT::tdual_int_2d &list,
const int & iswap,
const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
_x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
_xprd(xprd),_yprd(yprd),_zprd(zprd),
_xy(xy),_xz(xz),_yz(yz) {
_pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
_pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
};
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
const int j = _list(_iswap,i);
if (PBC_FLAG == 0) {
_xw(i+_nfirst,0) = _x(j,0);
_xw(i+_nfirst,1) = _x(j,1);
_xw(i+_nfirst,2) = _x(j,2);
} else {
if (TRICLINIC == 0) {
_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
} else {
_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
}
}
}
};
/* ---------------------------------------------------------------------- */
int AtomVecAtomicKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
const int nfirst, const int &pbc_flag, const int* const pbc) {
if(commKK->forward_comm_on_host) {
sync(Host,X_MASK);
modified(Host,X_MASK);
if(pbc_flag) {
if(domain->triclinic) {
struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
} else {
if(domain->triclinic) {
struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
}
} else {
sync(Device,X_MASK);
modified(Device,X_MASK);
if(pbc_flag) {
if(domain->triclinic) {
struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
} else {
if(domain->triclinic) {
struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
}
}
return n*3;
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
struct AtomVecAtomicKokkos_UnpackComm {
typedef DeviceType device_type;
typename ArrayTypes<DeviceType>::t_x_array _x;
typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
int _first;
AtomVecAtomicKokkos_UnpackComm(
const typename DAT::tdual_x_array &x,
const typename DAT::tdual_xfloat_2d &buf,
const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
_first(first) {};
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
_x(i+_first,0) = _buf(i,0);
_x(i+_first,1) = _buf(i,1);
_x(i+_first,2) = _buf(i,2);
}
};
/* ---------------------------------------------------------------------- */
void AtomVecAtomicKokkos::unpack_comm_kokkos(const int &n, const int &first,
const DAT::tdual_xfloat_2d &buf ) {
if(commKK->forward_comm_on_host) {
sync(Host,X_MASK);
modified(Host,X_MASK);
struct AtomVecAtomicKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
Kokkos::parallel_for(n,f);
} else {
sync(Device,X_MASK);
modified(Device,X_MASK);
struct AtomVecAtomicKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
Kokkos::parallel_for(n,f);
}
}
/* ---------------------------------------------------------------------- */
int AtomVecAtomicKokkos::pack_comm(int n, int *list, double *buf,
int pbc_flag, int *pbc)
{
int i,j,m;
double dx,dy,dz;
m = 0;
if (pbc_flag == 0) {
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0);
buf[m++] = h_x(j,1);
buf[m++] = h_x(j,2);
}
} else {
if (domain->triclinic == 0) {
dx = pbc[0]*domain->xprd;
dy = pbc[1]*domain->yprd;
dz = pbc[2]*domain->zprd;
} else {
dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
dz = pbc[2]*domain->zprd;
}
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0) + dx;
buf[m++] = h_x(j,1) + dy;
buf[m++] = h_x(j,2) + dz;
}
}
return m;
}
/* ---------------------------------------------------------------------- */
int AtomVecAtomicKokkos::pack_comm_vel(int n, int *list, double *buf,
int pbc_flag, int *pbc)
{
int i,j,m;
double dx,dy,dz,dvx,dvy,dvz;
m = 0;
if (pbc_flag == 0) {
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0);
buf[m++] = h_x(j,1);
buf[m++] = h_x(j,2);
buf[m++] = h_v(j,0);
buf[m++] = h_v(j,1);
buf[m++] = h_v(j,2);
}
} else {
if (domain->triclinic == 0) {
dx = pbc[0]*domain->xprd;
dy = pbc[1]*domain->yprd;
dz = pbc[2]*domain->zprd;
} else {
dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
dz = pbc[2]*domain->zprd;
}
if (!deform_vremap) {
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0) + dx;
buf[m++] = h_x(j,1) + dy;
buf[m++] = h_x(j,2) + dz;
buf[m++] = h_v(j,0);
buf[m++] = h_v(j,1);
buf[m++] = h_v(j,2);
}
} else {
dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
dvz = pbc[2]*h_rate[2];
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0) + dx;
buf[m++] = h_x(j,1) + dy;
buf[m++] = h_x(j,2) + dz;
if (mask[i] & deform_groupbit) {
buf[m++] = h_v(j,0) + dvx;
buf[m++] = h_v(j,1) + dvy;
buf[m++] = h_v(j,2) + dvz;
} else {
buf[m++] = h_v(j,0);
buf[m++] = h_v(j,1);
buf[m++] = h_v(j,2);
}
}
}
}
return m;
}
/* ---------------------------------------------------------------------- */
void AtomVecAtomicKokkos::unpack_comm(int n, int first, double *buf)
{
int i,m,last;
m = 0;
last = first + n;
for (i = first; i < last; i++) {
h_x(i,0) = buf[m++];
h_x(i,1) = buf[m++];
h_x(i,2) = buf[m++];
}
}
/* ---------------------------------------------------------------------- */
void AtomVecAtomicKokkos::unpack_comm_vel(int n, int first, double *buf)
{
int i,m,last;
m = 0;
last = first + n;
for (i = first; i < last; i++) {
h_x(i,0) = buf[m++];
h_x(i,1) = buf[m++];
h_x(i,2) = buf[m++];
h_v(i,0) = buf[m++];
h_v(i,1) = buf[m++];
h_v(i,2) = buf[m++];
}
}
/* ---------------------------------------------------------------------- */
int AtomVecAtomicKokkos::pack_reverse(int n, int first, double *buf)
{
if(n > 0)
sync(Host,F_MASK);
int m = 0;
const int last = first + n;
for (int i = first; i < last; i++) {
buf[m++] = h_f(i,0);
buf[m++] = h_f(i,1);
buf[m++] = h_f(i,2);
}
return m;
}
/* ---------------------------------------------------------------------- */
void AtomVecAtomicKokkos::unpack_reverse(int n, int *list, double *buf)
{
if(n > 0) {
sync(Host,F_MASK);
modified(Host,F_MASK);
}
int m = 0;
for (int i = 0; i < n; i++) {
const int j = list[i];
h_f(j,0) += buf[m++];
h_f(j,1) += buf[m++];
h_f(j,2) += buf[m++];
}
}
/* ---------------------------------------------------------------------- */
template<class DeviceType,int PBC_FLAG>
struct AtomVecAtomicKokkos_PackBorder {
typedef DeviceType device_type;

View File

@ -33,12 +33,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos {
virtual ~AtomVecAtomicKokkos() {}
void grow(int);
void copy(int, int, int);
int pack_comm(int, int *, double *, int, int *);
int pack_comm_vel(int, int *, double *, int, int *);
void unpack_comm(int, int, double *);
void unpack_comm_vel(int, int, double *);
int pack_reverse(int, int, double *);
void unpack_reverse(int, int *, double *);
int pack_border(int, int *, double *, int, int *);
int pack_border_vel(int, int *, double *, int, int *);
void unpack_border(int, int, double *);
@ -55,15 +49,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos {
bigint memory_usage();
void grow_reset();
int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
const int & iswap,
const DAT::tdual_xfloat_2d &buf,
const int &pbc_flag, const int pbc[]);
void unpack_comm_kokkos(const int &n, const int &nfirst,
const DAT::tdual_xfloat_2d &buf);
int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
const int & iswap, const int nfirst,
const int &pbc_flag, const int pbc[]);
int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
DAT::tdual_xfloat_2d buf,int iswap,
int pbc_flag, int *pbc, ExecutionSpace space);
@ -99,9 +84,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos {
DAT::t_x_array d_x;
DAT::t_v_array d_v;
DAT::t_f_array d_f;
HAT::t_x_array h_x;
HAT::t_v_array h_v;
HAT::t_f_array h_f;
DAT::tdual_int_1d k_count;
};

View File

@ -178,448 +178,6 @@ void AtomVecBondKokkos::copy(int i, int j, int delflag)
/* ---------------------------------------------------------------------- */
template<class DeviceType,int PBC_FLAG,int TRICLINIC>
struct AtomVecBondKokkos_PackComm {
typedef DeviceType device_type;
typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
const int _iswap;
X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
X_FLOAT _pbc[6];
AtomVecBondKokkos_PackComm(
const typename DAT::tdual_x_array &x,
const typename DAT::tdual_xfloat_2d &buf,
const typename DAT::tdual_int_2d &list,
const int & iswap,
const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
_x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap),
_xprd(xprd),_yprd(yprd),_zprd(zprd),
_xy(xy),_xz(xz),_yz(yz) {
const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
const size_t elements = 3;
buffer_view<DeviceType>(_buf,buf,maxsend,elements);
_pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
_pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
};
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
const int j = _list(_iswap,i);
if (PBC_FLAG == 0) {
_buf(i,0) = _x(j,0);
_buf(i,1) = _x(j,1);
_buf(i,2) = _x(j,2);
} else {
if (TRICLINIC == 0) {
_buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
_buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
_buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
} else {
_buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
_buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
_buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
}
}
}
};
/* ---------------------------------------------------------------------- */
int AtomVecBondKokkos::pack_comm_kokkos(const int &n,
const DAT::tdual_int_2d &list,
const int & iswap,
const DAT::tdual_xfloat_2d &buf,
const int &pbc_flag,
const int* const pbc)
{
// Check whether to always run forward communication on the host
// Choose correct forward PackComm kernel
if(commKK->forward_comm_on_host) {
sync(Host,X_MASK);
if(pbc_flag) {
if(domain->triclinic) {
struct AtomVecBondKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecBondKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
} else {
if(domain->triclinic) {
struct AtomVecBondKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecBondKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
}
} else {
sync(Device,X_MASK);
if(pbc_flag) {
if(domain->triclinic) {
struct AtomVecBondKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecBondKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
} else {
if(domain->triclinic) {
struct AtomVecBondKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecBondKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
}
}
return n*size_forward;
}
/* ---------------------------------------------------------------------- */
template<class DeviceType,int PBC_FLAG,int TRICLINIC>
struct AtomVecBondKokkos_PackCommSelf {
typedef DeviceType device_type;
typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
typename ArrayTypes<DeviceType>::t_x_array _xw;
int _nfirst;
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
const int _iswap;
X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
X_FLOAT _pbc[6];
AtomVecBondKokkos_PackCommSelf(
const typename DAT::tdual_x_array &x,
const int &nfirst,
const typename DAT::tdual_int_2d &list,
const int & iswap,
const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
_x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
_xprd(xprd),_yprd(yprd),_zprd(zprd),
_xy(xy),_xz(xz),_yz(yz) {
_pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
_pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
};
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
const int j = _list(_iswap,i);
if (PBC_FLAG == 0) {
_xw(i+_nfirst,0) = _x(j,0);
_xw(i+_nfirst,1) = _x(j,1);
_xw(i+_nfirst,2) = _x(j,2);
} else {
if (TRICLINIC == 0) {
_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
} else {
_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
}
}
}
};
/* ---------------------------------------------------------------------- */
int AtomVecBondKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
const int nfirst, const int &pbc_flag, const int* const pbc) {
if(commKK->forward_comm_on_host) {
sync(Host,X_MASK);
modified(Host,X_MASK);
if(pbc_flag) {
if(domain->triclinic) {
struct AtomVecBondKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecBondKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
} else {
if(domain->triclinic) {
struct AtomVecBondKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecBondKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
}
} else {
sync(Device,X_MASK);
modified(Device,X_MASK);
if(pbc_flag) {
if(domain->triclinic) {
struct AtomVecBondKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecBondKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
} else {
if(domain->triclinic) {
struct AtomVecBondKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecBondKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
}
}
return n*3;
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
struct AtomVecBondKokkos_UnpackComm {
typedef DeviceType device_type;
typename ArrayTypes<DeviceType>::t_x_array _x;
typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
int _first;
AtomVecBondKokkos_UnpackComm(
const typename DAT::tdual_x_array &x,
const typename DAT::tdual_xfloat_2d &buf,
const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
_first(first) {};
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
_x(i+_first,0) = _buf(i,0);
_x(i+_first,1) = _buf(i,1);
_x(i+_first,2) = _buf(i,2);
}
};
/* ---------------------------------------------------------------------- */
void AtomVecBondKokkos::unpack_comm_kokkos(const int &n, const int &first,
const DAT::tdual_xfloat_2d &buf ) {
if(commKK->forward_comm_on_host) {
sync(Host,X_MASK);
modified(Host,X_MASK);
struct AtomVecBondKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
Kokkos::parallel_for(n,f);
} else {
sync(Device,X_MASK);
modified(Device,X_MASK);
struct AtomVecBondKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
Kokkos::parallel_for(n,f);
}
}
/* ---------------------------------------------------------------------- */
int AtomVecBondKokkos::pack_comm(int n, int *list, double *buf,
int pbc_flag, int *pbc)
{
int i,j,m;
double dx,dy,dz;
m = 0;
if (pbc_flag == 0) {
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0);
buf[m++] = h_x(j,1);
buf[m++] = h_x(j,2);
}
} else {
if (domain->triclinic == 0) {
dx = pbc[0]*domain->xprd;
dy = pbc[1]*domain->yprd;
dz = pbc[2]*domain->zprd;
} else {
dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
dz = pbc[2]*domain->zprd;
}
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0) + dx;
buf[m++] = h_x(j,1) + dy;
buf[m++] = h_x(j,2) + dz;
}
}
return m;
}
/* ---------------------------------------------------------------------- */
int AtomVecBondKokkos::pack_comm_vel(int n, int *list, double *buf,
int pbc_flag, int *pbc)
{
int i,j,m;
double dx,dy,dz,dvx,dvy,dvz;
m = 0;
if (pbc_flag == 0) {
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0);
buf[m++] = h_x(j,1);
buf[m++] = h_x(j,2);
buf[m++] = h_v(j,0);
buf[m++] = h_v(j,1);
buf[m++] = h_v(j,2);
}
} else {
if (domain->triclinic == 0) {
dx = pbc[0]*domain->xprd;
dy = pbc[1]*domain->yprd;
dz = pbc[2]*domain->zprd;
} else {
dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
dz = pbc[2]*domain->zprd;
}
if (!deform_vremap) {
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0) + dx;
buf[m++] = h_x(j,1) + dy;
buf[m++] = h_x(j,2) + dz;
buf[m++] = h_v(j,0);
buf[m++] = h_v(j,1);
buf[m++] = h_v(j,2);
}
} else {
dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
dvz = pbc[2]*h_rate[2];
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0) + dx;
buf[m++] = h_x(j,1) + dy;
buf[m++] = h_x(j,2) + dz;
if (mask[i] & deform_groupbit) {
buf[m++] = h_v(j,0) + dvx;
buf[m++] = h_v(j,1) + dvy;
buf[m++] = h_v(j,2) + dvz;
} else {
buf[m++] = h_v(j,0);
buf[m++] = h_v(j,1);
buf[m++] = h_v(j,2);
}
}
}
}
return m;
}
/* ---------------------------------------------------------------------- */
void AtomVecBondKokkos::unpack_comm(int n, int first, double *buf)
{
int i,m,last;
m = 0;
last = first + n;
for (i = first; i < last; i++) {
h_x(i,0) = buf[m++];
h_x(i,1) = buf[m++];
h_x(i,2) = buf[m++];
}
}
/* ---------------------------------------------------------------------- */
void AtomVecBondKokkos::unpack_comm_vel(int n, int first, double *buf)
{
int i,m,last;
m = 0;
last = first + n;
for (i = first; i < last; i++) {
h_x(i,0) = buf[m++];
h_x(i,1) = buf[m++];
h_x(i,2) = buf[m++];
h_v(i,0) = buf[m++];
h_v(i,1) = buf[m++];
h_v(i,2) = buf[m++];
}
}
/* ---------------------------------------------------------------------- */
int AtomVecBondKokkos::pack_reverse(int n, int first, double *buf)
{
if(n > 0)
sync(Host,F_MASK);
int m = 0;
const int last = first + n;
for (int i = first; i < last; i++) {
buf[m++] = h_f(i,0);
buf[m++] = h_f(i,1);
buf[m++] = h_f(i,2);
}
return m;
}
/* ---------------------------------------------------------------------- */
void AtomVecBondKokkos::unpack_reverse(int n, int *list, double *buf)
{
if(n > 0)
modified(Host,F_MASK);
int m = 0;
for (int i = 0; i < n; i++) {
const int j = list[i];
h_f(j,0) += buf[m++];
h_f(j,1) += buf[m++];
h_f(j,2) += buf[m++];
}
}
/* ---------------------------------------------------------------------- */
template<class DeviceType,int PBC_FLAG>
struct AtomVecBondKokkos_PackBorder {
typedef DeviceType device_type;

View File

@ -32,12 +32,6 @@ class AtomVecBondKokkos : public AtomVecKokkos {
virtual ~AtomVecBondKokkos() {}
void grow(int);
void copy(int, int, int);
int pack_comm(int, int *, double *, int, int *);
int pack_comm_vel(int, int *, double *, int, int *);
void unpack_comm(int, int, double *);
void unpack_comm_vel(int, int, double *);
int pack_reverse(int, int, double *);
void unpack_reverse(int, int *, double *);
int pack_border(int, int *, double *, int, int *);
int pack_border_vel(int, int *, double *, int, int *);
int pack_border_hybrid(int, int *, double *);
@ -59,15 +53,6 @@ class AtomVecBondKokkos : public AtomVecKokkos {
bigint memory_usage();
void grow_reset();
int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
const int & iswap,
const DAT::tdual_xfloat_2d &buf,
const int &pbc_flag, const int pbc[]);
void unpack_comm_kokkos(const int &n, const int &nfirst,
const DAT::tdual_xfloat_2d &buf);
int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
const int & iswap, const int nfirst,
const int &pbc_flag, const int pbc[]);
int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
DAT::tdual_xfloat_2d buf,int iswap,
int pbc_flag, int *pbc, ExecutionSpace space);
@ -112,9 +97,6 @@ class AtomVecBondKokkos : public AtomVecKokkos {
DAT::t_x_array d_x;
DAT::t_v_array d_v;
DAT::t_f_array d_f;
HAT::t_x_array h_x;
HAT::t_v_array h_v;
HAT::t_f_array h_f;
DAT::t_tagint_1d d_molecule;
DAT::t_int_2d d_nspecial;

View File

@ -199,397 +199,6 @@ struct AtomVecChargeKokkos_PackComm {
/* ---------------------------------------------------------------------- */
int AtomVecChargeKokkos::pack_comm_kokkos(const int &n,
const DAT::tdual_int_2d &list,
const int & iswap,
const DAT::tdual_xfloat_2d &buf,
const int &pbc_flag,
const int* const pbc)
{
// Check whether to always run forward communication on the host
// Choose correct forward PackComm kernel
if(commKK->forward_comm_on_host) {
sync(Host,X_MASK);
if(pbc_flag) {
if(domain->triclinic) {
struct AtomVecChargeKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecChargeKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
} else {
if(domain->triclinic) {
struct AtomVecChargeKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecChargeKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
}
} else {
sync(Device,X_MASK);
if(pbc_flag) {
if(domain->triclinic) {
struct AtomVecChargeKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecChargeKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
} else {
if(domain->triclinic) {
struct AtomVecChargeKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecChargeKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
}
}
return n*size_forward;
}
/* ---------------------------------------------------------------------- */
template<class DeviceType,int PBC_FLAG,int TRICLINIC>
struct AtomVecChargeKokkos_PackCommSelf {
typedef DeviceType device_type;
typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
typename ArrayTypes<DeviceType>::t_x_array _xw;
int _nfirst;
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
const int _iswap;
X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
X_FLOAT _pbc[6];
AtomVecChargeKokkos_PackCommSelf(
const typename DAT::tdual_x_array &x,
const int &nfirst,
const typename DAT::tdual_int_2d &list,
const int & iswap,
const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
_x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
_xprd(xprd),_yprd(yprd),_zprd(zprd),
_xy(xy),_xz(xz),_yz(yz) {
_pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
_pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
};
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
const int j = _list(_iswap,i);
if (PBC_FLAG == 0) {
_xw(i+_nfirst,0) = _x(j,0);
_xw(i+_nfirst,1) = _x(j,1);
_xw(i+_nfirst,2) = _x(j,2);
} else {
if (TRICLINIC == 0) {
_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
} else {
_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
}
}
}
};
/* ---------------------------------------------------------------------- */
int AtomVecChargeKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
const int nfirst, const int &pbc_flag, const int* const pbc) {
if(commKK->forward_comm_on_host) {
sync(Host,X_MASK);
modified(Host,X_MASK);
if(pbc_flag) {
if(domain->triclinic) {
struct AtomVecChargeKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecChargeKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
} else {
if(domain->triclinic) {
struct AtomVecChargeKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecChargeKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
}
} else {
sync(Device,X_MASK);
modified(Device,X_MASK);
if(pbc_flag) {
if(domain->triclinic) {
struct AtomVecChargeKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecChargeKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
} else {
if(domain->triclinic) {
struct AtomVecChargeKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecChargeKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
}
}
return n*3;
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
struct AtomVecChargeKokkos_UnpackComm {
typedef DeviceType device_type;
typename ArrayTypes<DeviceType>::t_x_array _x;
typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
int _first;
AtomVecChargeKokkos_UnpackComm(
const typename DAT::tdual_x_array &x,
const typename DAT::tdual_xfloat_2d &buf,
const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
_first(first) {};
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
_x(i+_first,0) = _buf(i,0);
_x(i+_first,1) = _buf(i,1);
_x(i+_first,2) = _buf(i,2);
}
};
/* ---------------------------------------------------------------------- */
void AtomVecChargeKokkos::unpack_comm_kokkos(const int &n, const int &first,
const DAT::tdual_xfloat_2d &buf ) {
if(commKK->forward_comm_on_host) {
sync(Host,X_MASK);
modified(Host,X_MASK);
struct AtomVecChargeKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
Kokkos::parallel_for(n,f);
} else {
sync(Device,X_MASK);
modified(Device,X_MASK);
struct AtomVecChargeKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
Kokkos::parallel_for(n,f);
}
}
/* ---------------------------------------------------------------------- */
int AtomVecChargeKokkos::pack_comm(int n, int *list, double *buf,
int pbc_flag, int *pbc)
{
int i,j,m;
double dx,dy,dz;
m = 0;
if (pbc_flag == 0) {
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0);
buf[m++] = h_x(j,1);
buf[m++] = h_x(j,2);
}
} else {
if (domain->triclinic == 0) {
dx = pbc[0]*domain->xprd;
dy = pbc[1]*domain->yprd;
dz = pbc[2]*domain->zprd;
} else {
dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
dz = pbc[2]*domain->zprd;
}
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0) + dx;
buf[m++] = h_x(j,1) + dy;
buf[m++] = h_x(j,2) + dz;
}
}
return m;
}
/* ---------------------------------------------------------------------- */
int AtomVecChargeKokkos::pack_comm_vel(int n, int *list, double *buf,
int pbc_flag, int *pbc)
{
int i,j,m;
double dx,dy,dz,dvx,dvy,dvz;
m = 0;
if (pbc_flag == 0) {
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0);
buf[m++] = h_x(j,1);
buf[m++] = h_x(j,2);
buf[m++] = h_v(j,0);
buf[m++] = h_v(j,1);
buf[m++] = h_v(j,2);
}
} else {
if (domain->triclinic == 0) {
dx = pbc[0]*domain->xprd;
dy = pbc[1]*domain->yprd;
dz = pbc[2]*domain->zprd;
} else {
dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
dz = pbc[2]*domain->zprd;
}
if (!deform_vremap) {
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0) + dx;
buf[m++] = h_x(j,1) + dy;
buf[m++] = h_x(j,2) + dz;
buf[m++] = h_v(j,0);
buf[m++] = h_v(j,1);
buf[m++] = h_v(j,2);
}
} else {
dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
dvz = pbc[2]*h_rate[2];
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0) + dx;
buf[m++] = h_x(j,1) + dy;
buf[m++] = h_x(j,2) + dz;
if (mask[i] & deform_groupbit) {
buf[m++] = h_v(j,0) + dvx;
buf[m++] = h_v(j,1) + dvy;
buf[m++] = h_v(j,2) + dvz;
} else {
buf[m++] = h_v(j,0);
buf[m++] = h_v(j,1);
buf[m++] = h_v(j,2);
}
}
}
}
return m;
}
/* ---------------------------------------------------------------------- */
void AtomVecChargeKokkos::unpack_comm(int n, int first, double *buf)
{
int i,m,last;
m = 0;
last = first + n;
for (i = first; i < last; i++) {
h_x(i,0) = buf[m++];
h_x(i,1) = buf[m++];
h_x(i,2) = buf[m++];
}
}
/* ---------------------------------------------------------------------- */
void AtomVecChargeKokkos::unpack_comm_vel(int n, int first, double *buf)
{
int i,m,last;
m = 0;
last = first + n;
for (i = first; i < last; i++) {
h_x(i,0) = buf[m++];
h_x(i,1) = buf[m++];
h_x(i,2) = buf[m++];
h_v(i,0) = buf[m++];
h_v(i,1) = buf[m++];
h_v(i,2) = buf[m++];
}
}
/* ---------------------------------------------------------------------- */
int AtomVecChargeKokkos::pack_reverse(int n, int first, double *buf)
{
if(n > 0)
sync(Host,F_MASK);
int m = 0;
const int last = first + n;
for (int i = first; i < last; i++) {
buf[m++] = h_f(i,0);
buf[m++] = h_f(i,1);
buf[m++] = h_f(i,2);
}
return m;
}
/* ---------------------------------------------------------------------- */
void AtomVecChargeKokkos::unpack_reverse(int n, int *list, double *buf)
{
if(n > 0)
modified(Host,F_MASK);
int m = 0;
for (int i = 0; i < n; i++) {
const int j = list[i];
h_f(j,0) += buf[m++];
h_f(j,1) += buf[m++];
h_f(j,2) += buf[m++];
}
}
/* ---------------------------------------------------------------------- */
template<class DeviceType,int PBC_FLAG>
struct AtomVecChargeKokkos_PackBorder {
typedef DeviceType device_type;

View File

@ -33,12 +33,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos {
virtual ~AtomVecChargeKokkos() {}
void grow(int);
void copy(int, int, int);
int pack_comm(int, int *, double *, int, int *);
int pack_comm_vel(int, int *, double *, int, int *);
void unpack_comm(int, int, double *);
void unpack_comm_vel(int, int, double *);
int pack_reverse(int, int, double *);
void unpack_reverse(int, int *, double *);
int pack_border(int, int *, double *, int, int *);
int pack_border_vel(int, int *, double *, int, int *);
int pack_border_hybrid(int, int *, double *);
@ -60,15 +54,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos {
bigint memory_usage();
void grow_reset();
int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
const int & iswap,
const DAT::tdual_xfloat_2d &buf,
const int &pbc_flag, const int pbc[]);
void unpack_comm_kokkos(const int &n, const int &nfirst,
const DAT::tdual_xfloat_2d &buf);
int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
const int & iswap, const int nfirst,
const int &pbc_flag, const int pbc[]);
int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
DAT::tdual_xfloat_2d buf,int iswap,
int pbc_flag, int *pbc, ExecutionSpace space);
@ -108,9 +93,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos {
DAT::t_x_array d_x;
DAT::t_v_array d_v;
DAT::t_f_array d_f;
HAT::t_x_array h_x;
HAT::t_v_array h_v;
HAT::t_f_array h_f;
DAT::t_float_1d d_q;

View File

@ -111,9 +111,6 @@ class AtomVecDPDKokkos : public AtomVecKokkos {
DAT::t_x_array d_x;
DAT::t_v_array d_v;
DAT::t_f_array d_f;
HAT::t_x_array h_x;
HAT::t_v_array h_v;
HAT::t_f_array h_f;
DAT::tdual_int_1d k_count;
};

View File

@ -307,452 +307,6 @@ void AtomVecFullKokkos::copy(int i, int j, int delflag)
/* ---------------------------------------------------------------------- */
template<class DeviceType,int PBC_FLAG,int TRICLINIC>
struct AtomVecFullKokkos_PackComm {
typedef DeviceType device_type;
typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
const int _iswap;
X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
X_FLOAT _pbc[6];
AtomVecFullKokkos_PackComm(
const typename DAT::tdual_x_array &x,
const typename DAT::tdual_xfloat_2d &buf,
const typename DAT::tdual_int_2d &list,
const int & iswap,
const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
_x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap),
_xprd(xprd),_yprd(yprd),_zprd(zprd),
_xy(xy),_xz(xz),_yz(yz) {
const size_t maxsend = (buf.view<DeviceType>().dimension_0()
*buf.view<DeviceType>().dimension_1())/3;
const size_t elements = 3;
buffer_view<DeviceType>(_buf,buf,maxsend,elements);
_pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
_pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
};
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
const int j = _list(_iswap,i);
if (PBC_FLAG == 0) {
_buf(i,0) = _x(j,0);
_buf(i,1) = _x(j,1);
_buf(i,2) = _x(j,2);
} else {
if (TRICLINIC == 0) {
_buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
_buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
_buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
} else {
_buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
_buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
_buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
}
}
}
};
/* ---------------------------------------------------------------------- */
int AtomVecFullKokkos::pack_comm_kokkos(const int &n,
const DAT::tdual_int_2d &list,
const int & iswap,
const DAT::tdual_xfloat_2d &buf,
const int &pbc_flag,
const int* const pbc)
{
// Check whether to always run forward communication on the host
// Choose correct forward PackComm kernel
if(commKK->forward_comm_on_host) {
sync(Host,X_MASK);
if(pbc_flag) {
if(domain->triclinic) {
struct AtomVecFullKokkos_PackComm<LMPHostType,1,1>
f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecFullKokkos_PackComm<LMPHostType,1,0>
f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
} else {
if(domain->triclinic) {
struct AtomVecFullKokkos_PackComm<LMPHostType,0,1>
f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecFullKokkos_PackComm<LMPHostType,0,0>
f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
}
} else {
sync(Device,X_MASK);
if(pbc_flag) {
if(domain->triclinic) {
struct AtomVecFullKokkos_PackComm<LMPDeviceType,1,1>
f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecFullKokkos_PackComm<LMPDeviceType,1,0>
f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
} else {
if(domain->triclinic) {
struct AtomVecFullKokkos_PackComm<LMPDeviceType,0,1>
f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecFullKokkos_PackComm<LMPDeviceType,0,0>
f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
}
}
return n*size_forward;
}
/* ---------------------------------------------------------------------- */
template<class DeviceType,int PBC_FLAG,int TRICLINIC>
struct AtomVecFullKokkos_PackCommSelf {
typedef DeviceType device_type;
typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
typename ArrayTypes<DeviceType>::t_x_array _xw;
int _nfirst;
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
const int _iswap;
X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
X_FLOAT _pbc[6];
AtomVecFullKokkos_PackCommSelf(
const typename DAT::tdual_x_array &x,
const int &nfirst,
const typename DAT::tdual_int_2d &list,
const int & iswap,
const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
_x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),
_list(list.view<DeviceType>()),_iswap(iswap),
_xprd(xprd),_yprd(yprd),_zprd(zprd),
_xy(xy),_xz(xz),_yz(yz) {
_pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
_pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
};
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
const int j = _list(_iswap,i);
if (PBC_FLAG == 0) {
_xw(i+_nfirst,0) = _x(j,0);
_xw(i+_nfirst,1) = _x(j,1);
_xw(i+_nfirst,2) = _x(j,2);
} else {
if (TRICLINIC == 0) {
_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
} else {
_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
}
}
}
};
/* ---------------------------------------------------------------------- */
int AtomVecFullKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
const int & iswap,
const int nfirst, const int &pbc_flag,
const int* const pbc) {
if(commKK->forward_comm_on_host) {
sync(Host,X_MASK);
modified(Host,X_MASK);
if(pbc_flag) {
if(domain->triclinic) {
struct AtomVecFullKokkos_PackCommSelf<LMPHostType,1,1>
f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecFullKokkos_PackCommSelf<LMPHostType,1,0>
f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
} else {
if(domain->triclinic) {
struct AtomVecFullKokkos_PackCommSelf<LMPHostType,0,1>
f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecFullKokkos_PackCommSelf<LMPHostType,0,0>
f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
}
} else {
sync(Device,X_MASK);
modified(Device,X_MASK);
if(pbc_flag) {
if(domain->triclinic) {
struct AtomVecFullKokkos_PackCommSelf<LMPDeviceType,1,1>
f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecFullKokkos_PackCommSelf<LMPDeviceType,1,0>
f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
} else {
if(domain->triclinic) {
struct AtomVecFullKokkos_PackCommSelf<LMPDeviceType,0,1>
f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecFullKokkos_PackCommSelf<LMPDeviceType,0,0>
f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
}
}
return n*3;
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
struct AtomVecFullKokkos_UnpackComm {
typedef DeviceType device_type;
typename ArrayTypes<DeviceType>::t_x_array _x;
typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
int _first;
AtomVecFullKokkos_UnpackComm(
const typename DAT::tdual_x_array &x,
const typename DAT::tdual_xfloat_2d &buf,
const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
_first(first) {};
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
_x(i+_first,0) = _buf(i,0);
_x(i+_first,1) = _buf(i,1);
_x(i+_first,2) = _buf(i,2);
}
};
/* ---------------------------------------------------------------------- */
void AtomVecFullKokkos::unpack_comm_kokkos(const int &n, const int &first,
const DAT::tdual_xfloat_2d &buf ) {
if(commKK->forward_comm_on_host) {
sync(Host,X_MASK);
modified(Host,X_MASK);
struct AtomVecFullKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
Kokkos::parallel_for(n,f);
} else {
sync(Device,X_MASK);
modified(Device,X_MASK);
struct AtomVecFullKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
Kokkos::parallel_for(n,f);
}
}
/* ---------------------------------------------------------------------- */
int AtomVecFullKokkos::pack_comm(int n, int *list, double *buf,
int pbc_flag, int *pbc)
{
int i,j,m;
double dx,dy,dz;
m = 0;
if (pbc_flag == 0) {
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0);
buf[m++] = h_x(j,1);
buf[m++] = h_x(j,2);
}
} else {
if (domain->triclinic == 0) {
dx = pbc[0]*domain->xprd;
dy = pbc[1]*domain->yprd;
dz = pbc[2]*domain->zprd;
} else {
dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
dz = pbc[2]*domain->zprd;
}
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0) + dx;
buf[m++] = h_x(j,1) + dy;
buf[m++] = h_x(j,2) + dz;
}
}
return m;
}
/* ---------------------------------------------------------------------- */
int AtomVecFullKokkos::pack_comm_vel(int n, int *list, double *buf,
int pbc_flag, int *pbc)
{
int i,j,m;
double dx,dy,dz,dvx,dvy,dvz;
m = 0;
if (pbc_flag == 0) {
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0);
buf[m++] = h_x(j,1);
buf[m++] = h_x(j,2);
buf[m++] = h_v(j,0);
buf[m++] = h_v(j,1);
buf[m++] = h_v(j,2);
}
} else {
if (domain->triclinic == 0) {
dx = pbc[0]*domain->xprd;
dy = pbc[1]*domain->yprd;
dz = pbc[2]*domain->zprd;
} else {
dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
dz = pbc[2]*domain->zprd;
}
if (!deform_vremap) {
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0) + dx;
buf[m++] = h_x(j,1) + dy;
buf[m++] = h_x(j,2) + dz;
buf[m++] = h_v(j,0);
buf[m++] = h_v(j,1);
buf[m++] = h_v(j,2);
}
} else {
dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
dvz = pbc[2]*h_rate[2];
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0) + dx;
buf[m++] = h_x(j,1) + dy;
buf[m++] = h_x(j,2) + dz;
if (mask[i] & deform_groupbit) {
buf[m++] = h_v(j,0) + dvx;
buf[m++] = h_v(j,1) + dvy;
buf[m++] = h_v(j,2) + dvz;
} else {
buf[m++] = h_v(j,0);
buf[m++] = h_v(j,1);
buf[m++] = h_v(j,2);
}
}
}
}
return m;
}
/* ---------------------------------------------------------------------- */
void AtomVecFullKokkos::unpack_comm(int n, int first, double *buf)
{
int i,m,last;
m = 0;
last = first + n;
for (i = first; i < last; i++) {
h_x(i,0) = buf[m++];
h_x(i,1) = buf[m++];
h_x(i,2) = buf[m++];
}
}
/* ---------------------------------------------------------------------- */
void AtomVecFullKokkos::unpack_comm_vel(int n, int first, double *buf)
{
int i,m,last;
m = 0;
last = first + n;
for (i = first; i < last; i++) {
h_x(i,0) = buf[m++];
h_x(i,1) = buf[m++];
h_x(i,2) = buf[m++];
h_v(i,0) = buf[m++];
h_v(i,1) = buf[m++];
h_v(i,2) = buf[m++];
}
}
/* ---------------------------------------------------------------------- */
int AtomVecFullKokkos::pack_reverse(int n, int first, double *buf)
{
if(n > 0)
sync(Host,F_MASK);
int m = 0;
const int last = first + n;
for (int i = first; i < last; i++) {
buf[m++] = h_f(i,0);
buf[m++] = h_f(i,1);
buf[m++] = h_f(i,2);
}
return m;
}
/* ---------------------------------------------------------------------- */
void AtomVecFullKokkos::unpack_reverse(int n, int *list, double *buf)
{
if(n > 0)
modified(Host,F_MASK);
int m = 0;
for (int i = 0; i < n; i++) {
const int j = list[i];
h_f(j,0) += buf[m++];
h_f(j,1) += buf[m++];
h_f(j,2) += buf[m++];
}
}
/* ---------------------------------------------------------------------- */
template<class DeviceType,int PBC_FLAG>
struct AtomVecFullKokkos_PackBorder {
typedef DeviceType device_type;

View File

@ -32,12 +32,6 @@ class AtomVecFullKokkos : public AtomVecKokkos {
virtual ~AtomVecFullKokkos() {}
void grow(int);
void copy(int, int, int);
int pack_comm(int, int *, double *, int, int *);
int pack_comm_vel(int, int *, double *, int, int *);
void unpack_comm(int, int, double *);
void unpack_comm_vel(int, int, double *);
int pack_reverse(int, int, double *);
void unpack_reverse(int, int *, double *);
int pack_border(int, int *, double *, int, int *);
int pack_border_vel(int, int *, double *, int, int *);
int pack_border_hybrid(int, int *, double *);
@ -59,15 +53,6 @@ class AtomVecFullKokkos : public AtomVecKokkos {
bigint memory_usage();
void grow_reset();
int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
const int & iswap,
const DAT::tdual_xfloat_2d &buf,
const int &pbc_flag, const int pbc[]);
void unpack_comm_kokkos(const int &n, const int &nfirst,
const DAT::tdual_xfloat_2d &buf);
int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
const int & iswap, const int nfirst,
const int &pbc_flag, const int pbc[]);
int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
DAT::tdual_xfloat_2d buf,int iswap,
int pbc_flag, int *pbc, ExecutionSpace space);
@ -125,9 +110,6 @@ class AtomVecFullKokkos : public AtomVecKokkos {
DAT::t_x_array d_x;
DAT::t_v_array d_v;
DAT::t_f_array d_f;
HAT::t_x_array h_x;
HAT::t_v_array h_v;
HAT::t_f_array h_f;
DAT::t_float_1d d_q;
HAT::t_float_1d h_q;

View File

@ -12,6 +12,10 @@
------------------------------------------------------------------------- */
#include "atom_vec_kokkos.h"
#include "atom_kokkos.h"
#include "comm_kokkos.h"
#include "domain.h"
#include "atom_masks.h"
using namespace LAMMPS_NS;
@ -24,3 +28,585 @@ AtomVecKokkos::AtomVecKokkos(LAMMPS *lmp) : AtomVec(lmp)
buffer_size = 0;
}
/* ---------------------------------------------------------------------- */
template<class DeviceType,int PBC_FLAG,int TRICLINIC>
struct AtomVecKokkos_PackComm {
typedef DeviceType device_type;
typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
const int _iswap;
X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
X_FLOAT _pbc[6];
AtomVecKokkos_PackComm(
const typename DAT::tdual_x_array &x,
const typename DAT::tdual_xfloat_2d &buf,
const typename DAT::tdual_int_2d &list,
const int & iswap,
const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
_x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap),
_xprd(xprd),_yprd(yprd),_zprd(zprd),
_xy(xy),_xz(xz),_yz(yz) {
const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
const size_t elements = 3;
buffer_view<DeviceType>(_buf,buf,maxsend,elements);
_pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
_pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
};
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
const int j = _list(_iswap,i);
if (PBC_FLAG == 0) {
_buf(i,0) = _x(j,0);
_buf(i,1) = _x(j,1);
_buf(i,2) = _x(j,2);
} else {
if (TRICLINIC == 0) {
_buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
_buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
_buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
} else {
_buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
_buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
_buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
}
}
}
};
/* ---------------------------------------------------------------------- */
int AtomVecKokkos::pack_comm_kokkos(const int &n,
const DAT::tdual_int_2d &list,
const int & iswap,
const DAT::tdual_xfloat_2d &buf,
const int &pbc_flag,
const int* const pbc)
{
// Check whether to always run forward communication on the host
// Choose correct forward PackComm kernel
if(commKK->forward_comm_on_host) {
sync(Host,X_MASK);
if(pbc_flag) {
if(domain->triclinic) {
struct AtomVecKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
} else {
if(domain->triclinic) {
struct AtomVecKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
}
} else {
sync(Device,X_MASK);
if(pbc_flag) {
if(domain->triclinic) {
struct AtomVecKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
} else {
if(domain->triclinic) {
struct AtomVecKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
}
}
return n*size_forward;
}
/* ---------------------------------------------------------------------- */
template<class DeviceType,int PBC_FLAG,int TRICLINIC>
struct AtomVecKokkos_PackCommSelf {
typedef DeviceType device_type;
typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
typename ArrayTypes<DeviceType>::t_x_array _xw;
int _nfirst;
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
const int _iswap;
X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
X_FLOAT _pbc[6];
AtomVecKokkos_PackCommSelf(
const typename DAT::tdual_x_array &x,
const int &nfirst,
const typename DAT::tdual_int_2d &list,
const int & iswap,
const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
_x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
_xprd(xprd),_yprd(yprd),_zprd(zprd),
_xy(xy),_xz(xz),_yz(yz) {
_pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
_pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
};
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
const int j = _list(_iswap,i);
if (PBC_FLAG == 0) {
_xw(i+_nfirst,0) = _x(j,0);
_xw(i+_nfirst,1) = _x(j,1);
_xw(i+_nfirst,2) = _x(j,2);
} else {
if (TRICLINIC == 0) {
_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
} else {
_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
}
}
}
};
/* ---------------------------------------------------------------------- */
int AtomVecKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
const int nfirst, const int &pbc_flag, const int* const pbc) {
if(commKK->forward_comm_on_host) {
sync(Host,X_MASK);
modified(Host,X_MASK);
if(pbc_flag) {
if(domain->triclinic) {
struct AtomVecKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
} else {
if(domain->triclinic) {
struct AtomVecKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
}
} else {
sync(Device,X_MASK);
modified(Device,X_MASK);
if(pbc_flag) {
if(domain->triclinic) {
struct AtomVecKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
} else {
if(domain->triclinic) {
struct AtomVecKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
} else {
struct AtomVecKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap,
domain->xprd,domain->yprd,domain->zprd,
domain->xy,domain->xz,domain->yz,pbc);
Kokkos::parallel_for(n,f);
}
}
}
return n*3;
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
struct AtomVecKokkos_UnpackComm {
typedef DeviceType device_type;
typename ArrayTypes<DeviceType>::t_x_array _x;
typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
int _first;
AtomVecKokkos_UnpackComm(
const typename DAT::tdual_x_array &x,
const typename DAT::tdual_xfloat_2d &buf,
const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
_first(first) {};
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
_x(i+_first,0) = _buf(i,0);
_x(i+_first,1) = _buf(i,1);
_x(i+_first,2) = _buf(i,2);
}
};
/* ---------------------------------------------------------------------- */
void AtomVecKokkos::unpack_comm_kokkos(const int &n, const int &first,
const DAT::tdual_xfloat_2d &buf ) {
if(commKK->forward_comm_on_host) {
sync(Host,X_MASK);
modified(Host,X_MASK);
struct AtomVecKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
Kokkos::parallel_for(n,f);
} else {
sync(Device,X_MASK);
modified(Device,X_MASK);
struct AtomVecKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
Kokkos::parallel_for(n,f);
}
}
/* ---------------------------------------------------------------------- */
int AtomVecKokkos::pack_comm(int n, int *list, double *buf,
int pbc_flag, int *pbc)
{
int i,j,m;
double dx,dy,dz;
m = 0;
if (pbc_flag == 0) {
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0);
buf[m++] = h_x(j,1);
buf[m++] = h_x(j,2);
}
} else {
if (domain->triclinic == 0) {
dx = pbc[0]*domain->xprd;
dy = pbc[1]*domain->yprd;
dz = pbc[2]*domain->zprd;
} else {
dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
dz = pbc[2]*domain->zprd;
}
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0) + dx;
buf[m++] = h_x(j,1) + dy;
buf[m++] = h_x(j,2) + dz;
}
}
return m;
}
/* ---------------------------------------------------------------------- */
int AtomVecKokkos::pack_comm_vel(int n, int *list, double *buf,
int pbc_flag, int *pbc)
{
int i,j,m;
double dx,dy,dz,dvx,dvy,dvz;
m = 0;
if (pbc_flag == 0) {
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0);
buf[m++] = h_x(j,1);
buf[m++] = h_x(j,2);
buf[m++] = h_v(j,0);
buf[m++] = h_v(j,1);
buf[m++] = h_v(j,2);
}
} else {
if (domain->triclinic == 0) {
dx = pbc[0]*domain->xprd;
dy = pbc[1]*domain->yprd;
dz = pbc[2]*domain->zprd;
} else {
dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
dz = pbc[2]*domain->zprd;
}
if (!deform_vremap) {
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0) + dx;
buf[m++] = h_x(j,1) + dy;
buf[m++] = h_x(j,2) + dz;
buf[m++] = h_v(j,0);
buf[m++] = h_v(j,1);
buf[m++] = h_v(j,2);
}
} else {
dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
dvz = pbc[2]*h_rate[2];
for (i = 0; i < n; i++) {
j = list[i];
buf[m++] = h_x(j,0) + dx;
buf[m++] = h_x(j,1) + dy;
buf[m++] = h_x(j,2) + dz;
if (atom->mask[i] & deform_groupbit) {
buf[m++] = h_v(j,0) + dvx;
buf[m++] = h_v(j,1) + dvy;
buf[m++] = h_v(j,2) + dvz;
} else {
buf[m++] = h_v(j,0);
buf[m++] = h_v(j,1);
buf[m++] = h_v(j,2);
}
}
}
}
return m;
}
/* ---------------------------------------------------------------------- */
void AtomVecKokkos::unpack_comm(int n, int first, double *buf)
{
int i,m,last;
m = 0;
last = first + n;
for (i = first; i < last; i++) {
h_x(i,0) = buf[m++];
h_x(i,1) = buf[m++];
h_x(i,2) = buf[m++];
}
}
/* ---------------------------------------------------------------------- */
void AtomVecKokkos::unpack_comm_vel(int n, int first, double *buf)
{
int i,m,last;
m = 0;
last = first + n;
for (i = first; i < last; i++) {
h_x(i,0) = buf[m++];
h_x(i,1) = buf[m++];
h_x(i,2) = buf[m++];
h_v(i,0) = buf[m++];
h_v(i,1) = buf[m++];
h_v(i,2) = buf[m++];
}
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
struct AtomVecKokkos_PackReverse {
typedef DeviceType device_type;
typename ArrayTypes<DeviceType>::t_f_array_randomread _f;
typename ArrayTypes<DeviceType>::t_ffloat_2d _buf;
int _first;
AtomVecKokkos_PackReverse(
const typename DAT::tdual_f_array &f,
const typename DAT::tdual_ffloat_2d &buf,
const int& first):_f(f.view<DeviceType>()),_buf(buf.view<DeviceType>()),
_first(first) {};
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
_buf(i,0) = _f(i+_first,0);
_buf(i,1) = _f(i+_first,1);
_buf(i,2) = _f(i+_first,2);
}
};
/* ---------------------------------------------------------------------- */
int AtomVecKokkos::pack_reverse_kokkos(const int &n, const int &first,
const DAT::tdual_ffloat_2d &buf ) {
if(commKK->reverse_comm_on_host) {
sync(Host,F_MASK);
struct AtomVecKokkos_PackReverse<LMPHostType> f(atomKK->k_f,buf,first);
Kokkos::parallel_for(n,f);
} else {
sync(Device,F_MASK);
struct AtomVecKokkos_PackReverse<LMPDeviceType> f(atomKK->k_f,buf,first);
Kokkos::parallel_for(n,f);
}
return n*size_reverse;
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
struct AtomVecKokkos_UnPackReverseSelf {
typedef DeviceType device_type;
typename ArrayTypes<DeviceType>::t_f_array_randomread _f;
typename ArrayTypes<DeviceType>::t_f_array _fw;
int _nfirst;
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
const int _iswap;
AtomVecKokkos_UnPackReverseSelf(
const typename DAT::tdual_f_array &f,
const int &nfirst,
const typename DAT::tdual_int_2d &list,
const int & iswap):
_f(f.view<DeviceType>()),_fw(f.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap) {
};
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
const int j = _list(_iswap,i);
_fw(j,0) += _f(i+_nfirst,0);
_fw(j,1) += _f(i+_nfirst,1);
_fw(j,2) += _f(i+_nfirst,2);
}
};
/* ---------------------------------------------------------------------- */
int AtomVecKokkos::unpack_reverse_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
const int nfirst) {
if(commKK->reverse_comm_on_host) {
sync(Host,F_MASK);
struct AtomVecKokkos_UnPackReverseSelf<LMPHostType> f(atomKK->k_f,nfirst,list,iswap);
Kokkos::parallel_for(n,f);
modified(Host,F_MASK);
} else {
sync(Device,F_MASK);
struct AtomVecKokkos_UnPackReverseSelf<LMPDeviceType> f(atomKK->k_f,nfirst,list,iswap);
Kokkos::parallel_for(n,f);
modified(Device,F_MASK);
}
return n*3;
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
struct AtomVecKokkos_UnPackReverse {
typedef DeviceType device_type;
typename ArrayTypes<DeviceType>::t_f_array _f;
typename ArrayTypes<DeviceType>::t_ffloat_2d_const _buf;
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
const int _iswap;
AtomVecKokkos_UnPackReverse(
const typename DAT::tdual_f_array &f,
const typename DAT::tdual_ffloat_2d &buf,
const typename DAT::tdual_int_2d &list,
const int & iswap):
_f(f.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap) {
const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
const size_t elements = 3;
buffer_view<DeviceType>(_buf,buf,maxsend,elements);
};
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
const int j = _list(_iswap,i);
_f(j,0) += _buf(i,0);
_f(j,1) += _buf(i,1);
_f(j,2) += _buf(i,2);
}
};
/* ---------------------------------------------------------------------- */
void AtomVecKokkos::unpack_reverse_kokkos(const int &n,
const DAT::tdual_int_2d &list,
const int & iswap,
const DAT::tdual_ffloat_2d &buf)
{
// Check whether to always run reverse communication on the host
// Choose correct reverse UnPackReverse kernel
if(commKK->reverse_comm_on_host) {
struct AtomVecKokkos_UnPackReverse<LMPHostType> f(atomKK->k_f,buf,list,iswap);
Kokkos::parallel_for(n,f);
modified(Host,F_MASK);
} else {
struct AtomVecKokkos_UnPackReverse<LMPDeviceType> f(atomKK->k_f,buf,list,iswap);
Kokkos::parallel_for(n,f);
modified(Device,F_MASK);
}
}
/* ---------------------------------------------------------------------- */
int AtomVecKokkos::pack_reverse(int n, int first, double *buf)
{
if(n > 0)
sync(Host,F_MASK);
int m = 0;
const int last = first + n;
for (int i = first; i < last; i++) {
buf[m++] = h_f(i,0);
buf[m++] = h_f(i,1);
buf[m++] = h_f(i,2);
}
return m;
}
/* ---------------------------------------------------------------------- */
void AtomVecKokkos::unpack_reverse(int n, int *list, double *buf)
{
int m = 0;
for (int i = 0; i < n; i++) {
const int j = list[i];
h_f(j,0) += buf[m++];
h_f(j,1) += buf[m++];
h_f(j,2) += buf[m++];
}
if(n > 0)
modified(Host,F_MASK);
}

View File

@ -35,29 +35,48 @@ class AtomVecKokkos : public AtomVec {
public:
AtomVecKokkos(class LAMMPS *);
virtual ~AtomVecKokkos() {}
virtual int pack_comm(int, int *, double *, int, int *);
virtual int pack_comm_vel(int, int *, double *, int, int *);
virtual void unpack_comm(int, int, double *);
virtual void unpack_comm_vel(int, int, double *);
virtual int pack_reverse(int, int, double *);
virtual void unpack_reverse(int, int *, double *);
virtual void sync(ExecutionSpace space, unsigned int mask) = 0;
virtual void modified(ExecutionSpace space, unsigned int mask) = 0;
virtual void sync_overlapping_device(ExecutionSpace space, unsigned int mask) {};
virtual void sync_overlapping_device(ExecutionSpace space, unsigned int mask) = 0;
virtual int
pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
const int & iswap, const int nfirst,
const int &pbc_flag, const int pbc[]) = 0;
//{return 0;}
const int &pbc_flag, const int pbc[]);
virtual int
pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &list,
const int & iswap, const DAT::tdual_xfloat_2d &buf,
const int &pbc_flag, const int pbc[]) = 0;
//{return 0;}
const int &pbc_flag, const int pbc[]);
virtual void
unpack_comm_kokkos(const int &n, const int &nfirst,
const DAT::tdual_xfloat_2d &buf) = 0;
const DAT::tdual_xfloat_2d &buf);
virtual int
unpack_reverse_self(const int &n, const DAT::tdual_int_2d &list,
const int & iswap, const int nfirst);
virtual int
pack_reverse_kokkos(const int &n, const int &nfirst,
const DAT::tdual_ffloat_2d &buf);
virtual void
unpack_reverse_kokkos(const int &n, const DAT::tdual_int_2d &list,
const int & iswap, const DAT::tdual_ffloat_2d &buf);
virtual int
pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
DAT::tdual_xfloat_2d buf,int iswap,
int pbc_flag, int *pbc, ExecutionSpace space) = 0;
//{return 0;};
virtual void
unpack_border_kokkos(const int &n, const int &nfirst,
const DAT::tdual_xfloat_2d &buf,
@ -68,15 +87,19 @@ class AtomVecKokkos : public AtomVec {
DAT::tdual_int_1d k_sendlist,
DAT::tdual_int_1d k_copylist,
ExecutionSpace space, int dim, X_FLOAT lo, X_FLOAT hi) = 0;
//{return 0;};
virtual int
unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv,
int nlocal, int dim, X_FLOAT lo, X_FLOAT hi,
ExecutionSpace space) = 0;
//{return 0;};
protected:
HAT::t_x_array h_x;
HAT::t_v_array h_v;
HAT::t_f_array h_f;
class CommKokkos *commKK;
size_t buffer_size;
void* buffer;

View File

@ -46,7 +46,8 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp)
if (sendlist) for (int i = 0; i < maxswap; i++) memory->destroy(sendlist[i]);
memory->sfree(sendlist);
sendlist = NULL;
k_sendlist = ArrayTypes<LMPDeviceType>::tdual_int_2d();
k_sendlist = DAT::tdual_int_2d();
k_total_send = DAT::tdual_int_scalar("comm::k_total_send");
// error check for disallow of OpenMP threads?
@ -57,12 +58,12 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp)
memory->destroy(buf_recv);
buf_recv = NULL;
k_exchange_sendlist = ArrayTypes<LMPDeviceType>::
k_exchange_sendlist = DAT::
tdual_int_1d("comm:k_exchange_sendlist",100);
k_exchange_copylist = ArrayTypes<LMPDeviceType>::
k_exchange_copylist = DAT::
tdual_int_1d("comm:k_exchange_copylist",100);
k_count = ArrayTypes<LMPDeviceType>::tdual_int_1d("comm:k_count",1);
k_sendflag = ArrayTypes<LMPDeviceType>::tdual_int_1d("comm:k_sendflag",100);
k_count = DAT::tdual_int_scalar("comm:k_count");
k_sendflag = DAT::tdual_int_1d("comm:k_sendflag",100);
memory->destroy(maxsendlist);
maxsendlist = NULL;
@ -102,8 +103,10 @@ void CommKokkos::init()
atomKK = (AtomKokkos *) atom;
exchange_comm_classic = lmp->kokkos->exchange_comm_classic;
forward_comm_classic = lmp->kokkos->forward_comm_classic;
reverse_comm_classic = lmp->kokkos->reverse_comm_classic;
exchange_comm_on_host = lmp->kokkos->exchange_comm_on_host;
forward_comm_on_host = lmp->kokkos->forward_comm_on_host;
reverse_comm_on_host = lmp->kokkos->reverse_comm_on_host;
CommBrick::init();
@ -132,8 +135,11 @@ void CommKokkos::init()
if (force->newton == 0) check_reverse = 0;
if (force->pair) check_reverse += force->pair->comm_reverse_off;
if(check_reverse || check_forward)
if (ghost_velocity)
forward_comm_classic = true;
if (!comm_f_only) // not all Kokkos atom_vec styles have reverse pack/unpack routines yet
reverse_comm_classic = true;
}
/* ----------------------------------------------------------------------
@ -173,7 +179,6 @@ void CommKokkos::forward_comm_device(int dummy)
int n;
MPI_Request request;
AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec;
double **x = atom->x;
double *buf;
// exchange data with another proc
@ -181,15 +186,11 @@ void CommKokkos::forward_comm_device(int dummy)
// if comm_x_only set, exchange or copy directly to x, don't unpack
k_sendlist.sync<DeviceType>();
atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);
for (int iswap = 0; iswap < nswap; iswap++) {
if (sendproc[iswap] != me) {
if (comm_x_only) {
atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);
if (size_forward_recv[iswap]) buf = x[firstrecv[iswap]];
else buf = NULL;
if (size_forward_recv[iswap]) {
buf = atomKK->k_x.view<DeviceType>().ptr_on_device() +
firstrecv[iswap]*atomKK->k_x.view<DeviceType>().dimension_1();
@ -198,15 +199,16 @@ void CommKokkos::forward_comm_device(int dummy)
}
n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,
iswap,k_buf_send,pbc_flag[iswap],pbc[iswap]);
if (n) {
MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),
n,MPI_DOUBLE,sendproc[iswap],0,world);
}
if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
if (size_forward_recv[iswap]) {
MPI_Wait(&request,MPI_STATUS_IGNORE);
atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::
space,X_MASK);
}
} else if (ghost_velocity) {
error->all(FLERR,"Ghost velocity forward comm not yet "
"implemented with Kokkos");
@ -248,21 +250,93 @@ void CommKokkos::forward_comm_device(int dummy)
}
}
}
/* ----------------------------------------------------------------------
reverse communication of forces on atoms every timestep
other per-atom attributes may also be sent via pack/unpack routines
------------------------------------------------------------------------- */
void CommKokkos::reverse_comm()
{
if (!reverse_comm_classic) {
if (reverse_comm_on_host) reverse_comm_device<LMPHostType>();
else reverse_comm_device<LMPDeviceType>();
return;
}
k_sendlist.sync<LMPHostType>();
if (comm_f_only)
atomKK->sync(Host,F_MASK);
else
atomKK->sync(Host,ALL_MASK);
CommBrick::reverse_comm();
if (comm_f_only)
atomKK->modified(Host,F_MASK);
else
atomKK->modified(Host,ALL_MASK);
atomKK->sync(Device,ALL_MASK);
//atomKK->sync(Device,ALL_MASK); // is this needed?
}
template<class DeviceType>
void CommKokkos::reverse_comm_device()
{
int n;
MPI_Request request;
AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec;
double *buf;
// exchange data with another proc
// if other proc is self, just copy
// if comm_f_only set, exchange or copy directly from f, don't pack
k_sendlist.sync<DeviceType>();
atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,F_MASK);
for (int iswap = nswap-1; iswap >= 0; iswap--) {
if (sendproc[iswap] != me) {
if (comm_f_only) {
if (size_reverse_recv[iswap])
MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device(),size_reverse_recv[iswap],MPI_DOUBLE,
sendproc[iswap],0,world,&request);
if (size_reverse_send[iswap]) {
buf = atomKK->k_f.view<DeviceType>().ptr_on_device() +
firstrecv[iswap]*atomKK->k_f.view<DeviceType>().dimension_1();
MPI_Send(buf,size_reverse_send[iswap],MPI_DOUBLE,
recvproc[iswap],0,world);
}
if (size_reverse_recv[iswap]) {
MPI_Wait(&request,MPI_STATUS_IGNORE);
atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::
space,F_MASK);
}
} else {
if (size_reverse_recv[iswap])
MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device(),
size_reverse_recv[iswap],MPI_DOUBLE,
sendproc[iswap],0,world,&request);
n = avec->pack_reverse_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send);
if (n)
MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),n,
MPI_DOUBLE,recvproc[iswap],0,world);
if (size_reverse_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
}
avec->unpack_reverse_kokkos(sendnum[iswap],k_sendlist,iswap,
k_buf_recv);
} else {
if (sendnum[iswap])
n = avec->unpack_reverse_self(sendnum[iswap],k_sendlist,iswap,
firstrecv[iswap]);
}
}
}
/* ---------------------------------------------------------------------- */
void CommKokkos::forward_comm_fix(Fix *fix, int size)
{
k_sendlist.sync<LMPHostType>();
@ -408,7 +482,7 @@ struct BuildExchangeListFunctor {
typename AT::t_x_array _x;
int _nlocal,_dim;
typename AT::t_int_1d _nsend;
typename AT::t_int_scalar _nsend;
typename AT::t_int_1d _sendlist;
typename AT::t_int_1d _sendflag;
@ -416,7 +490,7 @@ struct BuildExchangeListFunctor {
BuildExchangeListFunctor(
const typename AT::tdual_x_array x,
const typename AT::tdual_int_1d sendlist,
typename AT::tdual_int_1d nsend,
typename AT::tdual_int_scalar nsend,
typename AT::tdual_int_1d sendflag,int nlocal, int dim,
X_FLOAT lo, X_FLOAT hi):
_x(x.template view<DeviceType>()),
@ -430,7 +504,7 @@ struct BuildExchangeListFunctor {
KOKKOS_INLINE_FUNCTION
void operator() (int i) const {
if (_x(i,_dim) < _lo || _x(i,_dim) >= _hi) {
const int mysend=Kokkos::atomic_fetch_add(&_nsend(0),1);
const int mysend=Kokkos::atomic_fetch_add(&_nsend(),1);
if(mysend<_sendlist.dimension_0()) {
_sendlist(mysend) = i;
_sendflag(i) = 1;
@ -489,9 +563,9 @@ void CommKokkos::exchange_device()
if (true) {
if (k_sendflag.h_view.dimension_0()<nlocal) k_sendflag.resize(nlocal);
k_sendflag.sync<DeviceType>();
k_count.h_view(0) = k_exchange_sendlist.h_view.dimension_0();
while (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) {
k_count.h_view(0) = 0;
k_count.h_view() = k_exchange_sendlist.h_view.dimension_0();
while (k_count.h_view()>=k_exchange_sendlist.h_view.dimension_0()) {
k_count.h_view() = 0;
k_count.modify<LMPHostType>();
k_count.sync<DeviceType>();
@ -504,10 +578,10 @@ void CommKokkos::exchange_device()
k_count.modify<DeviceType>();
k_count.sync<LMPHostType>();
if (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) {
k_exchange_sendlist.resize(k_count.h_view(0)*1.1);
k_exchange_copylist.resize(k_count.h_view(0)*1.1);
k_count.h_view(0)=k_exchange_sendlist.h_view.dimension_0();
if (k_count.h_view()>=k_exchange_sendlist.h_view.dimension_0()) {
k_exchange_sendlist.resize(k_count.h_view()*1.1);
k_exchange_copylist.resize(k_count.h_view()*1.1);
k_count.h_view()=k_exchange_sendlist.h_view.dimension_0();
}
}
k_exchange_copylist.sync<LMPHostType>();
@ -515,8 +589,8 @@ void CommKokkos::exchange_device()
k_sendflag.sync<LMPHostType>();
int sendpos = nlocal-1;
nlocal -= k_count.h_view(0);
for(int i = 0; i < k_count.h_view(0); i++) {
nlocal -= k_count.h_view();
for(int i = 0; i < k_count.h_view(); i++) {
if (k_exchange_sendlist.h_view(i)<nlocal) {
while (k_sendflag.h_view(sendpos)) sendpos--;
k_exchange_copylist.h_view(i) = sendpos;
@ -527,10 +601,10 @@ void CommKokkos::exchange_device()
k_exchange_copylist.modify<LMPHostType>();
k_exchange_copylist.sync<DeviceType>();
nsend = k_count.h_view(0);
nsend = k_count.h_view();
if (nsend > maxsend) grow_send_kokkos(nsend,1);
nsend =
avec->pack_exchange_kokkos(k_count.h_view(0),k_buf_send,
avec->pack_exchange_kokkos(k_count.h_view(),k_buf_send,
k_exchange_sendlist,k_exchange_copylist,
ExecutionSpaceFromDevice<DeviceType>::
space,dim,lo,hi);
@ -640,9 +714,7 @@ void CommKokkos::borders()
}
atomKK->sync(Host,ALL_MASK);
atomKK->modified(Host,ALL_MASK);
k_sendlist.sync<LMPHostType>();
k_sendlist.modify<LMPHostType>();
CommBrick::borders();
k_sendlist.modify<LMPHostType>();
atomKK->modified(Host,ALL_MASK);
@ -659,11 +731,11 @@ struct BuildBorderListFunctor {
int iswap,maxsendlist;
int nfirst,nlast,dim;
typename AT::t_int_2d sendlist;
typename AT::t_int_1d nsend;
typename AT::t_int_scalar nsend;
BuildBorderListFunctor(typename AT::tdual_x_array _x,
typename AT::tdual_int_2d _sendlist,
typename AT::tdual_int_1d _nsend,int _nfirst,
typename AT::tdual_int_scalar _nsend,int _nfirst,
int _nlast, int _dim,
X_FLOAT _lo, X_FLOAT _hi, int _iswap,
int _maxsendlist):
@ -684,7 +756,7 @@ struct BuildBorderListFunctor {
for (int i=teamstart + dev.team_rank(); i<teamend; i+=dev.team_size()) {
if (x(i,dim) >= lo && x(i,dim) <= hi) mysend++;
}
const int my_store_pos = dev.team_scan(mysend,&nsend(0));
const int my_store_pos = dev.team_scan(mysend,&nsend());
if (my_store_pos+mysend < maxsendlist) {
mysend = my_store_pos;
@ -713,7 +785,7 @@ void CommKokkos::borders_device() {
AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec;
ExecutionSpace exec_space = ExecutionSpaceFromDevice<DeviceType>::space;
k_sendlist.modify<DeviceType>();
k_sendlist.sync<DeviceType>();
atomKK->sync(exec_space,ALL_MASK);
// do swaps over all 3 dimensions
@ -763,37 +835,38 @@ void CommKokkos::borders_device() {
if (sendflag) {
if (!bordergroup || ineed >= 2) {
if (style == SINGLE) {
typename ArrayTypes<DeviceType>::tdual_int_1d total_send("TS",1);
total_send.h_view(0) = 0;
if(exec_space == Device) {
total_send.template modify<DeviceType>();
total_send.template sync<LMPDeviceType>();
}
k_total_send.h_view() = 0;
k_total_send.template modify<LMPHostType>();
k_total_send.template sync<LMPDeviceType>();
BuildBorderListFunctor<DeviceType> f(atomKK->k_x,k_sendlist,
total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
k_total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
Kokkos::TeamPolicy<DeviceType> config((nlast-nfirst+127)/128,128);
Kokkos::parallel_for(config,f);
total_send.template modify<DeviceType>();
total_send.template sync<LMPHostType>();
k_total_send.template modify<DeviceType>();
k_total_send.template sync<LMPHostType>();
if(total_send.h_view(0) >= maxsendlist[iswap]) {
grow_list(iswap,total_send.h_view(0));
k_sendlist.modify<DeviceType>();
total_send.h_view(0) = 0;
if(exec_space == Device) {
total_send.template modify<LMPHostType>();
total_send.template sync<LMPDeviceType>();
}
if(k_total_send.h_view() >= maxsendlist[iswap]) {
grow_list(iswap,k_total_send.h_view());
k_total_send.h_view() = 0;
k_total_send.template modify<LMPHostType>();
k_total_send.template sync<LMPDeviceType>();
BuildBorderListFunctor<DeviceType> f(atomKK->k_x,k_sendlist,
total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
k_total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
Kokkos::TeamPolicy<DeviceType> config((nlast-nfirst+127)/128,128);
Kokkos::parallel_for(config,f);
total_send.template modify<DeviceType>();
total_send.template sync<LMPHostType>();
k_total_send.template modify<DeviceType>();
k_total_send.template sync<LMPHostType>();
k_sendlist.modify<DeviceType>();
}
nsend = total_send.h_view(0);
nsend = k_total_send.h_view();
} else {
error->all(FLERR,"Required border comm not yet "
"implemented with Kokkos");
@ -916,10 +989,11 @@ void CommKokkos::borders_device() {
// reset global->local map
if (exec_space == Host) k_sendlist.sync<LMPDeviceType>();
atomKK->modified(exec_space,ALL_MASK);
if (map_style) {
atomKK->sync(Host,TAG_MASK);
if (map_style) atom->map_set();
atom->map_set();
}
}
/* ----------------------------------------------------------------------
realloc the size of the send buffer as needed with BUFFACTOR and bufextra
@ -961,7 +1035,7 @@ void CommKokkos::grow_send_kokkos(int n, int flag, ExecutionSpace space)
buf_send = k_buf_send.view<LMPHostType>().ptr_on_device();
}
else {
k_buf_send = ArrayTypes<LMPDeviceType>::
k_buf_send = DAT::
tdual_xfloat_2d("comm:k_buf_send",maxsend_border,atom->avec->size_border);
buf_send = k_buf_send.view<LMPHostType>().ptr_on_device();
}
@ -975,7 +1049,7 @@ void CommKokkos::grow_recv_kokkos(int n, ExecutionSpace space)
{
maxrecv = static_cast<int> (BUFFACTOR * n);
int maxrecv_border = (maxrecv+BUFEXTRA+5)/atom->avec->size_border + 2;
k_buf_recv = ArrayTypes<LMPDeviceType>::
k_buf_recv = DAT::
tdual_xfloat_2d("comm:k_buf_recv",maxrecv_border,atom->avec->size_border);
buf_recv = k_buf_recv.view<LMPHostType>().ptr_on_device();
}
@ -988,6 +1062,11 @@ void CommKokkos::grow_list(int iswap, int n)
{
int size = static_cast<int> (BUFFACTOR * n);
if (exchange_comm_classic) { // force realloc on Host
k_sendlist.sync<LMPHostType>();
k_sendlist.modify<LMPHostType>();
}
memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist");
for(int i=0;i<maxswap;i++) {
@ -1011,6 +1090,11 @@ void CommKokkos::grow_swap(int n)
maxswap = n;
int size = MAX(k_sendlist.d_view.dimension_1(),BUFMIN);
if (exchange_comm_classic) { // force realloc on Host
k_sendlist.sync<LMPHostType>();
k_sendlist.modify<LMPHostType>();
}
memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist");
memory->grow(maxsendlist,n,"comm:maxsendlist");

View File

@ -25,8 +25,10 @@ class CommKokkos : public CommBrick {
bool exchange_comm_classic;
bool forward_comm_classic;
bool reverse_comm_classic;
bool exchange_comm_on_host;
bool forward_comm_on_host;
bool reverse_comm_on_host;
CommKokkos(class LAMMPS *);
~CommKokkos();
@ -47,15 +49,17 @@ class CommKokkos : public CommBrick {
void reverse_comm_dump(class Dump *); // reverse comm from a Dump
template<class DeviceType> void forward_comm_device(int dummy);
template<class DeviceType> void reverse_comm_device();
template<class DeviceType> void forward_comm_pair_device(Pair *pair);
template<class DeviceType> void exchange_device();
template<class DeviceType> void borders_device();
protected:
DAT::tdual_int_2d k_sendlist;
DAT::tdual_int_scalar k_total_send;
DAT::tdual_xfloat_2d k_buf_send,k_buf_recv;
DAT::tdual_int_1d k_exchange_sendlist,k_exchange_copylist,k_sendflag;
DAT::tdual_int_1d k_count;
DAT::tdual_int_scalar k_count;
//double *buf_send; // send buffer for all comm
//double *buf_recv; // recv buffer for all comm

View File

@ -63,6 +63,7 @@ FixQEqReaxKokkos(LAMMPS *lmp, int narg, char **arg) :
nmax = nmax = m_cap = 0;
allocated_flag = 0;
nprev = 4;
}
/* ---------------------------------------------------------------------- */
@ -158,15 +159,15 @@ void FixQEqReaxKokkos<DeviceType>::init_hist()
{
int i,j;
k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",atom->nmax,5);
k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",atom->nmax,nprev);
d_s_hist = k_s_hist.template view<DeviceType>();
h_s_hist = k_s_hist.h_view;
k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",atom->nmax,5);
k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",atom->nmax,nprev);
d_t_hist = k_t_hist.template view<DeviceType>();
h_t_hist = k_t_hist.h_view;
for( i = 0; i < atom->nmax; i++ )
for( j = 0; j < 5; j++ )
for( j = 0; j < nprev; j++ )
k_s_hist.h_view(i,j) = k_t_hist.h_view(i,j) = 0.0;
k_s_hist.template modify<LMPHostType>();
@ -334,11 +335,11 @@ void FixQEqReaxKokkos<DeviceType>::allocate_array()
d_d = k_d.template view<DeviceType>();
h_d = k_d.h_view;
k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",nmax,5);
k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",nmax,nprev);
d_s_hist = k_s_hist.template view<DeviceType>();
h_s_hist = k_s_hist.h_view;
k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",nmax,5);
k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",nmax,nprev);
d_t_hist = k_t_hist.template view<DeviceType>();
h_t_hist = k_t_hist.h_view;
}
@ -368,7 +369,7 @@ void FixQEqReaxKokkos<DeviceType>::zero_item(int ii) const
d_o[i] = 0.0;
d_r[i] = 0.0;
d_d[i] = 0.0;
//for( int j = 0; j < 5; j++ )
//for( int j = 0; j < nprev; j++ )
//d_s_hist(i,j) = d_t_hist(i,j) = 0.0;
}
@ -1087,7 +1088,7 @@ void FixQEqReaxKokkos<DeviceType>::calculate_q_item(int ii) const
if (mask[i] & groupbit) {
q(i) = d_s[i] - delta * d_t[i];
for (int k = 4; k > 0; --k) {
for (int k = nprev-1; k > 0; --k) {
d_s_hist(i,k) = d_s_hist(i,k-1);
d_t_hist(i,k) = d_t_hist(i,k-1);
}
@ -1173,7 +1174,7 @@ double FixQEqReaxKokkos<DeviceType>::memory_usage()
{
double bytes;
bytes = atom->nmax*5*2 * sizeof(F_FLOAT); // s_hist & t_hist
bytes = atom->nmax*nprev*2 * sizeof(F_FLOAT); // s_hist & t_hist
bytes += atom->nmax*8 * sizeof(F_FLOAT); // storage
bytes += n_cap*2 * sizeof(int); // matrix...
bytes += m_cap * sizeof(int);

View File

@ -123,8 +123,10 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
neighflag_qeq_set = 0;
exchange_comm_classic = 0;
forward_comm_classic = 0;
reverse_comm_classic = 0;
exchange_comm_on_host = 0;
forward_comm_on_host = 0;
reverse_comm_on_host = 0;
#ifdef KILL_KOKKOS_ON_SIGSEGV
signal(SIGSEGV, my_signal_handler);
@ -158,8 +160,8 @@ void KokkosLMP::accelerator(int narg, char **arg)
neighflag_qeq_set = 0;
int newtonflag = 0;
double binsize = 0.0;
exchange_comm_classic = forward_comm_classic = 0;
exchange_comm_on_host = forward_comm_on_host = 0;
exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
int iarg = 0;
while (iarg < narg) {
@ -200,13 +202,13 @@ void KokkosLMP::accelerator(int narg, char **arg)
} else if (strcmp(arg[iarg],"comm") == 0) {
if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
if (strcmp(arg[iarg+1],"no") == 0) {
exchange_comm_classic = forward_comm_classic = 1;
exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1;
} else if (strcmp(arg[iarg+1],"host") == 0) {
exchange_comm_classic = forward_comm_classic = 0;
exchange_comm_on_host = forward_comm_on_host = 1;
exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 1;
} else if (strcmp(arg[iarg+1],"device") == 0) {
exchange_comm_classic = forward_comm_classic = 0;
exchange_comm_on_host = forward_comm_on_host = 0;
exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
} else error->all(FLERR,"Illegal package kokkos command");
iarg += 2;
} else if (strcmp(arg[iarg],"comm/exchange") == 0) {
@ -231,6 +233,17 @@ void KokkosLMP::accelerator(int narg, char **arg)
forward_comm_on_host = 0;
} else error->all(FLERR,"Illegal package kokkos command");
iarg += 2;
} else if (strcmp(arg[iarg],"comm/reverse") == 0) {
if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
if (strcmp(arg[iarg+1],"no") == 0) reverse_comm_classic = 1;
else if (strcmp(arg[iarg+1],"host") == 0) {
reverse_comm_classic = 0;
reverse_comm_on_host = 1;
} else if (strcmp(arg[iarg+1],"device") == 0) {
reverse_comm_classic = 0;
reverse_comm_on_host = 0;
} else error->all(FLERR,"Illegal package kokkos command");
iarg += 2;
} else error->all(FLERR,"Illegal package kokkos command");
}

View File

@ -27,8 +27,10 @@ class KokkosLMP : protected Pointers {
int neighflag_qeq_set;
int exchange_comm_classic;
int forward_comm_classic;
int reverse_comm_classic;
int exchange_comm_on_host;
int forward_comm_on_host;
int reverse_comm_on_host;
int num_threads,ngpu;
int numa;
int auto_sync;

View File

@ -75,6 +75,10 @@ void NBinKokkos<DeviceType>::bin_atoms_setup(int nall)
k_bincount = DAT::tdual_int_1d("Neighbor::d_bincount",mbins);
bincount = k_bincount.view<DeviceType>();
}
if (nall > k_atom2bin.d_view.dimension_0()) {
k_atom2bin = DAT::tdual_int_1d("Neighbor::d_atom2bin",nall);
atom2bin = k_atom2bin.view<DeviceType>();
}
}
/* ----------------------------------------------------------------------
@ -86,6 +90,10 @@ void NBinKokkos<DeviceType>::bin_atoms()
{
last_bin = update->ntimestep;
k_bins.template sync<DeviceType>();
k_bincount.template sync<DeviceType>();
k_atom2bin.template sync<DeviceType>();
h_resize() = 1;
while(h_resize() > 0) {
@ -115,6 +123,10 @@ void NBinKokkos<DeviceType>::bin_atoms()
c_bins = bins;
}
}
k_bins.template modify<DeviceType>();
k_bincount.template modify<DeviceType>();
k_atom2bin.template modify<DeviceType>();
}
/* ---------------------------------------------------------------------- */
@ -125,6 +137,7 @@ void NBinKokkos<DeviceType>::binatomsItem(const int &i) const
{
const int ibin = coord2bin(x(i, 0), x(i, 1), x(i, 2));
atom2bin(i) = ibin;
const int ac = Kokkos::atomic_fetch_add(&bincount[ibin], (int)1);
if(ac < bins.dimension_1()) {
bins(ibin, ac) = i;

View File

@ -44,11 +44,13 @@ class NBinKokkos : public NBinStandard {
int atoms_per_bin;
DAT::tdual_int_1d k_bincount;
DAT::tdual_int_2d k_bins;
DAT::tdual_int_1d k_atom2bin;
typename AT::t_int_1d bincount;
const typename AT::t_int_1d_const c_bincount;
typename AT::t_int_2d bins;
typename AT::t_int_2d_const c_bins;
typename AT::t_int_1d atom2bin;
typename AT::t_int_scalar d_resize;
typename ArrayTypes<LMPHostType>::t_int_scalar h_resize;
typename AT::t_x_array_randomread x;

View File

@ -310,9 +310,9 @@ void NeighborKokkos::build_kokkos(int topoflag)
// build pairwise lists for all perpetual NPair/NeighList
// grow() with nlocal/nall args so that only realloc if have to
atomKK->sync(Host,ALL_MASK);
for (i = 0; i < npair_perpetual; i++) {
m = plist[i];
if (!lists[m]->kokkos) atomKK->sync(Host,ALL_MASK);
if (!lists[m]->copy) lists[m]->grow(nlocal,nall);
neigh_pair[m]->build_setup();
neigh_pair[m]->build(lists[m]);

View File

@ -73,6 +73,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::copy_bin_info()
atoms_per_bin = nbKK->atoms_per_bin;
k_bincount = nbKK->k_bincount;
k_bins = nbKK->k_bins;
k_atom2bin = nbKK->k_atom2bin;
}
/* ----------------------------------------------------------------------
@ -88,12 +89,14 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::copy_stencil_info()
int maxstencil = ns->get_maxstencil();
if (maxstencil > k_stencil.dimension_0())
k_stencil = DAT::tdual_int_1d("neighlist:stencil",maxstencil);
for (int k = 0; k < maxstencil; k++)
k_stencil.h_view(k) = ns->stencil[k];
k_stencil.modify<LMPHostType>();
k_stencil.sync<DeviceType>();
if (GHOST) {
if (maxstencil > k_stencilxyz.dimension_0())
k_stencilxyz = DAT::tdual_int_1d_3("neighlist:stencilxyz",maxstencil);
for (int k = 0; k < maxstencil; k++) {
k_stencilxyz.h_view(k,0) = ns->stencilxyz[k][0];
@ -122,6 +125,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
k_cutneighsq.view<DeviceType>(),
k_bincount.view<DeviceType>(),
k_bins.view<DeviceType>(),
k_atom2bin.view<DeviceType>(),
nstencil,
k_stencil.view<DeviceType>(),
k_stencilxyz.view<DeviceType>(),
@ -164,8 +168,9 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
k_ex_mol_group.sync<DeviceType>();
k_ex_mol_bit.sync<DeviceType>();
k_ex_mol_intra.sync<DeviceType>();
k_bincount.sync<DeviceType>(),
k_bins.sync<DeviceType>(),
k_bincount.sync<DeviceType>();
k_bins.sync<DeviceType>();
k_atom2bin.sync<DeviceType>();
atomKK->sync(Device,X_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK|TAG_MASK|SPECIAL_MASK);
data.special_flag[0] = special_flag[0];
@ -317,7 +322,7 @@ void NeighborKokkosExecute<DeviceType>::
const X_FLOAT ztmp = x(i, 2);
const int itype = type(i);
const int ibin = coord2bin(xtmp, ytmp, ztmp);
const int ibin = c_atom2bin(i);
const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil
= d_stencil;
@ -431,7 +436,7 @@ void NeighborKokkosExecute<DeviceType>::
if(n > neigh_list.maxneighs) {
resize() = 1;
if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop
}
neigh_list.d_ilist(i) = i;
@ -641,7 +646,7 @@ void NeighborKokkosExecute<DeviceType>::build_ItemCuda(typename Kokkos::TeamPoli
if(n > neigh_list.maxneighs) {
resize() = 1;
if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop
}
}
}
@ -678,7 +683,7 @@ void NeighborKokkosExecute<DeviceType>::
// no molecular test when i = ghost atom
if (i < nlocal) {
const int ibin = coord2bin(xtmp, ytmp, ztmp);
const int ibin = c_atom2bin(i);
for (int k = 0; k < nstencil; k++) {
const int jbin = ibin + stencil[k];
for(int m = 0; m < c_bincount(jbin); m++) {
@ -764,7 +769,7 @@ void NeighborKokkosExecute<DeviceType>::
if(n > neigh_list.maxneighs) {
resize() = 1;
if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop
}
neigh_list.d_ilist(i) = i;
}

View File

@ -105,6 +105,7 @@ class NPairKokkos : public NPair {
int atoms_per_bin;
DAT::tdual_int_1d k_bincount;
DAT::tdual_int_2d k_bins;
DAT::tdual_int_1d k_atom2bin;
// data from NStencil class
@ -148,6 +149,8 @@ class NeighborKokkosExecute
const typename AT::t_int_1d_const c_bincount;
typename AT::t_int_2d bins;
typename AT::t_int_2d_const c_bins;
const typename AT::t_int_1d atom2bin;
const typename AT::t_int_1d_const c_atom2bin;
// data from NStencil class
@ -190,6 +193,7 @@ class NeighborKokkosExecute
const typename AT::t_xfloat_2d_randomread &_cutneighsq,
const typename AT::t_int_1d &_bincount,
const typename AT::t_int_2d &_bins,
const typename AT::t_int_1d &_atom2bin,
const int _nstencil,
const typename AT::t_int_1d &_d_stencil,
const typename AT::t_int_1d_3 &_d_stencilxyz,
@ -224,6 +228,7 @@ class NeighborKokkosExecute
const int & _xprd_half, const int & _yprd_half, const int & _zprd_half):
neigh_list(_neigh_list), cutneighsq(_cutneighsq),
bincount(_bincount),c_bincount(_bincount),bins(_bins),c_bins(_bins),
atom2bin(_atom2bin),c_atom2bin(_atom2bin),
nstencil(_nstencil),d_stencil(_d_stencil),d_stencilxyz(_d_stencilxyz),
nlocal(_nlocal),
x(_x),type(_type),mask(_mask),molecule(_molecule),
@ -281,38 +286,6 @@ class NeighborKokkosExecute
void build_ItemCuda(typename Kokkos::TeamPolicy<DeviceType>::member_type dev) const;
#endif
KOKKOS_INLINE_FUNCTION
int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z) const
{
int ix,iy,iz;
if (x >= bboxhi[0])
ix = static_cast<int> ((x-bboxhi[0])*bininvx) + nbinx;
else if (x >= bboxlo[0]) {
ix = static_cast<int> ((x-bboxlo[0])*bininvx);
ix = MIN(ix,nbinx-1);
} else
ix = static_cast<int> ((x-bboxlo[0])*bininvx) - 1;
if (y >= bboxhi[1])
iy = static_cast<int> ((y-bboxhi[1])*bininvy) + nbiny;
else if (y >= bboxlo[1]) {
iy = static_cast<int> ((y-bboxlo[1])*bininvy);
iy = MIN(iy,nbiny-1);
} else
iy = static_cast<int> ((y-bboxlo[1])*bininvy) - 1;
if (z >= bboxhi[2])
iz = static_cast<int> ((z-bboxhi[2])*bininvz) + nbinz;
else if (z >= bboxlo[2]) {
iz = static_cast<int> ((z-bboxlo[2])*bininvz);
iz = MIN(iz,nbinz-1);
} else
iz = static_cast<int> ((z-bboxlo[2])*bininvz) - 1;
return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo);
}
KOKKOS_INLINE_FUNCTION
int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z, int* i) const
{

View File

@ -131,6 +131,8 @@ template<class DeviceType>
void PairReaxCKokkos<DeviceType>::init_style()
{
PairReaxC::init_style();
if (fix_reax) modify->delete_fix("REAXC"); // not needed in the Kokkos version
fix_reax = NULL;
// irequest = neigh request made by parent class
@ -555,8 +557,8 @@ void PairReaxCKokkos<DeviceType>::Deallocate_Lookup_Tables()
ntypes = atom->ntypes;
for( i = 0; i < ntypes; ++i ) {
for( j = i; j < ntypes; ++j )
for( i = 0; i <= ntypes; ++i ) {
for( j = i; j <= ntypes; ++j )
if( LR[i][j].n ) {
sfree( LR[i][j].y, "LR[i,j].y" );
sfree( LR[i][j].H, "LR[i,j].H" );

View File

@ -294,6 +294,7 @@ void VerletKokkos::run(int n)
int n_pre_exchange = modify->n_pre_exchange;
int n_pre_neighbor = modify->n_pre_neighbor;
int n_pre_force = modify->n_pre_force;
int n_pre_reverse = modify->n_pre_reverse;
int n_post_force = modify->n_post_force;
int n_end_of_step = modify->n_end_of_step;
@ -304,9 +305,9 @@ void VerletKokkos::run(int n)
f_merge_copy = DAT::t_f_array("VerletKokkos::f_merge_copy",atomKK->k_f.dimension_0());
static double time = 0.0;
atomKK->sync(Device,ALL_MASK);
Kokkos::Impl::Timer ktimer;
//static double time = 0.0;
//Kokkos::Impl::Timer ktimer;
timer->init_timeout();
for (int i = 0; i < n; i++) {
@ -320,10 +321,10 @@ void VerletKokkos::run(int n)
// initial time integration
ktimer.reset();
//ktimer.reset();
timer->stamp();
modify->initial_integrate(vflag);
time += ktimer.seconds();
//time += ktimer.seconds();
if (n_post_integrate) modify->post_integrate();
timer->stamp(Timer::MODIFY);
@ -523,11 +524,18 @@ void VerletKokkos::run(int n)
atomKK->k_f.modify<LMPDeviceType>();
}
if (n_pre_reverse) {
modify->pre_reverse(eflag,vflag);
timer->stamp(Timer::MODIFY);
}
// reverse communication of forces
if (force->newton) comm->reverse_comm();
if (force->newton) {
Kokkos::fence();
comm->reverse_comm();
timer->stamp(Timer::COMM);
}
// force modifications, final time integration, diagnostics

View File

@ -15,13 +15,14 @@ SHELL = /bin/sh
CC = CC
OPTFLAGS = -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
-fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_NO_TBB
CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
-DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG -DLMP_INTEL_NO_TBB \
$(OPTFLAGS)
SHFLAGS = -fPIC
DEPFLAGS = -M
LINK = CC
LINKFLAGS = -g -qopenmp $(OPTFLAGS)
LINKFLAGS = -qopenmp $(OPTFLAGS)
LIB =
SIZE = size

View File

@ -10,7 +10,7 @@ CC = mpiicpc
MIC_OPT = -qoffload-option,mic,compiler,"-fp-model fast=2 -mGLOB_default_function_attrs=\"gather_scatter_loop_unroll=4\""
CCFLAGS = -g -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \
-xHost -fno-alias -ansi-alias -restrict -DLMP_INTEL_USELRT \
-qoverride-limits $(MIC_OPT)
-qoverride-limits $(MIC_OPT) -DLMP_USE_MKL_RNG
SHFLAGS = -fPIC
DEPFLAGS = -M

8
src/MAKE/OPTIONS/Makefile.intel_cpu Executable file → Normal file
View File

@ -8,14 +8,14 @@ SHELL = /bin/sh
CC = mpiicpc
OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
-fno-alias -ansi-alias -restrict $(OPTFLAGS)
CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
-DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
SHFLAGS = -fPIC
DEPFLAGS = -M
LINK = mpiicpc
LINKFLAGS = -g -qopenmp $(OPTFLAGS)
LIB = -ltbbmalloc -ltbbmalloc_proxy
LINKFLAGS = -qopenmp $(OPTFLAGS)
LIB = -ltbbmalloc
SIZE = size
ARCHIVE = ar

View File

@ -8,8 +8,8 @@ SHELL = /bin/sh
CC = mpiicpc
OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
-fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT
CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
-DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
SHFLAGS = -fPIC
DEPFLAGS = -M

View File

@ -8,14 +8,14 @@ SHELL = /bin/sh
CC = mpicxx -cxx=icc
OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
-fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT
CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
-DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
SHFLAGS = -fPIC
DEPFLAGS = -M
LINK = mpicxx -cxx=icc
LINKFLAGS = -g -qopenmp $(OPTFLAGS)
LIB =
LINKFLAGS = -qopenmp $(OPTFLAGS)
LIB = -ltbbmalloc
SIZE = size
ARCHIVE = ar

View File

@ -9,14 +9,14 @@ SHELL = /bin/sh
export OMPI_CXX = icc
CC = mpicxx
OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
-fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT
CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
-DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
SHFLAGS = -fPIC
DEPFLAGS = -M
LINK = mpicxx
LINKFLAGS = -g -qopenmp $(OPTFLAGS)
LIB = -ltbbmalloc -ltbbmalloc_proxy
LINKFLAGS = -qopenmp $(OPTFLAGS)
LIB = -ltbbmalloc
SIZE = size
ARCHIVE = ar

View File

@ -1,123 +0,0 @@
# intel_phi = USER-INTEL with Phi x200 (KNL) offload support,Intel MPI,MKL FFT
SHELL = /bin/sh
# ---------------------------------------------------------------------
# compiler/linker settings
# specify flags and libraries needed for your compiler
CC = mpiicpc
MIC_OPT = -qoffload-arch=mic-avx512 -fp-model fast=2
CCFLAGS = -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \
-xHost -fno-alias -ansi-alias -restrict \
-qoverride-limits $(MIC_OPT) -DLMP_INTEL_USELRT
SHFLAGS = -fPIC
DEPFLAGS = -M
LINK = mpiicpc
LINKFLAGS = -g -O3 -xHost -qopenmp -qoffload $(MIC_OPT)
LIB = -ltbbmalloc
SIZE = size
ARCHIVE = ar
ARFLAGS = -rc
SHLIBFLAGS = -shared
# ---------------------------------------------------------------------
# LAMMPS-specific settings, all OPTIONAL
# specify settings for LAMMPS features you will use
# if you change any -D setting, do full re-compile after "make clean"
# LAMMPS ifdef settings
# see possible settings in Section 2.2 (step 4) of manual
LMP_INC = -DLAMMPS_GZIP -DLAMMPS_JPEG
# MPI library
# see discussion in Section 2.2 (step 5) of manual
# MPI wrapper compiler/linker can provide this info
# can point to dummy MPI library in src/STUBS as in Makefile.serial
# use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts
# INC = path for mpi.h, MPI compiler settings
# PATH = path for MPI library
# LIB = name of MPI library
MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1
MPI_PATH =
MPI_LIB =
# FFT library
# see discussion in Section 2.2 (step 6) of manaul
# can be left blank to use provided KISS FFT library
# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
# PATH = path for FFT library
# LIB = name of FFT library
FFT_INC = -DFFT_MKL -DFFT_SINGLE
FFT_PATH =
FFT_LIB = -L$(MKLROOT)/lib/intel64/ -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core
# JPEG and/or PNG library
# see discussion in Section 2.2 (step 7) of manual
# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
# INC = path(s) for jpeglib.h and/or png.h
# PATH = path(s) for JPEG library and/or PNG library
# LIB = name(s) of JPEG library and/or PNG library
JPG_INC =
JPG_PATH =
JPG_LIB = -ljpeg
# ---------------------------------------------------------------------
# build rules and dependencies
# do not edit this section
include Makefile.package.settings
include Makefile.package
EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS)
EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS)
# Path to src files
vpath %.cpp ..
vpath %.h ..
# Link target
$(EXE): $(OBJ) $(EXTRA_LINK_DEPENDS)
$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
$(SIZE) $(EXE)
# Library targets
lib: $(OBJ) $(EXTRA_LINK_DEPENDS)
$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
shlib: $(OBJ) $(EXTRA_LINK_DEPENDS)
$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
$(OBJ) $(EXTRA_LIB) $(LIB)
# Compilation rules
%.o:%.cpp $(EXTRA_CPP_DEPENDS)
$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
%.d:%.cpp $(EXTRA_CPP_DEPENDS)
$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
%.o:%.cu $(EXTRA_CPP_DEPENDS)
$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
# Individual dependencies
depend : fastdep.exe $(SRC)
@./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1
fastdep.exe: ../DEPEND/fastdep.c
cc -O -o $@ $<
sinclude .depend

View File

@ -8,13 +8,13 @@ SHELL = /bin/sh
CC = mpiicpc
OPTFLAGS = -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
-fno-alias -ansi-alias -restrict $(OPTFLAGS)
CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
-DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
SHFLAGS = -fPIC
DEPFLAGS = -M
LINK = mpiicpc
LINKFLAGS = -g -qopenmp $(OPTFLAGS)
LINKFLAGS = -qopenmp $(OPTFLAGS)
LIB = -ltbbmalloc
SIZE = size

View File

@ -310,6 +310,7 @@ void PRD::command(int narg, char **arg)
time_dephase = time_dynamics = time_quench = time_comm = time_output = 0.0;
bigint clock = 0;
timer->init();
timer->barrier_start();
time_start = timer->get_wall(Timer::TOTAL);

View File

@ -274,6 +274,7 @@ void TAD::command(int narg, char **arg)
nbuild = ndanger = 0;
time_neb = time_dynamics = time_quench = time_comm = time_output = 0.0;
timer->init();
timer->barrier_start();
time_start = timer->get_wall(Timer::TOTAL);

View File

@ -46,7 +46,7 @@ action nbin_intel.h
action nbin_intel.cpp
action npair_intel.h
action npair_intel.cpp
action intel_simd.h pair_sw_intel.cpp
action intel_simd.h
action intel_intrinsics.h pair_tersoff_intel.cpp
action intel_intrinsics_airebo.h pair_airebo_intel.cpp

View File

@ -30,28 +30,37 @@ be added or changed in the Makefile depending on the version:
2017 update 2 - No changes needed
2017 updates 3 or 4 - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
2018 or newer - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high
2018 inital release - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
2018u1 or newer - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high
-----------------------------------------------------------------------------
When using the suffix command with "intel", intel styles will be used if they
exist. If the suffix command is used with "hybrid intel omp" and the USER-OMP
USER-OMP styles will be used whenever USER-INTEL styles are not available. This
allow for running most styles in LAMMPS with threading.
is installed, USER-OMP styles will be used whenever USER-INTEL styles are not
available. This allow for running most styles in LAMMPS with threading.
-----------------------------------------------------------------------------
The Long-Range Thread mode (LRT) in the Intel package currently uses
pthreads by default. If pthreads are not supported in the build environment,
the compile flag "-DLMP_INTEL_NOLRT" will disable the feature to allow for
builds without pthreads. Alternatively, "-DLMP_INTEL_LRT11" can be used to
build with compilers that support threads using the C++11 standard. When using
The Long-Range Thread mode (LRT) in the Intel package is enabled through the
-DLMP_INTEL_USELRT define at compile time. All intel optimized makefiles
include this define. This feature will use pthreads by default.
Alternatively, "-DLMP_INTEL_LRT11" can be used to build with compilers that
support threads intrinsically using the C++11 standard. When using
LRT mode, you might need to disable OpenMP affinity settings (e.g.
export KMP_AFFINITY=none). LAMMPS will generate a warning if the settings
need to be changed.
-----------------------------------------------------------------------------
Unless Intel Math Kernel Library (MKL) is unavailable, -DLMP_USE_MKL_RNG
should be added to the compile flags. This will enable using the MKL Mersenne
Twister random number generator (RNG) for Dissipative Particle Dynamics
(DPD). This RNG can allow significantly faster performance and it also has a
significantly longer period than the standard RNG for DPD.
-----------------------------------------------------------------------------
In order to use offload to Intel(R) Xeon Phi(TM) coprocessors, the flag
-DLMP_INTEL_OFFLOAD should be set in the Makefile. Offload requires the use of
Intel compilers.

View File

@ -9,6 +9,7 @@
# in.intel.tersoff - Silicon benchmark with Tersoff
# in.intel.water - Coarse-grain water benchmark using Stillinger-Weber
# in.intel.airebo - Polyethelene benchmark with AIREBO
# in.intel.dpd - Dissipative Particle Dynamics
#
#############################################################################
@ -16,16 +17,17 @@
# Expected Timesteps/second with turbo on and HT enabled, LAMMPS June-2017
# - Compiled w/ Intel Parallel Studio 2017u2 and Makefile.intel_cpu_intelmpi
#
# Xeon E5-2697v4 Xeon Phi 7250
# Xeon E5-2697v4 Xeon Phi 7250 Xeon Gold 6148
#
# in.intel.lj - 199.5 282.3
# in.intel.rhodo - 12.4 17.5
# in.intel.lc - 19.0 25.7
# in.intel.eam - 59.4 92.8
# in.intel.sw - 132.4 161.9
# in.intel.tersoff - 83.3 101.1
# in.intel.water - 53.4 90.3
# in.intel.airebo - 7.3 11.8
# in.intel.lj - 199.5 282.3 317.3
# in.intel.rhodo - 12.4 17.5 24.4
# in.intel.lc - 19.0 25.7 26.8
# in.intel.eam - 59.4 92.8 105.6
# in.intel.sw - 132.4 161.9 213.8
# in.intel.tersoff - 83.3 101.1 109.6
# in.intel.water - 53.4 90.3 105.5
# in.intel.airebo - 7.3 11.8 17.6
# in.intel.dpd - 74.5 100.4 148.1
#
#############################################################################

View File

@ -0,0 +1,48 @@
# DPD benchmark
variable N index on # Newton Setting
variable w index 10 # Warmup Timesteps
variable t index 4000 # Main Run Timesteps
variable m index 1 # Main Run Timestep Multiplier
variable n index 0 # Use NUMA Mapping for Multi-Node
variable p index 0 # Use Power Measurement
variable x index 4
variable y index 2
variable z index 2
variable xx equal 20*$x
variable yy equal 20*$y
variable zz equal 20*$z
variable rr equal floor($t*$m)
newton $N
if "$n > 0" then "processors * * * grid numa"
units lj
atom_style atomic
comm_modify mode single vel yes
lattice fcc 3.0
region box block 0 ${xx} 0 ${yy} 0 ${zz}
create_box 1 box
create_atoms 1 box
mass 1 1.0
velocity all create 1.0 87287 loop geom
pair_style dpd 1.0 1.0 928948
pair_coeff 1 1 25.0 4.5
neighbor 0.5 bin
neigh_modify delay 0 every 1
fix 1 all nve
timestep 0.04
thermo 1000
if "$p > 0" then "run_style verlet/power"
if "$w > 0" then "run $w"
run ${rr}

View File

@ -0,0 +1,441 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
#include <mpi.h>
#include <math.h>
#include "dihedral_fourier_intel.h"
#include "atom.h"
#include "comm.h"
#include "memory.h"
#include "neighbor.h"
#include "domain.h"
#include "force.h"
#include "pair.h"
#include "update.h"
#include "error.h"
#include "suffix.h"
using namespace LAMMPS_NS;
#define PTOLERANCE (flt_t)1.05
#define MTOLERANCE (flt_t)-1.05
typedef struct { int a,b,c,d,t; } int5_t;
/* ---------------------------------------------------------------------- */
DihedralFourierIntel::DihedralFourierIntel(class LAMMPS *lmp)
: DihedralFourier(lmp)
{
suffix_flag |= Suffix::INTEL;
}
/* ---------------------------------------------------------------------- */
void DihedralFourierIntel::compute(int eflag, int vflag)
{
#ifdef _LMP_INTEL_OFFLOAD
if (_use_base) {
DihedralFourier::compute(eflag, vflag);
return;
}
#endif
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
force_const_single);
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
force_const_double);
else
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
force_const_single);
}
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void DihedralFourierIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc)
{
if (eflag || vflag) {
ev_setup(eflag,vflag);
} else evflag = 0;
if (evflag) {
if (vflag && !eflag) {
if (force->newton_bond)
eval<0,1,1>(vflag, buffers, fc);
else
eval<0,1,0>(vflag, buffers, fc);
} else {
if (force->newton_bond)
eval<1,1,1>(vflag, buffers, fc);
else
eval<1,1,0>(vflag, buffers, fc);
}
} else {
if (force->newton_bond)
eval<0,0,1>(vflag, buffers, fc);
else
eval<0,0,0>(vflag, buffers, fc);
}
}
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void DihedralFourierIntel::eval(const int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc)
{
const int inum = neighbor->ndihedrallist;
if (inum == 0) return;
ATOM_T * _noalias const x = buffers->get_x(0);
const int nlocal = atom->nlocal;
const int nall = nlocal + atom->nghost;
int f_stride;
if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
else f_stride = buffers->get_stride(nlocal);
int tc;
FORCE_T * _noalias f_start;
acc_t * _noalias ev_global;
IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
const int nthreads = tc;
acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oedihedral = (acc_t)0.0;
if (VFLAG && vflag) {
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
}
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(f_start,f_stride,fc) \
reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
#endif
{
int nfrom, npl, nto, tid;
#ifdef LMP_INTEL_USE_SIMDOFF
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
#else
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
#endif
FORCE_T * _noalias const f = f_start + (tid * f_stride);
if (fix->need_zero(tid))
memset(f, 0, f_stride * sizeof(FORCE_T));
const int5_t * _noalias const dihedrallist =
(int5_t *) neighbor->dihedrallist[0];
#ifdef LMP_INTEL_USE_SIMDOFF
acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
if (EFLAG) sedihedral = (acc_t)0.0;
if (VFLAG && vflag) {
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
}
#pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
for (int n = nfrom; n < nto; n ++) {
#else
for (int n = nfrom; n < nto; n += npl) {
#endif
const int i1 = dihedrallist[n].a;
const int i2 = dihedrallist[n].b;
const int i3 = dihedrallist[n].c;
const int i4 = dihedrallist[n].d;
const int type = dihedrallist[n].t;
// 1st bond
const flt_t vb1x = x[i1].x - x[i2].x;
const flt_t vb1y = x[i1].y - x[i2].y;
const flt_t vb1z = x[i1].z - x[i2].z;
// 2nd bond
const flt_t vb2xm = x[i2].x - x[i3].x;
const flt_t vb2ym = x[i2].y - x[i3].y;
const flt_t vb2zm = x[i2].z - x[i3].z;
// 3rd bond
const flt_t vb3x = x[i4].x - x[i3].x;
const flt_t vb3y = x[i4].y - x[i3].y;
const flt_t vb3z = x[i4].z - x[i3].z;
// c,s calculation
const flt_t ax = vb1y*vb2zm - vb1z*vb2ym;
const flt_t ay = vb1z*vb2xm - vb1x*vb2zm;
const flt_t az = vb1x*vb2ym - vb1y*vb2xm;
const flt_t bx = vb3y*vb2zm - vb3z*vb2ym;
const flt_t by = vb3z*vb2xm - vb3x*vb2zm;
const flt_t bz = vb3x*vb2ym - vb3y*vb2xm;
const flt_t rasq = ax*ax + ay*ay + az*az;
const flt_t rbsq = bx*bx + by*by + bz*bz;
const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
const flt_t rg = sqrt(rgsq);
flt_t rginv, ra2inv, rb2inv;
rginv = ra2inv = rb2inv = (flt_t)0.0;
if (rg > 0) rginv = (flt_t)1.0/rg;
if (rasq > 0) ra2inv = (flt_t)1.0/rasq;
if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq;
const flt_t rabinv = sqrt(ra2inv*rb2inv);
flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
// error check
#ifndef LMP_INTEL_USE_SIMDOFF
if (c > PTOLERANCE || c < MTOLERANCE) {
int me = comm->me;
if (screen) {
char str[128];
sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
TAGINT_FORMAT " " TAGINT_FORMAT " "
TAGINT_FORMAT " " TAGINT_FORMAT,
me,tid,update->ntimestep,
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
error->warning(FLERR,str,0);
fprintf(screen," 1st atom: %d %g %g %g\n",
me,x[i1].x,x[i1].y,x[i1].z);
fprintf(screen," 2nd atom: %d %g %g %g\n",
me,x[i2].x,x[i2].y,x[i2].z);
fprintf(screen," 3rd atom: %d %g %g %g\n",
me,x[i3].x,x[i3].y,x[i3].z);
fprintf(screen," 4th atom: %d %g %g %g\n",
me,x[i4].x,x[i4].y,x[i4].z);
}
}
#endif
if (c > (flt_t)1.0) c = (flt_t)1.0;
if (c < (flt_t)-1.0) c = (flt_t)-1.0;
flt_t deng;
flt_t df = (flt_t)0.0;
if (EFLAG) deng = (flt_t)0.0;
for (int j = 0; j < nterms[type]; j++) {
const flt_t tcos_shift = fc.bp[j][type].cos_shift;
const flt_t tsin_shift = fc.bp[j][type].sin_shift;
const flt_t tk = fc.bp[j][type].k;
const int m = fc.bp[j][type].multiplicity;
flt_t p = (flt_t)1.0;
flt_t ddf1, df1;
ddf1 = df1 = (flt_t)0.0;
for (int i = 0; i < m; i++) {
ddf1 = p*c - df1*s;
df1 = p*s + df1*c;
p = ddf1;
}
p = p*tcos_shift + df1*tsin_shift;
df1 = df1*tcos_shift - ddf1*tsin_shift;
df1 *= -m;
p += (flt_t)1.0;
if (m == 0) {
p = (flt_t)1.0 + tcos_shift;
df1 = (flt_t)0.0;
}
if (EFLAG) deng += tk * p;
df -= tk * df1;
}
const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
const flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
const flt_t fga = fg*ra2inv*rginv;
const flt_t hgb = hg*rb2inv*rginv;
const flt_t gaa = -ra2inv*rg;
const flt_t gbb = rb2inv*rg;
const flt_t dtfx = gaa*ax;
const flt_t dtfy = gaa*ay;
const flt_t dtfz = gaa*az;
const flt_t dtgx = fga*ax - hgb*bx;
const flt_t dtgy = fga*ay - hgb*by;
const flt_t dtgz = fga*az - hgb*bz;
const flt_t dthx = gbb*bx;
const flt_t dthy = gbb*by;
const flt_t dthz = gbb*bz;
const flt_t sx2 = df*dtgx;
const flt_t sy2 = df*dtgy;
const flt_t sz2 = df*dtgz;
flt_t f1x = df*dtfx;
flt_t f1y = df*dtfy;
flt_t f1z = df*dtfz;
const flt_t f2x = sx2 - f1x;
const flt_t f2y = sy2 - f1y;
const flt_t f2z = sz2 - f1z;
flt_t f4x = df*dthx;
flt_t f4y = df*dthy;
flt_t f4z = df*dthz;
const flt_t f3x = -sx2 - f4x;
const flt_t f3y = -sy2 - f4y;
const flt_t f3z = -sz2 - f4z;
if (EFLAG || VFLAG) {
#ifdef LMP_INTEL_USE_SIMDOFF
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
sv0, sv1, sv2, sv3, sv4, sv5);
#else
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
ov0, ov1, ov2, ov3, ov4, ov5);
#endif
}
#ifdef LMP_INTEL_USE_SIMDOFF
#pragma simdoff
#endif
{
if (NEWTON_BOND || i1 < nlocal) {
f[i1].x += f1x;
f[i1].y += f1y;
f[i1].z += f1z;
}
if (NEWTON_BOND || i2 < nlocal) {
f[i2].x += f2x;
f[i2].y += f2y;
f[i2].z += f2z;
}
if (NEWTON_BOND || i3 < nlocal) {
f[i3].x += f3x;
f[i3].y += f3y;
f[i3].z += f3z;
}
if (NEWTON_BOND || i4 < nlocal) {
f[i4].x += f4x;
f[i4].y += f4y;
f[i4].z += f4z;
}
}
} // for n
#ifdef LMP_INTEL_USE_SIMDOFF
if (EFLAG) oedihedral += sedihedral;
if (VFLAG && vflag) {
ov0 += sv0; ov1 += sv1; ov2 += sv2;
ov3 += sv3; ov4 += sv4; ov5 += sv5;
}
#endif
} // omp parallel
if (EFLAG) energy += oedihedral;
if (VFLAG && vflag) {
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
}
fix->set_reduce_flag();
}
/* ---------------------------------------------------------------------- */
void DihedralFourierIntel::init_style()
{
DihedralFourier::init_style();
int ifix = modify->find_fix("package_intel");
if (ifix < 0)
error->all(FLERR,
"The 'package intel' command is required for /intel styles");
fix = static_cast<FixIntel *>(modify->fix[ifix]);
#ifdef _LMP_INTEL_OFFLOAD
_use_base = 0;
if (fix->offload_balance() != 0.0) {
_use_base = 1;
return;
}
#endif
fix->bond_init_check();
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
pack_force_const(force_const_single, fix->get_mixed_buffers());
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
pack_force_const(force_const_double, fix->get_double_buffers());
else
pack_force_const(force_const_single, fix->get_single_buffers());
}
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void DihedralFourierIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> *buffers)
{
const int bp1 = atom->ndihedraltypes + 1;
fc.set_ntypes(bp1, setflag, nterms, memory);
for (int i = 1; i < bp1; i++) {
if (setflag[i]) {
for (int j = 0; j < nterms[i]; j++) {
fc.bp[j][i].cos_shift = cos_shift[i][j];
fc.bp[j][i].sin_shift = sin_shift[i][j];
fc.bp[j][i].k = k[i][j];
fc.bp[j][i].multiplicity = multiplicity[i][j];
}
}
}
}
/* ---------------------------------------------------------------------- */
template <class flt_t>
void DihedralFourierIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
int *setflag,
int *nterms,
Memory *memory) {
if (nbondtypes != _nbondtypes) {
if (_nbondtypes > 0)
_memory->destroy(bp);
if (nbondtypes > 0) {
_maxnterms = 1;
for (int i = 1; i <= nbondtypes; i++)
if (setflag[i]) _maxnterms = MAX(_maxnterms, nterms[i]);
_memory->create(bp, _maxnterms, nbondtypes, "dihedralfourierintel.bp");
}
}
_nbondtypes = nbondtypes;
_memory = memory;
}

View File

@ -0,0 +1,82 @@
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
#ifdef DIHEDRAL_CLASS
DihedralStyle(fourier/intel,DihedralFourierIntel)
#else
#ifndef LMP_DIHEDRAL_FOURIER_INTEL_H
#define LMP_DIHEDRAL_FOURIER_INTEL_H
#include "dihedral_fourier.h"
#include "fix_intel.h"
namespace LAMMPS_NS {
class DihedralFourierIntel : public DihedralFourier {
public:
DihedralFourierIntel(class LAMMPS *lmp);
virtual void compute(int, int);
void init_style();
private:
FixIntel *fix;
template <class flt_t> class ForceConst;
template <class flt_t, class acc_t>
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc);
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc);
template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t, acc_t> *buffers);
#ifdef _LMP_INTEL_OFFLOAD
int _use_base;
#endif
template <class flt_t>
class ForceConst {
public:
typedef struct { flt_t cos_shift, sin_shift, k;
int multiplicity; } fc_packed1;
fc_packed1 **bp;
ForceConst() : _nbondtypes(0) {}
~ForceConst() { set_ntypes(0, NULL, NULL, NULL); }
void set_ntypes(const int nbondtypes, int *setflag, int *nterms,
Memory *memory);
private:
int _nbondtypes, _maxnterms;
Memory *_memory;
};
ForceConst<float> force_const_single;
ForceConst<double> force_const_double;
};
}
#endif
#endif

View File

@ -285,6 +285,7 @@ int FixIntel::setmask()
{
int mask = 0;
mask |= PRE_REVERSE;
mask |= MIN_PRE_REVERSE;
#ifdef _LMP_INTEL_OFFLOAD
mask |= POST_FORCE;
mask |= MIN_POST_FORCE;

View File

@ -43,6 +43,7 @@ class FixIntel : public Fix {
virtual int setmask();
virtual void init();
virtual void setup(int);
inline void min_setup(int in) { setup(in); }
void setup_pre_reverse(int eflag = 0, int vflag = 0);
void pair_init_check(const bool cdmessage=false);
@ -50,6 +51,8 @@ class FixIntel : public Fix {
void kspace_init_check();
void pre_reverse(int eflag = 0, int vflag = 0);
inline void min_pre_reverse(int eflag = 0, int vflag = 0)
{ pre_reverse(eflag, vflag); }
// Get all forces, calculation results from coprocesser
void sync_coprocessor();

View File

@ -409,6 +409,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
IP_PRE_get_stride(_ccache_stride3, nsize * 3, sizeof(acc_t), 0);
lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef");
#endif
memset(_ccachei, 0, vsize * sizeof(int));
memset(_ccachej, 0, vsize * sizeof(int));
#ifdef _LMP_INTEL_OFFLOAD
@ -425,7 +426,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
#pragma offload_transfer target(mic:_cop) \
nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \
in(ccachei:length(vsize) alloc_if(1) free_if(0)) \
in(ccachej:length(vsize) alloc_if(1) free_if(0))
}
#ifdef LMP_USE_AVXCD

View File

@ -292,6 +292,15 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
ito = inum; \
}
#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum, \
nthr, vecsize) \
{ \
tid = 0; \
ifrom = 0; \
ip = 1; \
ito = inum; \
}
#endif
#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \

View File

@ -319,7 +319,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
const int bstart = binhead[ibin + binstart[k]];
const int bend = binhead[ibin + binend[k]];
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd
#endif
for (int jj = bstart; jj < bend; jj++)
@ -341,7 +340,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
const int bstart = binhead[ibin + stencil[k]];
const int bend = binhead[ibin + stencil[k] + 1];
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd
#endif
for (int jj = bstart; jj < bend; jj++)

View File

@ -273,7 +273,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
const int bstart = binhead[ibin + binstart[k]];
const int bend = binhead[ibin + binend[k]];
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd
#endif
for (int jj = bstart; jj < bend; jj++)
@ -307,7 +306,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
const int bstart = binhead[ibin];
const int bend = binhead[ibin + 1];
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd
#endif
for (int jj = bstart; jj < bend; jj++) {

View File

@ -0,0 +1,617 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
This software is distributed under the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
Shun Xu (Computer Network Information Center, CAS)
------------------------------------------------------------------------- */
#include <math.h>
#include "pair_dpd_intel.h"
#include "atom.h"
#include "comm.h"
#include "force.h"
#include "memory.h"
#include "modify.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "neigh_request.h"
#include "suffix.h"
using namespace LAMMPS_NS;
#define LMP_MKL_RNG VSL_BRNG_MT19937
#define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
#define IEPSILON 1.0e10
/* ---------------------------------------------------------------------- */
PairDPDIntel::PairDPDIntel(LAMMPS *lmp) :
PairDPD(lmp)
{
suffix_flag |= Suffix::INTEL;
respa_enable = 0;
random_thread = NULL;
_nrandom_thread = 0;
}
/* ---------------------------------------------------------------------- */
PairDPDIntel::~PairDPDIntel()
{
#if defined(_OPENMP)
if (_nrandom_thread) {
#ifdef LMP_USE_MKL_RNG
for (int i = 0; i < _nrandom_thread; i++)
vslDeleteStream(&random_thread[i]);
#else
for (int i = 1; i < _nrandom_thread; i++)
delete random_thread[i];
#endif
}
#endif
delete []random_thread;
}
/* ---------------------------------------------------------------------- */
void PairDPDIntel::compute(int eflag, int vflag)
{
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
force_const_single);
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
force_const_double);
else
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
force_const_single);
fix->balance_stamp();
vflag_fdotr = 0;
}
template <class flt_t, class acc_t>
void PairDPDIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc)
{
if (eflag || vflag) {
ev_setup(eflag, vflag);
} else evflag = vflag_fdotr = 0;
const int inum = list->inum;
const int nthreads = comm->nthreads;
const int host_start = fix->host_start_pair();
const int offload_end = fix->offload_end_pair();
const int ago = neighbor->ago;
if (ago != 0 && fix->separate_buffers() == 0) {
fix->start_watch(TIME_PACK);
int packthreads;
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
else packthreads = 1;
#if defined(_OPENMP)
#pragma omp parallel if(packthreads > 1)
#endif
{
int ifrom, ito, tid;
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
packthreads, sizeof(ATOM_T));
buffers->thr_pack(ifrom,ito,ago);
}
fix->stop_watch(TIME_PACK);
}
int ovflag = 0;
if (vflag_fdotr) ovflag = 2;
else if (vflag) ovflag = 1;
if (_onetype) {
if (eflag) {
if (force->newton_pair) {
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
}
} else {
if (force->newton_pair) {
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
}
}
} else {
if (eflag) {
if (force->newton_pair) {
eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
}
} else {
if (force->newton_pair) {
eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
}
}
}
}
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
void PairDPDIntel::eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc,
const int astart, const int aend)
{
const int inum = aend - astart;
if (inum == 0) return;
int nlocal, nall, minlocal;
fix->get_buffern(offload, nlocal, nall, minlocal);
const int ago = neighbor->ago;
IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
ATOM_T * _noalias const x = buffers->get_x(offload);
typedef struct { double x, y, z; } lmp_vt;
lmp_vt *v = (lmp_vt *)atom->v[0];
const flt_t dtinvsqrt = 1.0/sqrt(update->dt);
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const FC_PACKED1_T * _noalias const param = fc.param[0];
const flt_t * _noalias const special_lj = fc.special_lj;
int * _noalias const rngi_thread = fc.rngi;
const int rng_size = buffers->get_max_nbors();
const int ntypes = atom->ntypes + 1;
const int eatom = this->eflag_atom;
// Determine how much data to transfer
int x_size, q_size, f_stride, ev_size, separate_flag;
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
buffers, offload, fix, separate_flag,
x_size, q_size, ev_size, f_stride);
int tc;
FORCE_T * _noalias f_start;
acc_t * _noalias ev_global;
IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
const int nthreads = tc;
int *overflow = fix->get_off_overflow_flag();
{
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime();
#endif
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
f_stride, x, 0);
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = (acc_t)0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
// loop over neighbors of my atoms
#if defined(_OPENMP)
#pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
#endif
{
int iifrom, iip, iito, tid;
IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
iifrom += astart;
iito += astart;
#ifdef LMP_USE_MKL_RNG
VSLStreamStatePtr *my_random = &(random_thread[tid]);
#else
RanMars *my_random = random_thread[tid];
#endif
flt_t *my_rand_buffer = fc.rand_buffer_thread[tid];
int rngi = rngi_thread[tid];
int foff;
if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
else foff = -minlocal;
FORCE_T * _noalias const f = f_start + foff;
if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
flt_t icut, a0, gamma, sigma;
if (ONETYPE) {
icut = param[3].icut;
a0 = param[3].a0;
gamma = param[3].gamma;
sigma = param[3].sigma;
}
for (int i = iifrom; i < iito; i += iip) {
int itype, ptr_off;
const FC_PACKED1_T * _noalias parami;
if (!ONETYPE) {
itype = x[i].w;
ptr_off = itype * ntypes;
parami = param + ptr_off;
}
const int * _noalias const jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
acc_t fxtmp, fytmp, fztmp, fwtmp;
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
const flt_t xtmp = x[i].x;
const flt_t ytmp = x[i].y;
const flt_t ztmp = x[i].z;
const flt_t vxtmp = v[i].x;
const flt_t vytmp = v[i].y;
const flt_t vztmp = v[i].z;
fxtmp = fytmp = fztmp = (acc_t)0;
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
if (NEWTON_PAIR == 0)
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
if (rngi + jnum > rng_size) {
#ifdef LMP_USE_MKL_RNG
if (sizeof(flt_t) == sizeof(float))
vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi,
(float*)my_rand_buffer, (float)0.0, (float)1.0 );
else
vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi,
(double*)my_rand_buffer, 0.0, 1.0 );
#else
for (int jj = 0; jj < rngi; jj++)
my_rand_buffer[jj] = my_random->gaussian();
#endif
rngi = 0;
}
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5)
#endif
for (int jj = 0; jj < jnum; jj++) {
flt_t forcelj, evdwl;
forcelj = evdwl = (flt_t)0.0;
int j, jtype, sbindex;
if (!ONETYPE) {
sbindex = jlist[jj] >> SBBITS & 3;
j = jlist[jj] & NEIGHMASK;
} else
j = jlist[jj];
const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z;
if (!ONETYPE) {
jtype = x[j].w;
icut = parami[jtype].icut;
}
const flt_t rsq = delx * delx + dely * dely + delz * delz;
const flt_t rinv = (flt_t)1.0/sqrt(rsq);
if (rinv > icut) {
flt_t factor_dpd;
if (!ONETYPE) factor_dpd = special_lj[sbindex];
flt_t delvx = vxtmp - v[j].x;
flt_t delvy = vytmp - v[j].y;
flt_t delvz = vztmp - v[j].z;
flt_t dot = delx*delvx + dely*delvy + delz*delvz;
flt_t randnum = my_rand_buffer[jj];
flt_t iwd = rinv - icut;
if (rinv > (flt_t)IEPSILON) iwd = (flt_t)0.0;
if (!ONETYPE) {
a0 = parami[jtype].a0;
gamma = parami[jtype].gamma;
sigma = parami[jtype].sigma;
}
flt_t fpair = a0 - iwd * gamma * dot + sigma * randnum * dtinvsqrt;
if (!ONETYPE) fpair *= factor_dpd;
fpair *= iwd;
const flt_t fpx = fpair * delx;
fxtmp += fpx;
if (NEWTON_PAIR) f[j].x -= fpx;
const flt_t fpy = fpair * dely;
fytmp += fpy;
if (NEWTON_PAIR) f[j].y -= fpy;
const flt_t fpz = fpair * delz;
fztmp += fpz;
if (NEWTON_PAIR) f[j].z -= fpz;
if (EFLAG) {
flt_t cut = (flt_t)1.0/icut;
flt_t r = (flt_t)1.0/rinv;
evdwl = (flt_t)0.5 * a0 * (cut - (flt_t)2.0*r + rsq * icut);
if (!ONETYPE) evdwl *= factor_dpd;
sevdwl += evdwl;
if (eatom) {
fwtmp += (flt_t)0.5 * evdwl;
if (NEWTON_PAIR)
f[j].w += (flt_t)0.5 * evdwl;
}
}
if (NEWTON_PAIR == 0)
IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
} // if rsq
} // for jj
if (NEWTON_PAIR) {
f[i].x += fxtmp;
f[i].y += fytmp;
f[i].z += fztmp;
} else {
f[i].x = fxtmp;
f[i].y = fytmp;
f[i].z = fztmp;
}
IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
rngi += jnum;
} // for ii
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
ov4, ov5);
rngi_thread[tid] = rngi;
} // end omp
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) {
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
ev_global[0] = oevdwl;
ev_global[1] = (acc_t)0.0;
}
if (vflag) {
if (NEWTON_PAIR == 0) {
ov0 *= (acc_t)0.5;
ov1 *= (acc_t)0.5;
ov2 *= (acc_t)0.5;
ov3 *= (acc_t)0.5;
ov4 *= (acc_t)0.5;
ov5 *= (acc_t)0.5;
}
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
ev_global[5] = ov3;
ev_global[6] = ov4;
ev_global[7] = ov5;
}
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime() - *timer_compute;
#endif
} // end offload
if (offload)
fix->stop_watch(TIME_OFFLOAD_LATENCY);
else
fix->stop_watch(TIME_HOST_PAIR);
if (EFLAG || vflag)
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
else
fix->add_result_array(f_start, 0, offload);
}
/* ----------------------------------------------------------------------
global settings
------------------------------------------------------------------------- */
void PairDPDIntel::settings(int narg, char **arg) {
#if defined(_OPENMP)
if (_nrandom_thread) {
#ifdef LMP_USE_MKL_RNG
for (int i = 0; i < _nrandom_thread; i++)
vslDeleteStream(&random_thread[i]);
#else
for (int i = 1; i < _nrandom_thread; i++)
delete random_thread[i];
#endif
}
delete []random_thread;
#endif
PairDPD::settings(narg,arg);
_nrandom_thread = comm->nthreads;
#ifdef LMP_USE_MKL_RNG
random_thread=new VSLStreamStatePtr[comm->nthreads];
#if defined(_OPENMP)
#pragma omp parallel
{
int tid = omp_get_thread_num();
vslNewStream(&random_thread[tid], LMP_MKL_RNG,
seed + comm->me + comm->nprocs * tid );
}
#endif
#else
random_thread =new RanMars*[comm->nthreads];
random_thread[0] = random;
#if defined(_OPENMP)
#pragma omp parallel
{
int tid = omp_get_thread_num();
if (tid > 0)
random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid);
}
#endif
#endif
}
/* ---------------------------------------------------------------------- */
void PairDPDIntel::init_style()
{
PairDPD::init_style();
if (force->newton_pair == 0) {
neighbor->requests[neighbor->nrequest-1]->half = 0;
neighbor->requests[neighbor->nrequest-1]->full = 1;
}
neighbor->requests[neighbor->nrequest-1]->intel = 1;
int ifix = modify->find_fix("package_intel");
if (ifix < 0)
error->all(FLERR,
"The 'package intel' command is required for /intel styles");
fix = static_cast<FixIntel *>(modify->fix[ifix]);
fix->pair_init_check();
#ifdef _LMP_INTEL_OFFLOAD
if (fix->offload_balance() != 0.0)
error->all(FLERR,
"Offload for dpd/intel is not yet available. Set balance to 0.");
#endif
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
pack_force_const(force_const_single, fix->get_mixed_buffers());
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
pack_force_const(force_const_double, fix->get_double_buffers());
else
pack_force_const(force_const_single, fix->get_single_buffers());
}
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void PairDPDIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> *buffers)
{
_onetype = 0;
if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;
int tp1 = atom->ntypes + 1;
fc.set_ntypes(tp1,comm->nthreads,buffers->get_max_nbors(),memory,_cop);
buffers->set_ntypes(tp1);
flt_t **cutneighsq = buffers->get_cutneighsq();
// Repeat cutsq calculation because done after call to init_style
double cut, cutneigh;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j);
cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
double icut = 1.0 / cut;
fc.param[i][j].icut = fc.param[j][i].icut = icut;
} else {
cut = init_one(i,j);
double icut = 1.0 / cut;
fc.param[i][j].icut = fc.param[j][i].icut = icut;
}
}
}
for (int i = 0; i < 4; i++) {
fc.special_lj[i] = force->special_lj[i];
fc.special_lj[0] = 1.0;
}
for (int i = 0; i < tp1; i++) {
for (int j = 0; j < tp1; j++) {
fc.param[i][j].a0 = a0[i][j];
fc.param[i][j].gamma = gamma[i][j];
fc.param[i][j].sigma = sigma[i][j];
}
}
}
/* ---------------------------------------------------------------------- */
template <class flt_t>
void PairDPDIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
const int nthreads,
const int max_nbors,
Memory *memory,
const int cop) {
if (ntypes != _ntypes) {
if (_ntypes > 0) {
_memory->destroy(param);
_memory->destroy(rand_buffer_thread);
_memory->destroy(rngi);
}
if (ntypes > 0) {
_cop = cop;
memory->create(param,ntypes,ntypes,"fc.param");
memory->create(rand_buffer_thread, nthreads, max_nbors,
"fc.rand_buffer_thread");
memory->create(rngi,nthreads,"fc.param");
for (int i = 0; i < nthreads; i++) rngi[i] = max_nbors;
}
}
_ntypes = ntypes;
_memory = memory;
}
/* ----------------------------------------------------------------------
proc 0 reads from restart file, bcasts
------------------------------------------------------------------------- */
void PairDPDIntel::read_restart_settings(FILE *fp)
{
#if defined(_OPENMP)
if (_nrandom_thread) {
#ifdef LMP_USE_MKL_RNG
for (int i = 0; i < _nrandom_thread; i++)
vslDeleteStream(&random_thread[i]);
#else
for (int i = 1; i < _nrandom_thread; i++)
delete random_thread[i];
#endif
}
delete []random_thread;
#endif
PairDPD::read_restart_settings(fp);
_nrandom_thread = comm->nthreads;
#ifdef LMP_USE_MKL_RNG
random_thread=new VSLStreamStatePtr[comm->nthreads];
#if defined(_OPENMP)
#pragma omp parallel
{
int tid = omp_get_thread_num();
vslNewStream(&random_thread[tid], LMP_MKL_RNG,
seed + comm->me + comm->nprocs * tid );
}
#endif
#else
random_thread =new RanMars*[comm->nthreads];
random_thread[0] = random;
#if defined(_OPENMP)
#pragma omp parallel
{
int tid = omp_get_thread_num();
if (tid > 0)
random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid);
}
#endif
#endif
}

View File

@ -0,0 +1,110 @@
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
Shun Xu (Computer Network Information Center, CAS)
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
PairStyle(dpd/intel,PairDPDIntel)
#else
#ifndef LMP_PAIR_DPD_INTEL_H
#define LMP_PAIR_DPD_INTEL_H
#include "pair_dpd.h"
#include "fix_intel.h"
#ifdef LMP_USE_MKL_RNG
#include "mkl_vsl.h"
#else
#include "random_mars.h"
#endif
namespace LAMMPS_NS {
class PairDPDIntel : public PairDPD {
public:
PairDPDIntel(class LAMMPS *);
~PairDPDIntel();
virtual void compute(int, int);
void settings(int, char **);
void init_style();
void read_restart_settings(FILE *);
private:
FixIntel *fix;
int _cop, _onetype, _nrandom_thread;
#ifdef LMP_USE_MKL_RNG
VSLStreamStatePtr *random_thread;
#else
RanMars **random_thread;
#endif
template <class flt_t> class ForceConst;
template <class flt_t, class acc_t>
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc);
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
void eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc, const int astart, const int aend);
template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t, acc_t> *buffers);
// ----------------------------------------------------------------------
template <class flt_t>
class ForceConst {
public:
typedef struct { flt_t icut, a0, gamma, sigma; } fc_packed1;
_alignvar(flt_t special_lj[4],64);
fc_packed1 **param;
flt_t **rand_buffer_thread;
int *rngi;
ForceConst() : _ntypes(0) {}
~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); }
void set_ntypes(const int ntypes, const int nthreads, const int max_nbors,
Memory *memory, const int cop);
private:
int _ntypes, _cop;
Memory *_memory;
};
ForceConst<float> force_const_single;
ForceConst<double> force_const_double;
};
}
#endif
#endif
/* ERROR/WARNING messages:
E: The 'package intel' command is required for /intel styles
Self-explanatory.
*/

View File

@ -68,7 +68,7 @@ void VerletLRTIntel::init()
_intel_kspace = (PPPMIntel*)(force->kspace_match("pppm/intel", 0));
#ifdef LMP_INTEL_NOLRT
#ifndef LMP_INTEL_USELRT
error->all(FLERR,
"LRT otion for Intel package disabled at compile time");
#endif

View File

@ -23,10 +23,7 @@ IntegrateStyle(verlet/lrt/intel,VerletLRTIntel)
#include "verlet.h"
#include "pppm_intel.h"
#ifndef LMP_INTEL_USELRT
#define LMP_INTEL_NOLRT
#else
#ifdef LMP_INTEL_USELRT
#ifdef LMP_INTEL_LRT11
#define _LMP_INTEL_LRT_11
#include <thread>

View File

@ -367,7 +367,7 @@ void manifold_gaussian_bump::test_lut()
}else{
taper_z = 0.0;
}
fprintf( fp, "%g %g %g %g %g\n", xx, gaussian_bump(xx), taper_z,
fprintf( fp, "%g %g %g %g %g %g %g\n", xx, gaussian_bump(xx), taper_z,
gg, nn[0], nn[1], nn[2] );
}
fclose(fp);

View File

@ -98,7 +98,7 @@ int FixSRP::setmask()
void FixSRP::init()
{
if (force->pair_match("hybrid",1) == NULL)
if (force->pair_match("hybrid",1) == NULL && force->pair_match("hybrid/overlay",1) == NULL)
error->all(FLERR,"Cannot use pair srp without pair_style hybrid");
int has_rigid = 0;

View File

@ -88,8 +88,8 @@ DumpNetCDF::DumpNetCDF(LAMMPS *lmp, int narg, char **arg) :
if (multiproc)
error->all(FLERR,"Multi-processor writes are not supported.");
if (multifile)
error->all(FLERR,"Multiple files are not supported.");
if (append_flag && multifile)
error->all(FLERR,"Cannot append when writing to multiple files.");
perat = new nc_perat_t[nfield];
@ -224,6 +224,24 @@ DumpNetCDF::~DumpNetCDF()
void DumpNetCDF::openfile()
{
char *filecurrent = filename;
if (multifile && !singlefile_opened) {
char *filestar = filecurrent;
filecurrent = new char[strlen(filestar) + 16];
char *ptr = strchr(filestar,'*');
*ptr = '\0';
if (padflag == 0)
sprintf(filecurrent,"%s" BIGINT_FORMAT "%s",
filestar,update->ntimestep,ptr+1);
else {
char bif[8],pad[16];
strcpy(bif,BIGINT_FORMAT);
sprintf(pad,"%%s%%0%d%s%%s",padflag,&bif[1]);
sprintf(filecurrent,pad,filestar,update->ntimestep,ptr+1);
}
*ptr = '*';
}
if (thermo && !singlefile_opened) {
if (thermovar) delete [] thermovar;
thermovar = new int[output->thermo->nfield];
@ -268,14 +286,14 @@ void DumpNetCDF::openfile()
ntotalgr = group->count(igroup);
if (filewriter) {
if (append_flag && access(filename, F_OK) != -1) {
if (append_flag && !multifile && access(filecurrent, F_OK) != -1) {
// Fixme! Perform checks if dimensions and variables conform with
// data structure standard.
if (singlefile_opened) return;
singlefile_opened = 1;
NCERRX( nc_open(filename, NC_WRITE, &ncid), filename );
NCERRX( nc_open(filecurrent, NC_WRITE, &ncid), filecurrent );
// dimensions
NCERRX( nc_inq_dimid(ncid, NC_FRAME_STR, &frame_dim), NC_FRAME_STR );
@ -348,8 +366,8 @@ void DumpNetCDF::openfile()
if (singlefile_opened) return;
singlefile_opened = 1;
NCERRX( nc_create(filename, NC_64BIT_DATA, &ncid),
filename );
NCERRX( nc_create(filecurrent, NC_64BIT_DATA, &ncid),
filecurrent );
// dimensions
NCERRX( nc_def_dim(ncid, NC_FRAME_STR, NC_UNLIMITED, &frame_dim),
@ -598,15 +616,39 @@ void DumpNetCDF::closefile()
if (filewriter && singlefile_opened) {
NCERR( nc_close(ncid) );
singlefile_opened = 0;
// write to next frame upon next open
if (multifile)
framei = 1;
else {
// append next time DumpNetCDF::openfile is called
append_flag = 1;
// write to next frame upon next open
framei++;
}
}
}
/* ---------------------------------------------------------------------- */
template <typename T>
int nc_put_var1_bigint(int ncid, int varid, const size_t index[], const T* tp)
{
return nc_put_var1_int(ncid, varid, index, tp);
}
template <>
int nc_put_var1_bigint<long>(int ncid, int varid, const size_t index[],
const long* tp)
{
return nc_put_var1_long(ncid, varid, index, tp);
}
template <>
int nc_put_var1_bigint<long long>(int ncid, int varid, const size_t index[],
const long long* tp)
{
return nc_put_var1_longlong(ncid, varid, index, tp);
}
void DumpNetCDF::write()
{
// open file
@ -638,13 +680,8 @@ void DumpNetCDF::write()
th->keyword[i] );
}
else if (th->vtype[i] == BIGINT) {
#if defined(LAMMPS_SMALLBIG) || defined(LAMMPS_BIGBIG)
NCERRX( nc_put_var1_long(ncid, thermovar[i], start, &th->bivalue),
NCERRX( nc_put_var1_bigint(ncid, thermovar[i], start, &th->bivalue),
th->keyword[i] );
#else
NCERRX( nc_put_var1_int(ncid, thermovar[i], start, &th->bivalue),
th->keyword[i] );
#endif
}
}
}
@ -887,6 +924,8 @@ int DumpNetCDF::modify_param(int narg, char **arg)
return 2;
}
else if (strcmp(arg[iarg],"at") == 0) {
if (!append_flag)
error->all(FLERR,"expected 'append yes' before 'at' keyword");
iarg++;
framei = force->inumeric(FLERR,arg[iarg]);
if (framei < 0) framei--;
@ -911,68 +950,6 @@ int DumpNetCDF::modify_param(int narg, char **arg)
/* ---------------------------------------------------------------------- */
void DumpNetCDF::write_prmtop()
{
char fn[1024];
char tmp[81];
FILE *f;
strcpy(fn, filename);
strcat(fn, ".prmtop");
f = fopen(fn, "w");
fprintf(f, "%%VERSION LAMMPS\n");
fprintf(f, "%%FLAG TITLE\n");
fprintf(f, "%%FORMAT(20a4)\n");
memset(tmp, ' ', 76);
tmp[76] = '\0';
fprintf(f, "NASN%s\n", tmp);
fprintf(f, "%%FLAG POINTERS\n");
fprintf(f, "%%FORMAT(10I8)\n");
#if defined(LAMMPS_SMALLBIG) || defined(LAMMPS_BIGBIG)
fprintf(f, "%8li", ntotalgr);
#else
fprintf(f, "%8i", ntotalgr);
#endif
for (int i = 0; i < 11; i++)
fprintf(f, "%8i", 0);
fprintf(f, "\n");
for (int i = 0; i < 12; i++)
fprintf(f, "%8i", 0);
fprintf(f, "\n");
for (int i = 0; i < 6; i++)
fprintf(f, "%8i", 0);
fprintf(f, "\n");
fprintf(f, "%%FLAG ATOM_NAME\n");
fprintf(f, "%%FORMAT(20a4)\n");
for (int i = 0; i < ntotalgr; i++) {
fprintf(f, "%4s", "He");
if ((i+1) % 20 == 0)
fprintf(f, "\n");
}
fprintf(f, "%%FLAG CHARGE\n");
fprintf(f, "%%FORMAT(5E16.5)\n");
for (int i = 0; i < ntotalgr; i++) {
fprintf(f, "%16.5e", 0.0);
if ((i+1) % 5 == 0)
fprintf(f, "\n");
}
fprintf(f, "%%FLAG MASS\n");
fprintf(f, "%%FORMAT(5E16.5)\n");
for (int i = 0; i < ntotalgr; i++) {
fprintf(f, "%16.5e", 1.0);
if ((i+1) % 5 == 0)
fprintf(f, "\n");
}
fclose(f);
}
/* ---------------------------------------------------------------------- */
void DumpNetCDF::ncerr(int err, const char *descr, int line)
{
if (err != NC_NOERR) {

View File

@ -92,7 +92,6 @@ class DumpNetCDF : public DumpCustom {
void closefile();
virtual void write_header(bigint);
virtual void write_data(int, double *);
void write_prmtop();
virtual int modify_param(int, char **);

Some files were not shown because too many files have changed in this diff Show More