Merge branch 'master' into HEAD
This commit is contained in:
@ -37,6 +37,10 @@ enable_language(CXX)
|
||||
#####################################################################
|
||||
include(CheckCCompilerFlag)
|
||||
|
||||
if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
|
||||
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -restrict")
|
||||
endif()
|
||||
|
||||
########################################################################
|
||||
# User input options #
|
||||
########################################################################
|
||||
@ -76,7 +80,7 @@ add_definitions(-DLAMMPS_MEMALIGN=${LAMMPS_MEMALIGN})
|
||||
option(LAMMPS_EXCEPTIONS "enable the use of C++ exceptions for error messages (useful for library interface)" OFF)
|
||||
if(LAMMPS_EXCEPTIONS)
|
||||
add_definitions(-DLAMMPS_EXCEPTIONS)
|
||||
set(LAMMPS_API_DEFINES "${LAMMPS_API_DEFINES -DLAMMPS_EXCEPTIONS")
|
||||
set(LAMMPS_API_DEFINES "${LAMMPS_API_DEFINES} -DLAMMPS_EXCEPTIONS")
|
||||
endif()
|
||||
|
||||
set(LAMMPS_MACHINE "" CACHE STRING "Suffix to append to lmp binary and liblammps (WON'T enable any features automatically")
|
||||
@ -665,7 +669,9 @@ include_directories(${LAMMPS_STYLE_HEADERS_DIR})
|
||||
############################################
|
||||
add_library(lammps ${LIB_SOURCES})
|
||||
target_link_libraries(lammps ${LAMMPS_LINK_LIBS})
|
||||
add_dependencies(lammps ${LAMMPS_DEPS})
|
||||
if(LAMMPS_DEPS)
|
||||
add_dependencies(lammps ${LAMMPS_DEPS})
|
||||
endif()
|
||||
set_target_properties(lammps PROPERTIES OUTPUT_NAME lammps${LAMMPS_MACHINE})
|
||||
if(BUILD_SHARED_LIBS)
|
||||
set_target_properties(lammps PROPERTIES SOVERSION ${SOVERSION})
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 20 KiB After Width: | Height: | Size: 19 KiB |
@ -706,7 +706,7 @@ dynamics can be run with LAMMPS using density-functional tight-binding
|
||||
quantum forces calculated by LATTE.
|
||||
|
||||
More information on LATTE can be found at this web site:
|
||||
"https://github.com/lanl/LATTE"_#latte_home. A brief technical
|
||||
"https://github.com/lanl/LATTE"_latte_home. A brief technical
|
||||
description is given with the "fix latte"_fix_latte.html command.
|
||||
|
||||
:link(latte_home,https://github.com/lanl/LATTE)
|
||||
@ -729,6 +729,7 @@ make lib-latte args="-b" # download and build in lib/latte/LATTE-
|
||||
make lib-latte args="-p $HOME/latte" # use existing LATTE installation in $HOME/latte
|
||||
make lib-latte args="-b -m gfortran" # download and build in lib/latte and
|
||||
# copy Makefile.lammps.gfortran to Makefile.lammps
|
||||
:pre
|
||||
|
||||
Note that 3 symbolic (soft) links, "includelink" and "liblink" and
|
||||
"filelink", are created in lib/latte to point into the LATTE home dir.
|
||||
|
||||
@ -25,14 +25,14 @@ LAMMPS to run on the CPU cores and coprocessor cores simultaneously.
|
||||
[Currently Available USER-INTEL Styles:]
|
||||
|
||||
Angle Styles: charmm, harmonic :ulb,l
|
||||
Bond Styles: fene, harmonic :l
|
||||
Bond Styles: fene, fourier, harmonic :l
|
||||
Dihedral Styles: charmm, harmonic, opls :l
|
||||
Fixes: nve, npt, nvt, nvt/sllod :l
|
||||
Fixes: nve, npt, nvt, nvt/sllod, nve/asphere :l
|
||||
Improper Styles: cvff, harmonic :l
|
||||
Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long,
|
||||
buck, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm,
|
||||
lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo,
|
||||
sw, tersoff :l
|
||||
buck, dpd, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm,
|
||||
lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long,
|
||||
rebo, sw, tersoff :l
|
||||
K-Space Styles: pppm, pppm/disp :l
|
||||
:ule
|
||||
|
||||
@ -54,11 +54,12 @@ warmup run (for use with offload benchmarks).
|
||||
:c,image(JPG/user_intel.png)
|
||||
|
||||
Results are speedups obtained on Intel Xeon E5-2697v4 processors
|
||||
(code-named Broadwell) and Intel Xeon Phi 7250 processors
|
||||
(code-named Knights Landing) with "June 2017" LAMMPS built with
|
||||
Intel Parallel Studio 2017 update 2. Results are with 1 MPI task
|
||||
per physical core. See {src/USER-INTEL/TEST/README} for the raw
|
||||
simulation rates and instructions to reproduce.
|
||||
(code-named Broadwell), Intel Xeon Phi 7250 processors (code-named
|
||||
Knights Landing), and Intel Xeon Gold 6148 processors (code-named
|
||||
Skylake) with "June 2017" LAMMPS built with Intel Parallel Studio
|
||||
2017 update 2. Results are with 1 MPI task per physical core. See
|
||||
{src/USER-INTEL/TEST/README} for the raw simulation rates and
|
||||
instructions to reproduce.
|
||||
|
||||
:line
|
||||
|
||||
@ -82,6 +83,11 @@ this order :l
|
||||
The {newton} setting applies to all atoms, not just atoms shared
|
||||
between MPI tasks :l
|
||||
Vectorization can change the order for adding pairwise forces :l
|
||||
When using the -DLMP_USE_MKL_RNG define (all included intel optimized
|
||||
makefiles do) at build time, the random number generator for
|
||||
dissipative particle dynamics (pair style dpd/intel) uses the Mersenne
|
||||
Twister generator included in the Intel MKL library (that should be
|
||||
more robust than the default Masaglia random number generator) :l
|
||||
:ule
|
||||
|
||||
The precision mode (described below) used with the USER-INTEL
|
||||
@ -108,7 +114,7 @@ $t should be 2 for Intel Xeon CPUs and 2 or 4 for Intel Xeon Phi :l
|
||||
For some of the simple 2-body potentials without long-range
|
||||
electrostatics, performance and scalability can be better with
|
||||
the "newton off" setting added to the input script :l
|
||||
For simulations on higher node counts, add "processors * * * grid
|
||||
For simulations on higher node counts, add "processors * * * grid
|
||||
numa" to the beginning of the input script for better scalability :l
|
||||
If using {kspace_style pppm} in the input script, add
|
||||
"kspace_modify diff ad" for better performance :l
|
||||
@ -119,8 +125,8 @@ For Intel Xeon Phi CPUs:
|
||||
Runs should be performed using MCDRAM. :ulb,l
|
||||
:ule
|
||||
|
||||
For simulations using {kspace_style pppm} on Intel CPUs
|
||||
supporting AVX-512:
|
||||
For simulations using {kspace_style pppm} on Intel CPUs supporting
|
||||
AVX-512:
|
||||
|
||||
Add "kspace_modify diff ad" to the input script :ulb,l
|
||||
The command-line option should be changed to
|
||||
@ -237,14 +243,17 @@ However, if you do not have coprocessors on your system, building
|
||||
without offload support will produce a smaller binary.
|
||||
|
||||
The general requirements for Makefiles with the USER-INTEL package
|
||||
are as follows. "-DLAMMPS_MEMALIGN=64" is required for CCFLAGS. When
|
||||
using Intel compilers, "-restrict" is required and "-qopenmp" is
|
||||
highly recommended for CCFLAGS and LINKFLAGS. LIB should include
|
||||
"-ltbbmalloc". For builds supporting offload, "-DLMP_INTEL_OFFLOAD"
|
||||
is required for CCFLAGS and "-qoffload" is required for LINKFLAGS.
|
||||
Other recommended CCFLAG options for best performance are
|
||||
"-O2 -fno-alias -ansi-alias -qoverride-limits fp-model fast=2
|
||||
-no-prec-div".
|
||||
are as follows. When using Intel compilers, "-restrict" is required
|
||||
and "-qopenmp" is highly recommended for CCFLAGS and LINKFLAGS.
|
||||
CCFLAGS should include "-DLMP_INTEL_USELRT" (unless POSIX Threads
|
||||
are not supported in the build environment) and "-DLMP_USE_MKL_RNG"
|
||||
(unless Intel Math Kernel Library (MKL) is not available in the build
|
||||
environment). For Intel compilers, LIB should include "-ltbbmalloc"
|
||||
or if the library is not available, "-DLMP_INTEL_NO_TBB" can be added
|
||||
to CCFLAGS. For builds supporting offload, "-DLMP_INTEL_OFFLOAD" is
|
||||
required for CCFLAGS and "-qoffload" is required for LINKFLAGS. Other
|
||||
recommended CCFLAG options for best performance are "-O2 -fno-alias
|
||||
-ansi-alias -qoverride-limits fp-model fast=2 -no-prec-div".
|
||||
|
||||
NOTE: The vectorization and math capabilities can differ depending on
|
||||
the CPU. For Intel compilers, the "-x" flag specifies the type of
|
||||
|
||||
@ -16,7 +16,7 @@ atom_modify keyword values ... :pre
|
||||
one or more keyword/value pairs may be appended :ulb,l
|
||||
keyword = {id} or {map} or {first} or {sort} :l
|
||||
{id} value = {yes} or {no}
|
||||
{map} value = {array} or {hash}
|
||||
{map} value = {yes} or {array} or {hash}
|
||||
{first} value = group-ID = group whose atoms will appear first in internal atom lists
|
||||
{sort} values = Nfreq binsize
|
||||
Nfreq = sort atoms spatially every this many time steps
|
||||
@ -25,8 +25,8 @@ keyword = {id} or {map} or {first} or {sort} :l
|
||||
|
||||
[Examples:]
|
||||
|
||||
atom_modify map hash
|
||||
atom_modify map array sort 10000 2.0
|
||||
atom_modify map yes
|
||||
atom_modify map hash sort 10000 2.0
|
||||
atom_modify first colloid :pre
|
||||
|
||||
[Description:]
|
||||
@ -62,29 +62,33 @@ switch. This is described in "Section 2.2"_Section_start.html#start_2
|
||||
of the manual. If atom IDs are not used, they must be specified as 0
|
||||
for all atoms, e.g. in a data or restart file.
|
||||
|
||||
The {map} keyword determines how atom ID lookup is done for molecular
|
||||
atom styles. Lookups are performed by bond (angle, etc) routines in
|
||||
LAMMPS to find the local atom index associated with a global atom ID.
|
||||
The {map} keyword determines how atoms with specific IDs are found
|
||||
when required. An example are the bond (angle, etc) methods which
|
||||
need to find the local index of an atom with a specific global ID
|
||||
which is a bond (angle, etc) partner. LAMMPS performs this operation
|
||||
efficiently by creating a "map", which is either an {array} or {hash}
|
||||
table, as descibed below.
|
||||
|
||||
When the {array} value is used, each processor stores a lookup table
|
||||
of length N, where N is the largest atom ID in the system. This is a
|
||||
When the {map} keyword is not specified in your input script, LAMMPS
|
||||
only creates a map for "atom_styles"_atom_style.html for molecular
|
||||
systems which have permanent bonds (angles, etc). No map is created
|
||||
for atomic systems, since it is normally not needed. However some
|
||||
LAMMPS commands require a map, even for atomic systems, and will
|
||||
generate an error if one does not exist. The {map} keyword thus
|
||||
allows you to force the creation of a map. The {yes} value will
|
||||
create either an {array} or {hash} style map, as explained in the next
|
||||
paragraph. The {array} and {hash} values create an atom-style or
|
||||
hash-style map respectively.
|
||||
|
||||
For an {array}-style map, each processor stores a lookup table of
|
||||
length N, where N is the largest atom ID in the system. This is a
|
||||
fast, simple method for many simulations, but requires too much memory
|
||||
for large simulations. The {hash} value uses a hash table to perform
|
||||
the lookups. This can be slightly slower than the {array} method, but
|
||||
its memory cost is proportional to the number of atoms owned by a
|
||||
processor, i.e. N/P when N is the total number of atoms in the system
|
||||
and P is the number of processors.
|
||||
|
||||
When this setting is not specified in your input script, LAMMPS
|
||||
creates a map, if one is needed, as an array or hash. See the
|
||||
discussion of default values below for how LAMMPS chooses which kind
|
||||
of map to build. Note that atomic systems do not normally need to
|
||||
create a map. However, even in this case some LAMMPS commands will
|
||||
create a map to find atoms (and then destroy it), or require a
|
||||
permanent map. An example of the former is the "velocity loop
|
||||
all"_velocity.html command, which uses a map when looping over all
|
||||
atoms and insuring the same velocity values are assigned to an atom
|
||||
ID, no matter which processor owns it.
|
||||
for large simulations. For a {hash}-style map, a hash table is
|
||||
created on each processor, which finds an atom ID in constant time
|
||||
(independent of the global number of atom IDs). It can be slightly
|
||||
slower than the {array} map, but its memory cost is proportional to
|
||||
the number of atoms owned by a processor, i.e. N/P when N is the total
|
||||
number of atoms in the system and P is the number of processors.
|
||||
|
||||
The {first} keyword allows a "group"_group.html to be specified whose
|
||||
atoms will be maintained as the first atoms in each processor's list
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
:line
|
||||
|
||||
dihedral_style fourier command :h3
|
||||
dihedral_style fourier/intel command :h3
|
||||
dihedral_style fourier/omp command :h3
|
||||
|
||||
[Syntax:]
|
||||
|
||||
@ -15,9 +15,11 @@ dump_modify dump-ID keyword values ... :pre
|
||||
dump-ID = ID of dump to modify :ulb,l
|
||||
one or more keyword/value pairs may be appended :l
|
||||
these keywords apply to various dump styles :l
|
||||
keyword = {append} or {buffer} or {element} or {every} or {fileper} or {first} or {flush} or {format} or {image} or {label} or {nfile} or {pad} or {precision} or {region} or {scale} or {sort} or {thresh} or {unwrap} :l
|
||||
{append} arg = {yes} or {no} or {at} N
|
||||
keyword = {append} or {at} or {buffer} or {element} or {every} or {fileper} or {first} or {flush} or {format} or {image} or {label} or {nfile} or {pad} or {precision} or {region} or {scale} or {sort} or {thresh} or {unwrap} :l
|
||||
{append} arg = {yes} or {no}
|
||||
{at} arg = N
|
||||
N = index of frame written upon first dump
|
||||
only available after "append yes"
|
||||
{buffer} arg = {yes} or {no}
|
||||
{element} args = E1 E2 ... EN, where N = # of atom types
|
||||
E1,...,EN = element name, e.g. C or Fe or Ga
|
||||
|
||||
@ -25,7 +25,8 @@ args = list of atom attributes, same as for "dump_style custom"_dump.html :l,ule
|
||||
|
||||
dump 1 all netcdf 100 traj.nc type x y z vx vy vz
|
||||
dump_modify 1 append yes at -1 thermo yes
|
||||
dump 1 all netcdf/mpiio 1000 traj.nc id type x y z :pre
|
||||
dump 1 all netcdf/mpiio 1000 traj.nc id type x y z
|
||||
dump 1 all netcdf 1000 traj.*.nc id type x y z :pre
|
||||
|
||||
[Description:]
|
||||
|
||||
@ -73,4 +74,3 @@ section for more info.
|
||||
[Related commands:]
|
||||
|
||||
"dump"_dump.html, "dump_modify"_dump_modify.html, "undump"_undump.html
|
||||
|
||||
|
||||
@ -66,7 +66,7 @@ reference charge of overlapping atom-centered densities and bond
|
||||
integrals are parameterized using a Slater-Koster tight-binding
|
||||
approach. This procedure, which usually is referred to as the DFTB
|
||||
method has been described in detail by ("Elstner"_#Elstner) and
|
||||
("Finnis"_#Finnis) and coworkers.
|
||||
("Finnis"_#Finnis2) and coworkers.
|
||||
|
||||
The work of the LATTE developers follows that of Elstner closely with
|
||||
respect to the physical model. However, the development of LATTE is
|
||||
@ -173,7 +173,7 @@ M. Haugk, T. Frauenheim, S. Suhai, and G. Seifert, Phys. Rev. B, 58,
|
||||
M. Haugk, T. Frauenheim, S. Suhai, and G. Seifert, Phys. Rev. B, 58,
|
||||
7260 (1998).
|
||||
|
||||
:link(Finnis)
|
||||
:link(Finnis2)
|
||||
[(Finnis)] M. W. Finnis, A. T. Paxton, M. Methfessel, and M. van
|
||||
Schilfgarde, Phys. Rev. Lett., 81, 5149 (1998).
|
||||
|
||||
@ -197,11 +197,11 @@ J. Sci. Comput. 36 (2), 147-170, (2014).
|
||||
[(Niklasson2014)] A. M. N. Niklasson and M. Cawkwell, J. Chem. Phys.,
|
||||
141, 164123, (2014).
|
||||
|
||||
:link(Niklasson2014)
|
||||
:link(Niklasson2017)
|
||||
[(Niklasson2017)] A. M. N. Niklasson, J. Chem. Phys., 147, 054103 (2017).
|
||||
|
||||
:link(Niklasson2012)
|
||||
[(Niklasson2017)] A. M. N. Niklasson, M. J. Cawkwell, Phys. Rev. B, 86
|
||||
:link(Cawkwell2012)
|
||||
[(Cawkwell2012)] A. M. N. Niklasson, M. J. Cawkwell, Phys. Rev. B, 86
|
||||
(17), 174308 (2012).
|
||||
|
||||
:link(Negre2016)
|
||||
|
||||
@ -93,7 +93,7 @@ intermediate replica with the previous and the next image:
|
||||
|
||||
Fnudge_parallel = {Kspring} * (|Ri+1 - Ri| - |Ri - Ri-1|) :pre
|
||||
|
||||
Note that in this case the specified {Kspring) is in force/distance
|
||||
Note that in this case the specified {Kspring} is in force/distance
|
||||
units.
|
||||
|
||||
With a value of {ideal}, the spring force is computed as suggested in
|
||||
@ -105,7 +105,7 @@ where RD is the "reaction coordinate" see "neb"_neb.html section, and
|
||||
RDideal is the ideal RD for which all the images are equally spaced.
|
||||
I.e. RDideal = (I-1)*meanDist when the climbing replica is off, where
|
||||
I is the replica number). The meanDist is the average distance
|
||||
between replicas. Note that in this case the specified {Kspring) is
|
||||
between replicas. Note that in this case the specified {Kspring} is
|
||||
in force units.
|
||||
|
||||
Note that the {ideal} form of nudging can often be more effective at
|
||||
|
||||
@ -393,32 +393,36 @@ thermostatting and barostatting.
|
||||
:line
|
||||
|
||||
These fixes compute a temperature and pressure each timestep. To do
|
||||
this, the fix creates its own computes of style "temp" and "pressure",
|
||||
as if one of these two sets of commands had been issued:
|
||||
this, the thermostat and barostat fixes create their own computes of
|
||||
style "temp" and "pressure", as if one of these sets of commands had
|
||||
been issued:
|
||||
|
||||
For fix nvt:
|
||||
compute fix-ID_temp group-ID temp
|
||||
compute fix-ID_press group-ID pressure fix-ID_temp :pre
|
||||
|
||||
For fix npt and fix nph:
|
||||
compute fix-ID_temp all temp
|
||||
compute fix-ID_press all pressure fix-ID_temp :pre
|
||||
|
||||
See the "compute temp"_compute_temp.html and "compute
|
||||
pressure"_compute_pressure.html commands for details. Note that the
|
||||
IDs of the new computes are the fix-ID + underscore + "temp" or fix_ID
|
||||
+ underscore + "press". For fix nvt, the group for the new computes
|
||||
is the same as the fix group. For fix nph and fix npt, the group for
|
||||
the new computes is "all" since pressure is computed for the entire
|
||||
system.
|
||||
For fix nvt, the group for the new temperature compute is the same as
|
||||
the fix group. For fix npt and fix nph, the group for both the new
|
||||
temperature and pressure compute is "all" since pressure is computed
|
||||
for the entire system. In the case of fix nph, the temperature
|
||||
compute is not used for thermostatting, but just for a kinetic-energy
|
||||
contribution to the pressure. See the "compute
|
||||
temp"_compute_temp.html and "compute pressure"_compute_pressure.html
|
||||
commands for details. Note that the IDs of the new computes are the
|
||||
fix-ID + underscore + "temp" or fix_ID + underscore + "press".
|
||||
|
||||
Note that these are NOT the computes used by thermodynamic output (see
|
||||
the "thermo_style"_thermo_style.html command) with ID = {thermo_temp}
|
||||
and {thermo_press}. This means you can change the attributes of this
|
||||
and {thermo_press}. This means you can change the attributes of these
|
||||
fix's temperature or pressure via the
|
||||
"compute_modify"_compute_modify.html command or print this temperature
|
||||
or pressure during thermodynamic output via the "thermo_style
|
||||
custom"_thermo_style.html command using the appropriate compute-ID.
|
||||
It also means that changing attributes of {thermo_temp} or
|
||||
{thermo_press} will have no effect on this fix.
|
||||
"compute_modify"_compute_modify.html command. Or you can print this
|
||||
temperature or pressure during thermodynamic output via the
|
||||
"thermo_style custom"_thermo_style.html command using the appropriate
|
||||
compute-ID. It also means that changing attributes of {thermo_temp}
|
||||
or {thermo_press} will have no effect on this fix.
|
||||
|
||||
Like other fixes that perform thermostatting, fix nvt and fix npt can
|
||||
be used with "compute commands"_compute.html that calculate a
|
||||
|
||||
@ -59,6 +59,7 @@ Fixes :h1
|
||||
fix_langevin
|
||||
fix_langevin_drude
|
||||
fix_langevin_eff
|
||||
fix_latte
|
||||
fix_lb_fluid
|
||||
fix_lb_momentum
|
||||
fix_lb_pc
|
||||
|
||||
@ -188,6 +188,7 @@ fix_ipi.html
|
||||
fix_langevin.html
|
||||
fix_langevin_drude.html
|
||||
fix_langevin_eff.html
|
||||
fix_latte.html
|
||||
fix_lb_fluid.html
|
||||
fix_lb_momentum.html
|
||||
fix_lb_pc.html
|
||||
|
||||
@ -62,7 +62,7 @@ args = arguments specific to the style :l
|
||||
{no_affinity} values = none
|
||||
{kokkos} args = keyword value ...
|
||||
zero or more keyword/value pairs may be appended
|
||||
keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward}
|
||||
keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse}
|
||||
{neigh} value = {full} or {half}
|
||||
full = full neighbor list
|
||||
half = half neighbor list built in thread-safe manner
|
||||
@ -75,9 +75,10 @@ args = arguments specific to the style :l
|
||||
{binsize} value = size
|
||||
size = bin size for neighbor list construction (distance units)
|
||||
{comm} value = {no} or {host} or {device}
|
||||
use value for both comm/exchange and comm/forward
|
||||
use value for comm/exchange and comm/forward and comm/reverse
|
||||
{comm/exchange} value = {no} or {host} or {device}
|
||||
{comm/forward} value = {no} or {host} or {device}
|
||||
{comm/reverse} value = {no} or {host} or {device}
|
||||
no = perform communication pack/unpack in non-KOKKOS mode
|
||||
host = perform pack/unpack on host (e.g. with OpenMP threading)
|
||||
device = perform pack/unpack on device (e.g. on GPU)
|
||||
@ -429,17 +430,18 @@ Coulombic solver"_kspace_style.html because the GPU is faster at
|
||||
performing pairwise interactions, then this rule of thumb may give too
|
||||
large a binsize.
|
||||
|
||||
The {comm} and {comm/exchange} and {comm/forward} keywords determine
|
||||
The {comm} and {comm/exchange} and {comm/forward} and {comm/reverse} keywords determine
|
||||
whether the host or device performs the packing and unpacking of data
|
||||
when communicating per-atom data between processors. "Exchange"
|
||||
communication happens only on timesteps that neighbor lists are
|
||||
rebuilt. The data is only for atoms that migrate to new processors.
|
||||
"Forward" communication happens every timestep. The data is for atom
|
||||
"Forward" communication happens every timestep. "Reverse" communication
|
||||
happens every timestep if the {newton} option is on. The data is for atom
|
||||
coordinates and any other atom properties that needs to be updated for
|
||||
ghost atoms owned by each processor.
|
||||
|
||||
The {comm} keyword is simply a short-cut to set the same value
|
||||
for both the {comm/exchange} and {comm/forward} keywords.
|
||||
for both the {comm/exchange} and {comm/forward} and {comm/reverse} keywords.
|
||||
|
||||
The value options for all 3 keywords are {no} or {host} or {device}.
|
||||
A value of {no} means to use the standard non-KOKKOS method of
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
|
||||
pair_style dpd command :h3
|
||||
pair_style dpd/gpu command :h3
|
||||
pair_style dpd/intel command :h3
|
||||
pair_style dpd/omp command :h3
|
||||
pair_style dpd/tstat command :h3
|
||||
pair_style dpd/tstat/gpu command :h3
|
||||
|
||||
@ -294,7 +294,7 @@ distribution have a ".cdeam" suffix.
|
||||
|
||||
Style {eam/fs} computes pairwise interactions for metals and metal
|
||||
alloys using a generalized form of EAM potentials due to Finnis and
|
||||
Sinclair "(Finnis)"_#Finnis. The total energy Ei of an atom I is
|
||||
Sinclair "(Finnis)"_#Finnis1. The total energy Ei of an atom I is
|
||||
given by
|
||||
|
||||
:c,image(Eqs/pair_eam_fs.jpg)
|
||||
@ -442,7 +442,7 @@ of Physics: Condensed Matter, 16, S2629 (2004).
|
||||
[(Daw)] Daw, Baskes, Phys Rev Lett, 50, 1285 (1983).
|
||||
Daw, Baskes, Phys Rev B, 29, 6443 (1984).
|
||||
|
||||
:link(Finnis)
|
||||
:link(Finnis1)
|
||||
[(Finnis)] Finnis, Sinclair, Philosophical Magazine A, 50, 45 (1984).
|
||||
|
||||
:link(Stukowski)
|
||||
|
||||
@ -1,5 +1,24 @@
|
||||
# Change Log
|
||||
|
||||
## [2.04.04](https://github.com/kokkos/kokkos/tree/2.04.04) (2017-09-11)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.00...2.04.04)
|
||||
|
||||
**Implemented enhancements:**
|
||||
|
||||
- OpenMP partition: set number of threads on nested level [\#1082](https://github.com/kokkos/kokkos/issues/1082)
|
||||
- Add StaticCrsGraph row\(\) method [\#1071](https://github.com/kokkos/kokkos/issues/1071)
|
||||
- Enhance Kokkos complex operator overloading [\#1052](https://github.com/kokkos/kokkos/issues/1052)
|
||||
- Tell Trilinos packages about host+device lambda [\#1019](https://github.com/kokkos/kokkos/issues/1019)
|
||||
- Function markup for defaulted class members [\#952](https://github.com/kokkos/kokkos/issues/952)
|
||||
- Add deterministic random number generator [\#857](https://github.com/kokkos/kokkos/issues/857)
|
||||
|
||||
**Fixed bugs:**
|
||||
|
||||
- Fix reduction\_identity\<T\>::max for floating point numbers [\#1048](https://github.com/kokkos/kokkos/issues/1048)
|
||||
- Fix MD iteration policy ignores lower bound on GPUs [\#1041](https://github.com/kokkos/kokkos/issues/1041)
|
||||
- (Experimental) HBWSpace Linking issues in KokkosKernels [\#1094](https://github.com/kokkos/kokkos/issues/1094)
|
||||
- (Experimental) ROCm: algorithms/unit\_tests test\_sort failing with segfault [\#1070](https://github.com/kokkos/kokkos/issues/1070)
|
||||
|
||||
## [2.04.00](https://github.com/kokkos/kokkos/tree/2.04.00) (2017-08-16)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.13...2.04.00)
|
||||
|
||||
|
||||
@ -443,7 +443,7 @@ endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
|
||||
KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include
|
||||
KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib
|
||||
KOKKOS_LIBS += -lmemkind
|
||||
KOKKOS_LIBS += -lmemkind -lnuma
|
||||
tmp := $(shell echo "\#define KOKKOS_HAVE_HBWSPACE 1" >> KokkosCore_config.tmp )
|
||||
endif
|
||||
|
||||
@ -614,9 +614,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
|
||||
else
|
||||
# Assume that this is a really a GNU compiler or it could be XL on P8.
|
||||
KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
|
||||
KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
|
||||
KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
|
||||
KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
|
||||
|
||||
else
|
||||
# Assume that this is a really a GNU compiler on P8.
|
||||
KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
|
||||
KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
@ -626,9 +635,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
|
||||
else
|
||||
# Assume that this is a really a GNU compiler or it could be XL on P9.
|
||||
KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
|
||||
KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
|
||||
KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
|
||||
KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
|
||||
|
||||
else
|
||||
# Assume that this is a really a GNU compiler on P9
|
||||
KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
|
||||
KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
@ -1265,6 +1265,243 @@ void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift102
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(KOKKOS_ENABLE_ROCM)
|
||||
|
||||
template<>
|
||||
class Random_XorShift1024<Kokkos::Experimental::ROCm> {
|
||||
private:
|
||||
int p_;
|
||||
const int state_idx_;
|
||||
uint64_t* state_;
|
||||
const int stride_;
|
||||
friend class Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>;
|
||||
public:
|
||||
|
||||
typedef Kokkos::Experimental::ROCm device_type;
|
||||
typedef Random_XorShift1024_Pool<device_type> pool_type;
|
||||
|
||||
enum {MAX_URAND = 0xffffffffU};
|
||||
enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
|
||||
enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
|
||||
enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
|
||||
p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint32_t urand() {
|
||||
uint64_t state_0 = state_[ p_ * stride_ ];
|
||||
uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
|
||||
state_1 ^= state_1 << 31;
|
||||
state_1 ^= state_1 >> 11;
|
||||
state_0 ^= state_0 >> 30;
|
||||
uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
|
||||
tmp = tmp>>16;
|
||||
return static_cast<uint32_t>(tmp&MAX_URAND);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint64_t urand64() {
|
||||
uint64_t state_0 = state_[ p_ * stride_ ];
|
||||
uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
|
||||
state_1 ^= state_1 << 31;
|
||||
state_1 ^= state_1 >> 11;
|
||||
state_0 ^= state_0 >> 30;
|
||||
return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint32_t urand(const uint32_t& range) {
|
||||
const uint32_t max_val = (MAX_URAND/range)*range;
|
||||
uint32_t tmp = urand();
|
||||
while(tmp>=max_val)
|
||||
urand();
|
||||
return tmp%range;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint32_t urand(const uint32_t& start, const uint32_t& end ) {
|
||||
return urand(end-start)+start;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint64_t urand64(const uint64_t& range) {
|
||||
const uint64_t max_val = (MAX_URAND64/range)*range;
|
||||
uint64_t tmp = urand64();
|
||||
while(tmp>=max_val)
|
||||
urand64();
|
||||
return tmp%range;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint64_t urand64(const uint64_t& start, const uint64_t& end ) {
|
||||
return urand64(end-start)+start;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int rand() {
|
||||
return static_cast<int>(urand()/2);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int rand(const int& range) {
|
||||
const int max_val = (MAX_RAND/range)*range;
|
||||
int tmp = rand();
|
||||
while(tmp>=max_val)
|
||||
rand();
|
||||
return tmp%range;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int rand(const int& start, const int& end ) {
|
||||
return rand(end-start)+start;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t rand64() {
|
||||
return static_cast<int64_t>(urand64()/2);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t rand64(const int64_t& range) {
|
||||
const int64_t max_val = (MAX_RAND64/range)*range;
|
||||
int64_t tmp = rand64();
|
||||
while(tmp>=max_val)
|
||||
rand64();
|
||||
return tmp%range;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t rand64(const int64_t& start, const int64_t& end ) {
|
||||
return rand64(end-start)+start;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float frand() {
|
||||
return 1.0f * urand64()/MAX_URAND64;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float frand(const float& range) {
|
||||
return range * urand64()/MAX_URAND64;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float frand(const float& start, const float& end ) {
|
||||
return frand(end-start)+start;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double drand() {
|
||||
return 1.0 * urand64()/MAX_URAND64;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double drand(const double& range) {
|
||||
return range * urand64()/MAX_URAND64;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double drand(const double& start, const double& end ) {
|
||||
return frand(end-start)+start;
|
||||
}
|
||||
|
||||
//Marsaglia polar method for drawing a standard normal distributed random number
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double normal() {
|
||||
double S = 2.0;
|
||||
double U;
|
||||
while(S>=1.0) {
|
||||
U = 2.0*drand() - 1.0;
|
||||
const double V = 2.0*drand() - 1.0;
|
||||
S = U*U+V*V;
|
||||
}
|
||||
return U*std::sqrt(-2.0*log(S)/S);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double normal(const double& mean, const double& std_dev=1.0) {
|
||||
return mean + normal()*std_dev;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
inline
|
||||
Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::Random_XorShift64_Pool(uint64_t seed) {
|
||||
num_states_ = 0;
|
||||
init(seed,4*32768);
|
||||
}
|
||||
|
||||
template<>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift64<Kokkos::Experimental::ROCm> Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::get_state() const {
|
||||
#ifdef __HCC_ACCELERATOR__
|
||||
const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
|
||||
int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
|
||||
blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
|
||||
while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
|
||||
i+=blockDim_x*blockDim_y*blockDim_z;
|
||||
if(i>=num_states_) {i = i_offset;}
|
||||
}
|
||||
|
||||
return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(i),i);
|
||||
#else
|
||||
return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(0),0);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift64<Kokkos::Experimental::ROCm> &state) const {
|
||||
#ifdef __HCC_ACCELERATOR__
|
||||
state_(state.state_idx_) = state.state_;
|
||||
locks_(state.state_idx_) = 0;
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
inline
|
||||
Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::Random_XorShift1024_Pool(uint64_t seed) {
|
||||
num_states_ = 0;
|
||||
init(seed,4*32768);
|
||||
}
|
||||
|
||||
template<>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift1024<Kokkos::Experimental::ROCm> Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::get_state() const {
|
||||
#ifdef __HCC_ACCELERATOR__
|
||||
const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
|
||||
int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
|
||||
blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
|
||||
while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
|
||||
i+=blockDim_x*blockDim_y*blockDim_z;
|
||||
if(i>=num_states_) {i = i_offset;}
|
||||
}
|
||||
|
||||
return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(i), i);
|
||||
#else
|
||||
return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(0), 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift1024<Kokkos::Experimental::ROCm> &state) const {
|
||||
#ifdef __HCC_ACCELERATOR__
|
||||
for(int i=0; i<16; i++)
|
||||
state_(state.state_idx_,i) = state.state_[i];
|
||||
locks_(state.state_idx_) = 0;
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
@ -30,6 +30,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
TEST_TARGETS += test-cuda
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
|
||||
OBJ_ROCM = TestROCm.o UnitTestMain.o gtest-all.o
|
||||
TARGETS += KokkosAlgorithms_UnitTest_ROCm
|
||||
TEST_TARGETS += test-rocm
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
|
||||
OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o
|
||||
TARGETS += KokkosAlgorithms_UnitTest_Threads
|
||||
@ -51,6 +57,9 @@ endif
|
||||
KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Cuda
|
||||
|
||||
KokkosAlgorithms_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_ROCm
|
||||
|
||||
KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Threads
|
||||
|
||||
@ -63,6 +72,9 @@ KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
|
||||
test-cuda: KokkosAlgorithms_UnitTest_Cuda
|
||||
./KokkosAlgorithms_UnitTest_Cuda
|
||||
|
||||
test-rocm: KokkosAlgorithms_UnitTest_ROCm
|
||||
./KokkosAlgorithms_UnitTest_ROCm
|
||||
|
||||
test-threads: KokkosAlgorithms_UnitTest_Threads
|
||||
./KokkosAlgorithms_UnitTest_Threads
|
||||
|
||||
|
||||
112
lib/kokkos/algorithms/unit_tests/TestROCm.cpp
Normal file
112
lib/kokkos/algorithms/unit_tests/TestROCm.cpp
Normal file
@ -0,0 +1,112 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#ifdef KOKKOS_ENABLE_ROCM
|
||||
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include <TestRandom.hpp>
|
||||
#include <TestSort.hpp>
|
||||
|
||||
namespace Test {
|
||||
|
||||
class rocm : public ::testing::Test {
|
||||
protected:
|
||||
static void SetUpTestCase()
|
||||
{
|
||||
std::cout << std::setprecision(5) << std::scientific;
|
||||
Kokkos::HostSpace::execution_space::initialize();
|
||||
Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) );
|
||||
}
|
||||
static void TearDownTestCase()
|
||||
{
|
||||
Kokkos::Experimental::ROCm::finalize();
|
||||
Kokkos::HostSpace::execution_space::finalize();
|
||||
}
|
||||
};
|
||||
|
||||
void rocm_test_random_xorshift64( int num_draws )
|
||||
{
|
||||
Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Experimental::ROCm> >(num_draws);
|
||||
}
|
||||
|
||||
void rocm_test_random_xorshift1024( int num_draws )
|
||||
{
|
||||
Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Experimental::ROCm> >(num_draws);
|
||||
}
|
||||
|
||||
|
||||
#define ROCM_RANDOM_XORSHIFT64( num_draws ) \
|
||||
TEST_F( rocm, Random_XorShift64 ) { \
|
||||
rocm_test_random_xorshift64(num_draws); \
|
||||
}
|
||||
|
||||
#define ROCM_RANDOM_XORSHIFT1024( num_draws ) \
|
||||
TEST_F( rocm, Random_XorShift1024 ) { \
|
||||
rocm_test_random_xorshift1024(num_draws); \
|
||||
}
|
||||
|
||||
#define ROCM_SORT_UNSIGNED( size ) \
|
||||
TEST_F( rocm, SortUnsigned ) { \
|
||||
Impl::test_sort< Kokkos::Experimental::ROCm, unsigned >(size); \
|
||||
}
|
||||
|
||||
ROCM_RANDOM_XORSHIFT64( 132141141 )
|
||||
ROCM_RANDOM_XORSHIFT1024( 52428813 )
|
||||
ROCM_SORT_UNSIGNED(171)
|
||||
|
||||
#undef ROCM_RANDOM_XORSHIFT64
|
||||
#undef ROCM_RANDOM_XORSHIFT1024
|
||||
#undef ROCM_SORT_UNSIGNED
|
||||
}
|
||||
#else
|
||||
void KOKKOS_ALGORITHMS_UNITTESTS_TESTROCM_PREVENT_LINK_ERROR() {}
|
||||
#endif /* #ifdef KOKKOS_ENABLE_ROCM */
|
||||
|
||||
@ -27,7 +27,7 @@ fi
|
||||
HPCBIND_HWLOC_PARENT_CPUSET=""
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
|
||||
MY_PID="$BASHPID"
|
||||
HPCBIND_HWLOC_PARENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
|
||||
HPCBIND_HWLOC_PARENT_CPUSET="$(hwloc-ps -a --cpuset | grep ${MY_PID} | cut -f 2)"
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
@ -58,23 +58,34 @@ declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0))
|
||||
################################################################################
|
||||
HPCBIND_QUEUE_NAME=""
|
||||
declare -i HPCBIND_QUEUE_INDEX=0
|
||||
declare -i HPCBIND_QUEUE_GPU_MAPPING=0
|
||||
declare -i HPCBIND_QUEUE_MAPPING=0
|
||||
|
||||
if [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
|
||||
HPCBIND_QUEUE_GPU_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="sbatch"
|
||||
if [[ ! -z "${PMI_RANK}" ]]; then
|
||||
HPCBIND_QUEUE_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="mpich"
|
||||
HPCBIND_QUEUE_INDEX=${PMI_RANK}
|
||||
elif [[ ! -z "${OMPI_COMM_WORLD_RANK}" ]]; then
|
||||
HPCBIND_QUEUE_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="openmpi"
|
||||
HPCBIND_QUEUE_INDEX=${OMPI_COMM_WORLD_RANK}
|
||||
elif [[ ! -z "${MV2_COMM_WORLD_RANK}" ]]; then
|
||||
HPCBIND_QUEUE_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="mvapich2"
|
||||
HPCBIND_QUEUE_INDEX=${MV2_COMM_WORLD_RANK}
|
||||
elif [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
|
||||
HPCBIND_QUEUE_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="slurm"
|
||||
HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID}
|
||||
elif [[ ! -z "${LBS_JOBINDEX}" ]]; then
|
||||
HPCBIND_QUEUE_GPU_MAPPING=1
|
||||
HPCBIND_QUEUE_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="bsub"
|
||||
HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX}
|
||||
elif [[ ! -z "${ALPS_APP_PE}" ]]; then
|
||||
HPCBIND_QUEUE_GPU_MAPPING=1
|
||||
HPCBIND_QUEUE_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="aprun"
|
||||
HPCBIND_QUEUE_INDEX=${ALPS_APP_PE}
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Show help
|
||||
################################################################################
|
||||
@ -91,13 +102,14 @@ function show_help {
|
||||
echo " --proc-bind=<LOC> Set the initial process mask for the script"
|
||||
echo " LOC can be any valid location argument for"
|
||||
echo " hwloc-calc Default: all"
|
||||
echo " --whole-system ${cmd} will ignore the its parent process binding"
|
||||
echo " --distribute=N Distribute the current cpuset into N partitions"
|
||||
echo " --distribute-partition=I"
|
||||
echo " Use the i'th partition (zero based)"
|
||||
echo " --visible-gpus=<L> Comma separated list of gpu ids"
|
||||
echo " Default: CUDA_VISIBLE_DEVICES or all gpus in"
|
||||
echo " sequential order"
|
||||
echo " --gpu-ignore-queue Ignore queue job id when choosing visible GPU"
|
||||
echo " --ignore-queue Ignore queue job id when choosing visible GPU and partition"
|
||||
echo " --no-gpu-mapping Do not set CUDA_VISIBLE_DEVICES"
|
||||
echo " --openmp=M.m Set env variables for the given OpenMP version"
|
||||
echo " Default: 4.0"
|
||||
@ -110,22 +122,30 @@ function show_help {
|
||||
echo " --force-openmp-proc-bind=<OP>"
|
||||
echo " Override logic for selecting OMP_PROC_BIND"
|
||||
echo " --no-openmp-nested Set OMP_NESTED to false"
|
||||
echo " --show-bindings Show the bindings"
|
||||
echo " --lstopo Show bindings in lstopo without executing a command"
|
||||
echo " -v|--verbose Show options and relevant environment variables"
|
||||
echo " --output-prefix=<P> Save the output to files of the form"
|
||||
echo " P-N.log, P-N.out and P-N.err where P is the prefix"
|
||||
echo " and N is the queue index or mpi rank (no spaces)"
|
||||
echo " --output-mode=<Op> How console output should be handled."
|
||||
echo " Options are all, rank0, and none. Default: rank0"
|
||||
echo " --lstopo Show bindings in lstopo"
|
||||
echo " -v|--verbose Print bindings and relevant environment variables"
|
||||
echo " -h|--help Show this message"
|
||||
echo ""
|
||||
echo "Sample Usage:"
|
||||
echo " Split the current process cpuset into 4 and use the 3rd partition"
|
||||
echo " ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..."
|
||||
echo " Bing the process to all even cores"
|
||||
echo " Launch 16 jobs over 4 nodes with 4 jobs per node using only the even pus"
|
||||
echo " and save the output to rank specific files"
|
||||
echo " mpiexec -N 16 -npernode 4 ${cmd} --whole-system --proc-bind=pu:even \\"
|
||||
echo " --distribute=4 -v --output-prefix=output -- command ..."
|
||||
echo " Bind the process to all even cores"
|
||||
echo " ${cmd} --proc-bind=core:even -v -- command ..."
|
||||
echo " Bind to the first 64 cores and split the current process cpuset into 4"
|
||||
echo " ${cmd} --proc-bind=core:0-63 --distribute=4 --distribute-partition=0 -- command ..."
|
||||
echo " skip GPU 0 when mapping visible devices"
|
||||
echo " Bind the the even cores of socket 0 and the odd cores of socket 1"
|
||||
echo " ${cmd} --proc-bind='socket:0.core:even socket:1.core:odd' -v -- command ..."
|
||||
echo " Skip GPU 0 when mapping visible devices"
|
||||
echo " ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..."
|
||||
echo " Display the current bindings"
|
||||
echo " ${cmd} --proc-bind=numa:0 --show-bindings -- command"
|
||||
echo " ${cmd} --proc-bind=numa:0 -- command"
|
||||
echo " Display the current bindings using lstopo"
|
||||
echo " ${cmd} --proc-bind=numa:0.core:odd --lstopo"
|
||||
echo ""
|
||||
@ -144,7 +164,7 @@ fi
|
||||
declare -a UNKNOWN_ARGS=()
|
||||
declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC}
|
||||
declare -i HPCBIND_DISTRIBUTE=1
|
||||
declare -i HPCBIND_PARTITION=0
|
||||
declare -i HPCBIND_PARTITION=-1
|
||||
HPCBIND_PROC_BIND="all"
|
||||
HPCBIND_OPENMP_VERSION=4.0
|
||||
declare -i HPCBIND_OPENMP_PERCENT=100
|
||||
@ -155,11 +175,15 @@ HPCBIND_OPENMP_FORCE_PROC_BIND=""
|
||||
HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true}
|
||||
declare -i HPCBIND_VERBOSE=0
|
||||
|
||||
declare -i HPCBIND_SHOW_BINDINGS=0
|
||||
declare -i HPCBIND_LSTOPO=0
|
||||
|
||||
for i in $@; do
|
||||
case $i in
|
||||
HPCBIND_OUTPUT_PREFIX=""
|
||||
HPCBIND_OUTPUT_MODE="rank0"
|
||||
|
||||
declare -i HPCBIND_HAS_COMMAND=0
|
||||
|
||||
for i in "$@"; do
|
||||
case "$i" in
|
||||
# number of partitions to create
|
||||
--no-hwloc-bind)
|
||||
HPCBIND_ENABLE_HWLOC_BIND=0
|
||||
@ -169,6 +193,10 @@ for i in $@; do
|
||||
HPCBIND_PROC_BIND="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--whole-system)
|
||||
HPCBIND_HWLOC_PARENT_CPUSET=""
|
||||
shift
|
||||
;;
|
||||
--distribute=*)
|
||||
HPCBIND_DISTRIBUTE="${i#*=}"
|
||||
shift
|
||||
@ -182,8 +210,8 @@ for i in $@; do
|
||||
HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ')
|
||||
shift
|
||||
;;
|
||||
--gpu-ignore-queue)
|
||||
HPCBIND_QUEUE_GPU_MAPPING=0
|
||||
--ignore-queue)
|
||||
HPCBIND_QUEUE_MAPPING=0
|
||||
shift
|
||||
;;
|
||||
--no-gpu-mapping)
|
||||
@ -218,14 +246,18 @@ for i in $@; do
|
||||
HPCBIND_OPENMP_NESTED="false"
|
||||
shift
|
||||
;;
|
||||
--show-bindings)
|
||||
HPCBIND_VERBOSE=1
|
||||
HPCBIND_SHOW_BINDINGS=1
|
||||
--output-prefix=*)
|
||||
HPCBIND_OUTPUT_PREFIX="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--output-mode=*)
|
||||
HPCBIND_OUTPUT_MODE="${i#*=}"
|
||||
#convert to lower case
|
||||
HPCBIND_OUTPUT_MODE="${HPCBIND_OUTPUT_MODE,,}"
|
||||
shift
|
||||
;;
|
||||
--lstopo)
|
||||
HPCBIND_VERBOSE=1
|
||||
HPCBIND_SHOW_BINDINGS=0
|
||||
HPCBIND_LSTOPO=1
|
||||
shift
|
||||
;;
|
||||
@ -239,6 +271,7 @@ for i in $@; do
|
||||
;;
|
||||
# ignore remaining arguments
|
||||
--)
|
||||
HPCBIND_HAS_COMMAND=1
|
||||
shift
|
||||
break
|
||||
;;
|
||||
@ -250,16 +283,41 @@ for i in $@; do
|
||||
esac
|
||||
done
|
||||
|
||||
################################################################################
|
||||
# Check output mode
|
||||
################################################################################
|
||||
declare -i HPCBIND_TEE=0
|
||||
|
||||
if [[ "${HPCBIND_OUTPUT_MODE}" == "none" ]]; then
|
||||
HPCBIND_TEE=0
|
||||
elif [[ "${HPCBIND_OUTPUT_MODE}" == "all" ]]; then
|
||||
HPCBIND_TEE=1
|
||||
elif [[ ${HPCBIND_QUEUE_INDEX} -eq 0 ]]; then
|
||||
#default to rank0 printing to screen
|
||||
HPCBIND_TEE=1
|
||||
fi
|
||||
|
||||
|
||||
if [[ "${HPCBIND_OUTPUT_PREFIX}" == "" ]]; then
|
||||
HPCBIND_LOG=/dev/null
|
||||
HPCBIND_ERR=/dev/null
|
||||
HPCBIND_OUT=/dev/null
|
||||
else
|
||||
HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.hpc.log"
|
||||
HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.err"
|
||||
HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.out"
|
||||
> ${HPCBIND_LOG}
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Check unknown arguments
|
||||
################################################################################
|
||||
if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
|
||||
echo "Uknown options: ${UNKNOWN_ARGS[*]}"
|
||||
echo "HPCBIND Uknown options: ${UNKNOWN_ARGS[*]}" > >(tee -a ${HPCBIND_LOG})
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Check that visible gpus are valid
|
||||
################################################################################
|
||||
@ -268,22 +326,19 @@ if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
|
||||
for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do
|
||||
if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} ||
|
||||
${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then
|
||||
echo "Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]}, setting to 0"
|
||||
echo "HPCBIND Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]} (setting to 0)" > >(tee -a ${HPCBIND_LOG})
|
||||
HPCBIND_VISIBLE_GPUS[$i]=0;
|
||||
fi
|
||||
done
|
||||
NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]}
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Check OpenMP percent
|
||||
################################################################################
|
||||
if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then
|
||||
echo "OpenMP percent < 1, setting to 1"
|
||||
HPCBIND_OPENMP_PERCENT=1
|
||||
elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then
|
||||
echo "OpenMP percent > 100, setting to 100"
|
||||
HPCBIND_OPENMP_PERCENT=100
|
||||
fi
|
||||
|
||||
@ -291,15 +346,21 @@ fi
|
||||
# Check distribute
|
||||
################################################################################
|
||||
if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then
|
||||
echo "Invalid input for distribute, changing distribute to 1"
|
||||
HPCBIND_DISTRIBUTE=1
|
||||
fi
|
||||
|
||||
if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
|
||||
echo "Invalid input for distribute-partition, changing to 0"
|
||||
################################################################################
|
||||
#choose the correct partition
|
||||
################################################################################
|
||||
if [[ ${HPCBIND_PARTITION} -lt 0 && ${HPCBIND_QUEUE_MAPPING} -eq 1 ]]; then
|
||||
HPCBIND_PARTITION=${HPCBIND_QUEUE_INDEX}
|
||||
elif [[ ${HPCBIND_PARTITION} -lt 0 ]]; then
|
||||
HPCBIND_PARTITION=0
|
||||
fi
|
||||
|
||||
if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
|
||||
HPCBIND_PARTITION=$((HPCBIND_PARTITION % HPCBIND_DISTRIBUTE))
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
# Find cpuset and num threads
|
||||
@ -309,13 +370,17 @@ declare -i HPCBIND_NUM_PUS=0
|
||||
|
||||
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
|
||||
if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
|
||||
BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND})
|
||||
BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND[*]})
|
||||
else
|
||||
BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND})
|
||||
BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND[*]})
|
||||
fi
|
||||
|
||||
CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
|
||||
HPCBIND_HWLOC_CPUSET=${CPUSETS[${HPCBIND_PARTITION}]}
|
||||
if [[ ${HPCBIND_DISTRIBUTE} -gt 1 ]]; then
|
||||
CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
|
||||
HPCBIND_HWLOC_CPUSET="${CPUSETS[${HPCBIND_PARTITION}]}"
|
||||
else
|
||||
HPCBIND_HWLOC_CPUSET="${BINDING}"
|
||||
fi
|
||||
HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l)
|
||||
else
|
||||
HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor)
|
||||
@ -373,13 +438,13 @@ export OMP_NESTED=${HPCBIND_OPENMP_NESTED}
|
||||
################################################################################
|
||||
|
||||
if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
|
||||
if [[ ${HPCBIND_QUEUE_GPU_MAPPING} -eq 0 ]]; then
|
||||
if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then
|
||||
declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
|
||||
export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
|
||||
export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
|
||||
else
|
||||
declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
|
||||
declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
|
||||
export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
|
||||
export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
|
||||
fi
|
||||
fi
|
||||
|
||||
@ -389,22 +454,22 @@ fi
|
||||
export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC}
|
||||
export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA}
|
||||
export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS}
|
||||
export HPCBIND_HWLOC_CPUSET=${HPCBIND_HWLOC_CPUSET}
|
||||
export HPCBIND_HWLOC_CPUSET="${HPCBIND_HWLOC_CPUSET}"
|
||||
export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE}
|
||||
export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION}
|
||||
if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
|
||||
export HPCBIND_HWLOC_PARENT_CPUSET="all"
|
||||
else
|
||||
export HPCBIND_HWLOC_PARENT_CPUSET=${HPCBIND_HWLOC_PARENT_CPUSET}
|
||||
export HPCBIND_HWLOC_PARENT_CPUSET="${HPCBIND_HWLOC_PARENT_CPUSET}"
|
||||
fi
|
||||
export HPCBIND_HWLOC_PROC_BIND=${HPCBIND_PROC_BIND}
|
||||
export HPCBIND_HWLOC_PROC_BIND="${HPCBIND_PROC_BIND}"
|
||||
export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
|
||||
export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
|
||||
export HPCBIND_OPENMP_VERSION=${HPCBIND_OPENMP_VERSION}
|
||||
export HPCBIND_OPENMP_VERSION="${HPCBIND_OPENMP_VERSION}"
|
||||
if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then
|
||||
export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX}
|
||||
export HPCBIND_QUEUE_NAME=${HPCBIND_QUEUE_NAME}
|
||||
export HPCBIND_QUEUE_GPU_MAPPING=${HPCBIND_QUEUE_GPU_MAPPING}
|
||||
export HPCBIND_QUEUE_NAME="${HPCBIND_QUEUE_NAME}"
|
||||
export HPCBIND_QUEUE_MAPPING=${HPCBIND_QUEUE_MAPPING}
|
||||
fi
|
||||
|
||||
|
||||
@ -412,43 +477,63 @@ fi
|
||||
# Print verbose
|
||||
################################################################################
|
||||
|
||||
if [[ ${HPCBIND_VERBOSE} -eq 1 ]]; then
|
||||
MY_ENV=$(env | sort)
|
||||
echo "[HPCBIND]"
|
||||
echo "${MY_ENV}" | grep -E "^HPCBIND_"
|
||||
echo "[CUDA]"
|
||||
echo "${MY_ENV}" | grep -E "^CUDA_"
|
||||
echo "[OPENMP]"
|
||||
echo "${MY_ENV}" | grep -E "^OMP_"
|
||||
fi
|
||||
TMP_ENV=$(env | sort)
|
||||
if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then
|
||||
echo "[HOST]" >> ${HPCBIND_LOG}
|
||||
hostname -s >> ${HPCBIND_LOG}
|
||||
echo "[HPCBIND]" >> ${HPCBIND_LOG}
|
||||
echo "${TMP_ENV}" | grep -E "^HPCBIND_" >> ${HPCBIND_LOG}
|
||||
echo "[CUDA]" >> ${HPCBIND_LOG}
|
||||
echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG}
|
||||
echo "[OPENMP]" >> ${HPCBIND_LOG}
|
||||
echo "${TMP_ENV}" | grep -E "^OMP_" >> ${HPCBIND_LOG}
|
||||
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
|
||||
echo "[BINDINGS]"
|
||||
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
|
||||
elif [[ ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
|
||||
echo "Unable to show bindings, hwloc not available."
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
|
||||
echo "[BINDINGS]" >> ${HPCBIND_LOG}
|
||||
hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu >> ${HPCBIND_LOG}
|
||||
else
|
||||
echo "Unable to show bindings, hwloc not available." >> ${HPCBIND_LOG}
|
||||
fi
|
||||
else
|
||||
echo "[HOST]" > >(tee -a ${HPCBIND_LOG})
|
||||
hostname -s > >(tee -a ${HPCBIND_LOG})
|
||||
echo "[HPCBIND]" > >(tee -a ${HPCBIND_LOG})
|
||||
echo "${TMP_ENV}" | grep -E "^HPCBIND_" > >(tee -a ${HPCBIND_LOG})
|
||||
echo "[CUDA]" > >(tee -a ${HPCBIND_LOG})
|
||||
echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG})
|
||||
echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG})
|
||||
echo "${TMP_ENV}" | grep -E "^OMP_" > >(tee -a ${HPCBIND_LOG})
|
||||
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
|
||||
echo "[BINDINGS]" > >(tee -a ${HPCBIND_LOG})
|
||||
hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu > >(tee -a ${HPCBIND_LOG})
|
||||
else
|
||||
echo "Unable to show bindings, hwloc not available." > >(tee -a ${HPCBIND_LOG})
|
||||
fi
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
# Run command
|
||||
################################################################################
|
||||
|
||||
if [[ ${HPCBIND_LSTOPO} -eq 0 ]]; then
|
||||
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
|
||||
hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- $@
|
||||
else
|
||||
eval $@
|
||||
fi
|
||||
else
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
|
||||
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
|
||||
echo "[BINDINGS]"
|
||||
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
|
||||
hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- lstopo --pid 0
|
||||
# must be the last executed command so that the return value is correct
|
||||
if [[ ${HPCBIND_LSTOPO} -eq 1 && ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
|
||||
hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- lstopo --pid 0
|
||||
elif [[ ${HPCBIND_HAS_COMMAND} -eq 1 ]]; then
|
||||
# clear output files
|
||||
> ${HPCBIND_ERR}
|
||||
> ${HPCBIND_OUT}
|
||||
if [[ ${HPCBIND_TEE} -eq 0 ]]; then
|
||||
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
|
||||
hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
|
||||
else
|
||||
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET}
|
||||
eval $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
|
||||
fi
|
||||
else
|
||||
echo "Unable to show bindings, hwloc not available."
|
||||
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
|
||||
hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
|
||||
else
|
||||
eval $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
@ -1,221 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# check if hwloc commands exist
|
||||
declare -i HAS_HWLOC=0
|
||||
type hwloc-bind >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
type hwloc-distrib >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
type hwloc-ls >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
type hwloc-calc >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
type hwloc-ps >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
|
||||
#parse args
|
||||
declare -a UNKNOWN_ARGS=()
|
||||
declare -i DISTRIBUTE=1
|
||||
declare -i INDEX=0
|
||||
PROC_BIND="all"
|
||||
CURRENT_CPUSET=""
|
||||
OPENMP_VERSION=4.0
|
||||
OPENMP_PROC_BIND=True
|
||||
OPENMP_NESTED=True
|
||||
VERBOSE=False
|
||||
|
||||
#get the current process cpuset
|
||||
if [[ ${HAS_HWLOC} -eq 0 ]]; then
|
||||
MY_PID="$BASHPID"
|
||||
CURRENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
|
||||
echo "$CURRENT_CPUSET"
|
||||
fi
|
||||
|
||||
function show_help {
|
||||
local cmd=$(basename "$0")
|
||||
echo "Usage: ${cmd} <options> -- command ..."
|
||||
echo " Uses hwloc to divide the node into the given number of groups,"
|
||||
echo " set the appropriate OMP_NUM_THREADS and execute the command on the"
|
||||
echo " selected group."
|
||||
echo ""
|
||||
echo " NOTE: This command assumes it has exclusive use of the node"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --proc-bind=<LOC> Set the initial process mask for the script. "
|
||||
echo " LOC can be any valid location argumnet for"
|
||||
echo " hwloc-calc. Defaults to the entire machine"
|
||||
echo " --distribute=N Distribute the current proc-bind into N groups"
|
||||
echo " --index=I Use the i'th group (zero based)"
|
||||
echo " --openmp=M.m Set env variables for the given OpenMP version"
|
||||
echo " (default 4.0)"
|
||||
echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
|
||||
echo " --no-openmp-nested Set OMP_NESTED to false"
|
||||
echo " -v|--verbose"
|
||||
echo " -h|--help"
|
||||
echo ""
|
||||
echo "Sample Usage:"
|
||||
echo " ${cmd} --distribute=4 --index=2 -v -- command ..."
|
||||
echo ""
|
||||
}
|
||||
|
||||
if [[ "$#" -eq 0 ]]; then
|
||||
show_help
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
||||
for i in $@; do
|
||||
case $i in
|
||||
# number of partitions to create
|
||||
--proc-bind=*)
|
||||
PROC_BIND="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--distribute=*)
|
||||
DISTRIBUTE="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
# which group to use
|
||||
--index=*)
|
||||
INDEX="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--openmp=*)
|
||||
OPENMP_VERSION="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--no-openmp-proc-bind)
|
||||
OPENMP_PROC_BIND=False
|
||||
shift
|
||||
;;
|
||||
--no-openmp-nested)
|
||||
OPENMP_NESTED=False
|
||||
shift
|
||||
;;
|
||||
-v|--verbose)
|
||||
VERBOSE=True
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
# ignore remaining arguments
|
||||
--)
|
||||
shift
|
||||
break
|
||||
;;
|
||||
# unknown option
|
||||
*)
|
||||
UNKNOWN_ARGS+=("$i")
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
|
||||
echo "Uknown options: ${UNKNOWN_ARGS[*]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ${DISTRIBUTE} -le 0 ]]; then
|
||||
echo "Invalid input for distribute, changing distribute to 1"
|
||||
DISTRIBUTE=1
|
||||
fi
|
||||
|
||||
if [[ ${INDEX} -ge ${DISTRIBUTE} ]]; then
|
||||
echo "Invalid input for index, changing index to 0"
|
||||
INDEX=0
|
||||
fi
|
||||
|
||||
if [[ ${HAS_HWLOC} -ne 0 ]]; then
|
||||
echo "hwloc not found, no process binding will occur"
|
||||
DISTRIBUTE=1
|
||||
INDEX=0
|
||||
fi
|
||||
|
||||
if [[ ${HAS_HWLOC} -eq 0 ]]; then
|
||||
|
||||
if [[ "${CURRENT_CPUSET}" == "" ]]; then
|
||||
BINDING=$(hwloc-calc ${PROC_BIND})
|
||||
else
|
||||
BINDING=$(hwloc-calc --restrict ${CURRENT_CPUSET} ${PROC_BIND})
|
||||
fi
|
||||
|
||||
CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${DISTRIBUTE}))
|
||||
CPUSET=${CPUSETS[${INDEX}]}
|
||||
NUM_THREADS=$(hwloc-ls --restrict ${CPUSET} --only pu | wc -l)
|
||||
|
||||
if [[ "${VERBOSE}" == "True" ]]; then
|
||||
echo "hwloc: true"
|
||||
echo " proc_bind: ${PROC_BIND}"
|
||||
echo " distribute: ${DISTRIBUTE}"
|
||||
echo " index: ${INDEX}"
|
||||
echo " parent_cpuset: ${CURRENT_CPUSET}"
|
||||
echo " cpuset: ${CPUSET}"
|
||||
echo "omp_num_threads: ${NUM_THREADS}"
|
||||
echo "omp_proc_bind: ${OPENMP_PROC_BIND}"
|
||||
echo "omp_nested: ${OPENMP_NESTED}"
|
||||
echo "OpenMP: ${OPENMP_VERSION}"
|
||||
fi
|
||||
|
||||
# set OMP env
|
||||
if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
|
||||
if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
|
||||
export OMP_PLACES="threads"
|
||||
export OMP_PROC_BIND="spread"
|
||||
else
|
||||
export OMP_PROC_BIND="true"
|
||||
unset OMP_PLACES
|
||||
fi
|
||||
else
|
||||
unset OMP_PLACES
|
||||
unset OMP_PROC_BIND
|
||||
fi
|
||||
if [[ "${OPENMP_NESTED}" == "True" ]]; then
|
||||
export OMP_NESTED="true"
|
||||
else
|
||||
export OMP_NESTED="false"
|
||||
fi
|
||||
export OMP_NUM_THREADS="${NUM_THREADS}"
|
||||
|
||||
hwloc-bind ${CPUSET} -- $@
|
||||
else
|
||||
NUM_THREADS=$(cat /proc/cpuinfo | grep -c processor)
|
||||
|
||||
if [[ "${VERBOSE}" == "True" ]]; then
|
||||
echo "hwloc: false"
|
||||
echo "omp_num_threads: ${NUM_THREADS}"
|
||||
echo "omp_proc_bind: ${OPENMP_PROC_BIND}"
|
||||
echo "omp_nested: ${OPENMP_NESTED}"
|
||||
echo "OpenMP: ${OPENMP_VERSION}"
|
||||
fi
|
||||
|
||||
# set OMP env
|
||||
if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
|
||||
if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
|
||||
export OMP_PLACES="threads"
|
||||
export OMP_PROC_BIND="spread"
|
||||
else
|
||||
export OMP_PROC_BIND="true"
|
||||
unset OMP_PLACES
|
||||
fi
|
||||
else
|
||||
unset OMP_PLACES
|
||||
unset OMP_PROC_BIND
|
||||
fi
|
||||
if [[ "${OPENMP_NESTED}" == "True" ]]; then
|
||||
export OMP_NESTED="true"
|
||||
else
|
||||
export OMP_NESTED="false"
|
||||
fi
|
||||
export OMP_NUM_THREADS="${NUM_THREADS}"
|
||||
|
||||
eval $@
|
||||
fi
|
||||
|
||||
@ -78,6 +78,9 @@ temp_dir=${TMPDIR:-/tmp}
|
||||
# Check if we have an optimization argument already
|
||||
optimization_applied=0
|
||||
|
||||
# Check if we have -std=c++X or --std=c++X already
|
||||
stdcxx_applied=0
|
||||
|
||||
#echo "Arguments: $# $@"
|
||||
|
||||
while [ $# -gt 0 ]
|
||||
@ -130,10 +133,16 @@ do
|
||||
cuda_args="$cuda_args $1 $2"
|
||||
shift
|
||||
;;
|
||||
#Handle c++11 setting
|
||||
--std=c++11|-std=c++11)
|
||||
shared_args="$shared_args $1"
|
||||
#Handle c++11
|
||||
--std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++1z|-std=c++1z)
|
||||
if [ $stdcxx_applied -eq 1 ]; then
|
||||
echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-std=c++1* or --std=c++1*), only the first is used because nvcc can only accept a single std setting"
|
||||
else
|
||||
shared_args="$shared_args $1"
|
||||
stdcxx_applied=1
|
||||
fi
|
||||
;;
|
||||
|
||||
#strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98
|
||||
-std=c++98|--std=c++98)
|
||||
;;
|
||||
|
||||
@ -9,3 +9,4 @@ tag: 2.03.00 date: 04:25:2017 master: 120d9ce7 develop: 015ba641
|
||||
tag: 2.03.05 date: 05:27:2017 master: 36b92f43 develop: 79073186
|
||||
tag: 2.03.13 date: 07:27:2017 master: da314444 develop: 29ccb58a
|
||||
tag: 2.04.00 date: 08:16:2017 master: 54eb75c0 develop: 32fb8ee1
|
||||
tag: 2.04.04 date: 09:11:2017 master: 2b7e9c20 develop: 51e7b25a
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
module purge
|
||||
module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.59.0/base
|
||||
module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.63.0/base sems-yaml_cpp sems-superlu
|
||||
|
||||
#Run Trilinos CheckinTest
|
||||
|
||||
@ -125,6 +125,123 @@ namespace Impl {
|
||||
};
|
||||
}
|
||||
|
||||
/// \class GraphRowViewConst
|
||||
/// \brief View of a row of a sparse graph.
|
||||
/// \tparam GraphType Sparse graph type, such as (but not limited to) StaticCrsGraph.
|
||||
///
|
||||
/// This class provides a generic view of a row of a sparse graph.
|
||||
/// We intended this class to view a row of a StaticCrsGraph, but
|
||||
/// GraphType need not necessarily be CrsMatrix.
|
||||
///
|
||||
/// The row view is suited for computational kernels like sparse
|
||||
/// matrix-vector multiply, as well as for modifying entries in the
|
||||
/// sparse matrix. The view is always const as it does not allow graph modification.
|
||||
///
|
||||
/// Here is an example loop over the entries in the row:
|
||||
/// \code
|
||||
/// typedef typename GraphRowViewConst<MatrixType>::ordinal_type ordinal_type;
|
||||
///
|
||||
/// GraphRowView<GraphType> G_i = ...;
|
||||
/// const ordinal_type numEntries = G_i.length;
|
||||
/// for (ordinal_type k = 0; k < numEntries; ++k) {
|
||||
/// ordinal_type j = G_i.colidx (k);
|
||||
/// // ... do something with A_ij and j ...
|
||||
/// }
|
||||
/// \endcode
|
||||
///
|
||||
/// GraphType must provide the \c data_type
|
||||
/// typedefs. In addition, it must make sense to use GraphRowViewConst to
|
||||
/// view a row of GraphType. In particular, column
|
||||
/// indices of a row must be accessible using the <tt>entries</tt>
|
||||
/// resp. <tt>colidx</tt> arrays given to the constructor of this
|
||||
/// class, with a constant <tt>stride</tt> between successive entries.
|
||||
/// The stride is one for the compressed sparse row storage format (as
|
||||
/// is used by CrsMatrix), but may be greater than one for other
|
||||
/// sparse matrix storage formats (e.g., ELLPACK or jagged diagonal).
|
||||
template<class GraphType>
|
||||
struct GraphRowViewConst {
|
||||
//! The type of the column indices in the row.
|
||||
typedef const typename GraphType::data_type ordinal_type;
|
||||
|
||||
private:
|
||||
//! Array of (local) column indices in the row.
|
||||
ordinal_type* colidx_;
|
||||
/// \brief Stride between successive entries in the row.
|
||||
///
|
||||
/// For compressed sparse row (CSR) storage, this is always one.
|
||||
/// This might be greater than one for storage formats like ELLPACK
|
||||
/// or Jagged Diagonal. Nevertheless, the stride can never be
|
||||
/// greater than the number of rows or columns in the matrix. Thus,
|
||||
/// \c ordinal_type is the correct type.
|
||||
const ordinal_type stride_;
|
||||
|
||||
public:
|
||||
/// \brief Constructor
|
||||
///
|
||||
/// \param values [in] Array of the row's values.
|
||||
/// \param colidx [in] Array of the row's column indices.
|
||||
/// \param stride [in] (Constant) stride between matrix entries in
|
||||
/// each of the above arrays.
|
||||
/// \param count [in] Number of entries in the row.
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
GraphRowViewConst ( ordinal_type* const colidx_in,
|
||||
const ordinal_type& stride,
|
||||
const ordinal_type& count) :
|
||||
colidx_ (colidx_in), stride_ (stride), length (count)
|
||||
{}
|
||||
|
||||
/// \brief Constructor with offset into \c colidx array
|
||||
///
|
||||
/// \param colidx [in] Array of the row's column indices.
|
||||
/// \param stride [in] (Constant) stride between matrix entries in
|
||||
/// each of the above arrays.
|
||||
/// \param count [in] Number of entries in the row.
|
||||
/// \param idx [in] Start offset into \c colidx array
|
||||
///
|
||||
/// \tparam OffsetType The type of \c idx (see above). Must be a
|
||||
/// built-in integer type. This may differ from ordinal_type.
|
||||
/// For example, the matrix may have dimensions that fit in int,
|
||||
/// but a number of entries that does not fit in int.
|
||||
template<class OffsetType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
GraphRowViewConst ( const typename GraphType::entries_type& colidx_in,
|
||||
const ordinal_type& stride,
|
||||
const ordinal_type& count,
|
||||
const OffsetType& idx,
|
||||
const typename std::enable_if<std::is_integral<OffsetType>::value, int>::type& = 0) :
|
||||
colidx_ (&colidx_in(idx)), stride_ (stride), length (count)
|
||||
{}
|
||||
|
||||
/// \brief Number of entries in the row.
|
||||
///
|
||||
/// This is a public const field rather than a public const method,
|
||||
/// in order to avoid possible overhead of a method call if the
|
||||
/// compiler is unable to inline that method call.
|
||||
///
|
||||
/// We assume that rows contain no duplicate entries (i.e., entries
|
||||
/// with the same column index). Thus, a row may have up to
|
||||
/// A.numCols() entries. This means that the correct type of
|
||||
/// 'length' is ordinal_type.
|
||||
const ordinal_type length;
|
||||
|
||||
/// \brief (Const) reference to the column index of entry i in this
|
||||
/// row of the sparse matrix.
|
||||
///
|
||||
/// "Entry i" is not necessarily the entry with column index i, nor
|
||||
/// does i necessarily correspond to the (local) row index.
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ordinal_type& colidx (const ordinal_type& i) const {
|
||||
return colidx_[i*stride_];
|
||||
}
|
||||
|
||||
/// \brief An alias for colidx
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ordinal_type& operator()(const ordinal_type& i) const {
|
||||
return colidx(i);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// \class StaticCrsGraph
|
||||
/// \brief Compressed row storage array.
|
||||
///
|
||||
@ -218,6 +335,38 @@ public:
|
||||
static_cast<size_type> (0);
|
||||
}
|
||||
|
||||
/// \brief Return a const view of row i of the graph.
|
||||
///
|
||||
/// If row i does not belong to the graph, return an empty view.
|
||||
///
|
||||
/// The returned object \c view implements the following interface:
|
||||
/// <ul>
|
||||
/// <li> \c view.length is the number of entries in the row </li>
|
||||
/// <li> \c view.colidx(k) returns a const reference to the
|
||||
/// column index of the k-th entry in the row </li>
|
||||
/// </ul>
|
||||
/// k is not a column index; it just counts from 0 to
|
||||
/// <tt>view.length - 1</tt>.
|
||||
///
|
||||
/// Users should not rely on the return type of this method. They
|
||||
/// should instead assign to 'auto'. That allows compile-time
|
||||
/// polymorphism for different kinds of sparse matrix formats (e.g.,
|
||||
/// ELLPACK or Jagged Diagonal) that we may wish to support in the
|
||||
/// future.
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
GraphRowViewConst<StaticCrsGraph> rowConst (const data_type i) const {
|
||||
const size_type start = row_map(i);
|
||||
// count is guaranteed to fit in ordinal_type, as long as no row
|
||||
// has duplicate entries.
|
||||
const data_type count = static_cast<data_type> (row_map(i+1) - start);
|
||||
|
||||
if (count == 0) {
|
||||
return GraphRowViewConst<StaticCrsGraph> (NULL, 1, 0);
|
||||
} else {
|
||||
return GraphRowViewConst<StaticCrsGraph> (entries, 1, count, start);
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Create a row partitioning into a given number of blocks
|
||||
* balancing non-zeros + a fixed cost per row.
|
||||
*/
|
||||
|
||||
@ -91,11 +91,11 @@ struct DeviceIterateTile<2,RP,Functor,void >
|
||||
// LL
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
|
||||
m_func(offset_0 , offset_1);
|
||||
}
|
||||
@ -106,11 +106,11 @@ struct DeviceIterateTile<2,RP,Functor,void >
|
||||
// LR
|
||||
else {
|
||||
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
|
||||
m_func(offset_0 , offset_1);
|
||||
}
|
||||
@ -143,11 +143,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag>
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
// Loop over size maxnumblocks until full range covered
|
||||
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
|
||||
m_func(Tag(), offset_0 , offset_1);
|
||||
}
|
||||
@ -157,11 +157,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag>
|
||||
}
|
||||
else {
|
||||
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
|
||||
m_func(Tag(), offset_0 , offset_1);
|
||||
}
|
||||
@ -196,15 +196,15 @@ struct DeviceIterateTile<3,RP,Functor,void >
|
||||
// LL
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2);
|
||||
}
|
||||
@ -217,15 +217,15 @@ struct DeviceIterateTile<3,RP,Functor,void >
|
||||
// LR
|
||||
else {
|
||||
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2);
|
||||
}
|
||||
@ -259,15 +259,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag>
|
||||
{
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
|
||||
m_func(Tag(), offset_0 , offset_1 , offset_2);
|
||||
}
|
||||
@ -279,15 +279,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag>
|
||||
}
|
||||
else {
|
||||
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
|
||||
m_func(Tag(), offset_0 , offset_1 , offset_2);
|
||||
}
|
||||
@ -340,19 +340,19 @@ struct DeviceIterateTile<4,RP,Functor,void >
|
||||
const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
|
||||
|
||||
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2 , offset_3);
|
||||
}
|
||||
@ -378,19 +378,19 @@ struct DeviceIterateTile<4,RP,Functor,void >
|
||||
const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
|
||||
|
||||
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2 , offset_3);
|
||||
}
|
||||
@ -442,19 +442,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag>
|
||||
const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
|
||||
|
||||
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
|
||||
}
|
||||
@ -479,19 +479,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag>
|
||||
const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
|
||||
|
||||
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
|
||||
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
|
||||
}
|
||||
@ -558,23 +558,23 @@ struct DeviceIterateTile<5,RP,Functor,void >
|
||||
const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
|
||||
|
||||
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
|
||||
}
|
||||
@ -613,23 +613,23 @@ struct DeviceIterateTile<5,RP,Functor,void >
|
||||
const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
|
||||
|
||||
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
|
||||
}
|
||||
@ -695,23 +695,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag>
|
||||
const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
|
||||
|
||||
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
|
||||
}
|
||||
@ -750,23 +750,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag>
|
||||
const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
|
||||
|
||||
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
|
||||
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
|
||||
}
|
||||
@ -845,27 +845,27 @@ struct DeviceIterateTile<6,RP,Functor,void >
|
||||
const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
|
||||
|
||||
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
|
||||
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
|
||||
|
||||
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
|
||||
}
|
||||
@ -917,27 +917,27 @@ struct DeviceIterateTile<6,RP,Functor,void >
|
||||
const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
|
||||
|
||||
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
|
||||
|
||||
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
|
||||
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
|
||||
}
|
||||
@ -1016,27 +1016,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag>
|
||||
const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
|
||||
|
||||
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
|
||||
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
|
||||
|
||||
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
|
||||
}
|
||||
@ -1088,27 +1088,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag>
|
||||
const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
|
||||
|
||||
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
|
||||
|
||||
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
|
||||
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
|
||||
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
|
||||
}
|
||||
|
||||
@ -164,7 +164,7 @@ static void cuda_parallel_launch_constant_memory()
|
||||
|
||||
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
|
||||
__global__
|
||||
__launch_bounds__(maxTperB, minBperSM)
|
||||
//__launch_bounds__(maxTperB, minBperSM)
|
||||
static void cuda_parallel_launch_constant_memory()
|
||||
{
|
||||
const DriverType & driver =
|
||||
@ -182,7 +182,7 @@ static void cuda_parallel_launch_local_memory( const DriverType driver )
|
||||
|
||||
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
|
||||
__global__
|
||||
__launch_bounds__(maxTperB, minBperSM)
|
||||
//__launch_bounds__(maxTperB, minBperSM)
|
||||
static void cuda_parallel_launch_local_memory( const DriverType driver )
|
||||
{
|
||||
driver();
|
||||
|
||||
@ -242,45 +242,89 @@ public:
|
||||
re_ = v;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator += (const complex<RealType>& src) {
|
||||
complex<RealType>&
|
||||
operator += (const complex<InputRealType>& src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
re_ += src.re_;
|
||||
im_ += src.im_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator += (const volatile complex<RealType>& src) volatile {
|
||||
void
|
||||
operator += (const volatile complex<InputRealType>& src) volatile {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
re_ += src.re_;
|
||||
im_ += src.im_;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator += (const RealType& src) {
|
||||
complex<RealType>&
|
||||
operator += (const std::complex<RealType>& src) {
|
||||
re_ += src.real();
|
||||
im_ += src.imag();
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>&
|
||||
operator += (const InputRealType& src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
re_ += src;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator += (const volatile RealType& src) volatile {
|
||||
void
|
||||
operator += (const volatile InputRealType& src) volatile {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
re_ += src;
|
||||
}
|
||||
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator -= (const complex<RealType>& src) {
|
||||
complex<RealType>&
|
||||
operator -= (const complex<InputRealType>& src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
re_ -= src.re_;
|
||||
im_ -= src.im_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator -= (const RealType& src) {
|
||||
complex<RealType>&
|
||||
operator -= (const std::complex<RealType>& src) {
|
||||
re_ -= src.real();
|
||||
im_ -= src.imag();
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>&
|
||||
operator -= (const InputRealType& src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
re_ -= src;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator *= (const complex<RealType>& src) {
|
||||
complex<RealType>&
|
||||
operator *= (const complex<InputRealType>& src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
const RealType realPart = re_ * src.re_ - im_ * src.im_;
|
||||
const RealType imagPart = re_ * src.im_ + im_ * src.re_;
|
||||
re_ = realPart;
|
||||
@ -288,8 +332,12 @@ public:
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator *= (const volatile complex<RealType>& src) volatile {
|
||||
void
|
||||
operator *= (const volatile complex<InputRealType>& src) volatile {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
const RealType realPart = re_ * src.re_ - im_ * src.im_;
|
||||
const RealType imagPart = re_ * src.im_ + im_ * src.re_;
|
||||
re_ = realPart;
|
||||
@ -297,20 +345,70 @@ public:
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator *= (const RealType& src) {
|
||||
complex<RealType>&
|
||||
operator *= (const std::complex<RealType>& src) {
|
||||
const RealType realPart = re_ * src.real() - im_ * src.imag();
|
||||
const RealType imagPart = re_ * src.imag() + im_ * src.real();
|
||||
re_ = realPart;
|
||||
im_ = imagPart;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>&
|
||||
operator *= (const InputRealType& src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
re_ *= src;
|
||||
im_ *= src;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator *= (const volatile RealType& src) volatile {
|
||||
void
|
||||
operator *= (const volatile InputRealType& src) volatile {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
re_ *= src;
|
||||
im_ *= src;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator /= (const complex<RealType>& y) {
|
||||
complex<RealType>&
|
||||
operator /= (const complex<InputRealType>& y) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
|
||||
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
|
||||
// If the real part is +/-Inf and the imaginary part is -/+Inf,
|
||||
// this won't change the result.
|
||||
const RealType s = std::fabs (y.real ()) + std::fabs (y.imag ());
|
||||
|
||||
// If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
|
||||
// In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
|
||||
// because y/s is NaN.
|
||||
if (s == 0.0) {
|
||||
this->re_ /= s;
|
||||
this->im_ /= s;
|
||||
}
|
||||
else {
|
||||
const complex<RealType> x_scaled (this->re_ / s, this->im_ / s);
|
||||
const complex<RealType> y_conj_scaled (y.re_ / s, -(y.im_) / s);
|
||||
const RealType y_scaled_abs = y_conj_scaled.re_ * y_conj_scaled.re_ +
|
||||
y_conj_scaled.im_ * y_conj_scaled.im_; // abs(y) == abs(conj(y))
|
||||
*this = x_scaled * y_conj_scaled;
|
||||
*this /= y_scaled_abs;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>&
|
||||
operator /= (const std::complex<RealType>& y) {
|
||||
|
||||
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
|
||||
// If the real part is +/-Inf and the imaginary part is -/+Inf,
|
||||
// this won't change the result.
|
||||
@ -334,57 +432,95 @@ public:
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator /= (const RealType& src) {
|
||||
complex<RealType>&
|
||||
operator /= (const InputRealType& src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
|
||||
re_ /= src;
|
||||
im_ /= src;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator == (const complex<RealType>& src) {
|
||||
return (re_ == src.re_) && (im_ == src.im_);
|
||||
bool
|
||||
operator == (const complex<InputRealType>& src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
|
||||
return (re_ == static_cast<RealType>(src.re_)) && (im_ == static_cast<RealType>(src.im_));
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator == (const RealType src) {
|
||||
return (re_ == src) && (im_ == RealType(0));
|
||||
bool
|
||||
operator == (const std::complex<RealType>& src) {
|
||||
return (re_ == src.real()) && (im_ == src.imag());
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool
|
||||
operator == (const InputRealType src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
|
||||
return (re_ == static_cast<RealType>(src)) && (im_ == RealType(0));
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool
|
||||
operator != (const complex<InputRealType>& src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
|
||||
return (re_ != static_cast<RealType>(src.re_)) || (im_ != static_cast<RealType>(src.im_));
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator != (const complex<RealType>& src) {
|
||||
return (re_ != src.re_) || (im_ != src.im_);
|
||||
bool
|
||||
operator != (const std::complex<RealType>& src) {
|
||||
return (re_ != src.real()) || (im_ != src.imag());
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator != (const RealType src) {
|
||||
return (re_ != src) || (im_ != RealType(0));
|
||||
}
|
||||
bool
|
||||
operator != (const InputRealType src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
|
||||
return (re_ != static_cast<RealType>(src)) || (im_ != RealType(0));
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
//! Binary + operator for complex complex.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator + (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x.real () + y.real (), x.imag () + y.imag ());
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator + (const complex<RealType1>& x, const complex<RealType2>& y) {
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type > (x.real () + y.real (), x.imag () + y.imag ());
|
||||
}
|
||||
|
||||
//! Binary + operator for complex scalar.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator + (const complex<RealType>& x, const RealType& y) {
|
||||
return complex<RealType> (x.real () + y , x.imag ());
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator + (const complex<RealType1>& x, const RealType2& y) {
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () + y , x.imag ());
|
||||
}
|
||||
|
||||
//! Binary + operator for scalar complex.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator + (const RealType& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x + y.real (), y.imag ());
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator + (const RealType1& x, const complex<RealType2>& y) {
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (x + y.real (), y.imag ());
|
||||
}
|
||||
|
||||
//! Unary + operator for complex.
|
||||
@ -396,27 +532,27 @@ operator + (const complex<RealType>& x) {
|
||||
}
|
||||
|
||||
//! Binary - operator for complex.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator - (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x.real () - y.real (), x.imag () - y.imag ());
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator - (const complex<RealType1>& x, const complex<RealType2>& y) {
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () - y.real (), x.imag () - y.imag ());
|
||||
}
|
||||
|
||||
//! Binary - operator for complex scalar.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator - (const complex<RealType>& x, const RealType& y) {
|
||||
return complex<RealType> (x.real () - y , x.imag ());
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator - (const complex<RealType1>& x, const RealType2& y) {
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () - y , x.imag ());
|
||||
}
|
||||
|
||||
//! Binary - operator for scalar complex.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator - (const RealType& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x - y.real (), - y.imag ());
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator - (const RealType1& x, const complex<RealType2>& y) {
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (x - y.real (), - y.imag ());
|
||||
}
|
||||
|
||||
//! Unary - operator for complex.
|
||||
@ -428,12 +564,12 @@ operator - (const complex<RealType>& x) {
|
||||
}
|
||||
|
||||
//! Binary * operator for complex.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator * (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
|
||||
x.real () * y.imag () + x.imag () * y.real ());
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator * (const complex<RealType1>& x, const complex<RealType2>& y) {
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () * y.real () - x.imag () * y.imag (),
|
||||
x.real () * y.imag () + x.imag () * y.real ());
|
||||
}
|
||||
|
||||
/// \brief Binary * operator for std::complex and complex.
|
||||
@ -446,33 +582,34 @@ operator * (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
/// This function cannot be called in a CUDA device function, because
|
||||
/// std::complex's methods and nonmember functions are not marked as
|
||||
/// CUDA device functions.
|
||||
template<class RealType>
|
||||
complex<RealType>
|
||||
operator * (const std::complex<RealType>& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
|
||||
x.real () * y.imag () + x.imag () * y.real ());
|
||||
template<class RealType1, class RealType2>
|
||||
inline
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator * (const std::complex<RealType1>& x, const complex<RealType2>& y) {
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () * y.real () - x.imag () * y.imag (),
|
||||
x.real () * y.imag () + x.imag () * y.real ());
|
||||
}
|
||||
|
||||
/// \brief Binary * operator for RealType times complex.
|
||||
///
|
||||
/// This function exists because the compiler doesn't know that
|
||||
/// RealType and complex<RealType> commute with respect to operator*.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator * (const RealType& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x * y.real (), x * y.imag ());
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator * (const RealType1& x, const complex<RealType2>& y) {
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (x * y.real (), x * y.imag ());
|
||||
}
|
||||
|
||||
/// \brief Binary * operator for RealType times complex.
|
||||
///
|
||||
/// This function exists because the compiler doesn't know that
|
||||
/// RealType and complex<RealType> commute with respect to operator*.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator * (const complex<RealType>& y, const RealType& x) {
|
||||
return complex<RealType> (x * y.real (), x * y.imag ());
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator * (const complex<RealType1>& y, const RealType2& x) {
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (x * y.real (), x * y.imag ());
|
||||
}
|
||||
|
||||
//! Imaginary part of a complex number.
|
||||
@ -539,33 +676,34 @@ complex<RealType> pow (const complex<RealType>& x) {
|
||||
//! Binary operator / for complex and real numbers
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType1>
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator / (const complex<RealType1>& x, const RealType2& y) {
|
||||
return complex<RealType1> (real (x) / y, imag (x) / y);
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (real (x) / y, imag (x) / y);
|
||||
}
|
||||
|
||||
//! Binary operator / for complex.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator / (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator / (const complex<RealType1>& x, const complex<RealType2>& y) {
|
||||
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
|
||||
// If the real part is +/-Inf and the imaginary part is -/+Inf,
|
||||
// this won't change the result.
|
||||
const RealType s = std::fabs (real (y)) + std::fabs (imag (y));
|
||||
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
|
||||
const common_real_type s = std::fabs (real (y)) + std::fabs (imag (y));
|
||||
|
||||
// If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
|
||||
// In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
|
||||
// because y/s is NaN.
|
||||
if (s == 0.0) {
|
||||
return complex<RealType> (real (x) / s, imag (x) / s);
|
||||
return complex<common_real_type> (real (x) / s, imag (x) / s);
|
||||
}
|
||||
else {
|
||||
const complex<RealType> x_scaled (real (x) / s, imag (x) / s);
|
||||
const complex<RealType> y_conj_scaled (real (y) / s, -imag (y) / s);
|
||||
const RealType y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) +
|
||||
const complex<common_real_type> x_scaled (real (x) / s, imag (x) / s);
|
||||
const complex<common_real_type> y_conj_scaled (real (y) / s, -imag (y) / s);
|
||||
const RealType1 y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) +
|
||||
imag (y_conj_scaled) * imag (y_conj_scaled); // abs(y) == abs(conj(y))
|
||||
complex<RealType> result = x_scaled * y_conj_scaled;
|
||||
complex<common_real_type> result = x_scaled * y_conj_scaled;
|
||||
result /= y_scaled_abs;
|
||||
return result;
|
||||
}
|
||||
@ -574,16 +712,19 @@ operator / (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
//! Binary operator / for complex and real numbers
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType1>
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator / (const RealType1& x, const complex<RealType2>& y) {
|
||||
return complex<RealType1> (x)/y;
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (x)/y;
|
||||
}
|
||||
|
||||
//! Equality operator for two complex numbers.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
return real (x) == real (y) && imag (x) == imag (y);
|
||||
bool
|
||||
operator == (const complex<RealType1>& x, const complex<RealType2>& y) {
|
||||
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
|
||||
return ( static_cast<common_real_type>(real (x)) == static_cast<common_real_type>(real (y)) &&
|
||||
static_cast<common_real_type>(imag (x)) == static_cast<common_real_type>(imag (y)) );
|
||||
}
|
||||
|
||||
/// \brief Equality operator for std::complex and Kokkos::complex.
|
||||
@ -592,50 +733,68 @@ bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
/// Otherwise, CUDA builds will give compiler warnings ("warning:
|
||||
/// calling a constexpr __host__ function("real") from a __host__
|
||||
/// __device__ function("operator==") is not allowed").
|
||||
template<class RealType>
|
||||
bool operator == (const std::complex<RealType>& x, const complex<RealType>& y) {
|
||||
return std::real (x) == real (y) && std::imag (x) == imag (y);
|
||||
template<class RealType1, class RealType2>
|
||||
inline
|
||||
bool
|
||||
operator == (const std::complex<RealType1>& x, const complex<RealType2>& y) {
|
||||
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
|
||||
return ( static_cast<common_real_type>(std::real (x)) == static_cast<common_real_type>(real (y)) &&
|
||||
static_cast<common_real_type>(std::imag (x)) == static_cast<common_real_type>(imag (y)) );
|
||||
}
|
||||
|
||||
|
||||
//! Equality operator for complex and real number.
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator == (const complex<RealType1>& x, const RealType2& y) {
|
||||
return real (x) == y && imag (x) == static_cast<RealType1> (0.0);
|
||||
bool
|
||||
operator == (const complex<RealType1>& x, const RealType2& y) {
|
||||
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
|
||||
return ( static_cast<common_real_type>(real (x)) == static_cast<common_real_type>(y) &&
|
||||
static_cast<common_real_type>(imag (x)) == static_cast<common_real_type>(0.0) );
|
||||
}
|
||||
|
||||
//! Equality operator for real and complex number.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator == (const RealType& x, const complex<RealType>& y) {
|
||||
bool
|
||||
operator == (const RealType1& x, const complex<RealType2>& y) {
|
||||
return y == x;
|
||||
}
|
||||
|
||||
//! Inequality operator for two complex numbers.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator != (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
return real (x) != real (y) || imag (x) != imag (y);
|
||||
bool
|
||||
operator != (const complex<RealType1>& x, const complex<RealType2>& y) {
|
||||
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
|
||||
return ( static_cast<common_real_type>(real (x)) != static_cast<common_real_type>(real (y)) ||
|
||||
static_cast<common_real_type>(imag (x)) != static_cast<common_real_type>(imag (y)) );
|
||||
}
|
||||
|
||||
//! Inequality operator for std::complex and Kokkos::complex.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator != (const std::complex<RealType>& x, const complex<RealType>& y) {
|
||||
return std::real (x) != real (y) || std::imag (x) != imag (y);
|
||||
template<class RealType1, class RealType2>
|
||||
inline
|
||||
bool
|
||||
operator != (const std::complex<RealType1>& x, const complex<RealType2>& y) {
|
||||
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
|
||||
return ( static_cast<common_real_type>(std::real (x)) != static_cast<common_real_type>(real (y)) ||
|
||||
static_cast<common_real_type>(std::imag (x)) != static_cast<common_real_type>(imag (y)) );
|
||||
}
|
||||
|
||||
//! Inequality operator for complex and real number.
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator != (const complex<RealType1>& x, const RealType2& y) {
|
||||
return real (x) != y || imag (x) != static_cast<RealType1> (0.0);
|
||||
bool
|
||||
operator != (const complex<RealType1>& x, const RealType2& y) {
|
||||
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
|
||||
return ( static_cast<common_real_type>(real (x)) != static_cast<common_real_type>(y) ||
|
||||
static_cast<common_real_type>(imag (x)) != static_cast<common_real_type>(0.0) );
|
||||
}
|
||||
|
||||
//! Inequality operator for real and complex number.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator != (const RealType& x, const complex<RealType>& y) {
|
||||
bool
|
||||
operator != (const RealType1& x, const complex<RealType2>& y) {
|
||||
return y != x;
|
||||
}
|
||||
|
||||
|
||||
@ -353,7 +353,14 @@ struct CountAndFill {
|
||||
struct Fill {};
|
||||
KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const {
|
||||
auto j = m_crs.row_map(i);
|
||||
data_type* fill = &(m_crs.entries(j));
|
||||
/* we don't want to access entries(entries.size()), even if its just to get its
|
||||
address and never use it.
|
||||
this can happen when row (i) is empty and all rows after it are also empty.
|
||||
we could compare to row_map(i + 1), but that is a read from global memory,
|
||||
whereas dimension_0() should be part of the View in registers (or constant memory) */
|
||||
data_type* fill =
|
||||
(j == static_cast<decltype(j)>(m_crs.entries.dimension_0())) ?
|
||||
nullptr : (&(m_crs.entries(j)));
|
||||
m_functor(i, fill);
|
||||
}
|
||||
using self_type = CountAndFill<CrsType, Functor>;
|
||||
|
||||
@ -147,12 +147,11 @@ public:
|
||||
, const size_t arg_alloc_size ) const;
|
||||
|
||||
/**\brief Return Name of the MemorySpace */
|
||||
static constexpr const char* name();
|
||||
static constexpr const char* name() { return "HBW"; }
|
||||
|
||||
private:
|
||||
|
||||
AllocationMechanism m_alloc_mech;
|
||||
static constexpr const char* m_name = "HBW";
|
||||
friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >;
|
||||
};
|
||||
|
||||
|
||||
@ -192,7 +192,7 @@ template<>
|
||||
struct reduction_identity<float> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum() {return static_cast<float>(0.0f);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() {return static_cast<float>(1.0f);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() {return FLT_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() {return -FLT_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static float min() {return FLT_MAX;}
|
||||
};
|
||||
|
||||
@ -200,7 +200,7 @@ template<>
|
||||
struct reduction_identity<double> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static double sum() {return static_cast<double>(0.0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static double prod() {return static_cast<double>(1.0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static double max() {return DBL_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static double max() {return -DBL_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static double min() {return DBL_MAX;}
|
||||
};
|
||||
|
||||
@ -208,7 +208,7 @@ template<>
|
||||
struct reduction_identity<long double> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double sum() {return static_cast<long double>(0.0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double prod() {return static_cast<long double>(1.0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max() {return LDBL_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max() {return -LDBL_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double min() {return LDBL_MAX;}
|
||||
};
|
||||
|
||||
|
||||
@ -211,6 +211,24 @@ struct VerifyExecutionCanAccessMemorySpace
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
|
||||
#define threadIdx_x (hc_get_workitem_id(0))
|
||||
#define threadIdx_y (hc_get_workitem_id(1))
|
||||
#define threadIdx_z (hc_get_workitem_id(2))
|
||||
|
||||
#define blockIdx_x (hc_get_group_id(0))
|
||||
#define blockIdx_y (hc_get_group_id(1))
|
||||
#define blockIdx_z (hc_get_group_id(2))
|
||||
|
||||
#define blockDim_x (hc_get_group_size(0))
|
||||
#define blockDim_y (hc_get_group_size(1))
|
||||
#define blockDim_z (hc_get_group_size(2))
|
||||
|
||||
#define gridDim_x (hc_get_num_groups(0))
|
||||
#define gridDim_y (hc_get_num_groups(1))
|
||||
#define gridDim_z (hc_get_num_groups(2))
|
||||
|
||||
|
||||
#include <ROCm/Kokkos_ROCm_Parallel.hpp>
|
||||
#include <ROCm/Kokkos_ROCm_Task.hpp>
|
||||
|
||||
|
||||
@ -88,6 +88,7 @@ build-makefile-kokkos:
|
||||
echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos
|
||||
echo "" >> Makefile.kokkos
|
||||
echo "#Variables used in application Makefiles" >> Makefile.kokkos
|
||||
echo "KOKKOS_OS = $(KOKKOS_OS)" >> Makefile.kokkos
|
||||
echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos
|
||||
echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos
|
||||
echo "KOKKOS_CPPFLAGS = $(KOKKOS_CPPFLAGS)" >> Makefile.kokkos
|
||||
|
||||
@ -211,6 +211,7 @@ void OpenMP::partition_master( F const& f
|
||||
, thread_local_bytes
|
||||
);
|
||||
|
||||
omp_set_num_threads(partition_size);
|
||||
f( omp_get_thread_num(), omp_get_num_threads() );
|
||||
|
||||
Impl::t_openmp_instance->~Exec();
|
||||
|
||||
@ -113,7 +113,6 @@ void reduce_enqueue(
|
||||
|
||||
if (output_length < 1) return;
|
||||
|
||||
assert(output_result != nullptr);
|
||||
const auto td = get_tile_desc<T>(szElements,output_length,team_size,vector_size, shared_size);
|
||||
|
||||
// allocate host and device memory for the results from each team
|
||||
@ -176,14 +175,17 @@ void reduce_enqueue(
|
||||
}
|
||||
|
||||
});
|
||||
ValueInit::init(ReducerConditional::select(f, reducer), output_result);
|
||||
if (output_result != nullptr)
|
||||
ValueInit::init(ReducerConditional::select(f, reducer), output_result);
|
||||
fut.wait();
|
||||
|
||||
copy(result,result_cpu.data());
|
||||
for(std::size_t i=0;i<td.num_tiles;i++)
|
||||
ValueJoin::join(ReducerConditional::select(f, reducer), output_result, result_cpu.data()+i*output_length);
|
||||
if (output_result != nullptr) {
|
||||
for(std::size_t i=0;i<td.num_tiles;i++)
|
||||
ValueJoin::join(ReducerConditional::select(f, reducer), output_result, result_cpu.data()+i*output_length);
|
||||
|
||||
ValueFinal::final( ReducerConditional::select(f, reducer) , output_result );
|
||||
ValueFinal::final( ReducerConditional::select(f, reducer) , output_result );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -67,7 +67,7 @@ void scan_enqueue(
|
||||
hc::array<value_type> result(td.num_tiles);
|
||||
hc::array<value_type> scratch(len);
|
||||
|
||||
tile_for<value_type>(td, [&,len,td](hc::tiled_index<1> t_idx, tile_buffer<value_type> buffer) [[hc]]
|
||||
tile_for<value_type>(td, [&,f,len,td](hc::tiled_index<1> t_idx, tile_buffer<value_type> buffer) [[hc]]
|
||||
{
|
||||
const auto local = t_idx.local[0];
|
||||
const auto global = t_idx.global[0];
|
||||
@ -135,7 +135,7 @@ void scan_enqueue(
|
||||
ValueJoin::join(f, &result_cpu[i], &result_cpu[i-1]);
|
||||
|
||||
copy(result_cpu.data(),result);
|
||||
hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,len,td](hc::tiled_index<1> t_idx) [[hc]]
|
||||
hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,f,len,td](hc::tiled_index<1> t_idx) [[hc]]
|
||||
{
|
||||
// const auto local = t_idx.local[0];
|
||||
const auto global = t_idx.global[0];
|
||||
|
||||
@ -68,6 +68,8 @@ int bit_first_zero( unsigned i ) noexcept
|
||||
return full != i ? _bit_scan_forward( ~i ) : -1 ;
|
||||
#elif defined( KOKKOS_COMPILER_IBM )
|
||||
return full != i ? __cnttz4( ~i ) : -1 ;
|
||||
#elif defined( KOKKOS_COMPILER_CRAYC )
|
||||
return full != i ? _popcnt( i ^ (i+1) ) - 1 : -1 ;
|
||||
#elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
|
||||
return full != i ? __builtin_ffs( ~i ) - 1 : -1 ;
|
||||
#else
|
||||
@ -90,17 +92,16 @@ int bit_scan_forward( unsigned i )
|
||||
return _bit_scan_forward(i);
|
||||
#elif defined( KOKKOS_COMPILER_IBM )
|
||||
return __cnttz4(i);
|
||||
#elif defined( KOKKOS_COMPILER_CRAYC )
|
||||
return i ? _popcnt(~i & (i-1)) : -1;
|
||||
#elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
|
||||
return __builtin_ffs(i) - 1;
|
||||
#else
|
||||
unsigned t = 1u;
|
||||
int r = 0;
|
||||
while ( i && ( i & t == 0 ) )
|
||||
{
|
||||
t = t << 1;
|
||||
++r;
|
||||
int offset = -1;
|
||||
if ( i ) {
|
||||
for ( offset = 0 ; (i & ( 1 << offset ) ) == 0 ; ++offset );
|
||||
}
|
||||
return r;
|
||||
return offset;
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -116,17 +117,16 @@ int bit_scan_reverse( unsigned i )
|
||||
return _bit_scan_reverse(i);
|
||||
#elif defined( KOKKOS_COMPILER_IBM )
|
||||
return shift - __cntlz4(i);
|
||||
#elif defined( KOKKOS_COMPILER_CRAYC )
|
||||
return i ? shift - _leadz32(i) : 0 ;
|
||||
#elif defined( __GNUC__ ) || defined( __GNUG__ )
|
||||
return shift - __builtin_clz(i);
|
||||
#else
|
||||
unsigned t = 1u << shift;
|
||||
int r = 0;
|
||||
while ( i && ( i & t == 0 ) )
|
||||
{
|
||||
t = t >> 1;
|
||||
++r;
|
||||
int offset = 0;
|
||||
if ( i ) {
|
||||
for ( offset = shift ; (i & ( 1 << offset ) ) == 0 ; --offset );
|
||||
}
|
||||
return r;
|
||||
return offset;
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -142,6 +142,8 @@ int bit_count( unsigned i )
|
||||
return _popcnt32(i);
|
||||
#elif defined( KOKKOS_COMPILER_IBM )
|
||||
return __popcnt4(i);
|
||||
#elif defined( KOKKOS_COMPILER_CRAYC )
|
||||
return _popcnt(i);
|
||||
#elif defined( __GNUC__ ) || defined( __GNUG__ )
|
||||
return __builtin_popcount(i);
|
||||
#else
|
||||
|
||||
@ -166,10 +166,6 @@ void HBWSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_s
|
||||
}
|
||||
}
|
||||
|
||||
constexpr const char* HBWSpace::name() {
|
||||
return m_name;
|
||||
}
|
||||
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
@ -114,7 +114,7 @@ struct TestComplexBasicMath {
|
||||
typename Kokkos::View<Kokkos::complex<double>*,ExecSpace>::HostMirror h_results;
|
||||
|
||||
void testit () {
|
||||
d_results = Kokkos::View<Kokkos::complex<double>*,ExecSpace>("TestComplexBasicMath",20);
|
||||
d_results = Kokkos::View<Kokkos::complex<double>*,ExecSpace>("TestComplexBasicMath",24);
|
||||
h_results = Kokkos::create_mirror_view(d_results);
|
||||
|
||||
Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0,1), *this);
|
||||
@ -125,6 +125,7 @@ struct TestComplexBasicMath {
|
||||
std::complex<double> b(3.25,5.75);
|
||||
std::complex<double> d(1.0,2.0);
|
||||
double c = 9.3;
|
||||
int e = 2;
|
||||
|
||||
std::complex<double> r;
|
||||
r = a+b; ASSERT_FLOAT_EQ(h_results(0).real(), r.real()); ASSERT_FLOAT_EQ(h_results(0).imag(), r.imag());
|
||||
@ -147,6 +148,12 @@ struct TestComplexBasicMath {
|
||||
r = c-a; ASSERT_FLOAT_EQ(h_results(17).real(), r.real()); ASSERT_FLOAT_EQ(h_results(17).imag(), r.imag());
|
||||
r = c*a; ASSERT_FLOAT_EQ(h_results(18).real(), r.real()); ASSERT_FLOAT_EQ(h_results(18).imag(), r.imag());
|
||||
r = c/a; ASSERT_FLOAT_EQ(h_results(19).real(), r.real()); ASSERT_FLOAT_EQ(h_results(19).imag(), r.imag());
|
||||
|
||||
r = a;
|
||||
/* r = a+e; */ ASSERT_FLOAT_EQ(h_results(20).real(), r.real()+e); ASSERT_FLOAT_EQ(h_results(20).imag(), r.imag());
|
||||
/* r = a-e; */ ASSERT_FLOAT_EQ(h_results(21).real(), r.real()-e); ASSERT_FLOAT_EQ(h_results(21).imag(), r.imag());
|
||||
/* r = a*e; */ ASSERT_FLOAT_EQ(h_results(22).real(), r.real()*e); ASSERT_FLOAT_EQ(h_results(22).imag(), r.imag()*e);
|
||||
/* r = a/e; */ ASSERT_FLOAT_EQ(h_results(23).real(), r.real()/2); ASSERT_FLOAT_EQ(h_results(23).imag(), r.imag()/e);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -190,6 +197,12 @@ struct TestComplexBasicMath {
|
||||
d_results(17) = c-a;
|
||||
d_results(18) = c*a;
|
||||
d_results(19) = c/a;
|
||||
|
||||
int e = 2;
|
||||
d_results(20) = a+e;
|
||||
d_results(21) = a-e;
|
||||
d_results(22) = a*e;
|
||||
d_results(23) = a/e;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -286,7 +286,9 @@ struct TestMDRange_2D {
|
||||
// Test with reducers - scalar
|
||||
{
|
||||
typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> > range_type;
|
||||
range_type range( {{ 0, 0 }}, {{ N0, N1 }}, {{ 3, 3 }} );
|
||||
int s0 = 1;
|
||||
int s1 = 1;
|
||||
range_type range( {{ s0, s1 }}, {{ N0, N1 }}, {{ 3, 3 }} );
|
||||
|
||||
TestMDRange_2D functor( N0, N1 );
|
||||
|
||||
@ -297,7 +299,7 @@ struct TestMDRange_2D {
|
||||
|
||||
parallel_reduce( range, functor, reducer_scalar );
|
||||
|
||||
ASSERT_EQ( sum, 2 * N0 * N1 );
|
||||
ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) );
|
||||
}
|
||||
// Test with reducers - scalar view
|
||||
{
|
||||
@ -445,7 +447,9 @@ struct TestMDRange_2D {
|
||||
typedef typename range_type::tile_type tile_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
|
||||
range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
|
||||
const int s0 = 1;
|
||||
const int s1 = 1;
|
||||
range_type range( point_type{ { s0, s1 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
|
||||
TestMDRange_2D functor( N0, N1 );
|
||||
|
||||
parallel_for( range, functor );
|
||||
@ -454,8 +458,8 @@ struct TestMDRange_2D {
|
||||
Kokkos::deep_copy( h_view, functor.input_view );
|
||||
|
||||
int counter = 0;
|
||||
for ( int i = 0; i < N0; ++i )
|
||||
for ( int j = 0; j < N1; ++j )
|
||||
for ( int i = s0; i < N0; ++i )
|
||||
for ( int j = s1; j < N1; ++j )
|
||||
{
|
||||
if ( h_view( i, j ) != 3 ) {
|
||||
++counter;
|
||||
@ -463,7 +467,7 @@ struct TestMDRange_2D {
|
||||
}
|
||||
|
||||
if ( counter != 0 ) {
|
||||
printf( "Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
|
||||
printf( "Offset Start + Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
|
||||
}
|
||||
|
||||
ASSERT_EQ( counter, 0 );
|
||||
@ -699,6 +703,7 @@ struct TestMDRange_2D {
|
||||
|
||||
ASSERT_EQ( counter, 0 );
|
||||
}
|
||||
|
||||
} // end test_for2
|
||||
}; // MDRange_2D
|
||||
|
||||
@ -749,7 +754,10 @@ struct TestMDRange_3D {
|
||||
typedef typename range_type::tile_type tile_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
|
||||
range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
|
||||
int s0 = 1;
|
||||
int s1 = 1;
|
||||
int s2 = 1;
|
||||
range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
|
||||
|
||||
TestMDRange_3D functor( N0, N1, N2 );
|
||||
|
||||
@ -757,7 +765,7 @@ struct TestMDRange_3D {
|
||||
double sum = 0.0;
|
||||
parallel_reduce( range, functor, sum );
|
||||
|
||||
ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
|
||||
ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) );
|
||||
}
|
||||
|
||||
// Test with reducers - scalar
|
||||
@ -952,7 +960,10 @@ struct TestMDRange_3D {
|
||||
typedef typename range_type::tile_type tile_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
|
||||
range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
|
||||
int s0 = 1;
|
||||
int s1 = 1;
|
||||
int s2 = 1;
|
||||
range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
|
||||
TestMDRange_3D functor( N0, N1, N2 );
|
||||
|
||||
parallel_for( range, functor );
|
||||
@ -961,9 +972,9 @@ struct TestMDRange_3D {
|
||||
Kokkos::deep_copy( h_view, functor.input_view );
|
||||
|
||||
int counter = 0;
|
||||
for ( int i = 0; i < N0; ++i )
|
||||
for ( int j = 0; j < N1; ++j )
|
||||
for ( int k = 0; k < N2; ++k )
|
||||
for ( int i = s0; i < N0; ++i )
|
||||
for ( int j = s1; j < N1; ++j )
|
||||
for ( int k = s2; k < N2; ++k )
|
||||
{
|
||||
if ( h_view( i, j, k ) != 3 ) {
|
||||
++counter;
|
||||
@ -971,7 +982,7 @@ struct TestMDRange_3D {
|
||||
}
|
||||
|
||||
if ( counter != 0 ) {
|
||||
printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
|
||||
printf( "Offset Start + Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
|
||||
}
|
||||
|
||||
ASSERT_EQ( counter, 0 );
|
||||
@ -1207,7 +1218,11 @@ struct TestMDRange_4D {
|
||||
typedef typename range_type::tile_type tile_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
|
||||
range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } );
|
||||
int s0 = 1;
|
||||
int s1 = 1;
|
||||
int s2 = 1;
|
||||
int s3 = 1;
|
||||
range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } );
|
||||
|
||||
TestMDRange_4D functor( N0, N1, N2, N3 );
|
||||
|
||||
@ -1215,7 +1230,7 @@ struct TestMDRange_4D {
|
||||
double sum = 0.0;
|
||||
parallel_reduce( range, functor, sum );
|
||||
|
||||
ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
|
||||
ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) );
|
||||
}
|
||||
|
||||
// Test with reducers - scalar
|
||||
@ -1415,7 +1430,11 @@ struct TestMDRange_4D {
|
||||
typedef typename range_type::tile_type tile_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
|
||||
range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } );
|
||||
int s0 = 1;
|
||||
int s1 = 1;
|
||||
int s2 = 1;
|
||||
int s3 = 1;
|
||||
range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } );
|
||||
TestMDRange_4D functor( N0, N1, N2, N3 );
|
||||
|
||||
parallel_for( range, functor );
|
||||
@ -1424,10 +1443,10 @@ struct TestMDRange_4D {
|
||||
Kokkos::deep_copy( h_view, functor.input_view );
|
||||
|
||||
int counter = 0;
|
||||
for ( int i = 0; i < N0; ++i )
|
||||
for ( int j = 0; j < N1; ++j )
|
||||
for ( int k = 0; k < N2; ++k )
|
||||
for ( int l = 0; l < N3; ++l )
|
||||
for ( int i = s0; i < N0; ++i )
|
||||
for ( int j = s1; j < N1; ++j )
|
||||
for ( int k = s2; k < N2; ++k )
|
||||
for ( int l = s3; l < N3; ++l )
|
||||
{
|
||||
if ( h_view( i, j, k, l ) != 3 ) {
|
||||
++counter;
|
||||
@ -1435,7 +1454,7 @@ struct TestMDRange_4D {
|
||||
}
|
||||
|
||||
if ( counter != 0 ) {
|
||||
printf("Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter);
|
||||
printf("Offset Start + Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter);
|
||||
}
|
||||
|
||||
ASSERT_EQ( counter, 0 );
|
||||
@ -1682,7 +1701,12 @@ struct TestMDRange_5D {
|
||||
typedef typename range_type::tile_type tile_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
|
||||
range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } );
|
||||
int s0 = 1;
|
||||
int s1 = 1;
|
||||
int s2 = 1;
|
||||
int s3 = 1;
|
||||
int s4 = 1;
|
||||
range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } );
|
||||
|
||||
TestMDRange_5D functor( N0, N1, N2, N3, N4 );
|
||||
|
||||
@ -1690,7 +1714,7 @@ struct TestMDRange_5D {
|
||||
double sum = 0.0;
|
||||
parallel_reduce( range, functor, sum );
|
||||
|
||||
ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 );
|
||||
ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) );
|
||||
}
|
||||
|
||||
// Test with reducers - scalar
|
||||
@ -1810,7 +1834,12 @@ struct TestMDRange_5D {
|
||||
typedef typename range_type::tile_type tile_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
|
||||
range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } );
|
||||
int s0 = 1;
|
||||
int s1 = 1;
|
||||
int s2 = 1;
|
||||
int s3 = 1;
|
||||
int s4 = 1;
|
||||
range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } );
|
||||
TestMDRange_5D functor( N0, N1, N2, N3, N4 );
|
||||
|
||||
parallel_for( range, functor );
|
||||
@ -1819,11 +1848,11 @@ struct TestMDRange_5D {
|
||||
Kokkos::deep_copy( h_view, functor.input_view );
|
||||
|
||||
int counter = 0;
|
||||
for ( int i = 0; i < N0; ++i )
|
||||
for ( int j = 0; j < N1; ++j )
|
||||
for ( int k = 0; k < N2; ++k )
|
||||
for ( int l = 0; l < N3; ++l )
|
||||
for ( int m = 0; m < N4; ++m )
|
||||
for ( int i = s0; i < N0; ++i )
|
||||
for ( int j = s1; j < N1; ++j )
|
||||
for ( int k = s2; k < N2; ++k )
|
||||
for ( int l = s3; l < N3; ++l )
|
||||
for ( int m = s4; m < N4; ++m )
|
||||
{
|
||||
if ( h_view( i, j, k, l, m ) != 3 ) {
|
||||
++counter;
|
||||
@ -1831,7 +1860,7 @@ struct TestMDRange_5D {
|
||||
}
|
||||
|
||||
if ( counter != 0 ) {
|
||||
printf( "Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter );
|
||||
printf( "Offset Start + Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter );
|
||||
}
|
||||
|
||||
ASSERT_EQ( counter, 0 );
|
||||
@ -2084,7 +2113,13 @@ struct TestMDRange_6D {
|
||||
typedef typename range_type::tile_type tile_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
|
||||
range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } );
|
||||
int s0 = 1;
|
||||
int s1 = 1;
|
||||
int s2 = 1;
|
||||
int s3 = 1;
|
||||
int s4 = 1;
|
||||
int s5 = 1;
|
||||
range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } );
|
||||
|
||||
TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
|
||||
|
||||
@ -2092,7 +2127,7 @@ struct TestMDRange_6D {
|
||||
double sum = 0.0;
|
||||
parallel_reduce( range, functor, sum );
|
||||
|
||||
ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 );
|
||||
ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) * (N5 - s5) );
|
||||
}
|
||||
|
||||
// Test with reducers - scalar
|
||||
@ -2214,7 +2249,13 @@ struct TestMDRange_6D {
|
||||
typedef typename range_type::tile_type tile_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
|
||||
range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging
|
||||
int s0 = 1;
|
||||
int s1 = 1;
|
||||
int s2 = 1;
|
||||
int s3 = 1;
|
||||
int s4 = 1;
|
||||
int s5 = 1;
|
||||
range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging
|
||||
TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
|
||||
|
||||
parallel_for( range, functor );
|
||||
@ -2223,12 +2264,12 @@ struct TestMDRange_6D {
|
||||
Kokkos::deep_copy( h_view, functor.input_view );
|
||||
|
||||
int counter = 0;
|
||||
for ( int i = 0; i < N0; ++i )
|
||||
for ( int j = 0; j < N1; ++j )
|
||||
for ( int k = 0; k < N2; ++k )
|
||||
for ( int l = 0; l < N3; ++l )
|
||||
for ( int m = 0; m < N4; ++m )
|
||||
for ( int n = 0; n < N5; ++n )
|
||||
for ( int i = s0; i < N0; ++i )
|
||||
for ( int j = s1; j < N1; ++j )
|
||||
for ( int k = s2; k < N2; ++k )
|
||||
for ( int l = s3; l < N3; ++l )
|
||||
for ( int m = s4; m < N4; ++m )
|
||||
for ( int n = s5; n < N5; ++n )
|
||||
{
|
||||
if ( h_view( i, j, k, l, m, n ) != 3 ) {
|
||||
++counter;
|
||||
@ -2236,7 +2277,7 @@ struct TestMDRange_6D {
|
||||
}
|
||||
|
||||
if ( counter != 0 ) {
|
||||
printf( "Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter );
|
||||
printf( "Offset Start + Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter );
|
||||
}
|
||||
|
||||
ASSERT_EQ( counter, 0 );
|
||||
|
||||
@ -159,13 +159,13 @@ if buildflag or pathflag:
|
||||
os.remove("includelink")
|
||||
if os.path.isfile("liblink") or os.path.islink("liblink"):
|
||||
os.remove("liblink")
|
||||
if os.path.isfile("filelink") or os.path.islink("filelink"):
|
||||
os.remove("filelink")
|
||||
if os.path.isfile("filelink.o") or os.path.islink("filelink.o"):
|
||||
os.remove("filelink.o")
|
||||
cmd = 'ln -s "%s/src" includelink' % lattedir
|
||||
subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
|
||||
cmd = 'ln -s "%s" liblink' % lattedir
|
||||
subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
|
||||
cmd = 'ln -s "%s/src/latte_c_bind.o" filelink' % lattedir
|
||||
cmd = 'ln -s "%s/src/latte_c_bind.o" filelink.o' % lattedir
|
||||
subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
|
||||
|
||||
# copy Makefile.lammps.suffix to Makefile.lammps
|
||||
|
||||
@ -3,5 +3,5 @@
|
||||
# GNU Fortran settings
|
||||
|
||||
latte_SYSINC =
|
||||
latte_SYSLIB = ../../lib/latte/filelink -llatte -lgfortran -llapack -lblas
|
||||
latte_SYSLIB = ../../lib/latte/filelink.o -llatte -lgfortran -llapack -lblas
|
||||
latte_SYSPATH = -fopenmp
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
# Intel ifort settings
|
||||
|
||||
latte_SYSINC =
|
||||
latte_SYSLIB = ../../lib/latte/filelink \
|
||||
latte_SYSLIB = ../../lib/latte/filelink.o \
|
||||
-llatte -lifcore -lsvml -lompstub -limf -lmkl_intel_lp64 \
|
||||
-lmkl_intel_thread -lmkl_core -lmkl_intel_thread -lpthread \
|
||||
-openmp -O0
|
||||
|
||||
2
src/.gitignore
vendored
2
src/.gitignore
vendored
@ -414,6 +414,8 @@
|
||||
/fix_lambdah_calc.h
|
||||
/fix_langevin_eff.cpp
|
||||
/fix_langevin_eff.h
|
||||
/fix_latte.cpp
|
||||
/fix_latte.h
|
||||
/fix_lb_fluid.cpp
|
||||
/fix_lb_fluid.h
|
||||
/fix_lb_momentum.cpp
|
||||
|
||||
@ -119,6 +119,10 @@ if (test $1 = "USER-DPD") then
|
||||
depend KOKKOS
|
||||
fi
|
||||
|
||||
if (test $1 = "USER-DRUDE") then
|
||||
depend USER-OMP
|
||||
fi
|
||||
|
||||
if (test $1 = "USER-FEP") then
|
||||
depend USER-OMP
|
||||
fi
|
||||
|
||||
@ -136,450 +136,6 @@ void AtomVecAtomicKokkos::copy(int i, int j, int delflag)
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType,int PBC_FLAG,int TRICLINIC>
|
||||
struct AtomVecAtomicKokkos_PackComm {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
|
||||
typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
|
||||
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
|
||||
const int _iswap;
|
||||
X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
|
||||
X_FLOAT _pbc[6];
|
||||
|
||||
AtomVecAtomicKokkos_PackComm(
|
||||
const typename DAT::tdual_x_array &x,
|
||||
const typename DAT::tdual_xfloat_2d &buf,
|
||||
const typename DAT::tdual_int_2d &list,
|
||||
const int & iswap,
|
||||
const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
|
||||
const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
|
||||
_x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap),
|
||||
_xprd(xprd),_yprd(yprd),_zprd(zprd),
|
||||
_xy(xy),_xz(xz),_yz(yz) {
|
||||
const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
|
||||
const size_t elements = 3;
|
||||
buffer_view<DeviceType>(_buf,buf,maxsend,elements);
|
||||
_pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
|
||||
_pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
|
||||
};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int& i) const {
|
||||
const int j = _list(_iswap,i);
|
||||
if (PBC_FLAG == 0) {
|
||||
_buf(i,0) = _x(j,0);
|
||||
_buf(i,1) = _x(j,1);
|
||||
_buf(i,2) = _x(j,2);
|
||||
} else {
|
||||
if (TRICLINIC == 0) {
|
||||
_buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
|
||||
_buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
|
||||
_buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
|
||||
} else {
|
||||
_buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
|
||||
_buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
|
||||
_buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecAtomicKokkos::pack_comm_kokkos(const int &n,
|
||||
const DAT::tdual_int_2d &list,
|
||||
const int & iswap,
|
||||
const DAT::tdual_xfloat_2d &buf,
|
||||
const int &pbc_flag,
|
||||
const int* const pbc)
|
||||
{
|
||||
// Check whether to always run forward communication on the host
|
||||
// Choose correct forward PackComm kernel
|
||||
|
||||
if(commKK->forward_comm_on_host) {
|
||||
sync(Host,X_MASK);
|
||||
if(pbc_flag) {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecAtomicKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecAtomicKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
} else {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecAtomicKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecAtomicKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
sync(Device,X_MASK);
|
||||
if(pbc_flag) {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
} else {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return n*size_forward;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType,int PBC_FLAG,int TRICLINIC>
|
||||
struct AtomVecAtomicKokkos_PackCommSelf {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
|
||||
typename ArrayTypes<DeviceType>::t_x_array _xw;
|
||||
int _nfirst;
|
||||
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
|
||||
const int _iswap;
|
||||
X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
|
||||
X_FLOAT _pbc[6];
|
||||
|
||||
AtomVecAtomicKokkos_PackCommSelf(
|
||||
const typename DAT::tdual_x_array &x,
|
||||
const int &nfirst,
|
||||
const typename DAT::tdual_int_2d &list,
|
||||
const int & iswap,
|
||||
const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
|
||||
const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
|
||||
_x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
|
||||
_xprd(xprd),_yprd(yprd),_zprd(zprd),
|
||||
_xy(xy),_xz(xz),_yz(yz) {
|
||||
_pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
|
||||
_pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
|
||||
};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int& i) const {
|
||||
const int j = _list(_iswap,i);
|
||||
if (PBC_FLAG == 0) {
|
||||
_xw(i+_nfirst,0) = _x(j,0);
|
||||
_xw(i+_nfirst,1) = _x(j,1);
|
||||
_xw(i+_nfirst,2) = _x(j,2);
|
||||
} else {
|
||||
if (TRICLINIC == 0) {
|
||||
_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
|
||||
_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
|
||||
_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
|
||||
} else {
|
||||
_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
|
||||
_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
|
||||
_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecAtomicKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
|
||||
const int nfirst, const int &pbc_flag, const int* const pbc) {
|
||||
if(commKK->forward_comm_on_host) {
|
||||
sync(Host,X_MASK);
|
||||
modified(Host,X_MASK);
|
||||
if(pbc_flag) {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
} else {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
sync(Device,X_MASK);
|
||||
modified(Device,X_MASK);
|
||||
if(pbc_flag) {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
} else {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
}
|
||||
return n*3;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
struct AtomVecAtomicKokkos_UnpackComm {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
typename ArrayTypes<DeviceType>::t_x_array _x;
|
||||
typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
|
||||
int _first;
|
||||
|
||||
AtomVecAtomicKokkos_UnpackComm(
|
||||
const typename DAT::tdual_x_array &x,
|
||||
const typename DAT::tdual_xfloat_2d &buf,
|
||||
const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
|
||||
_first(first) {};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int& i) const {
|
||||
_x(i+_first,0) = _buf(i,0);
|
||||
_x(i+_first,1) = _buf(i,1);
|
||||
_x(i+_first,2) = _buf(i,2);
|
||||
}
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AtomVecAtomicKokkos::unpack_comm_kokkos(const int &n, const int &first,
|
||||
const DAT::tdual_xfloat_2d &buf ) {
|
||||
if(commKK->forward_comm_on_host) {
|
||||
sync(Host,X_MASK);
|
||||
modified(Host,X_MASK);
|
||||
struct AtomVecAtomicKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
sync(Device,X_MASK);
|
||||
modified(Device,X_MASK);
|
||||
struct AtomVecAtomicKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecAtomicKokkos::pack_comm(int n, int *list, double *buf,
|
||||
int pbc_flag, int *pbc)
|
||||
{
|
||||
int i,j,m;
|
||||
double dx,dy,dz;
|
||||
|
||||
m = 0;
|
||||
if (pbc_flag == 0) {
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0);
|
||||
buf[m++] = h_x(j,1);
|
||||
buf[m++] = h_x(j,2);
|
||||
}
|
||||
} else {
|
||||
if (domain->triclinic == 0) {
|
||||
dx = pbc[0]*domain->xprd;
|
||||
dy = pbc[1]*domain->yprd;
|
||||
dz = pbc[2]*domain->zprd;
|
||||
} else {
|
||||
dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
|
||||
dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
|
||||
dz = pbc[2]*domain->zprd;
|
||||
}
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0) + dx;
|
||||
buf[m++] = h_x(j,1) + dy;
|
||||
buf[m++] = h_x(j,2) + dz;
|
||||
}
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecAtomicKokkos::pack_comm_vel(int n, int *list, double *buf,
|
||||
int pbc_flag, int *pbc)
|
||||
{
|
||||
int i,j,m;
|
||||
double dx,dy,dz,dvx,dvy,dvz;
|
||||
|
||||
m = 0;
|
||||
if (pbc_flag == 0) {
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0);
|
||||
buf[m++] = h_x(j,1);
|
||||
buf[m++] = h_x(j,2);
|
||||
buf[m++] = h_v(j,0);
|
||||
buf[m++] = h_v(j,1);
|
||||
buf[m++] = h_v(j,2);
|
||||
}
|
||||
} else {
|
||||
if (domain->triclinic == 0) {
|
||||
dx = pbc[0]*domain->xprd;
|
||||
dy = pbc[1]*domain->yprd;
|
||||
dz = pbc[2]*domain->zprd;
|
||||
} else {
|
||||
dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
|
||||
dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
|
||||
dz = pbc[2]*domain->zprd;
|
||||
}
|
||||
if (!deform_vremap) {
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0) + dx;
|
||||
buf[m++] = h_x(j,1) + dy;
|
||||
buf[m++] = h_x(j,2) + dz;
|
||||
buf[m++] = h_v(j,0);
|
||||
buf[m++] = h_v(j,1);
|
||||
buf[m++] = h_v(j,2);
|
||||
}
|
||||
} else {
|
||||
dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
|
||||
dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
|
||||
dvz = pbc[2]*h_rate[2];
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0) + dx;
|
||||
buf[m++] = h_x(j,1) + dy;
|
||||
buf[m++] = h_x(j,2) + dz;
|
||||
if (mask[i] & deform_groupbit) {
|
||||
buf[m++] = h_v(j,0) + dvx;
|
||||
buf[m++] = h_v(j,1) + dvy;
|
||||
buf[m++] = h_v(j,2) + dvz;
|
||||
} else {
|
||||
buf[m++] = h_v(j,0);
|
||||
buf[m++] = h_v(j,1);
|
||||
buf[m++] = h_v(j,2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AtomVecAtomicKokkos::unpack_comm(int n, int first, double *buf)
|
||||
{
|
||||
int i,m,last;
|
||||
|
||||
m = 0;
|
||||
last = first + n;
|
||||
for (i = first; i < last; i++) {
|
||||
h_x(i,0) = buf[m++];
|
||||
h_x(i,1) = buf[m++];
|
||||
h_x(i,2) = buf[m++];
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AtomVecAtomicKokkos::unpack_comm_vel(int n, int first, double *buf)
|
||||
{
|
||||
int i,m,last;
|
||||
|
||||
m = 0;
|
||||
last = first + n;
|
||||
for (i = first; i < last; i++) {
|
||||
h_x(i,0) = buf[m++];
|
||||
h_x(i,1) = buf[m++];
|
||||
h_x(i,2) = buf[m++];
|
||||
h_v(i,0) = buf[m++];
|
||||
h_v(i,1) = buf[m++];
|
||||
h_v(i,2) = buf[m++];
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecAtomicKokkos::pack_reverse(int n, int first, double *buf)
|
||||
{
|
||||
if(n > 0)
|
||||
sync(Host,F_MASK);
|
||||
|
||||
int m = 0;
|
||||
const int last = first + n;
|
||||
for (int i = first; i < last; i++) {
|
||||
buf[m++] = h_f(i,0);
|
||||
buf[m++] = h_f(i,1);
|
||||
buf[m++] = h_f(i,2);
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AtomVecAtomicKokkos::unpack_reverse(int n, int *list, double *buf)
|
||||
{
|
||||
if(n > 0) {
|
||||
sync(Host,F_MASK);
|
||||
modified(Host,F_MASK);
|
||||
}
|
||||
|
||||
int m = 0;
|
||||
for (int i = 0; i < n; i++) {
|
||||
const int j = list[i];
|
||||
h_f(j,0) += buf[m++];
|
||||
h_f(j,1) += buf[m++];
|
||||
h_f(j,2) += buf[m++];
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType,int PBC_FLAG>
|
||||
struct AtomVecAtomicKokkos_PackBorder {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
@ -33,12 +33,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos {
|
||||
virtual ~AtomVecAtomicKokkos() {}
|
||||
void grow(int);
|
||||
void copy(int, int, int);
|
||||
int pack_comm(int, int *, double *, int, int *);
|
||||
int pack_comm_vel(int, int *, double *, int, int *);
|
||||
void unpack_comm(int, int, double *);
|
||||
void unpack_comm_vel(int, int, double *);
|
||||
int pack_reverse(int, int, double *);
|
||||
void unpack_reverse(int, int *, double *);
|
||||
int pack_border(int, int *, double *, int, int *);
|
||||
int pack_border_vel(int, int *, double *, int, int *);
|
||||
void unpack_border(int, int, double *);
|
||||
@ -55,15 +49,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos {
|
||||
bigint memory_usage();
|
||||
|
||||
void grow_reset();
|
||||
int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
|
||||
const int & iswap,
|
||||
const DAT::tdual_xfloat_2d &buf,
|
||||
const int &pbc_flag, const int pbc[]);
|
||||
void unpack_comm_kokkos(const int &n, const int &nfirst,
|
||||
const DAT::tdual_xfloat_2d &buf);
|
||||
int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
|
||||
const int & iswap, const int nfirst,
|
||||
const int &pbc_flag, const int pbc[]);
|
||||
int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
|
||||
DAT::tdual_xfloat_2d buf,int iswap,
|
||||
int pbc_flag, int *pbc, ExecutionSpace space);
|
||||
@ -99,9 +84,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos {
|
||||
DAT::t_x_array d_x;
|
||||
DAT::t_v_array d_v;
|
||||
DAT::t_f_array d_f;
|
||||
HAT::t_x_array h_x;
|
||||
HAT::t_v_array h_v;
|
||||
HAT::t_f_array h_f;
|
||||
|
||||
DAT::tdual_int_1d k_count;
|
||||
};
|
||||
|
||||
@ -178,448 +178,6 @@ void AtomVecBondKokkos::copy(int i, int j, int delflag)
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType,int PBC_FLAG,int TRICLINIC>
|
||||
struct AtomVecBondKokkos_PackComm {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
|
||||
typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
|
||||
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
|
||||
const int _iswap;
|
||||
X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
|
||||
X_FLOAT _pbc[6];
|
||||
|
||||
AtomVecBondKokkos_PackComm(
|
||||
const typename DAT::tdual_x_array &x,
|
||||
const typename DAT::tdual_xfloat_2d &buf,
|
||||
const typename DAT::tdual_int_2d &list,
|
||||
const int & iswap,
|
||||
const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
|
||||
const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
|
||||
_x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap),
|
||||
_xprd(xprd),_yprd(yprd),_zprd(zprd),
|
||||
_xy(xy),_xz(xz),_yz(yz) {
|
||||
const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
|
||||
const size_t elements = 3;
|
||||
buffer_view<DeviceType>(_buf,buf,maxsend,elements);
|
||||
_pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
|
||||
_pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
|
||||
};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int& i) const {
|
||||
const int j = _list(_iswap,i);
|
||||
if (PBC_FLAG == 0) {
|
||||
_buf(i,0) = _x(j,0);
|
||||
_buf(i,1) = _x(j,1);
|
||||
_buf(i,2) = _x(j,2);
|
||||
} else {
|
||||
if (TRICLINIC == 0) {
|
||||
_buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
|
||||
_buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
|
||||
_buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
|
||||
} else {
|
||||
_buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
|
||||
_buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
|
||||
_buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecBondKokkos::pack_comm_kokkos(const int &n,
|
||||
const DAT::tdual_int_2d &list,
|
||||
const int & iswap,
|
||||
const DAT::tdual_xfloat_2d &buf,
|
||||
const int &pbc_flag,
|
||||
const int* const pbc)
|
||||
{
|
||||
// Check whether to always run forward communication on the host
|
||||
// Choose correct forward PackComm kernel
|
||||
|
||||
if(commKK->forward_comm_on_host) {
|
||||
sync(Host,X_MASK);
|
||||
if(pbc_flag) {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecBondKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecBondKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
} else {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecBondKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecBondKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
sync(Device,X_MASK);
|
||||
if(pbc_flag) {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecBondKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecBondKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
} else {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecBondKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecBondKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return n*size_forward;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType,int PBC_FLAG,int TRICLINIC>
|
||||
struct AtomVecBondKokkos_PackCommSelf {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
|
||||
typename ArrayTypes<DeviceType>::t_x_array _xw;
|
||||
int _nfirst;
|
||||
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
|
||||
const int _iswap;
|
||||
X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
|
||||
X_FLOAT _pbc[6];
|
||||
|
||||
AtomVecBondKokkos_PackCommSelf(
|
||||
const typename DAT::tdual_x_array &x,
|
||||
const int &nfirst,
|
||||
const typename DAT::tdual_int_2d &list,
|
||||
const int & iswap,
|
||||
const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
|
||||
const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
|
||||
_x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
|
||||
_xprd(xprd),_yprd(yprd),_zprd(zprd),
|
||||
_xy(xy),_xz(xz),_yz(yz) {
|
||||
_pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
|
||||
_pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
|
||||
};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int& i) const {
|
||||
const int j = _list(_iswap,i);
|
||||
if (PBC_FLAG == 0) {
|
||||
_xw(i+_nfirst,0) = _x(j,0);
|
||||
_xw(i+_nfirst,1) = _x(j,1);
|
||||
_xw(i+_nfirst,2) = _x(j,2);
|
||||
} else {
|
||||
if (TRICLINIC == 0) {
|
||||
_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
|
||||
_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
|
||||
_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
|
||||
} else {
|
||||
_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
|
||||
_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
|
||||
_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecBondKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
|
||||
const int nfirst, const int &pbc_flag, const int* const pbc) {
|
||||
if(commKK->forward_comm_on_host) {
|
||||
sync(Host,X_MASK);
|
||||
modified(Host,X_MASK);
|
||||
if(pbc_flag) {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecBondKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecBondKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
} else {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecBondKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecBondKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
sync(Device,X_MASK);
|
||||
modified(Device,X_MASK);
|
||||
if(pbc_flag) {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecBondKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecBondKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
} else {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecBondKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecBondKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
}
|
||||
return n*3;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
struct AtomVecBondKokkos_UnpackComm {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
typename ArrayTypes<DeviceType>::t_x_array _x;
|
||||
typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
|
||||
int _first;
|
||||
|
||||
AtomVecBondKokkos_UnpackComm(
|
||||
const typename DAT::tdual_x_array &x,
|
||||
const typename DAT::tdual_xfloat_2d &buf,
|
||||
const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
|
||||
_first(first) {};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int& i) const {
|
||||
_x(i+_first,0) = _buf(i,0);
|
||||
_x(i+_first,1) = _buf(i,1);
|
||||
_x(i+_first,2) = _buf(i,2);
|
||||
}
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AtomVecBondKokkos::unpack_comm_kokkos(const int &n, const int &first,
|
||||
const DAT::tdual_xfloat_2d &buf ) {
|
||||
if(commKK->forward_comm_on_host) {
|
||||
sync(Host,X_MASK);
|
||||
modified(Host,X_MASK);
|
||||
struct AtomVecBondKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
sync(Device,X_MASK);
|
||||
modified(Device,X_MASK);
|
||||
struct AtomVecBondKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecBondKokkos::pack_comm(int n, int *list, double *buf,
|
||||
int pbc_flag, int *pbc)
|
||||
{
|
||||
int i,j,m;
|
||||
double dx,dy,dz;
|
||||
|
||||
m = 0;
|
||||
if (pbc_flag == 0) {
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0);
|
||||
buf[m++] = h_x(j,1);
|
||||
buf[m++] = h_x(j,2);
|
||||
}
|
||||
} else {
|
||||
if (domain->triclinic == 0) {
|
||||
dx = pbc[0]*domain->xprd;
|
||||
dy = pbc[1]*domain->yprd;
|
||||
dz = pbc[2]*domain->zprd;
|
||||
} else {
|
||||
dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
|
||||
dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
|
||||
dz = pbc[2]*domain->zprd;
|
||||
}
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0) + dx;
|
||||
buf[m++] = h_x(j,1) + dy;
|
||||
buf[m++] = h_x(j,2) + dz;
|
||||
}
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecBondKokkos::pack_comm_vel(int n, int *list, double *buf,
|
||||
int pbc_flag, int *pbc)
|
||||
{
|
||||
int i,j,m;
|
||||
double dx,dy,dz,dvx,dvy,dvz;
|
||||
|
||||
m = 0;
|
||||
if (pbc_flag == 0) {
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0);
|
||||
buf[m++] = h_x(j,1);
|
||||
buf[m++] = h_x(j,2);
|
||||
buf[m++] = h_v(j,0);
|
||||
buf[m++] = h_v(j,1);
|
||||
buf[m++] = h_v(j,2);
|
||||
}
|
||||
} else {
|
||||
if (domain->triclinic == 0) {
|
||||
dx = pbc[0]*domain->xprd;
|
||||
dy = pbc[1]*domain->yprd;
|
||||
dz = pbc[2]*domain->zprd;
|
||||
} else {
|
||||
dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
|
||||
dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
|
||||
dz = pbc[2]*domain->zprd;
|
||||
}
|
||||
if (!deform_vremap) {
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0) + dx;
|
||||
buf[m++] = h_x(j,1) + dy;
|
||||
buf[m++] = h_x(j,2) + dz;
|
||||
buf[m++] = h_v(j,0);
|
||||
buf[m++] = h_v(j,1);
|
||||
buf[m++] = h_v(j,2);
|
||||
}
|
||||
} else {
|
||||
dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
|
||||
dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
|
||||
dvz = pbc[2]*h_rate[2];
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0) + dx;
|
||||
buf[m++] = h_x(j,1) + dy;
|
||||
buf[m++] = h_x(j,2) + dz;
|
||||
if (mask[i] & deform_groupbit) {
|
||||
buf[m++] = h_v(j,0) + dvx;
|
||||
buf[m++] = h_v(j,1) + dvy;
|
||||
buf[m++] = h_v(j,2) + dvz;
|
||||
} else {
|
||||
buf[m++] = h_v(j,0);
|
||||
buf[m++] = h_v(j,1);
|
||||
buf[m++] = h_v(j,2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AtomVecBondKokkos::unpack_comm(int n, int first, double *buf)
|
||||
{
|
||||
int i,m,last;
|
||||
|
||||
m = 0;
|
||||
last = first + n;
|
||||
for (i = first; i < last; i++) {
|
||||
h_x(i,0) = buf[m++];
|
||||
h_x(i,1) = buf[m++];
|
||||
h_x(i,2) = buf[m++];
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AtomVecBondKokkos::unpack_comm_vel(int n, int first, double *buf)
|
||||
{
|
||||
int i,m,last;
|
||||
|
||||
m = 0;
|
||||
last = first + n;
|
||||
for (i = first; i < last; i++) {
|
||||
h_x(i,0) = buf[m++];
|
||||
h_x(i,1) = buf[m++];
|
||||
h_x(i,2) = buf[m++];
|
||||
h_v(i,0) = buf[m++];
|
||||
h_v(i,1) = buf[m++];
|
||||
h_v(i,2) = buf[m++];
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecBondKokkos::pack_reverse(int n, int first, double *buf)
|
||||
{
|
||||
if(n > 0)
|
||||
sync(Host,F_MASK);
|
||||
|
||||
int m = 0;
|
||||
const int last = first + n;
|
||||
for (int i = first; i < last; i++) {
|
||||
buf[m++] = h_f(i,0);
|
||||
buf[m++] = h_f(i,1);
|
||||
buf[m++] = h_f(i,2);
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AtomVecBondKokkos::unpack_reverse(int n, int *list, double *buf)
|
||||
{
|
||||
if(n > 0)
|
||||
modified(Host,F_MASK);
|
||||
|
||||
int m = 0;
|
||||
for (int i = 0; i < n; i++) {
|
||||
const int j = list[i];
|
||||
h_f(j,0) += buf[m++];
|
||||
h_f(j,1) += buf[m++];
|
||||
h_f(j,2) += buf[m++];
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType,int PBC_FLAG>
|
||||
struct AtomVecBondKokkos_PackBorder {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
@ -32,12 +32,6 @@ class AtomVecBondKokkos : public AtomVecKokkos {
|
||||
virtual ~AtomVecBondKokkos() {}
|
||||
void grow(int);
|
||||
void copy(int, int, int);
|
||||
int pack_comm(int, int *, double *, int, int *);
|
||||
int pack_comm_vel(int, int *, double *, int, int *);
|
||||
void unpack_comm(int, int, double *);
|
||||
void unpack_comm_vel(int, int, double *);
|
||||
int pack_reverse(int, int, double *);
|
||||
void unpack_reverse(int, int *, double *);
|
||||
int pack_border(int, int *, double *, int, int *);
|
||||
int pack_border_vel(int, int *, double *, int, int *);
|
||||
int pack_border_hybrid(int, int *, double *);
|
||||
@ -59,15 +53,6 @@ class AtomVecBondKokkos : public AtomVecKokkos {
|
||||
bigint memory_usage();
|
||||
|
||||
void grow_reset();
|
||||
int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
|
||||
const int & iswap,
|
||||
const DAT::tdual_xfloat_2d &buf,
|
||||
const int &pbc_flag, const int pbc[]);
|
||||
void unpack_comm_kokkos(const int &n, const int &nfirst,
|
||||
const DAT::tdual_xfloat_2d &buf);
|
||||
int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
|
||||
const int & iswap, const int nfirst,
|
||||
const int &pbc_flag, const int pbc[]);
|
||||
int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
|
||||
DAT::tdual_xfloat_2d buf,int iswap,
|
||||
int pbc_flag, int *pbc, ExecutionSpace space);
|
||||
@ -112,9 +97,6 @@ class AtomVecBondKokkos : public AtomVecKokkos {
|
||||
DAT::t_x_array d_x;
|
||||
DAT::t_v_array d_v;
|
||||
DAT::t_f_array d_f;
|
||||
HAT::t_x_array h_x;
|
||||
HAT::t_v_array h_v;
|
||||
HAT::t_f_array h_f;
|
||||
|
||||
DAT::t_tagint_1d d_molecule;
|
||||
DAT::t_int_2d d_nspecial;
|
||||
|
||||
@ -199,397 +199,6 @@ struct AtomVecChargeKokkos_PackComm {
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecChargeKokkos::pack_comm_kokkos(const int &n,
|
||||
const DAT::tdual_int_2d &list,
|
||||
const int & iswap,
|
||||
const DAT::tdual_xfloat_2d &buf,
|
||||
const int &pbc_flag,
|
||||
const int* const pbc)
|
||||
{
|
||||
// Check whether to always run forward communication on the host
|
||||
// Choose correct forward PackComm kernel
|
||||
|
||||
if(commKK->forward_comm_on_host) {
|
||||
sync(Host,X_MASK);
|
||||
if(pbc_flag) {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecChargeKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecChargeKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
} else {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecChargeKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecChargeKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
sync(Device,X_MASK);
|
||||
if(pbc_flag) {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecChargeKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecChargeKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
} else {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecChargeKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecChargeKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return n*size_forward;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType,int PBC_FLAG,int TRICLINIC>
|
||||
struct AtomVecChargeKokkos_PackCommSelf {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
|
||||
typename ArrayTypes<DeviceType>::t_x_array _xw;
|
||||
int _nfirst;
|
||||
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
|
||||
const int _iswap;
|
||||
X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
|
||||
X_FLOAT _pbc[6];
|
||||
|
||||
AtomVecChargeKokkos_PackCommSelf(
|
||||
const typename DAT::tdual_x_array &x,
|
||||
const int &nfirst,
|
||||
const typename DAT::tdual_int_2d &list,
|
||||
const int & iswap,
|
||||
const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
|
||||
const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
|
||||
_x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
|
||||
_xprd(xprd),_yprd(yprd),_zprd(zprd),
|
||||
_xy(xy),_xz(xz),_yz(yz) {
|
||||
_pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
|
||||
_pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
|
||||
};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int& i) const {
|
||||
const int j = _list(_iswap,i);
|
||||
if (PBC_FLAG == 0) {
|
||||
_xw(i+_nfirst,0) = _x(j,0);
|
||||
_xw(i+_nfirst,1) = _x(j,1);
|
||||
_xw(i+_nfirst,2) = _x(j,2);
|
||||
} else {
|
||||
if (TRICLINIC == 0) {
|
||||
_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
|
||||
_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
|
||||
_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
|
||||
} else {
|
||||
_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
|
||||
_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
|
||||
_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecChargeKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
|
||||
const int nfirst, const int &pbc_flag, const int* const pbc) {
|
||||
if(commKK->forward_comm_on_host) {
|
||||
sync(Host,X_MASK);
|
||||
modified(Host,X_MASK);
|
||||
if(pbc_flag) {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecChargeKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecChargeKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
} else {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecChargeKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecChargeKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
sync(Device,X_MASK);
|
||||
modified(Device,X_MASK);
|
||||
if(pbc_flag) {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecChargeKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecChargeKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
} else {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecChargeKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecChargeKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
}
|
||||
return n*3;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
struct AtomVecChargeKokkos_UnpackComm {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
typename ArrayTypes<DeviceType>::t_x_array _x;
|
||||
typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
|
||||
int _first;
|
||||
|
||||
AtomVecChargeKokkos_UnpackComm(
|
||||
const typename DAT::tdual_x_array &x,
|
||||
const typename DAT::tdual_xfloat_2d &buf,
|
||||
const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
|
||||
_first(first) {};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int& i) const {
|
||||
_x(i+_first,0) = _buf(i,0);
|
||||
_x(i+_first,1) = _buf(i,1);
|
||||
_x(i+_first,2) = _buf(i,2);
|
||||
}
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AtomVecChargeKokkos::unpack_comm_kokkos(const int &n, const int &first,
|
||||
const DAT::tdual_xfloat_2d &buf ) {
|
||||
if(commKK->forward_comm_on_host) {
|
||||
sync(Host,X_MASK);
|
||||
modified(Host,X_MASK);
|
||||
struct AtomVecChargeKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
sync(Device,X_MASK);
|
||||
modified(Device,X_MASK);
|
||||
struct AtomVecChargeKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecChargeKokkos::pack_comm(int n, int *list, double *buf,
|
||||
int pbc_flag, int *pbc)
|
||||
{
|
||||
int i,j,m;
|
||||
double dx,dy,dz;
|
||||
|
||||
m = 0;
|
||||
if (pbc_flag == 0) {
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0);
|
||||
buf[m++] = h_x(j,1);
|
||||
buf[m++] = h_x(j,2);
|
||||
}
|
||||
} else {
|
||||
if (domain->triclinic == 0) {
|
||||
dx = pbc[0]*domain->xprd;
|
||||
dy = pbc[1]*domain->yprd;
|
||||
dz = pbc[2]*domain->zprd;
|
||||
} else {
|
||||
dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
|
||||
dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
|
||||
dz = pbc[2]*domain->zprd;
|
||||
}
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0) + dx;
|
||||
buf[m++] = h_x(j,1) + dy;
|
||||
buf[m++] = h_x(j,2) + dz;
|
||||
}
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecChargeKokkos::pack_comm_vel(int n, int *list, double *buf,
|
||||
int pbc_flag, int *pbc)
|
||||
{
|
||||
int i,j,m;
|
||||
double dx,dy,dz,dvx,dvy,dvz;
|
||||
|
||||
m = 0;
|
||||
if (pbc_flag == 0) {
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0);
|
||||
buf[m++] = h_x(j,1);
|
||||
buf[m++] = h_x(j,2);
|
||||
buf[m++] = h_v(j,0);
|
||||
buf[m++] = h_v(j,1);
|
||||
buf[m++] = h_v(j,2);
|
||||
}
|
||||
} else {
|
||||
if (domain->triclinic == 0) {
|
||||
dx = pbc[0]*domain->xprd;
|
||||
dy = pbc[1]*domain->yprd;
|
||||
dz = pbc[2]*domain->zprd;
|
||||
} else {
|
||||
dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
|
||||
dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
|
||||
dz = pbc[2]*domain->zprd;
|
||||
}
|
||||
if (!deform_vremap) {
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0) + dx;
|
||||
buf[m++] = h_x(j,1) + dy;
|
||||
buf[m++] = h_x(j,2) + dz;
|
||||
buf[m++] = h_v(j,0);
|
||||
buf[m++] = h_v(j,1);
|
||||
buf[m++] = h_v(j,2);
|
||||
}
|
||||
} else {
|
||||
dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
|
||||
dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
|
||||
dvz = pbc[2]*h_rate[2];
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0) + dx;
|
||||
buf[m++] = h_x(j,1) + dy;
|
||||
buf[m++] = h_x(j,2) + dz;
|
||||
if (mask[i] & deform_groupbit) {
|
||||
buf[m++] = h_v(j,0) + dvx;
|
||||
buf[m++] = h_v(j,1) + dvy;
|
||||
buf[m++] = h_v(j,2) + dvz;
|
||||
} else {
|
||||
buf[m++] = h_v(j,0);
|
||||
buf[m++] = h_v(j,1);
|
||||
buf[m++] = h_v(j,2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AtomVecChargeKokkos::unpack_comm(int n, int first, double *buf)
|
||||
{
|
||||
int i,m,last;
|
||||
|
||||
m = 0;
|
||||
last = first + n;
|
||||
for (i = first; i < last; i++) {
|
||||
h_x(i,0) = buf[m++];
|
||||
h_x(i,1) = buf[m++];
|
||||
h_x(i,2) = buf[m++];
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AtomVecChargeKokkos::unpack_comm_vel(int n, int first, double *buf)
|
||||
{
|
||||
int i,m,last;
|
||||
|
||||
m = 0;
|
||||
last = first + n;
|
||||
for (i = first; i < last; i++) {
|
||||
h_x(i,0) = buf[m++];
|
||||
h_x(i,1) = buf[m++];
|
||||
h_x(i,2) = buf[m++];
|
||||
h_v(i,0) = buf[m++];
|
||||
h_v(i,1) = buf[m++];
|
||||
h_v(i,2) = buf[m++];
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecChargeKokkos::pack_reverse(int n, int first, double *buf)
|
||||
{
|
||||
if(n > 0)
|
||||
sync(Host,F_MASK);
|
||||
|
||||
int m = 0;
|
||||
const int last = first + n;
|
||||
for (int i = first; i < last; i++) {
|
||||
buf[m++] = h_f(i,0);
|
||||
buf[m++] = h_f(i,1);
|
||||
buf[m++] = h_f(i,2);
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AtomVecChargeKokkos::unpack_reverse(int n, int *list, double *buf)
|
||||
{
|
||||
if(n > 0)
|
||||
modified(Host,F_MASK);
|
||||
|
||||
int m = 0;
|
||||
for (int i = 0; i < n; i++) {
|
||||
const int j = list[i];
|
||||
h_f(j,0) += buf[m++];
|
||||
h_f(j,1) += buf[m++];
|
||||
h_f(j,2) += buf[m++];
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType,int PBC_FLAG>
|
||||
struct AtomVecChargeKokkos_PackBorder {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
@ -33,12 +33,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos {
|
||||
virtual ~AtomVecChargeKokkos() {}
|
||||
void grow(int);
|
||||
void copy(int, int, int);
|
||||
int pack_comm(int, int *, double *, int, int *);
|
||||
int pack_comm_vel(int, int *, double *, int, int *);
|
||||
void unpack_comm(int, int, double *);
|
||||
void unpack_comm_vel(int, int, double *);
|
||||
int pack_reverse(int, int, double *);
|
||||
void unpack_reverse(int, int *, double *);
|
||||
int pack_border(int, int *, double *, int, int *);
|
||||
int pack_border_vel(int, int *, double *, int, int *);
|
||||
int pack_border_hybrid(int, int *, double *);
|
||||
@ -60,15 +54,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos {
|
||||
bigint memory_usage();
|
||||
|
||||
void grow_reset();
|
||||
int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
|
||||
const int & iswap,
|
||||
const DAT::tdual_xfloat_2d &buf,
|
||||
const int &pbc_flag, const int pbc[]);
|
||||
void unpack_comm_kokkos(const int &n, const int &nfirst,
|
||||
const DAT::tdual_xfloat_2d &buf);
|
||||
int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
|
||||
const int & iswap, const int nfirst,
|
||||
const int &pbc_flag, const int pbc[]);
|
||||
int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
|
||||
DAT::tdual_xfloat_2d buf,int iswap,
|
||||
int pbc_flag, int *pbc, ExecutionSpace space);
|
||||
@ -108,9 +93,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos {
|
||||
DAT::t_x_array d_x;
|
||||
DAT::t_v_array d_v;
|
||||
DAT::t_f_array d_f;
|
||||
HAT::t_x_array h_x;
|
||||
HAT::t_v_array h_v;
|
||||
HAT::t_f_array h_f;
|
||||
|
||||
DAT::t_float_1d d_q;
|
||||
|
||||
|
||||
@ -111,9 +111,6 @@ class AtomVecDPDKokkos : public AtomVecKokkos {
|
||||
DAT::t_x_array d_x;
|
||||
DAT::t_v_array d_v;
|
||||
DAT::t_f_array d_f;
|
||||
HAT::t_x_array h_x;
|
||||
HAT::t_v_array h_v;
|
||||
HAT::t_f_array h_f;
|
||||
|
||||
DAT::tdual_int_1d k_count;
|
||||
};
|
||||
|
||||
@ -307,452 +307,6 @@ void AtomVecFullKokkos::copy(int i, int j, int delflag)
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType,int PBC_FLAG,int TRICLINIC>
|
||||
struct AtomVecFullKokkos_PackComm {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
|
||||
typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
|
||||
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
|
||||
const int _iswap;
|
||||
X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
|
||||
X_FLOAT _pbc[6];
|
||||
|
||||
AtomVecFullKokkos_PackComm(
|
||||
const typename DAT::tdual_x_array &x,
|
||||
const typename DAT::tdual_xfloat_2d &buf,
|
||||
const typename DAT::tdual_int_2d &list,
|
||||
const int & iswap,
|
||||
const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
|
||||
const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
|
||||
_x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap),
|
||||
_xprd(xprd),_yprd(yprd),_zprd(zprd),
|
||||
_xy(xy),_xz(xz),_yz(yz) {
|
||||
const size_t maxsend = (buf.view<DeviceType>().dimension_0()
|
||||
*buf.view<DeviceType>().dimension_1())/3;
|
||||
const size_t elements = 3;
|
||||
buffer_view<DeviceType>(_buf,buf,maxsend,elements);
|
||||
_pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
|
||||
_pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
|
||||
};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int& i) const {
|
||||
const int j = _list(_iswap,i);
|
||||
if (PBC_FLAG == 0) {
|
||||
_buf(i,0) = _x(j,0);
|
||||
_buf(i,1) = _x(j,1);
|
||||
_buf(i,2) = _x(j,2);
|
||||
} else {
|
||||
if (TRICLINIC == 0) {
|
||||
_buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
|
||||
_buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
|
||||
_buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
|
||||
} else {
|
||||
_buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
|
||||
_buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
|
||||
_buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecFullKokkos::pack_comm_kokkos(const int &n,
|
||||
const DAT::tdual_int_2d &list,
|
||||
const int & iswap,
|
||||
const DAT::tdual_xfloat_2d &buf,
|
||||
const int &pbc_flag,
|
||||
const int* const pbc)
|
||||
{
|
||||
// Check whether to always run forward communication on the host
|
||||
// Choose correct forward PackComm kernel
|
||||
|
||||
if(commKK->forward_comm_on_host) {
|
||||
sync(Host,X_MASK);
|
||||
if(pbc_flag) {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecFullKokkos_PackComm<LMPHostType,1,1>
|
||||
f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecFullKokkos_PackComm<LMPHostType,1,0>
|
||||
f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
} else {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecFullKokkos_PackComm<LMPHostType,0,1>
|
||||
f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecFullKokkos_PackComm<LMPHostType,0,0>
|
||||
f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
sync(Device,X_MASK);
|
||||
if(pbc_flag) {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecFullKokkos_PackComm<LMPDeviceType,1,1>
|
||||
f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecFullKokkos_PackComm<LMPDeviceType,1,0>
|
||||
f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
} else {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecFullKokkos_PackComm<LMPDeviceType,0,1>
|
||||
f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecFullKokkos_PackComm<LMPDeviceType,0,0>
|
||||
f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return n*size_forward;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType,int PBC_FLAG,int TRICLINIC>
|
||||
struct AtomVecFullKokkos_PackCommSelf {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
|
||||
typename ArrayTypes<DeviceType>::t_x_array _xw;
|
||||
int _nfirst;
|
||||
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
|
||||
const int _iswap;
|
||||
X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
|
||||
X_FLOAT _pbc[6];
|
||||
|
||||
AtomVecFullKokkos_PackCommSelf(
|
||||
const typename DAT::tdual_x_array &x,
|
||||
const int &nfirst,
|
||||
const typename DAT::tdual_int_2d &list,
|
||||
const int & iswap,
|
||||
const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
|
||||
const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
|
||||
_x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),
|
||||
_list(list.view<DeviceType>()),_iswap(iswap),
|
||||
_xprd(xprd),_yprd(yprd),_zprd(zprd),
|
||||
_xy(xy),_xz(xz),_yz(yz) {
|
||||
_pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
|
||||
_pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
|
||||
};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int& i) const {
|
||||
const int j = _list(_iswap,i);
|
||||
if (PBC_FLAG == 0) {
|
||||
_xw(i+_nfirst,0) = _x(j,0);
|
||||
_xw(i+_nfirst,1) = _x(j,1);
|
||||
_xw(i+_nfirst,2) = _x(j,2);
|
||||
} else {
|
||||
if (TRICLINIC == 0) {
|
||||
_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
|
||||
_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
|
||||
_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
|
||||
} else {
|
||||
_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
|
||||
_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
|
||||
_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecFullKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
|
||||
const int & iswap,
|
||||
const int nfirst, const int &pbc_flag,
|
||||
const int* const pbc) {
|
||||
if(commKK->forward_comm_on_host) {
|
||||
sync(Host,X_MASK);
|
||||
modified(Host,X_MASK);
|
||||
if(pbc_flag) {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecFullKokkos_PackCommSelf<LMPHostType,1,1>
|
||||
f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecFullKokkos_PackCommSelf<LMPHostType,1,0>
|
||||
f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
} else {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecFullKokkos_PackCommSelf<LMPHostType,0,1>
|
||||
f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecFullKokkos_PackCommSelf<LMPHostType,0,0>
|
||||
f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
sync(Device,X_MASK);
|
||||
modified(Device,X_MASK);
|
||||
if(pbc_flag) {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecFullKokkos_PackCommSelf<LMPDeviceType,1,1>
|
||||
f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecFullKokkos_PackCommSelf<LMPDeviceType,1,0>
|
||||
f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
} else {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecFullKokkos_PackCommSelf<LMPDeviceType,0,1>
|
||||
f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecFullKokkos_PackCommSelf<LMPDeviceType,0,0>
|
||||
f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
}
|
||||
return n*3;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
struct AtomVecFullKokkos_UnpackComm {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
typename ArrayTypes<DeviceType>::t_x_array _x;
|
||||
typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
|
||||
int _first;
|
||||
|
||||
AtomVecFullKokkos_UnpackComm(
|
||||
const typename DAT::tdual_x_array &x,
|
||||
const typename DAT::tdual_xfloat_2d &buf,
|
||||
const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
|
||||
_first(first) {};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int& i) const {
|
||||
_x(i+_first,0) = _buf(i,0);
|
||||
_x(i+_first,1) = _buf(i,1);
|
||||
_x(i+_first,2) = _buf(i,2);
|
||||
}
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AtomVecFullKokkos::unpack_comm_kokkos(const int &n, const int &first,
|
||||
const DAT::tdual_xfloat_2d &buf ) {
|
||||
if(commKK->forward_comm_on_host) {
|
||||
sync(Host,X_MASK);
|
||||
modified(Host,X_MASK);
|
||||
struct AtomVecFullKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
sync(Device,X_MASK);
|
||||
modified(Device,X_MASK);
|
||||
struct AtomVecFullKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecFullKokkos::pack_comm(int n, int *list, double *buf,
|
||||
int pbc_flag, int *pbc)
|
||||
{
|
||||
int i,j,m;
|
||||
double dx,dy,dz;
|
||||
|
||||
m = 0;
|
||||
if (pbc_flag == 0) {
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0);
|
||||
buf[m++] = h_x(j,1);
|
||||
buf[m++] = h_x(j,2);
|
||||
}
|
||||
} else {
|
||||
if (domain->triclinic == 0) {
|
||||
dx = pbc[0]*domain->xprd;
|
||||
dy = pbc[1]*domain->yprd;
|
||||
dz = pbc[2]*domain->zprd;
|
||||
} else {
|
||||
dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
|
||||
dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
|
||||
dz = pbc[2]*domain->zprd;
|
||||
}
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0) + dx;
|
||||
buf[m++] = h_x(j,1) + dy;
|
||||
buf[m++] = h_x(j,2) + dz;
|
||||
}
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecFullKokkos::pack_comm_vel(int n, int *list, double *buf,
|
||||
int pbc_flag, int *pbc)
|
||||
{
|
||||
int i,j,m;
|
||||
double dx,dy,dz,dvx,dvy,dvz;
|
||||
|
||||
m = 0;
|
||||
if (pbc_flag == 0) {
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0);
|
||||
buf[m++] = h_x(j,1);
|
||||
buf[m++] = h_x(j,2);
|
||||
buf[m++] = h_v(j,0);
|
||||
buf[m++] = h_v(j,1);
|
||||
buf[m++] = h_v(j,2);
|
||||
}
|
||||
} else {
|
||||
if (domain->triclinic == 0) {
|
||||
dx = pbc[0]*domain->xprd;
|
||||
dy = pbc[1]*domain->yprd;
|
||||
dz = pbc[2]*domain->zprd;
|
||||
} else {
|
||||
dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
|
||||
dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
|
||||
dz = pbc[2]*domain->zprd;
|
||||
}
|
||||
if (!deform_vremap) {
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0) + dx;
|
||||
buf[m++] = h_x(j,1) + dy;
|
||||
buf[m++] = h_x(j,2) + dz;
|
||||
buf[m++] = h_v(j,0);
|
||||
buf[m++] = h_v(j,1);
|
||||
buf[m++] = h_v(j,2);
|
||||
}
|
||||
} else {
|
||||
dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
|
||||
dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
|
||||
dvz = pbc[2]*h_rate[2];
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0) + dx;
|
||||
buf[m++] = h_x(j,1) + dy;
|
||||
buf[m++] = h_x(j,2) + dz;
|
||||
if (mask[i] & deform_groupbit) {
|
||||
buf[m++] = h_v(j,0) + dvx;
|
||||
buf[m++] = h_v(j,1) + dvy;
|
||||
buf[m++] = h_v(j,2) + dvz;
|
||||
} else {
|
||||
buf[m++] = h_v(j,0);
|
||||
buf[m++] = h_v(j,1);
|
||||
buf[m++] = h_v(j,2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AtomVecFullKokkos::unpack_comm(int n, int first, double *buf)
|
||||
{
|
||||
int i,m,last;
|
||||
|
||||
m = 0;
|
||||
last = first + n;
|
||||
for (i = first; i < last; i++) {
|
||||
h_x(i,0) = buf[m++];
|
||||
h_x(i,1) = buf[m++];
|
||||
h_x(i,2) = buf[m++];
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AtomVecFullKokkos::unpack_comm_vel(int n, int first, double *buf)
|
||||
{
|
||||
int i,m,last;
|
||||
|
||||
m = 0;
|
||||
last = first + n;
|
||||
for (i = first; i < last; i++) {
|
||||
h_x(i,0) = buf[m++];
|
||||
h_x(i,1) = buf[m++];
|
||||
h_x(i,2) = buf[m++];
|
||||
h_v(i,0) = buf[m++];
|
||||
h_v(i,1) = buf[m++];
|
||||
h_v(i,2) = buf[m++];
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecFullKokkos::pack_reverse(int n, int first, double *buf)
|
||||
{
|
||||
if(n > 0)
|
||||
sync(Host,F_MASK);
|
||||
|
||||
int m = 0;
|
||||
const int last = first + n;
|
||||
for (int i = first; i < last; i++) {
|
||||
buf[m++] = h_f(i,0);
|
||||
buf[m++] = h_f(i,1);
|
||||
buf[m++] = h_f(i,2);
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AtomVecFullKokkos::unpack_reverse(int n, int *list, double *buf)
|
||||
{
|
||||
if(n > 0)
|
||||
modified(Host,F_MASK);
|
||||
|
||||
int m = 0;
|
||||
for (int i = 0; i < n; i++) {
|
||||
const int j = list[i];
|
||||
h_f(j,0) += buf[m++];
|
||||
h_f(j,1) += buf[m++];
|
||||
h_f(j,2) += buf[m++];
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType,int PBC_FLAG>
|
||||
struct AtomVecFullKokkos_PackBorder {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
@ -32,12 +32,6 @@ class AtomVecFullKokkos : public AtomVecKokkos {
|
||||
virtual ~AtomVecFullKokkos() {}
|
||||
void grow(int);
|
||||
void copy(int, int, int);
|
||||
int pack_comm(int, int *, double *, int, int *);
|
||||
int pack_comm_vel(int, int *, double *, int, int *);
|
||||
void unpack_comm(int, int, double *);
|
||||
void unpack_comm_vel(int, int, double *);
|
||||
int pack_reverse(int, int, double *);
|
||||
void unpack_reverse(int, int *, double *);
|
||||
int pack_border(int, int *, double *, int, int *);
|
||||
int pack_border_vel(int, int *, double *, int, int *);
|
||||
int pack_border_hybrid(int, int *, double *);
|
||||
@ -59,15 +53,6 @@ class AtomVecFullKokkos : public AtomVecKokkos {
|
||||
bigint memory_usage();
|
||||
|
||||
void grow_reset();
|
||||
int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
|
||||
const int & iswap,
|
||||
const DAT::tdual_xfloat_2d &buf,
|
||||
const int &pbc_flag, const int pbc[]);
|
||||
void unpack_comm_kokkos(const int &n, const int &nfirst,
|
||||
const DAT::tdual_xfloat_2d &buf);
|
||||
int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
|
||||
const int & iswap, const int nfirst,
|
||||
const int &pbc_flag, const int pbc[]);
|
||||
int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
|
||||
DAT::tdual_xfloat_2d buf,int iswap,
|
||||
int pbc_flag, int *pbc, ExecutionSpace space);
|
||||
@ -125,9 +110,6 @@ class AtomVecFullKokkos : public AtomVecKokkos {
|
||||
DAT::t_x_array d_x;
|
||||
DAT::t_v_array d_v;
|
||||
DAT::t_f_array d_f;
|
||||
HAT::t_x_array h_x;
|
||||
HAT::t_v_array h_v;
|
||||
HAT::t_f_array h_f;
|
||||
|
||||
DAT::t_float_1d d_q;
|
||||
HAT::t_float_1d h_q;
|
||||
|
||||
@ -12,6 +12,10 @@
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "atom_vec_kokkos.h"
|
||||
#include "atom_kokkos.h"
|
||||
#include "comm_kokkos.h"
|
||||
#include "domain.h"
|
||||
#include "atom_masks.h"
|
||||
|
||||
using namespace LAMMPS_NS;
|
||||
|
||||
@ -24,3 +28,585 @@ AtomVecKokkos::AtomVecKokkos(LAMMPS *lmp) : AtomVec(lmp)
|
||||
buffer_size = 0;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType,int PBC_FLAG,int TRICLINIC>
|
||||
struct AtomVecKokkos_PackComm {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
|
||||
typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
|
||||
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
|
||||
const int _iswap;
|
||||
X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
|
||||
X_FLOAT _pbc[6];
|
||||
|
||||
AtomVecKokkos_PackComm(
|
||||
const typename DAT::tdual_x_array &x,
|
||||
const typename DAT::tdual_xfloat_2d &buf,
|
||||
const typename DAT::tdual_int_2d &list,
|
||||
const int & iswap,
|
||||
const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
|
||||
const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
|
||||
_x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap),
|
||||
_xprd(xprd),_yprd(yprd),_zprd(zprd),
|
||||
_xy(xy),_xz(xz),_yz(yz) {
|
||||
const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
|
||||
const size_t elements = 3;
|
||||
buffer_view<DeviceType>(_buf,buf,maxsend,elements);
|
||||
_pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
|
||||
_pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
|
||||
};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int& i) const {
|
||||
const int j = _list(_iswap,i);
|
||||
if (PBC_FLAG == 0) {
|
||||
_buf(i,0) = _x(j,0);
|
||||
_buf(i,1) = _x(j,1);
|
||||
_buf(i,2) = _x(j,2);
|
||||
} else {
|
||||
if (TRICLINIC == 0) {
|
||||
_buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
|
||||
_buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
|
||||
_buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
|
||||
} else {
|
||||
_buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
|
||||
_buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
|
||||
_buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecKokkos::pack_comm_kokkos(const int &n,
|
||||
const DAT::tdual_int_2d &list,
|
||||
const int & iswap,
|
||||
const DAT::tdual_xfloat_2d &buf,
|
||||
const int &pbc_flag,
|
||||
const int* const pbc)
|
||||
{
|
||||
// Check whether to always run forward communication on the host
|
||||
// Choose correct forward PackComm kernel
|
||||
|
||||
if(commKK->forward_comm_on_host) {
|
||||
sync(Host,X_MASK);
|
||||
if(pbc_flag) {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
} else {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
sync(Device,X_MASK);
|
||||
if(pbc_flag) {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
} else {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return n*size_forward;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType,int PBC_FLAG,int TRICLINIC>
|
||||
struct AtomVecKokkos_PackCommSelf {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
|
||||
typename ArrayTypes<DeviceType>::t_x_array _xw;
|
||||
int _nfirst;
|
||||
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
|
||||
const int _iswap;
|
||||
X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
|
||||
X_FLOAT _pbc[6];
|
||||
|
||||
AtomVecKokkos_PackCommSelf(
|
||||
const typename DAT::tdual_x_array &x,
|
||||
const int &nfirst,
|
||||
const typename DAT::tdual_int_2d &list,
|
||||
const int & iswap,
|
||||
const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
|
||||
const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
|
||||
_x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
|
||||
_xprd(xprd),_yprd(yprd),_zprd(zprd),
|
||||
_xy(xy),_xz(xz),_yz(yz) {
|
||||
_pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
|
||||
_pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
|
||||
};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int& i) const {
|
||||
const int j = _list(_iswap,i);
|
||||
if (PBC_FLAG == 0) {
|
||||
_xw(i+_nfirst,0) = _x(j,0);
|
||||
_xw(i+_nfirst,1) = _x(j,1);
|
||||
_xw(i+_nfirst,2) = _x(j,2);
|
||||
} else {
|
||||
if (TRICLINIC == 0) {
|
||||
_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
|
||||
_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
|
||||
_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
|
||||
} else {
|
||||
_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
|
||||
_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
|
||||
_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
|
||||
const int nfirst, const int &pbc_flag, const int* const pbc) {
|
||||
if(commKK->forward_comm_on_host) {
|
||||
sync(Host,X_MASK);
|
||||
modified(Host,X_MASK);
|
||||
if(pbc_flag) {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
} else {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
sync(Device,X_MASK);
|
||||
modified(Device,X_MASK);
|
||||
if(pbc_flag) {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
} else {
|
||||
if(domain->triclinic) {
|
||||
struct AtomVecKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
struct AtomVecKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap,
|
||||
domain->xprd,domain->yprd,domain->zprd,
|
||||
domain->xy,domain->xz,domain->yz,pbc);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
}
|
||||
return n*3;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
struct AtomVecKokkos_UnpackComm {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
typename ArrayTypes<DeviceType>::t_x_array _x;
|
||||
typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
|
||||
int _first;
|
||||
|
||||
AtomVecKokkos_UnpackComm(
|
||||
const typename DAT::tdual_x_array &x,
|
||||
const typename DAT::tdual_xfloat_2d &buf,
|
||||
const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
|
||||
_first(first) {};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int& i) const {
|
||||
_x(i+_first,0) = _buf(i,0);
|
||||
_x(i+_first,1) = _buf(i,1);
|
||||
_x(i+_first,2) = _buf(i,2);
|
||||
}
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AtomVecKokkos::unpack_comm_kokkos(const int &n, const int &first,
|
||||
const DAT::tdual_xfloat_2d &buf ) {
|
||||
if(commKK->forward_comm_on_host) {
|
||||
sync(Host,X_MASK);
|
||||
modified(Host,X_MASK);
|
||||
struct AtomVecKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
sync(Device,X_MASK);
|
||||
modified(Device,X_MASK);
|
||||
struct AtomVecKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecKokkos::pack_comm(int n, int *list, double *buf,
|
||||
int pbc_flag, int *pbc)
|
||||
{
|
||||
int i,j,m;
|
||||
double dx,dy,dz;
|
||||
|
||||
m = 0;
|
||||
if (pbc_flag == 0) {
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0);
|
||||
buf[m++] = h_x(j,1);
|
||||
buf[m++] = h_x(j,2);
|
||||
}
|
||||
} else {
|
||||
if (domain->triclinic == 0) {
|
||||
dx = pbc[0]*domain->xprd;
|
||||
dy = pbc[1]*domain->yprd;
|
||||
dz = pbc[2]*domain->zprd;
|
||||
} else {
|
||||
dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
|
||||
dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
|
||||
dz = pbc[2]*domain->zprd;
|
||||
}
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0) + dx;
|
||||
buf[m++] = h_x(j,1) + dy;
|
||||
buf[m++] = h_x(j,2) + dz;
|
||||
}
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecKokkos::pack_comm_vel(int n, int *list, double *buf,
|
||||
int pbc_flag, int *pbc)
|
||||
{
|
||||
int i,j,m;
|
||||
double dx,dy,dz,dvx,dvy,dvz;
|
||||
|
||||
m = 0;
|
||||
if (pbc_flag == 0) {
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0);
|
||||
buf[m++] = h_x(j,1);
|
||||
buf[m++] = h_x(j,2);
|
||||
buf[m++] = h_v(j,0);
|
||||
buf[m++] = h_v(j,1);
|
||||
buf[m++] = h_v(j,2);
|
||||
}
|
||||
} else {
|
||||
if (domain->triclinic == 0) {
|
||||
dx = pbc[0]*domain->xprd;
|
||||
dy = pbc[1]*domain->yprd;
|
||||
dz = pbc[2]*domain->zprd;
|
||||
} else {
|
||||
dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
|
||||
dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
|
||||
dz = pbc[2]*domain->zprd;
|
||||
}
|
||||
if (!deform_vremap) {
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0) + dx;
|
||||
buf[m++] = h_x(j,1) + dy;
|
||||
buf[m++] = h_x(j,2) + dz;
|
||||
buf[m++] = h_v(j,0);
|
||||
buf[m++] = h_v(j,1);
|
||||
buf[m++] = h_v(j,2);
|
||||
}
|
||||
} else {
|
||||
dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
|
||||
dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
|
||||
dvz = pbc[2]*h_rate[2];
|
||||
for (i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf[m++] = h_x(j,0) + dx;
|
||||
buf[m++] = h_x(j,1) + dy;
|
||||
buf[m++] = h_x(j,2) + dz;
|
||||
if (atom->mask[i] & deform_groupbit) {
|
||||
buf[m++] = h_v(j,0) + dvx;
|
||||
buf[m++] = h_v(j,1) + dvy;
|
||||
buf[m++] = h_v(j,2) + dvz;
|
||||
} else {
|
||||
buf[m++] = h_v(j,0);
|
||||
buf[m++] = h_v(j,1);
|
||||
buf[m++] = h_v(j,2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AtomVecKokkos::unpack_comm(int n, int first, double *buf)
|
||||
{
|
||||
int i,m,last;
|
||||
|
||||
m = 0;
|
||||
last = first + n;
|
||||
for (i = first; i < last; i++) {
|
||||
h_x(i,0) = buf[m++];
|
||||
h_x(i,1) = buf[m++];
|
||||
h_x(i,2) = buf[m++];
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AtomVecKokkos::unpack_comm_vel(int n, int first, double *buf)
|
||||
{
|
||||
int i,m,last;
|
||||
|
||||
m = 0;
|
||||
last = first + n;
|
||||
for (i = first; i < last; i++) {
|
||||
h_x(i,0) = buf[m++];
|
||||
h_x(i,1) = buf[m++];
|
||||
h_x(i,2) = buf[m++];
|
||||
h_v(i,0) = buf[m++];
|
||||
h_v(i,1) = buf[m++];
|
||||
h_v(i,2) = buf[m++];
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
struct AtomVecKokkos_PackReverse {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
typename ArrayTypes<DeviceType>::t_f_array_randomread _f;
|
||||
typename ArrayTypes<DeviceType>::t_ffloat_2d _buf;
|
||||
int _first;
|
||||
|
||||
AtomVecKokkos_PackReverse(
|
||||
const typename DAT::tdual_f_array &f,
|
||||
const typename DAT::tdual_ffloat_2d &buf,
|
||||
const int& first):_f(f.view<DeviceType>()),_buf(buf.view<DeviceType>()),
|
||||
_first(first) {};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int& i) const {
|
||||
_buf(i,0) = _f(i+_first,0);
|
||||
_buf(i,1) = _f(i+_first,1);
|
||||
_buf(i,2) = _f(i+_first,2);
|
||||
}
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecKokkos::pack_reverse_kokkos(const int &n, const int &first,
|
||||
const DAT::tdual_ffloat_2d &buf ) {
|
||||
if(commKK->reverse_comm_on_host) {
|
||||
sync(Host,F_MASK);
|
||||
struct AtomVecKokkos_PackReverse<LMPHostType> f(atomKK->k_f,buf,first);
|
||||
Kokkos::parallel_for(n,f);
|
||||
} else {
|
||||
sync(Device,F_MASK);
|
||||
struct AtomVecKokkos_PackReverse<LMPDeviceType> f(atomKK->k_f,buf,first);
|
||||
Kokkos::parallel_for(n,f);
|
||||
}
|
||||
|
||||
return n*size_reverse;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
struct AtomVecKokkos_UnPackReverseSelf {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
typename ArrayTypes<DeviceType>::t_f_array_randomread _f;
|
||||
typename ArrayTypes<DeviceType>::t_f_array _fw;
|
||||
int _nfirst;
|
||||
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
|
||||
const int _iswap;
|
||||
|
||||
AtomVecKokkos_UnPackReverseSelf(
|
||||
const typename DAT::tdual_f_array &f,
|
||||
const int &nfirst,
|
||||
const typename DAT::tdual_int_2d &list,
|
||||
const int & iswap):
|
||||
_f(f.view<DeviceType>()),_fw(f.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap) {
|
||||
};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int& i) const {
|
||||
const int j = _list(_iswap,i);
|
||||
_fw(j,0) += _f(i+_nfirst,0);
|
||||
_fw(j,1) += _f(i+_nfirst,1);
|
||||
_fw(j,2) += _f(i+_nfirst,2);
|
||||
}
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecKokkos::unpack_reverse_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
|
||||
const int nfirst) {
|
||||
if(commKK->reverse_comm_on_host) {
|
||||
sync(Host,F_MASK);
|
||||
struct AtomVecKokkos_UnPackReverseSelf<LMPHostType> f(atomKK->k_f,nfirst,list,iswap);
|
||||
Kokkos::parallel_for(n,f);
|
||||
modified(Host,F_MASK);
|
||||
} else {
|
||||
sync(Device,F_MASK);
|
||||
struct AtomVecKokkos_UnPackReverseSelf<LMPDeviceType> f(atomKK->k_f,nfirst,list,iswap);
|
||||
Kokkos::parallel_for(n,f);
|
||||
modified(Device,F_MASK);
|
||||
}
|
||||
return n*3;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
struct AtomVecKokkos_UnPackReverse {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
typename ArrayTypes<DeviceType>::t_f_array _f;
|
||||
typename ArrayTypes<DeviceType>::t_ffloat_2d_const _buf;
|
||||
typename ArrayTypes<DeviceType>::t_int_2d_const _list;
|
||||
const int _iswap;
|
||||
|
||||
AtomVecKokkos_UnPackReverse(
|
||||
const typename DAT::tdual_f_array &f,
|
||||
const typename DAT::tdual_ffloat_2d &buf,
|
||||
const typename DAT::tdual_int_2d &list,
|
||||
const int & iswap):
|
||||
_f(f.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap) {
|
||||
const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
|
||||
const size_t elements = 3;
|
||||
buffer_view<DeviceType>(_buf,buf,maxsend,elements);
|
||||
};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int& i) const {
|
||||
const int j = _list(_iswap,i);
|
||||
_f(j,0) += _buf(i,0);
|
||||
_f(j,1) += _buf(i,1);
|
||||
_f(j,2) += _buf(i,2);
|
||||
}
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AtomVecKokkos::unpack_reverse_kokkos(const int &n,
|
||||
const DAT::tdual_int_2d &list,
|
||||
const int & iswap,
|
||||
const DAT::tdual_ffloat_2d &buf)
|
||||
{
|
||||
// Check whether to always run reverse communication on the host
|
||||
// Choose correct reverse UnPackReverse kernel
|
||||
|
||||
if(commKK->reverse_comm_on_host) {
|
||||
struct AtomVecKokkos_UnPackReverse<LMPHostType> f(atomKK->k_f,buf,list,iswap);
|
||||
Kokkos::parallel_for(n,f);
|
||||
modified(Host,F_MASK);
|
||||
} else {
|
||||
struct AtomVecKokkos_UnPackReverse<LMPDeviceType> f(atomKK->k_f,buf,list,iswap);
|
||||
Kokkos::parallel_for(n,f);
|
||||
modified(Device,F_MASK);
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
int AtomVecKokkos::pack_reverse(int n, int first, double *buf)
|
||||
{
|
||||
if(n > 0)
|
||||
sync(Host,F_MASK);
|
||||
|
||||
int m = 0;
|
||||
const int last = first + n;
|
||||
for (int i = first; i < last; i++) {
|
||||
buf[m++] = h_f(i,0);
|
||||
buf[m++] = h_f(i,1);
|
||||
buf[m++] = h_f(i,2);
|
||||
}
|
||||
|
||||
return m;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AtomVecKokkos::unpack_reverse(int n, int *list, double *buf)
|
||||
{
|
||||
int m = 0;
|
||||
for (int i = 0; i < n; i++) {
|
||||
const int j = list[i];
|
||||
h_f(j,0) += buf[m++];
|
||||
h_f(j,1) += buf[m++];
|
||||
h_f(j,2) += buf[m++];
|
||||
}
|
||||
|
||||
if(n > 0)
|
||||
modified(Host,F_MASK);
|
||||
}
|
||||
|
||||
@ -35,29 +35,48 @@ class AtomVecKokkos : public AtomVec {
|
||||
public:
|
||||
AtomVecKokkos(class LAMMPS *);
|
||||
virtual ~AtomVecKokkos() {}
|
||||
virtual int pack_comm(int, int *, double *, int, int *);
|
||||
virtual int pack_comm_vel(int, int *, double *, int, int *);
|
||||
virtual void unpack_comm(int, int, double *);
|
||||
virtual void unpack_comm_vel(int, int, double *);
|
||||
virtual int pack_reverse(int, int, double *);
|
||||
virtual void unpack_reverse(int, int *, double *);
|
||||
|
||||
virtual void sync(ExecutionSpace space, unsigned int mask) = 0;
|
||||
virtual void modified(ExecutionSpace space, unsigned int mask) = 0;
|
||||
virtual void sync_overlapping_device(ExecutionSpace space, unsigned int mask) {};
|
||||
virtual void sync_overlapping_device(ExecutionSpace space, unsigned int mask) = 0;
|
||||
|
||||
virtual int
|
||||
pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
|
||||
const int & iswap, const int nfirst,
|
||||
const int &pbc_flag, const int pbc[]) = 0;
|
||||
//{return 0;}
|
||||
const int &pbc_flag, const int pbc[]);
|
||||
|
||||
virtual int
|
||||
pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &list,
|
||||
const int & iswap, const DAT::tdual_xfloat_2d &buf,
|
||||
const int &pbc_flag, const int pbc[]) = 0;
|
||||
//{return 0;}
|
||||
const int &pbc_flag, const int pbc[]);
|
||||
|
||||
virtual void
|
||||
unpack_comm_kokkos(const int &n, const int &nfirst,
|
||||
const DAT::tdual_xfloat_2d &buf) = 0;
|
||||
const DAT::tdual_xfloat_2d &buf);
|
||||
|
||||
virtual int
|
||||
unpack_reverse_self(const int &n, const DAT::tdual_int_2d &list,
|
||||
const int & iswap, const int nfirst);
|
||||
|
||||
virtual int
|
||||
pack_reverse_kokkos(const int &n, const int &nfirst,
|
||||
const DAT::tdual_ffloat_2d &buf);
|
||||
|
||||
virtual void
|
||||
unpack_reverse_kokkos(const int &n, const DAT::tdual_int_2d &list,
|
||||
const int & iswap, const DAT::tdual_ffloat_2d &buf);
|
||||
|
||||
virtual int
|
||||
pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
|
||||
DAT::tdual_xfloat_2d buf,int iswap,
|
||||
int pbc_flag, int *pbc, ExecutionSpace space) = 0;
|
||||
//{return 0;};
|
||||
|
||||
virtual void
|
||||
unpack_border_kokkos(const int &n, const int &nfirst,
|
||||
const DAT::tdual_xfloat_2d &buf,
|
||||
@ -68,15 +87,19 @@ class AtomVecKokkos : public AtomVec {
|
||||
DAT::tdual_int_1d k_sendlist,
|
||||
DAT::tdual_int_1d k_copylist,
|
||||
ExecutionSpace space, int dim, X_FLOAT lo, X_FLOAT hi) = 0;
|
||||
//{return 0;};
|
||||
|
||||
virtual int
|
||||
unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv,
|
||||
int nlocal, int dim, X_FLOAT lo, X_FLOAT hi,
|
||||
ExecutionSpace space) = 0;
|
||||
//{return 0;};
|
||||
|
||||
|
||||
protected:
|
||||
|
||||
HAT::t_x_array h_x;
|
||||
HAT::t_v_array h_v;
|
||||
HAT::t_f_array h_f;
|
||||
|
||||
class CommKokkos *commKK;
|
||||
size_t buffer_size;
|
||||
void* buffer;
|
||||
|
||||
@ -46,7 +46,8 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp)
|
||||
if (sendlist) for (int i = 0; i < maxswap; i++) memory->destroy(sendlist[i]);
|
||||
memory->sfree(sendlist);
|
||||
sendlist = NULL;
|
||||
k_sendlist = ArrayTypes<LMPDeviceType>::tdual_int_2d();
|
||||
k_sendlist = DAT::tdual_int_2d();
|
||||
k_total_send = DAT::tdual_int_scalar("comm::k_total_send");
|
||||
|
||||
// error check for disallow of OpenMP threads?
|
||||
|
||||
@ -57,12 +58,12 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp)
|
||||
memory->destroy(buf_recv);
|
||||
buf_recv = NULL;
|
||||
|
||||
k_exchange_sendlist = ArrayTypes<LMPDeviceType>::
|
||||
k_exchange_sendlist = DAT::
|
||||
tdual_int_1d("comm:k_exchange_sendlist",100);
|
||||
k_exchange_copylist = ArrayTypes<LMPDeviceType>::
|
||||
k_exchange_copylist = DAT::
|
||||
tdual_int_1d("comm:k_exchange_copylist",100);
|
||||
k_count = ArrayTypes<LMPDeviceType>::tdual_int_1d("comm:k_count",1);
|
||||
k_sendflag = ArrayTypes<LMPDeviceType>::tdual_int_1d("comm:k_sendflag",100);
|
||||
k_count = DAT::tdual_int_scalar("comm:k_count");
|
||||
k_sendflag = DAT::tdual_int_1d("comm:k_sendflag",100);
|
||||
|
||||
memory->destroy(maxsendlist);
|
||||
maxsendlist = NULL;
|
||||
@ -102,8 +103,10 @@ void CommKokkos::init()
|
||||
atomKK = (AtomKokkos *) atom;
|
||||
exchange_comm_classic = lmp->kokkos->exchange_comm_classic;
|
||||
forward_comm_classic = lmp->kokkos->forward_comm_classic;
|
||||
reverse_comm_classic = lmp->kokkos->reverse_comm_classic;
|
||||
exchange_comm_on_host = lmp->kokkos->exchange_comm_on_host;
|
||||
forward_comm_on_host = lmp->kokkos->forward_comm_on_host;
|
||||
reverse_comm_on_host = lmp->kokkos->reverse_comm_on_host;
|
||||
|
||||
CommBrick::init();
|
||||
|
||||
@ -132,8 +135,11 @@ void CommKokkos::init()
|
||||
if (force->newton == 0) check_reverse = 0;
|
||||
if (force->pair) check_reverse += force->pair->comm_reverse_off;
|
||||
|
||||
if(check_reverse || check_forward)
|
||||
if (ghost_velocity)
|
||||
forward_comm_classic = true;
|
||||
|
||||
if (!comm_f_only) // not all Kokkos atom_vec styles have reverse pack/unpack routines yet
|
||||
reverse_comm_classic = true;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -173,7 +179,6 @@ void CommKokkos::forward_comm_device(int dummy)
|
||||
int n;
|
||||
MPI_Request request;
|
||||
AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec;
|
||||
double **x = atom->x;
|
||||
double *buf;
|
||||
|
||||
// exchange data with another proc
|
||||
@ -181,32 +186,29 @@ void CommKokkos::forward_comm_device(int dummy)
|
||||
// if comm_x_only set, exchange or copy directly to x, don't unpack
|
||||
|
||||
k_sendlist.sync<DeviceType>();
|
||||
atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);
|
||||
|
||||
for (int iswap = 0; iswap < nswap; iswap++) {
|
||||
|
||||
if (sendproc[iswap] != me) {
|
||||
if (comm_x_only) {
|
||||
atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);
|
||||
if (size_forward_recv[iswap]) buf = x[firstrecv[iswap]];
|
||||
else buf = NULL;
|
||||
|
||||
if (size_forward_recv[iswap]) {
|
||||
buf = atomKK->k_x.view<DeviceType>().ptr_on_device() +
|
||||
firstrecv[iswap]*atomKK->k_x.view<DeviceType>().dimension_1();
|
||||
MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE,
|
||||
recvproc[iswap],0,world,&request);
|
||||
recvproc[iswap],0,world,&request);
|
||||
}
|
||||
n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,
|
||||
iswap,k_buf_send,pbc_flag[iswap],pbc[iswap]);
|
||||
|
||||
if (n) {
|
||||
MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),
|
||||
n,MPI_DOUBLE,sendproc[iswap],0,world);
|
||||
}
|
||||
|
||||
if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
|
||||
atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::
|
||||
space,X_MASK);
|
||||
if (size_forward_recv[iswap]) {
|
||||
MPI_Wait(&request,MPI_STATUS_IGNORE);
|
||||
atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::
|
||||
space,X_MASK);
|
||||
}
|
||||
} else if (ghost_velocity) {
|
||||
error->all(FLERR,"Ghost velocity forward comm not yet "
|
||||
"implemented with Kokkos");
|
||||
@ -248,21 +250,93 @@ void CommKokkos::forward_comm_device(int dummy)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
reverse communication of forces on atoms every timestep
|
||||
other per-atom attributes may also be sent via pack/unpack routines
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void CommKokkos::reverse_comm()
|
||||
{
|
||||
if (!reverse_comm_classic) {
|
||||
if (reverse_comm_on_host) reverse_comm_device<LMPHostType>();
|
||||
else reverse_comm_device<LMPDeviceType>();
|
||||
return;
|
||||
}
|
||||
|
||||
k_sendlist.sync<LMPHostType>();
|
||||
|
||||
if (comm_f_only)
|
||||
atomKK->sync(Host,F_MASK);
|
||||
else
|
||||
atomKK->sync(Host,ALL_MASK);
|
||||
|
||||
CommBrick::reverse_comm();
|
||||
|
||||
if (comm_f_only)
|
||||
atomKK->modified(Host,F_MASK);
|
||||
else
|
||||
atomKK->modified(Host,ALL_MASK);
|
||||
atomKK->sync(Device,ALL_MASK);
|
||||
|
||||
//atomKK->sync(Device,ALL_MASK); // is this needed?
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
void CommKokkos::reverse_comm_device()
|
||||
{
|
||||
int n;
|
||||
MPI_Request request;
|
||||
AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec;
|
||||
double *buf;
|
||||
|
||||
// exchange data with another proc
|
||||
// if other proc is self, just copy
|
||||
// if comm_f_only set, exchange or copy directly from f, don't pack
|
||||
|
||||
k_sendlist.sync<DeviceType>();
|
||||
atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,F_MASK);
|
||||
|
||||
for (int iswap = nswap-1; iswap >= 0; iswap--) {
|
||||
if (sendproc[iswap] != me) {
|
||||
if (comm_f_only) {
|
||||
if (size_reverse_recv[iswap])
|
||||
MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device(),size_reverse_recv[iswap],MPI_DOUBLE,
|
||||
sendproc[iswap],0,world,&request);
|
||||
if (size_reverse_send[iswap]) {
|
||||
buf = atomKK->k_f.view<DeviceType>().ptr_on_device() +
|
||||
firstrecv[iswap]*atomKK->k_f.view<DeviceType>().dimension_1();
|
||||
|
||||
MPI_Send(buf,size_reverse_send[iswap],MPI_DOUBLE,
|
||||
recvproc[iswap],0,world);
|
||||
}
|
||||
if (size_reverse_recv[iswap]) {
|
||||
MPI_Wait(&request,MPI_STATUS_IGNORE);
|
||||
atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::
|
||||
space,F_MASK);
|
||||
}
|
||||
} else {
|
||||
if (size_reverse_recv[iswap])
|
||||
MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device(),
|
||||
size_reverse_recv[iswap],MPI_DOUBLE,
|
||||
sendproc[iswap],0,world,&request);
|
||||
n = avec->pack_reverse_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send);
|
||||
if (n)
|
||||
MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),n,
|
||||
MPI_DOUBLE,recvproc[iswap],0,world);
|
||||
if (size_reverse_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
|
||||
}
|
||||
avec->unpack_reverse_kokkos(sendnum[iswap],k_sendlist,iswap,
|
||||
k_buf_recv);
|
||||
} else {
|
||||
if (sendnum[iswap])
|
||||
n = avec->unpack_reverse_self(sendnum[iswap],k_sendlist,iswap,
|
||||
firstrecv[iswap]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void CommKokkos::forward_comm_fix(Fix *fix, int size)
|
||||
{
|
||||
k_sendlist.sync<LMPHostType>();
|
||||
@ -408,7 +482,7 @@ struct BuildExchangeListFunctor {
|
||||
typename AT::t_x_array _x;
|
||||
|
||||
int _nlocal,_dim;
|
||||
typename AT::t_int_1d _nsend;
|
||||
typename AT::t_int_scalar _nsend;
|
||||
typename AT::t_int_1d _sendlist;
|
||||
typename AT::t_int_1d _sendflag;
|
||||
|
||||
@ -416,7 +490,7 @@ struct BuildExchangeListFunctor {
|
||||
BuildExchangeListFunctor(
|
||||
const typename AT::tdual_x_array x,
|
||||
const typename AT::tdual_int_1d sendlist,
|
||||
typename AT::tdual_int_1d nsend,
|
||||
typename AT::tdual_int_scalar nsend,
|
||||
typename AT::tdual_int_1d sendflag,int nlocal, int dim,
|
||||
X_FLOAT lo, X_FLOAT hi):
|
||||
_x(x.template view<DeviceType>()),
|
||||
@ -430,7 +504,7 @@ struct BuildExchangeListFunctor {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (int i) const {
|
||||
if (_x(i,_dim) < _lo || _x(i,_dim) >= _hi) {
|
||||
const int mysend=Kokkos::atomic_fetch_add(&_nsend(0),1);
|
||||
const int mysend=Kokkos::atomic_fetch_add(&_nsend(),1);
|
||||
if(mysend<_sendlist.dimension_0()) {
|
||||
_sendlist(mysend) = i;
|
||||
_sendflag(i) = 1;
|
||||
@ -489,9 +563,9 @@ void CommKokkos::exchange_device()
|
||||
if (true) {
|
||||
if (k_sendflag.h_view.dimension_0()<nlocal) k_sendflag.resize(nlocal);
|
||||
k_sendflag.sync<DeviceType>();
|
||||
k_count.h_view(0) = k_exchange_sendlist.h_view.dimension_0();
|
||||
while (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) {
|
||||
k_count.h_view(0) = 0;
|
||||
k_count.h_view() = k_exchange_sendlist.h_view.dimension_0();
|
||||
while (k_count.h_view()>=k_exchange_sendlist.h_view.dimension_0()) {
|
||||
k_count.h_view() = 0;
|
||||
k_count.modify<LMPHostType>();
|
||||
k_count.sync<DeviceType>();
|
||||
|
||||
@ -504,10 +578,10 @@ void CommKokkos::exchange_device()
|
||||
k_count.modify<DeviceType>();
|
||||
|
||||
k_count.sync<LMPHostType>();
|
||||
if (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) {
|
||||
k_exchange_sendlist.resize(k_count.h_view(0)*1.1);
|
||||
k_exchange_copylist.resize(k_count.h_view(0)*1.1);
|
||||
k_count.h_view(0)=k_exchange_sendlist.h_view.dimension_0();
|
||||
if (k_count.h_view()>=k_exchange_sendlist.h_view.dimension_0()) {
|
||||
k_exchange_sendlist.resize(k_count.h_view()*1.1);
|
||||
k_exchange_copylist.resize(k_count.h_view()*1.1);
|
||||
k_count.h_view()=k_exchange_sendlist.h_view.dimension_0();
|
||||
}
|
||||
}
|
||||
k_exchange_copylist.sync<LMPHostType>();
|
||||
@ -515,22 +589,22 @@ void CommKokkos::exchange_device()
|
||||
k_sendflag.sync<LMPHostType>();
|
||||
|
||||
int sendpos = nlocal-1;
|
||||
nlocal -= k_count.h_view(0);
|
||||
for(int i = 0; i < k_count.h_view(0); i++) {
|
||||
nlocal -= k_count.h_view();
|
||||
for(int i = 0; i < k_count.h_view(); i++) {
|
||||
if (k_exchange_sendlist.h_view(i)<nlocal) {
|
||||
while (k_sendflag.h_view(sendpos)) sendpos--;
|
||||
k_exchange_copylist.h_view(i) = sendpos;
|
||||
sendpos--;
|
||||
} else
|
||||
k_exchange_copylist.h_view(i) = -1;
|
||||
k_exchange_copylist.h_view(i) = -1;
|
||||
}
|
||||
|
||||
k_exchange_copylist.modify<LMPHostType>();
|
||||
k_exchange_copylist.sync<DeviceType>();
|
||||
nsend = k_count.h_view(0);
|
||||
nsend = k_count.h_view();
|
||||
if (nsend > maxsend) grow_send_kokkos(nsend,1);
|
||||
nsend =
|
||||
avec->pack_exchange_kokkos(k_count.h_view(0),k_buf_send,
|
||||
avec->pack_exchange_kokkos(k_count.h_view(),k_buf_send,
|
||||
k_exchange_sendlist,k_exchange_copylist,
|
||||
ExecutionSpaceFromDevice<DeviceType>::
|
||||
space,dim,lo,hi);
|
||||
@ -640,9 +714,7 @@ void CommKokkos::borders()
|
||||
}
|
||||
|
||||
atomKK->sync(Host,ALL_MASK);
|
||||
atomKK->modified(Host,ALL_MASK);
|
||||
k_sendlist.sync<LMPHostType>();
|
||||
k_sendlist.modify<LMPHostType>();
|
||||
CommBrick::borders();
|
||||
k_sendlist.modify<LMPHostType>();
|
||||
atomKK->modified(Host,ALL_MASK);
|
||||
@ -659,11 +731,11 @@ struct BuildBorderListFunctor {
|
||||
int iswap,maxsendlist;
|
||||
int nfirst,nlast,dim;
|
||||
typename AT::t_int_2d sendlist;
|
||||
typename AT::t_int_1d nsend;
|
||||
typename AT::t_int_scalar nsend;
|
||||
|
||||
BuildBorderListFunctor(typename AT::tdual_x_array _x,
|
||||
typename AT::tdual_int_2d _sendlist,
|
||||
typename AT::tdual_int_1d _nsend,int _nfirst,
|
||||
typename AT::tdual_int_scalar _nsend,int _nfirst,
|
||||
int _nlast, int _dim,
|
||||
X_FLOAT _lo, X_FLOAT _hi, int _iswap,
|
||||
int _maxsendlist):
|
||||
@ -684,7 +756,7 @@ struct BuildBorderListFunctor {
|
||||
for (int i=teamstart + dev.team_rank(); i<teamend; i+=dev.team_size()) {
|
||||
if (x(i,dim) >= lo && x(i,dim) <= hi) mysend++;
|
||||
}
|
||||
const int my_store_pos = dev.team_scan(mysend,&nsend(0));
|
||||
const int my_store_pos = dev.team_scan(mysend,&nsend());
|
||||
|
||||
if (my_store_pos+mysend < maxsendlist) {
|
||||
mysend = my_store_pos;
|
||||
@ -713,7 +785,7 @@ void CommKokkos::borders_device() {
|
||||
AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec;
|
||||
|
||||
ExecutionSpace exec_space = ExecutionSpaceFromDevice<DeviceType>::space;
|
||||
k_sendlist.modify<DeviceType>();
|
||||
k_sendlist.sync<DeviceType>();
|
||||
atomKK->sync(exec_space,ALL_MASK);
|
||||
|
||||
// do swaps over all 3 dimensions
|
||||
@ -763,37 +835,38 @@ void CommKokkos::borders_device() {
|
||||
if (sendflag) {
|
||||
if (!bordergroup || ineed >= 2) {
|
||||
if (style == SINGLE) {
|
||||
typename ArrayTypes<DeviceType>::tdual_int_1d total_send("TS",1);
|
||||
total_send.h_view(0) = 0;
|
||||
if(exec_space == Device) {
|
||||
total_send.template modify<DeviceType>();
|
||||
total_send.template sync<LMPDeviceType>();
|
||||
}
|
||||
k_total_send.h_view() = 0;
|
||||
k_total_send.template modify<LMPHostType>();
|
||||
k_total_send.template sync<LMPDeviceType>();
|
||||
|
||||
BuildBorderListFunctor<DeviceType> f(atomKK->k_x,k_sendlist,
|
||||
total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
|
||||
k_total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
|
||||
Kokkos::TeamPolicy<DeviceType> config((nlast-nfirst+127)/128,128);
|
||||
Kokkos::parallel_for(config,f);
|
||||
|
||||
total_send.template modify<DeviceType>();
|
||||
total_send.template sync<LMPHostType>();
|
||||
k_total_send.template modify<DeviceType>();
|
||||
k_total_send.template sync<LMPHostType>();
|
||||
|
||||
k_sendlist.modify<DeviceType>();
|
||||
|
||||
if(k_total_send.h_view() >= maxsendlist[iswap]) {
|
||||
grow_list(iswap,k_total_send.h_view());
|
||||
|
||||
k_total_send.h_view() = 0;
|
||||
k_total_send.template modify<LMPHostType>();
|
||||
k_total_send.template sync<LMPDeviceType>();
|
||||
|
||||
if(total_send.h_view(0) >= maxsendlist[iswap]) {
|
||||
grow_list(iswap,total_send.h_view(0));
|
||||
k_sendlist.modify<DeviceType>();
|
||||
total_send.h_view(0) = 0;
|
||||
if(exec_space == Device) {
|
||||
total_send.template modify<LMPHostType>();
|
||||
total_send.template sync<LMPDeviceType>();
|
||||
}
|
||||
BuildBorderListFunctor<DeviceType> f(atomKK->k_x,k_sendlist,
|
||||
total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
|
||||
k_total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
|
||||
Kokkos::TeamPolicy<DeviceType> config((nlast-nfirst+127)/128,128);
|
||||
Kokkos::parallel_for(config,f);
|
||||
total_send.template modify<DeviceType>();
|
||||
total_send.template sync<LMPHostType>();
|
||||
|
||||
k_total_send.template modify<DeviceType>();
|
||||
k_total_send.template sync<LMPHostType>();
|
||||
|
||||
k_sendlist.modify<DeviceType>();
|
||||
}
|
||||
nsend = total_send.h_view(0);
|
||||
nsend = k_total_send.h_view();
|
||||
} else {
|
||||
error->all(FLERR,"Required border comm not yet "
|
||||
"implemented with Kokkos");
|
||||
@ -916,10 +989,11 @@ void CommKokkos::borders_device() {
|
||||
|
||||
// reset global->local map
|
||||
|
||||
if (exec_space == Host) k_sendlist.sync<LMPDeviceType>();
|
||||
atomKK->modified(exec_space,ALL_MASK);
|
||||
atomKK->sync(Host,TAG_MASK);
|
||||
if (map_style) atom->map_set();
|
||||
if (map_style) {
|
||||
atomKK->sync(Host,TAG_MASK);
|
||||
atom->map_set();
|
||||
}
|
||||
}
|
||||
/* ----------------------------------------------------------------------
|
||||
realloc the size of the send buffer as needed with BUFFACTOR and bufextra
|
||||
@ -961,7 +1035,7 @@ void CommKokkos::grow_send_kokkos(int n, int flag, ExecutionSpace space)
|
||||
buf_send = k_buf_send.view<LMPHostType>().ptr_on_device();
|
||||
}
|
||||
else {
|
||||
k_buf_send = ArrayTypes<LMPDeviceType>::
|
||||
k_buf_send = DAT::
|
||||
tdual_xfloat_2d("comm:k_buf_send",maxsend_border,atom->avec->size_border);
|
||||
buf_send = k_buf_send.view<LMPHostType>().ptr_on_device();
|
||||
}
|
||||
@ -975,7 +1049,7 @@ void CommKokkos::grow_recv_kokkos(int n, ExecutionSpace space)
|
||||
{
|
||||
maxrecv = static_cast<int> (BUFFACTOR * n);
|
||||
int maxrecv_border = (maxrecv+BUFEXTRA+5)/atom->avec->size_border + 2;
|
||||
k_buf_recv = ArrayTypes<LMPDeviceType>::
|
||||
k_buf_recv = DAT::
|
||||
tdual_xfloat_2d("comm:k_buf_recv",maxrecv_border,atom->avec->size_border);
|
||||
buf_recv = k_buf_recv.view<LMPHostType>().ptr_on_device();
|
||||
}
|
||||
@ -988,6 +1062,11 @@ void CommKokkos::grow_list(int iswap, int n)
|
||||
{
|
||||
int size = static_cast<int> (BUFFACTOR * n);
|
||||
|
||||
if (exchange_comm_classic) { // force realloc on Host
|
||||
k_sendlist.sync<LMPHostType>();
|
||||
k_sendlist.modify<LMPHostType>();
|
||||
}
|
||||
|
||||
memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist");
|
||||
|
||||
for(int i=0;i<maxswap;i++) {
|
||||
@ -1011,6 +1090,11 @@ void CommKokkos::grow_swap(int n)
|
||||
maxswap = n;
|
||||
int size = MAX(k_sendlist.d_view.dimension_1(),BUFMIN);
|
||||
|
||||
if (exchange_comm_classic) { // force realloc on Host
|
||||
k_sendlist.sync<LMPHostType>();
|
||||
k_sendlist.modify<LMPHostType>();
|
||||
}
|
||||
|
||||
memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist");
|
||||
|
||||
memory->grow(maxsendlist,n,"comm:maxsendlist");
|
||||
|
||||
@ -25,15 +25,17 @@ class CommKokkos : public CommBrick {
|
||||
|
||||
bool exchange_comm_classic;
|
||||
bool forward_comm_classic;
|
||||
bool reverse_comm_classic;
|
||||
bool exchange_comm_on_host;
|
||||
bool forward_comm_on_host;
|
||||
bool reverse_comm_on_host;
|
||||
|
||||
CommKokkos(class LAMMPS *);
|
||||
~CommKokkos();
|
||||
void init();
|
||||
|
||||
void forward_comm(int dummy = 0); // forward comm of atom coords
|
||||
void reverse_comm(); // reverse comm of atom coords
|
||||
void reverse_comm(); // reverse comm of atom coords
|
||||
void exchange(); // move atoms to new procs
|
||||
void borders(); // setup list of atoms to comm
|
||||
|
||||
@ -47,15 +49,17 @@ class CommKokkos : public CommBrick {
|
||||
void reverse_comm_dump(class Dump *); // reverse comm from a Dump
|
||||
|
||||
template<class DeviceType> void forward_comm_device(int dummy);
|
||||
template<class DeviceType> void reverse_comm_device();
|
||||
template<class DeviceType> void forward_comm_pair_device(Pair *pair);
|
||||
template<class DeviceType> void exchange_device();
|
||||
template<class DeviceType> void borders_device();
|
||||
|
||||
protected:
|
||||
DAT::tdual_int_2d k_sendlist;
|
||||
DAT::tdual_int_scalar k_total_send;
|
||||
DAT::tdual_xfloat_2d k_buf_send,k_buf_recv;
|
||||
DAT::tdual_int_1d k_exchange_sendlist,k_exchange_copylist,k_sendflag;
|
||||
DAT::tdual_int_1d k_count;
|
||||
DAT::tdual_int_scalar k_count;
|
||||
//double *buf_send; // send buffer for all comm
|
||||
//double *buf_recv; // recv buffer for all comm
|
||||
|
||||
|
||||
@ -63,6 +63,7 @@ FixQEqReaxKokkos(LAMMPS *lmp, int narg, char **arg) :
|
||||
|
||||
nmax = nmax = m_cap = 0;
|
||||
allocated_flag = 0;
|
||||
nprev = 4;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
@ -158,15 +159,15 @@ void FixQEqReaxKokkos<DeviceType>::init_hist()
|
||||
{
|
||||
int i,j;
|
||||
|
||||
k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",atom->nmax,5);
|
||||
k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",atom->nmax,nprev);
|
||||
d_s_hist = k_s_hist.template view<DeviceType>();
|
||||
h_s_hist = k_s_hist.h_view;
|
||||
k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",atom->nmax,5);
|
||||
k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",atom->nmax,nprev);
|
||||
d_t_hist = k_t_hist.template view<DeviceType>();
|
||||
h_t_hist = k_t_hist.h_view;
|
||||
|
||||
for( i = 0; i < atom->nmax; i++ )
|
||||
for( j = 0; j < 5; j++ )
|
||||
for( j = 0; j < nprev; j++ )
|
||||
k_s_hist.h_view(i,j) = k_t_hist.h_view(i,j) = 0.0;
|
||||
|
||||
k_s_hist.template modify<LMPHostType>();
|
||||
@ -334,11 +335,11 @@ void FixQEqReaxKokkos<DeviceType>::allocate_array()
|
||||
d_d = k_d.template view<DeviceType>();
|
||||
h_d = k_d.h_view;
|
||||
|
||||
k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",nmax,5);
|
||||
k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",nmax,nprev);
|
||||
d_s_hist = k_s_hist.template view<DeviceType>();
|
||||
h_s_hist = k_s_hist.h_view;
|
||||
|
||||
k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",nmax,5);
|
||||
k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",nmax,nprev);
|
||||
d_t_hist = k_t_hist.template view<DeviceType>();
|
||||
h_t_hist = k_t_hist.h_view;
|
||||
}
|
||||
@ -368,7 +369,7 @@ void FixQEqReaxKokkos<DeviceType>::zero_item(int ii) const
|
||||
d_o[i] = 0.0;
|
||||
d_r[i] = 0.0;
|
||||
d_d[i] = 0.0;
|
||||
//for( int j = 0; j < 5; j++ )
|
||||
//for( int j = 0; j < nprev; j++ )
|
||||
//d_s_hist(i,j) = d_t_hist(i,j) = 0.0;
|
||||
}
|
||||
|
||||
@ -1087,7 +1088,7 @@ void FixQEqReaxKokkos<DeviceType>::calculate_q_item(int ii) const
|
||||
if (mask[i] & groupbit) {
|
||||
q(i) = d_s[i] - delta * d_t[i];
|
||||
|
||||
for (int k = 4; k > 0; --k) {
|
||||
for (int k = nprev-1; k > 0; --k) {
|
||||
d_s_hist(i,k) = d_s_hist(i,k-1);
|
||||
d_t_hist(i,k) = d_t_hist(i,k-1);
|
||||
}
|
||||
@ -1173,7 +1174,7 @@ double FixQEqReaxKokkos<DeviceType>::memory_usage()
|
||||
{
|
||||
double bytes;
|
||||
|
||||
bytes = atom->nmax*5*2 * sizeof(F_FLOAT); // s_hist & t_hist
|
||||
bytes = atom->nmax*nprev*2 * sizeof(F_FLOAT); // s_hist & t_hist
|
||||
bytes += atom->nmax*8 * sizeof(F_FLOAT); // storage
|
||||
bytes += n_cap*2 * sizeof(int); // matrix...
|
||||
bytes += m_cap * sizeof(int);
|
||||
|
||||
@ -123,8 +123,10 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
|
||||
neighflag_qeq_set = 0;
|
||||
exchange_comm_classic = 0;
|
||||
forward_comm_classic = 0;
|
||||
reverse_comm_classic = 0;
|
||||
exchange_comm_on_host = 0;
|
||||
forward_comm_on_host = 0;
|
||||
reverse_comm_on_host = 0;
|
||||
|
||||
#ifdef KILL_KOKKOS_ON_SIGSEGV
|
||||
signal(SIGSEGV, my_signal_handler);
|
||||
@ -158,8 +160,8 @@ void KokkosLMP::accelerator(int narg, char **arg)
|
||||
neighflag_qeq_set = 0;
|
||||
int newtonflag = 0;
|
||||
double binsize = 0.0;
|
||||
exchange_comm_classic = forward_comm_classic = 0;
|
||||
exchange_comm_on_host = forward_comm_on_host = 0;
|
||||
exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
|
||||
exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
|
||||
|
||||
int iarg = 0;
|
||||
while (iarg < narg) {
|
||||
@ -200,13 +202,13 @@ void KokkosLMP::accelerator(int narg, char **arg)
|
||||
} else if (strcmp(arg[iarg],"comm") == 0) {
|
||||
if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
|
||||
if (strcmp(arg[iarg+1],"no") == 0) {
|
||||
exchange_comm_classic = forward_comm_classic = 1;
|
||||
exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1;
|
||||
} else if (strcmp(arg[iarg+1],"host") == 0) {
|
||||
exchange_comm_classic = forward_comm_classic = 0;
|
||||
exchange_comm_on_host = forward_comm_on_host = 1;
|
||||
exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
|
||||
exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 1;
|
||||
} else if (strcmp(arg[iarg+1],"device") == 0) {
|
||||
exchange_comm_classic = forward_comm_classic = 0;
|
||||
exchange_comm_on_host = forward_comm_on_host = 0;
|
||||
exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
|
||||
exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
|
||||
} else error->all(FLERR,"Illegal package kokkos command");
|
||||
iarg += 2;
|
||||
} else if (strcmp(arg[iarg],"comm/exchange") == 0) {
|
||||
@ -231,6 +233,17 @@ void KokkosLMP::accelerator(int narg, char **arg)
|
||||
forward_comm_on_host = 0;
|
||||
} else error->all(FLERR,"Illegal package kokkos command");
|
||||
iarg += 2;
|
||||
} else if (strcmp(arg[iarg],"comm/reverse") == 0) {
|
||||
if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
|
||||
if (strcmp(arg[iarg+1],"no") == 0) reverse_comm_classic = 1;
|
||||
else if (strcmp(arg[iarg+1],"host") == 0) {
|
||||
reverse_comm_classic = 0;
|
||||
reverse_comm_on_host = 1;
|
||||
} else if (strcmp(arg[iarg+1],"device") == 0) {
|
||||
reverse_comm_classic = 0;
|
||||
reverse_comm_on_host = 0;
|
||||
} else error->all(FLERR,"Illegal package kokkos command");
|
||||
iarg += 2;
|
||||
} else error->all(FLERR,"Illegal package kokkos command");
|
||||
}
|
||||
|
||||
|
||||
@ -27,8 +27,10 @@ class KokkosLMP : protected Pointers {
|
||||
int neighflag_qeq_set;
|
||||
int exchange_comm_classic;
|
||||
int forward_comm_classic;
|
||||
int reverse_comm_classic;
|
||||
int exchange_comm_on_host;
|
||||
int forward_comm_on_host;
|
||||
int reverse_comm_on_host;
|
||||
int num_threads,ngpu;
|
||||
int numa;
|
||||
int auto_sync;
|
||||
|
||||
@ -75,6 +75,10 @@ void NBinKokkos<DeviceType>::bin_atoms_setup(int nall)
|
||||
k_bincount = DAT::tdual_int_1d("Neighbor::d_bincount",mbins);
|
||||
bincount = k_bincount.view<DeviceType>();
|
||||
}
|
||||
if (nall > k_atom2bin.d_view.dimension_0()) {
|
||||
k_atom2bin = DAT::tdual_int_1d("Neighbor::d_atom2bin",nall);
|
||||
atom2bin = k_atom2bin.view<DeviceType>();
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -86,6 +90,10 @@ void NBinKokkos<DeviceType>::bin_atoms()
|
||||
{
|
||||
last_bin = update->ntimestep;
|
||||
|
||||
k_bins.template sync<DeviceType>();
|
||||
k_bincount.template sync<DeviceType>();
|
||||
k_atom2bin.template sync<DeviceType>();
|
||||
|
||||
h_resize() = 1;
|
||||
|
||||
while(h_resize() > 0) {
|
||||
@ -115,6 +123,10 @@ void NBinKokkos<DeviceType>::bin_atoms()
|
||||
c_bins = bins;
|
||||
}
|
||||
}
|
||||
|
||||
k_bins.template modify<DeviceType>();
|
||||
k_bincount.template modify<DeviceType>();
|
||||
k_atom2bin.template modify<DeviceType>();
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
@ -125,6 +137,7 @@ void NBinKokkos<DeviceType>::binatomsItem(const int &i) const
|
||||
{
|
||||
const int ibin = coord2bin(x(i, 0), x(i, 1), x(i, 2));
|
||||
|
||||
atom2bin(i) = ibin;
|
||||
const int ac = Kokkos::atomic_fetch_add(&bincount[ibin], (int)1);
|
||||
if(ac < bins.dimension_1()) {
|
||||
bins(ibin, ac) = i;
|
||||
|
||||
@ -44,11 +44,13 @@ class NBinKokkos : public NBinStandard {
|
||||
int atoms_per_bin;
|
||||
DAT::tdual_int_1d k_bincount;
|
||||
DAT::tdual_int_2d k_bins;
|
||||
DAT::tdual_int_1d k_atom2bin;
|
||||
|
||||
typename AT::t_int_1d bincount;
|
||||
const typename AT::t_int_1d_const c_bincount;
|
||||
typename AT::t_int_2d bins;
|
||||
typename AT::t_int_2d_const c_bins;
|
||||
typename AT::t_int_1d atom2bin;
|
||||
typename AT::t_int_scalar d_resize;
|
||||
typename ArrayTypes<LMPHostType>::t_int_scalar h_resize;
|
||||
typename AT::t_x_array_randomread x;
|
||||
|
||||
@ -310,9 +310,9 @@ void NeighborKokkos::build_kokkos(int topoflag)
|
||||
// build pairwise lists for all perpetual NPair/NeighList
|
||||
// grow() with nlocal/nall args so that only realloc if have to
|
||||
|
||||
atomKK->sync(Host,ALL_MASK);
|
||||
for (i = 0; i < npair_perpetual; i++) {
|
||||
m = plist[i];
|
||||
if (!lists[m]->kokkos) atomKK->sync(Host,ALL_MASK);
|
||||
if (!lists[m]->copy) lists[m]->grow(nlocal,nall);
|
||||
neigh_pair[m]->build_setup();
|
||||
neigh_pair[m]->build(lists[m]);
|
||||
|
||||
@ -73,6 +73,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::copy_bin_info()
|
||||
atoms_per_bin = nbKK->atoms_per_bin;
|
||||
k_bincount = nbKK->k_bincount;
|
||||
k_bins = nbKK->k_bins;
|
||||
k_atom2bin = nbKK->k_atom2bin;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -88,13 +89,15 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::copy_stencil_info()
|
||||
|
||||
int maxstencil = ns->get_maxstencil();
|
||||
|
||||
k_stencil = DAT::tdual_int_1d("neighlist:stencil",maxstencil);
|
||||
if (maxstencil > k_stencil.dimension_0())
|
||||
k_stencil = DAT::tdual_int_1d("neighlist:stencil",maxstencil);
|
||||
for (int k = 0; k < maxstencil; k++)
|
||||
k_stencil.h_view(k) = ns->stencil[k];
|
||||
k_stencil.modify<LMPHostType>();
|
||||
k_stencil.sync<DeviceType>();
|
||||
if (GHOST) {
|
||||
k_stencilxyz = DAT::tdual_int_1d_3("neighlist:stencilxyz",maxstencil);
|
||||
if (maxstencil > k_stencilxyz.dimension_0())
|
||||
k_stencilxyz = DAT::tdual_int_1d_3("neighlist:stencilxyz",maxstencil);
|
||||
for (int k = 0; k < maxstencil; k++) {
|
||||
k_stencilxyz.h_view(k,0) = ns->stencilxyz[k][0];
|
||||
k_stencilxyz.h_view(k,1) = ns->stencilxyz[k][1];
|
||||
@ -122,6 +125,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
|
||||
k_cutneighsq.view<DeviceType>(),
|
||||
k_bincount.view<DeviceType>(),
|
||||
k_bins.view<DeviceType>(),
|
||||
k_atom2bin.view<DeviceType>(),
|
||||
nstencil,
|
||||
k_stencil.view<DeviceType>(),
|
||||
k_stencilxyz.view<DeviceType>(),
|
||||
@ -164,8 +168,9 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
|
||||
k_ex_mol_group.sync<DeviceType>();
|
||||
k_ex_mol_bit.sync<DeviceType>();
|
||||
k_ex_mol_intra.sync<DeviceType>();
|
||||
k_bincount.sync<DeviceType>(),
|
||||
k_bins.sync<DeviceType>(),
|
||||
k_bincount.sync<DeviceType>();
|
||||
k_bins.sync<DeviceType>();
|
||||
k_atom2bin.sync<DeviceType>();
|
||||
atomKK->sync(Device,X_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK|TAG_MASK|SPECIAL_MASK);
|
||||
|
||||
data.special_flag[0] = special_flag[0];
|
||||
@ -317,7 +322,7 @@ void NeighborKokkosExecute<DeviceType>::
|
||||
const X_FLOAT ztmp = x(i, 2);
|
||||
const int itype = type(i);
|
||||
|
||||
const int ibin = coord2bin(xtmp, ytmp, ztmp);
|
||||
const int ibin = c_atom2bin(i);
|
||||
|
||||
const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil
|
||||
= d_stencil;
|
||||
@ -431,7 +436,7 @@ void NeighborKokkosExecute<DeviceType>::
|
||||
if(n > neigh_list.maxneighs) {
|
||||
resize() = 1;
|
||||
|
||||
if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
|
||||
if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop
|
||||
}
|
||||
|
||||
neigh_list.d_ilist(i) = i;
|
||||
@ -641,7 +646,7 @@ void NeighborKokkosExecute<DeviceType>::build_ItemCuda(typename Kokkos::TeamPoli
|
||||
if(n > neigh_list.maxneighs) {
|
||||
resize() = 1;
|
||||
|
||||
if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
|
||||
if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -678,7 +683,7 @@ void NeighborKokkosExecute<DeviceType>::
|
||||
// no molecular test when i = ghost atom
|
||||
|
||||
if (i < nlocal) {
|
||||
const int ibin = coord2bin(xtmp, ytmp, ztmp);
|
||||
const int ibin = c_atom2bin(i);
|
||||
for (int k = 0; k < nstencil; k++) {
|
||||
const int jbin = ibin + stencil[k];
|
||||
for(int m = 0; m < c_bincount(jbin); m++) {
|
||||
@ -764,7 +769,7 @@ void NeighborKokkosExecute<DeviceType>::
|
||||
if(n > neigh_list.maxneighs) {
|
||||
resize() = 1;
|
||||
|
||||
if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
|
||||
if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop
|
||||
}
|
||||
neigh_list.d_ilist(i) = i;
|
||||
}
|
||||
|
||||
@ -105,6 +105,7 @@ class NPairKokkos : public NPair {
|
||||
int atoms_per_bin;
|
||||
DAT::tdual_int_1d k_bincount;
|
||||
DAT::tdual_int_2d k_bins;
|
||||
DAT::tdual_int_1d k_atom2bin;
|
||||
|
||||
// data from NStencil class
|
||||
|
||||
@ -148,6 +149,8 @@ class NeighborKokkosExecute
|
||||
const typename AT::t_int_1d_const c_bincount;
|
||||
typename AT::t_int_2d bins;
|
||||
typename AT::t_int_2d_const c_bins;
|
||||
const typename AT::t_int_1d atom2bin;
|
||||
const typename AT::t_int_1d_const c_atom2bin;
|
||||
|
||||
|
||||
// data from NStencil class
|
||||
@ -190,6 +193,7 @@ class NeighborKokkosExecute
|
||||
const typename AT::t_xfloat_2d_randomread &_cutneighsq,
|
||||
const typename AT::t_int_1d &_bincount,
|
||||
const typename AT::t_int_2d &_bins,
|
||||
const typename AT::t_int_1d &_atom2bin,
|
||||
const int _nstencil,
|
||||
const typename AT::t_int_1d &_d_stencil,
|
||||
const typename AT::t_int_1d_3 &_d_stencilxyz,
|
||||
@ -224,6 +228,7 @@ class NeighborKokkosExecute
|
||||
const int & _xprd_half, const int & _yprd_half, const int & _zprd_half):
|
||||
neigh_list(_neigh_list), cutneighsq(_cutneighsq),
|
||||
bincount(_bincount),c_bincount(_bincount),bins(_bins),c_bins(_bins),
|
||||
atom2bin(_atom2bin),c_atom2bin(_atom2bin),
|
||||
nstencil(_nstencil),d_stencil(_d_stencil),d_stencilxyz(_d_stencilxyz),
|
||||
nlocal(_nlocal),
|
||||
x(_x),type(_type),mask(_mask),molecule(_molecule),
|
||||
@ -281,38 +286,6 @@ class NeighborKokkosExecute
|
||||
void build_ItemCuda(typename Kokkos::TeamPolicy<DeviceType>::member_type dev) const;
|
||||
#endif
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z) const
|
||||
{
|
||||
int ix,iy,iz;
|
||||
|
||||
if (x >= bboxhi[0])
|
||||
ix = static_cast<int> ((x-bboxhi[0])*bininvx) + nbinx;
|
||||
else if (x >= bboxlo[0]) {
|
||||
ix = static_cast<int> ((x-bboxlo[0])*bininvx);
|
||||
ix = MIN(ix,nbinx-1);
|
||||
} else
|
||||
ix = static_cast<int> ((x-bboxlo[0])*bininvx) - 1;
|
||||
|
||||
if (y >= bboxhi[1])
|
||||
iy = static_cast<int> ((y-bboxhi[1])*bininvy) + nbiny;
|
||||
else if (y >= bboxlo[1]) {
|
||||
iy = static_cast<int> ((y-bboxlo[1])*bininvy);
|
||||
iy = MIN(iy,nbiny-1);
|
||||
} else
|
||||
iy = static_cast<int> ((y-bboxlo[1])*bininvy) - 1;
|
||||
|
||||
if (z >= bboxhi[2])
|
||||
iz = static_cast<int> ((z-bboxhi[2])*bininvz) + nbinz;
|
||||
else if (z >= bboxlo[2]) {
|
||||
iz = static_cast<int> ((z-bboxlo[2])*bininvz);
|
||||
iz = MIN(iz,nbinz-1);
|
||||
} else
|
||||
iz = static_cast<int> ((z-bboxlo[2])*bininvz) - 1;
|
||||
|
||||
return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z, int* i) const
|
||||
{
|
||||
|
||||
@ -131,6 +131,8 @@ template<class DeviceType>
|
||||
void PairReaxCKokkos<DeviceType>::init_style()
|
||||
{
|
||||
PairReaxC::init_style();
|
||||
if (fix_reax) modify->delete_fix("REAXC"); // not needed in the Kokkos version
|
||||
fix_reax = NULL;
|
||||
|
||||
// irequest = neigh request made by parent class
|
||||
|
||||
@ -555,8 +557,8 @@ void PairReaxCKokkos<DeviceType>::Deallocate_Lookup_Tables()
|
||||
|
||||
ntypes = atom->ntypes;
|
||||
|
||||
for( i = 0; i < ntypes; ++i ) {
|
||||
for( j = i; j < ntypes; ++j )
|
||||
for( i = 0; i <= ntypes; ++i ) {
|
||||
for( j = i; j <= ntypes; ++j )
|
||||
if( LR[i][j].n ) {
|
||||
sfree( LR[i][j].y, "LR[i,j].y" );
|
||||
sfree( LR[i][j].H, "LR[i,j].H" );
|
||||
|
||||
@ -294,6 +294,7 @@ void VerletKokkos::run(int n)
|
||||
int n_pre_exchange = modify->n_pre_exchange;
|
||||
int n_pre_neighbor = modify->n_pre_neighbor;
|
||||
int n_pre_force = modify->n_pre_force;
|
||||
int n_pre_reverse = modify->n_pre_reverse;
|
||||
int n_post_force = modify->n_post_force;
|
||||
int n_end_of_step = modify->n_end_of_step;
|
||||
|
||||
@ -304,9 +305,9 @@ void VerletKokkos::run(int n)
|
||||
|
||||
f_merge_copy = DAT::t_f_array("VerletKokkos::f_merge_copy",atomKK->k_f.dimension_0());
|
||||
|
||||
static double time = 0.0;
|
||||
atomKK->sync(Device,ALL_MASK);
|
||||
Kokkos::Impl::Timer ktimer;
|
||||
//static double time = 0.0;
|
||||
//Kokkos::Impl::Timer ktimer;
|
||||
|
||||
timer->init_timeout();
|
||||
for (int i = 0; i < n; i++) {
|
||||
@ -320,10 +321,10 @@ void VerletKokkos::run(int n)
|
||||
|
||||
// initial time integration
|
||||
|
||||
ktimer.reset();
|
||||
//ktimer.reset();
|
||||
timer->stamp();
|
||||
modify->initial_integrate(vflag);
|
||||
time += ktimer.seconds();
|
||||
//time += ktimer.seconds();
|
||||
if (n_post_integrate) modify->post_integrate();
|
||||
timer->stamp(Timer::MODIFY);
|
||||
|
||||
@ -523,11 +524,18 @@ void VerletKokkos::run(int n)
|
||||
atomKK->k_f.modify<LMPDeviceType>();
|
||||
}
|
||||
|
||||
if (n_pre_reverse) {
|
||||
modify->pre_reverse(eflag,vflag);
|
||||
timer->stamp(Timer::MODIFY);
|
||||
}
|
||||
|
||||
// reverse communication of forces
|
||||
|
||||
if (force->newton) comm->reverse_comm();
|
||||
timer->stamp(Timer::COMM);
|
||||
if (force->newton) {
|
||||
Kokkos::fence();
|
||||
comm->reverse_comm();
|
||||
timer->stamp(Timer::COMM);
|
||||
}
|
||||
|
||||
// force modifications, final time integration, diagnostics
|
||||
|
||||
|
||||
@ -15,13 +15,14 @@ SHELL = /bin/sh
|
||||
|
||||
CC = CC
|
||||
OPTFLAGS = -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
|
||||
CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
|
||||
-fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_NO_TBB
|
||||
CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
|
||||
-DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG -DLMP_INTEL_NO_TBB \
|
||||
$(OPTFLAGS)
|
||||
SHFLAGS = -fPIC
|
||||
DEPFLAGS = -M
|
||||
|
||||
LINK = CC
|
||||
LINKFLAGS = -g -qopenmp $(OPTFLAGS)
|
||||
LINKFLAGS = -qopenmp $(OPTFLAGS)
|
||||
LIB =
|
||||
SIZE = size
|
||||
|
||||
|
||||
@ -10,7 +10,7 @@ CC = mpiicpc
|
||||
MIC_OPT = -qoffload-option,mic,compiler,"-fp-model fast=2 -mGLOB_default_function_attrs=\"gather_scatter_loop_unroll=4\""
|
||||
CCFLAGS = -g -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \
|
||||
-xHost -fno-alias -ansi-alias -restrict -DLMP_INTEL_USELRT \
|
||||
-qoverride-limits $(MIC_OPT)
|
||||
-qoverride-limits $(MIC_OPT) -DLMP_USE_MKL_RNG
|
||||
SHFLAGS = -fPIC
|
||||
DEPFLAGS = -M
|
||||
|
||||
|
||||
8
src/MAKE/OPTIONS/Makefile.intel_cpu
Executable file → Normal file
8
src/MAKE/OPTIONS/Makefile.intel_cpu
Executable file → Normal file
@ -8,14 +8,14 @@ SHELL = /bin/sh
|
||||
|
||||
CC = mpiicpc
|
||||
OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
|
||||
CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
|
||||
-fno-alias -ansi-alias -restrict $(OPTFLAGS)
|
||||
CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
|
||||
-DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
|
||||
SHFLAGS = -fPIC
|
||||
DEPFLAGS = -M
|
||||
|
||||
LINK = mpiicpc
|
||||
LINKFLAGS = -g -qopenmp $(OPTFLAGS)
|
||||
LIB = -ltbbmalloc -ltbbmalloc_proxy
|
||||
LINKFLAGS = -qopenmp $(OPTFLAGS)
|
||||
LIB = -ltbbmalloc
|
||||
SIZE = size
|
||||
|
||||
ARCHIVE = ar
|
||||
|
||||
@ -8,8 +8,8 @@ SHELL = /bin/sh
|
||||
|
||||
CC = mpiicpc
|
||||
OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
|
||||
CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
|
||||
-fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT
|
||||
CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
|
||||
-DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
|
||||
SHFLAGS = -fPIC
|
||||
DEPFLAGS = -M
|
||||
|
||||
|
||||
@ -8,14 +8,14 @@ SHELL = /bin/sh
|
||||
|
||||
CC = mpicxx -cxx=icc
|
||||
OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
|
||||
CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
|
||||
-fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT
|
||||
CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
|
||||
-DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
|
||||
SHFLAGS = -fPIC
|
||||
DEPFLAGS = -M
|
||||
|
||||
LINK = mpicxx -cxx=icc
|
||||
LINKFLAGS = -g -qopenmp $(OPTFLAGS)
|
||||
LIB =
|
||||
LINKFLAGS = -qopenmp $(OPTFLAGS)
|
||||
LIB = -ltbbmalloc
|
||||
SIZE = size
|
||||
|
||||
ARCHIVE = ar
|
||||
|
||||
@ -9,14 +9,14 @@ SHELL = /bin/sh
|
||||
export OMPI_CXX = icc
|
||||
CC = mpicxx
|
||||
OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
|
||||
CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
|
||||
-fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT
|
||||
CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
|
||||
-DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
|
||||
SHFLAGS = -fPIC
|
||||
DEPFLAGS = -M
|
||||
|
||||
LINK = mpicxx
|
||||
LINKFLAGS = -g -qopenmp $(OPTFLAGS)
|
||||
LIB = -ltbbmalloc -ltbbmalloc_proxy
|
||||
LINKFLAGS = -qopenmp $(OPTFLAGS)
|
||||
LIB = -ltbbmalloc
|
||||
SIZE = size
|
||||
|
||||
ARCHIVE = ar
|
||||
|
||||
@ -1,123 +0,0 @@
|
||||
# intel_phi = USER-INTEL with Phi x200 (KNL) offload support,Intel MPI,MKL FFT
|
||||
|
||||
SHELL = /bin/sh
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# compiler/linker settings
|
||||
# specify flags and libraries needed for your compiler
|
||||
|
||||
CC = mpiicpc
|
||||
MIC_OPT = -qoffload-arch=mic-avx512 -fp-model fast=2
|
||||
CCFLAGS = -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \
|
||||
-xHost -fno-alias -ansi-alias -restrict \
|
||||
-qoverride-limits $(MIC_OPT) -DLMP_INTEL_USELRT
|
||||
SHFLAGS = -fPIC
|
||||
DEPFLAGS = -M
|
||||
|
||||
LINK = mpiicpc
|
||||
LINKFLAGS = -g -O3 -xHost -qopenmp -qoffload $(MIC_OPT)
|
||||
LIB = -ltbbmalloc
|
||||
SIZE = size
|
||||
|
||||
ARCHIVE = ar
|
||||
ARFLAGS = -rc
|
||||
SHLIBFLAGS = -shared
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# LAMMPS-specific settings, all OPTIONAL
|
||||
# specify settings for LAMMPS features you will use
|
||||
# if you change any -D setting, do full re-compile after "make clean"
|
||||
|
||||
# LAMMPS ifdef settings
|
||||
# see possible settings in Section 2.2 (step 4) of manual
|
||||
|
||||
LMP_INC = -DLAMMPS_GZIP -DLAMMPS_JPEG
|
||||
|
||||
# MPI library
|
||||
# see discussion in Section 2.2 (step 5) of manual
|
||||
# MPI wrapper compiler/linker can provide this info
|
||||
# can point to dummy MPI library in src/STUBS as in Makefile.serial
|
||||
# use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts
|
||||
# INC = path for mpi.h, MPI compiler settings
|
||||
# PATH = path for MPI library
|
||||
# LIB = name of MPI library
|
||||
|
||||
MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1
|
||||
MPI_PATH =
|
||||
MPI_LIB =
|
||||
|
||||
# FFT library
|
||||
# see discussion in Section 2.2 (step 6) of manaul
|
||||
# can be left blank to use provided KISS FFT library
|
||||
# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
|
||||
# PATH = path for FFT library
|
||||
# LIB = name of FFT library
|
||||
|
||||
FFT_INC = -DFFT_MKL -DFFT_SINGLE
|
||||
FFT_PATH =
|
||||
FFT_LIB = -L$(MKLROOT)/lib/intel64/ -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core
|
||||
|
||||
# JPEG and/or PNG library
|
||||
# see discussion in Section 2.2 (step 7) of manual
|
||||
# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
|
||||
# INC = path(s) for jpeglib.h and/or png.h
|
||||
# PATH = path(s) for JPEG library and/or PNG library
|
||||
# LIB = name(s) of JPEG library and/or PNG library
|
||||
|
||||
JPG_INC =
|
||||
JPG_PATH =
|
||||
JPG_LIB = -ljpeg
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# build rules and dependencies
|
||||
# do not edit this section
|
||||
|
||||
include Makefile.package.settings
|
||||
include Makefile.package
|
||||
|
||||
EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
|
||||
EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
|
||||
EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
|
||||
EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS)
|
||||
EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS)
|
||||
|
||||
# Path to src files
|
||||
|
||||
vpath %.cpp ..
|
||||
vpath %.h ..
|
||||
|
||||
# Link target
|
||||
|
||||
$(EXE): $(OBJ) $(EXTRA_LINK_DEPENDS)
|
||||
$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
|
||||
$(SIZE) $(EXE)
|
||||
|
||||
# Library targets
|
||||
|
||||
lib: $(OBJ) $(EXTRA_LINK_DEPENDS)
|
||||
$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
|
||||
|
||||
shlib: $(OBJ) $(EXTRA_LINK_DEPENDS)
|
||||
$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
|
||||
$(OBJ) $(EXTRA_LIB) $(LIB)
|
||||
|
||||
# Compilation rules
|
||||
|
||||
%.o:%.cpp $(EXTRA_CPP_DEPENDS)
|
||||
$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
|
||||
|
||||
%.d:%.cpp $(EXTRA_CPP_DEPENDS)
|
||||
$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
|
||||
|
||||
%.o:%.cu $(EXTRA_CPP_DEPENDS)
|
||||
$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
|
||||
|
||||
# Individual dependencies
|
||||
|
||||
depend : fastdep.exe $(SRC)
|
||||
@./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1
|
||||
|
||||
fastdep.exe: ../DEPEND/fastdep.c
|
||||
cc -O -o $@ $<
|
||||
|
||||
sinclude .depend
|
||||
@ -8,13 +8,13 @@ SHELL = /bin/sh
|
||||
|
||||
CC = mpiicpc
|
||||
OPTFLAGS = -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
|
||||
CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
|
||||
-fno-alias -ansi-alias -restrict $(OPTFLAGS)
|
||||
CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
|
||||
-DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
|
||||
SHFLAGS = -fPIC
|
||||
DEPFLAGS = -M
|
||||
|
||||
LINK = mpiicpc
|
||||
LINKFLAGS = -g -qopenmp $(OPTFLAGS)
|
||||
LINKFLAGS = -qopenmp $(OPTFLAGS)
|
||||
LIB = -ltbbmalloc
|
||||
SIZE = size
|
||||
|
||||
|
||||
@ -310,6 +310,7 @@ void PRD::command(int narg, char **arg)
|
||||
time_dephase = time_dynamics = time_quench = time_comm = time_output = 0.0;
|
||||
bigint clock = 0;
|
||||
|
||||
timer->init();
|
||||
timer->barrier_start();
|
||||
time_start = timer->get_wall(Timer::TOTAL);
|
||||
|
||||
|
||||
@ -274,6 +274,7 @@ void TAD::command(int narg, char **arg)
|
||||
nbuild = ndanger = 0;
|
||||
time_neb = time_dynamics = time_quench = time_comm = time_output = 0.0;
|
||||
|
||||
timer->init();
|
||||
timer->barrier_start();
|
||||
time_start = timer->get_wall(Timer::TOTAL);
|
||||
|
||||
|
||||
@ -46,7 +46,7 @@ action nbin_intel.h
|
||||
action nbin_intel.cpp
|
||||
action npair_intel.h
|
||||
action npair_intel.cpp
|
||||
action intel_simd.h pair_sw_intel.cpp
|
||||
action intel_simd.h
|
||||
action intel_intrinsics.h pair_tersoff_intel.cpp
|
||||
action intel_intrinsics_airebo.h pair_airebo_intel.cpp
|
||||
|
||||
|
||||
@ -30,28 +30,37 @@ be added or changed in the Makefile depending on the version:
|
||||
|
||||
2017 update 2 - No changes needed
|
||||
2017 updates 3 or 4 - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
|
||||
2018 or newer - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high
|
||||
2018 inital release - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
|
||||
2018u1 or newer - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
When using the suffix command with "intel", intel styles will be used if they
|
||||
exist. If the suffix command is used with "hybrid intel omp" and the USER-OMP
|
||||
USER-OMP styles will be used whenever USER-INTEL styles are not available. This
|
||||
allow for running most styles in LAMMPS with threading.
|
||||
is installed, USER-OMP styles will be used whenever USER-INTEL styles are not
|
||||
available. This allow for running most styles in LAMMPS with threading.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
The Long-Range Thread mode (LRT) in the Intel package currently uses
|
||||
pthreads by default. If pthreads are not supported in the build environment,
|
||||
the compile flag "-DLMP_INTEL_NOLRT" will disable the feature to allow for
|
||||
builds without pthreads. Alternatively, "-DLMP_INTEL_LRT11" can be used to
|
||||
build with compilers that support threads using the C++11 standard. When using
|
||||
The Long-Range Thread mode (LRT) in the Intel package is enabled through the
|
||||
-DLMP_INTEL_USELRT define at compile time. All intel optimized makefiles
|
||||
include this define. This feature will use pthreads by default.
|
||||
Alternatively, "-DLMP_INTEL_LRT11" can be used to build with compilers that
|
||||
support threads intrinsically using the C++11 standard. When using
|
||||
LRT mode, you might need to disable OpenMP affinity settings (e.g.
|
||||
export KMP_AFFINITY=none). LAMMPS will generate a warning if the settings
|
||||
need to be changed.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
Unless Intel Math Kernel Library (MKL) is unavailable, -DLMP_USE_MKL_RNG
|
||||
should be added to the compile flags. This will enable using the MKL Mersenne
|
||||
Twister random number generator (RNG) for Dissipative Particle Dynamics
|
||||
(DPD). This RNG can allow significantly faster performance and it also has a
|
||||
significantly longer period than the standard RNG for DPD.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
In order to use offload to Intel(R) Xeon Phi(TM) coprocessors, the flag
|
||||
-DLMP_INTEL_OFFLOAD should be set in the Makefile. Offload requires the use of
|
||||
Intel compilers.
|
||||
|
||||
@ -9,6 +9,7 @@
|
||||
# in.intel.tersoff - Silicon benchmark with Tersoff
|
||||
# in.intel.water - Coarse-grain water benchmark using Stillinger-Weber
|
||||
# in.intel.airebo - Polyethelene benchmark with AIREBO
|
||||
# in.intel.dpd - Dissipative Particle Dynamics
|
||||
#
|
||||
#############################################################################
|
||||
|
||||
@ -16,16 +17,17 @@
|
||||
# Expected Timesteps/second with turbo on and HT enabled, LAMMPS June-2017
|
||||
# - Compiled w/ Intel Parallel Studio 2017u2 and Makefile.intel_cpu_intelmpi
|
||||
#
|
||||
# Xeon E5-2697v4 Xeon Phi 7250
|
||||
# Xeon E5-2697v4 Xeon Phi 7250 Xeon Gold 6148
|
||||
#
|
||||
# in.intel.lj - 199.5 282.3
|
||||
# in.intel.rhodo - 12.4 17.5
|
||||
# in.intel.lc - 19.0 25.7
|
||||
# in.intel.eam - 59.4 92.8
|
||||
# in.intel.sw - 132.4 161.9
|
||||
# in.intel.tersoff - 83.3 101.1
|
||||
# in.intel.water - 53.4 90.3
|
||||
# in.intel.airebo - 7.3 11.8
|
||||
# in.intel.lj - 199.5 282.3 317.3
|
||||
# in.intel.rhodo - 12.4 17.5 24.4
|
||||
# in.intel.lc - 19.0 25.7 26.8
|
||||
# in.intel.eam - 59.4 92.8 105.6
|
||||
# in.intel.sw - 132.4 161.9 213.8
|
||||
# in.intel.tersoff - 83.3 101.1 109.6
|
||||
# in.intel.water - 53.4 90.3 105.5
|
||||
# in.intel.airebo - 7.3 11.8 17.6
|
||||
# in.intel.dpd - 74.5 100.4 148.1
|
||||
#
|
||||
#############################################################################
|
||||
|
||||
|
||||
48
src/USER-INTEL/TEST/in.intel.dpd
Normal file
48
src/USER-INTEL/TEST/in.intel.dpd
Normal file
@ -0,0 +1,48 @@
|
||||
# DPD benchmark
|
||||
|
||||
variable N index on # Newton Setting
|
||||
variable w index 10 # Warmup Timesteps
|
||||
variable t index 4000 # Main Run Timesteps
|
||||
variable m index 1 # Main Run Timestep Multiplier
|
||||
variable n index 0 # Use NUMA Mapping for Multi-Node
|
||||
variable p index 0 # Use Power Measurement
|
||||
|
||||
variable x index 4
|
||||
variable y index 2
|
||||
variable z index 2
|
||||
|
||||
variable xx equal 20*$x
|
||||
variable yy equal 20*$y
|
||||
variable zz equal 20*$z
|
||||
variable rr equal floor($t*$m)
|
||||
|
||||
newton $N
|
||||
if "$n > 0" then "processors * * * grid numa"
|
||||
|
||||
units lj
|
||||
atom_style atomic
|
||||
comm_modify mode single vel yes
|
||||
|
||||
lattice fcc 3.0
|
||||
region box block 0 ${xx} 0 ${yy} 0 ${zz}
|
||||
create_box 1 box
|
||||
create_atoms 1 box
|
||||
mass 1 1.0
|
||||
|
||||
velocity all create 1.0 87287 loop geom
|
||||
|
||||
pair_style dpd 1.0 1.0 928948
|
||||
pair_coeff 1 1 25.0 4.5
|
||||
|
||||
neighbor 0.5 bin
|
||||
neigh_modify delay 0 every 1
|
||||
|
||||
fix 1 all nve
|
||||
timestep 0.04
|
||||
|
||||
thermo 1000
|
||||
|
||||
if "$p > 0" then "run_style verlet/power"
|
||||
|
||||
if "$w > 0" then "run $w"
|
||||
run ${rr}
|
||||
441
src/USER-INTEL/dihedral_fourier_intel.cpp
Normal file
441
src/USER-INTEL/dihedral_fourier_intel.cpp
Normal file
@ -0,0 +1,441 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <mpi.h>
|
||||
#include <math.h>
|
||||
#include "dihedral_fourier_intel.h"
|
||||
#include "atom.h"
|
||||
#include "comm.h"
|
||||
#include "memory.h"
|
||||
#include "neighbor.h"
|
||||
#include "domain.h"
|
||||
#include "force.h"
|
||||
#include "pair.h"
|
||||
#include "update.h"
|
||||
#include "error.h"
|
||||
|
||||
#include "suffix.h"
|
||||
using namespace LAMMPS_NS;
|
||||
|
||||
#define PTOLERANCE (flt_t)1.05
|
||||
#define MTOLERANCE (flt_t)-1.05
|
||||
typedef struct { int a,b,c,d,t; } int5_t;
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
DihedralFourierIntel::DihedralFourierIntel(class LAMMPS *lmp)
|
||||
: DihedralFourier(lmp)
|
||||
{
|
||||
suffix_flag |= Suffix::INTEL;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void DihedralFourierIntel::compute(int eflag, int vflag)
|
||||
{
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_use_base) {
|
||||
DihedralFourier::compute(eflag, vflag);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
|
||||
force_const_single);
|
||||
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
|
||||
force_const_double);
|
||||
else
|
||||
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
|
||||
force_const_single);
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void DihedralFourierIntel::compute(int eflag, int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
{
|
||||
if (eflag || vflag) {
|
||||
ev_setup(eflag,vflag);
|
||||
} else evflag = 0;
|
||||
|
||||
if (evflag) {
|
||||
if (vflag && !eflag) {
|
||||
if (force->newton_bond)
|
||||
eval<0,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<0,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,1,0>(vflag, buffers, fc);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<0,0,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<0,0,0>(vflag, buffers, fc);
|
||||
}
|
||||
}
|
||||
|
||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void DihedralFourierIntel::eval(const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
|
||||
{
|
||||
const int inum = neighbor->ndihedrallist;
|
||||
if (inum == 0) return;
|
||||
|
||||
ATOM_T * _noalias const x = buffers->get_x(0);
|
||||
const int nlocal = atom->nlocal;
|
||||
const int nall = nlocal + atom->nghost;
|
||||
|
||||
int f_stride;
|
||||
if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
|
||||
else f_stride = buffers->get_stride(nlocal);
|
||||
|
||||
int tc;
|
||||
FORCE_T * _noalias f_start;
|
||||
acc_t * _noalias ev_global;
|
||||
IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
|
||||
const int nthreads = tc;
|
||||
|
||||
acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EFLAG) oedihedral = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(f_start,f_stride,fc) \
|
||||
reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int nfrom, npl, nto, tid;
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||
#else
|
||||
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
|
||||
#endif
|
||||
|
||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||
if (fix->need_zero(tid))
|
||||
memset(f, 0, f_stride * sizeof(FORCE_T));
|
||||
|
||||
const int5_t * _noalias const dihedrallist =
|
||||
(int5_t *) neighbor->dihedrallist[0];
|
||||
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||
if (EFLAG) sedihedral = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
#pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
for (int n = nfrom; n < nto; n ++) {
|
||||
#else
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
#endif
|
||||
const int i1 = dihedrallist[n].a;
|
||||
const int i2 = dihedrallist[n].b;
|
||||
const int i3 = dihedrallist[n].c;
|
||||
const int i4 = dihedrallist[n].d;
|
||||
const int type = dihedrallist[n].t;
|
||||
|
||||
// 1st bond
|
||||
|
||||
const flt_t vb1x = x[i1].x - x[i2].x;
|
||||
const flt_t vb1y = x[i1].y - x[i2].y;
|
||||
const flt_t vb1z = x[i1].z - x[i2].z;
|
||||
|
||||
// 2nd bond
|
||||
|
||||
const flt_t vb2xm = x[i2].x - x[i3].x;
|
||||
const flt_t vb2ym = x[i2].y - x[i3].y;
|
||||
const flt_t vb2zm = x[i2].z - x[i3].z;
|
||||
|
||||
// 3rd bond
|
||||
|
||||
const flt_t vb3x = x[i4].x - x[i3].x;
|
||||
const flt_t vb3y = x[i4].y - x[i3].y;
|
||||
const flt_t vb3z = x[i4].z - x[i3].z;
|
||||
|
||||
// c,s calculation
|
||||
|
||||
const flt_t ax = vb1y*vb2zm - vb1z*vb2ym;
|
||||
const flt_t ay = vb1z*vb2xm - vb1x*vb2zm;
|
||||
const flt_t az = vb1x*vb2ym - vb1y*vb2xm;
|
||||
const flt_t bx = vb3y*vb2zm - vb3z*vb2ym;
|
||||
const flt_t by = vb3z*vb2xm - vb3x*vb2zm;
|
||||
const flt_t bz = vb3x*vb2ym - vb3y*vb2xm;
|
||||
|
||||
const flt_t rasq = ax*ax + ay*ay + az*az;
|
||||
const flt_t rbsq = bx*bx + by*by + bz*bz;
|
||||
const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
|
||||
const flt_t rg = sqrt(rgsq);
|
||||
|
||||
flt_t rginv, ra2inv, rb2inv;
|
||||
rginv = ra2inv = rb2inv = (flt_t)0.0;
|
||||
if (rg > 0) rginv = (flt_t)1.0/rg;
|
||||
if (rasq > 0) ra2inv = (flt_t)1.0/rasq;
|
||||
if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq;
|
||||
const flt_t rabinv = sqrt(ra2inv*rb2inv);
|
||||
|
||||
flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
|
||||
const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
|
||||
|
||||
// error check
|
||||
#ifndef LMP_INTEL_USE_SIMDOFF
|
||||
if (c > PTOLERANCE || c < MTOLERANCE) {
|
||||
int me = comm->me;
|
||||
|
||||
if (screen) {
|
||||
char str[128];
|
||||
sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
|
||||
TAGINT_FORMAT " " TAGINT_FORMAT " "
|
||||
TAGINT_FORMAT " " TAGINT_FORMAT,
|
||||
me,tid,update->ntimestep,
|
||||
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
|
||||
error->warning(FLERR,str,0);
|
||||
fprintf(screen," 1st atom: %d %g %g %g\n",
|
||||
me,x[i1].x,x[i1].y,x[i1].z);
|
||||
fprintf(screen," 2nd atom: %d %g %g %g\n",
|
||||
me,x[i2].x,x[i2].y,x[i2].z);
|
||||
fprintf(screen," 3rd atom: %d %g %g %g\n",
|
||||
me,x[i3].x,x[i3].y,x[i3].z);
|
||||
fprintf(screen," 4th atom: %d %g %g %g\n",
|
||||
me,x[i4].x,x[i4].y,x[i4].z);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (c > (flt_t)1.0) c = (flt_t)1.0;
|
||||
if (c < (flt_t)-1.0) c = (flt_t)-1.0;
|
||||
|
||||
flt_t deng;
|
||||
flt_t df = (flt_t)0.0;
|
||||
if (EFLAG) deng = (flt_t)0.0;
|
||||
|
||||
for (int j = 0; j < nterms[type]; j++) {
|
||||
const flt_t tcos_shift = fc.bp[j][type].cos_shift;
|
||||
const flt_t tsin_shift = fc.bp[j][type].sin_shift;
|
||||
const flt_t tk = fc.bp[j][type].k;
|
||||
const int m = fc.bp[j][type].multiplicity;
|
||||
|
||||
flt_t p = (flt_t)1.0;
|
||||
flt_t ddf1, df1;
|
||||
ddf1 = df1 = (flt_t)0.0;
|
||||
|
||||
for (int i = 0; i < m; i++) {
|
||||
ddf1 = p*c - df1*s;
|
||||
df1 = p*s + df1*c;
|
||||
p = ddf1;
|
||||
}
|
||||
|
||||
p = p*tcos_shift + df1*tsin_shift;
|
||||
df1 = df1*tcos_shift - ddf1*tsin_shift;
|
||||
df1 *= -m;
|
||||
p += (flt_t)1.0;
|
||||
|
||||
if (m == 0) {
|
||||
p = (flt_t)1.0 + tcos_shift;
|
||||
df1 = (flt_t)0.0;
|
||||
}
|
||||
|
||||
if (EFLAG) deng += tk * p;
|
||||
df -= tk * df1;
|
||||
}
|
||||
|
||||
const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
|
||||
const flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
|
||||
const flt_t fga = fg*ra2inv*rginv;
|
||||
const flt_t hgb = hg*rb2inv*rginv;
|
||||
const flt_t gaa = -ra2inv*rg;
|
||||
const flt_t gbb = rb2inv*rg;
|
||||
|
||||
const flt_t dtfx = gaa*ax;
|
||||
const flt_t dtfy = gaa*ay;
|
||||
const flt_t dtfz = gaa*az;
|
||||
const flt_t dtgx = fga*ax - hgb*bx;
|
||||
const flt_t dtgy = fga*ay - hgb*by;
|
||||
const flt_t dtgz = fga*az - hgb*bz;
|
||||
const flt_t dthx = gbb*bx;
|
||||
const flt_t dthy = gbb*by;
|
||||
const flt_t dthz = gbb*bz;
|
||||
|
||||
const flt_t sx2 = df*dtgx;
|
||||
const flt_t sy2 = df*dtgy;
|
||||
const flt_t sz2 = df*dtgz;
|
||||
|
||||
flt_t f1x = df*dtfx;
|
||||
flt_t f1y = df*dtfy;
|
||||
flt_t f1z = df*dtfz;
|
||||
|
||||
const flt_t f2x = sx2 - f1x;
|
||||
const flt_t f2y = sy2 - f1y;
|
||||
const flt_t f2z = sz2 - f1z;
|
||||
|
||||
flt_t f4x = df*dthx;
|
||||
flt_t f4y = df*dthy;
|
||||
flt_t f4z = df*dthz;
|
||||
|
||||
const flt_t f3x = -sx2 - f4x;
|
||||
const flt_t f3y = -sy2 - f4y;
|
||||
const flt_t f3z = -sz2 - f4z;
|
||||
|
||||
if (EFLAG || VFLAG) {
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
|
||||
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
||||
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
||||
vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
|
||||
sv0, sv1, sv2, sv3, sv4, sv5);
|
||||
#else
|
||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
|
||||
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
||||
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
||||
vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
|
||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
#pragma simdoff
|
||||
#endif
|
||||
{
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
f[i1].x += f1x;
|
||||
f[i1].y += f1y;
|
||||
f[i1].z += f1z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i2 < nlocal) {
|
||||
f[i2].x += f2x;
|
||||
f[i2].y += f2y;
|
||||
f[i2].z += f2z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i3 < nlocal) {
|
||||
f[i3].x += f3x;
|
||||
f[i3].y += f3y;
|
||||
f[i3].z += f3z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i4 < nlocal) {
|
||||
f[i4].x += f4x;
|
||||
f[i4].y += f4y;
|
||||
f[i4].z += f4z;
|
||||
}
|
||||
}
|
||||
} // for n
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
if (EFLAG) oedihedral += sedihedral;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||
}
|
||||
#endif
|
||||
} // omp parallel
|
||||
|
||||
if (EFLAG) energy += oedihedral;
|
||||
if (VFLAG && vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
}
|
||||
|
||||
fix->set_reduce_flag();
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void DihedralFourierIntel::init_style()
|
||||
{
|
||||
DihedralFourier::init_style();
|
||||
|
||||
int ifix = modify->find_fix("package_intel");
|
||||
if (ifix < 0)
|
||||
error->all(FLERR,
|
||||
"The 'package intel' command is required for /intel styles");
|
||||
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
_use_base = 0;
|
||||
if (fix->offload_balance() != 0.0) {
|
||||
_use_base = 1;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
fix->bond_init_check();
|
||||
|
||||
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||
pack_force_const(force_const_single, fix->get_mixed_buffers());
|
||||
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||
pack_force_const(force_const_double, fix->get_double_buffers());
|
||||
else
|
||||
pack_force_const(force_const_single, fix->get_single_buffers());
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void DihedralFourierIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t,acc_t> *buffers)
|
||||
{
|
||||
const int bp1 = atom->ndihedraltypes + 1;
|
||||
fc.set_ntypes(bp1, setflag, nterms, memory);
|
||||
|
||||
for (int i = 1; i < bp1; i++) {
|
||||
if (setflag[i]) {
|
||||
for (int j = 0; j < nterms[i]; j++) {
|
||||
fc.bp[j][i].cos_shift = cos_shift[i][j];
|
||||
fc.bp[j][i].sin_shift = sin_shift[i][j];
|
||||
fc.bp[j][i].k = k[i][j];
|
||||
fc.bp[j][i].multiplicity = multiplicity[i][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t>
|
||||
void DihedralFourierIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
|
||||
int *setflag,
|
||||
int *nterms,
|
||||
Memory *memory) {
|
||||
if (nbondtypes != _nbondtypes) {
|
||||
if (_nbondtypes > 0)
|
||||
_memory->destroy(bp);
|
||||
|
||||
if (nbondtypes > 0) {
|
||||
_maxnterms = 1;
|
||||
for (int i = 1; i <= nbondtypes; i++)
|
||||
if (setflag[i]) _maxnterms = MAX(_maxnterms, nterms[i]);
|
||||
|
||||
_memory->create(bp, _maxnterms, nbondtypes, "dihedralfourierintel.bp");
|
||||
}
|
||||
}
|
||||
_nbondtypes = nbondtypes;
|
||||
_memory = memory;
|
||||
}
|
||||
82
src/USER-INTEL/dihedral_fourier_intel.h
Normal file
82
src/USER-INTEL/dihedral_fourier_intel.h
Normal file
@ -0,0 +1,82 @@
|
||||
/* -*- c++ -*- ----------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef DIHEDRAL_CLASS
|
||||
|
||||
DihedralStyle(fourier/intel,DihedralFourierIntel)
|
||||
|
||||
#else
|
||||
|
||||
#ifndef LMP_DIHEDRAL_FOURIER_INTEL_H
|
||||
#define LMP_DIHEDRAL_FOURIER_INTEL_H
|
||||
|
||||
#include "dihedral_fourier.h"
|
||||
#include "fix_intel.h"
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
class DihedralFourierIntel : public DihedralFourier {
|
||||
|
||||
public:
|
||||
DihedralFourierIntel(class LAMMPS *lmp);
|
||||
virtual void compute(int, int);
|
||||
void init_style();
|
||||
|
||||
private:
|
||||
FixIntel *fix;
|
||||
|
||||
template <class flt_t> class ForceConst;
|
||||
template <class flt_t, class acc_t>
|
||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <class flt_t, class acc_t>
|
||||
void pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t, acc_t> *buffers);
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int _use_base;
|
||||
#endif
|
||||
|
||||
template <class flt_t>
|
||||
class ForceConst {
|
||||
public:
|
||||
typedef struct { flt_t cos_shift, sin_shift, k;
|
||||
int multiplicity; } fc_packed1;
|
||||
|
||||
fc_packed1 **bp;
|
||||
|
||||
ForceConst() : _nbondtypes(0) {}
|
||||
~ForceConst() { set_ntypes(0, NULL, NULL, NULL); }
|
||||
|
||||
void set_ntypes(const int nbondtypes, int *setflag, int *nterms,
|
||||
Memory *memory);
|
||||
|
||||
private:
|
||||
int _nbondtypes, _maxnterms;
|
||||
Memory *_memory;
|
||||
};
|
||||
ForceConst<float> force_const_single;
|
||||
ForceConst<double> force_const_double;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
@ -285,6 +285,7 @@ int FixIntel::setmask()
|
||||
{
|
||||
int mask = 0;
|
||||
mask |= PRE_REVERSE;
|
||||
mask |= MIN_PRE_REVERSE;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
mask |= POST_FORCE;
|
||||
mask |= MIN_POST_FORCE;
|
||||
|
||||
@ -43,6 +43,7 @@ class FixIntel : public Fix {
|
||||
virtual int setmask();
|
||||
virtual void init();
|
||||
virtual void setup(int);
|
||||
inline void min_setup(int in) { setup(in); }
|
||||
void setup_pre_reverse(int eflag = 0, int vflag = 0);
|
||||
|
||||
void pair_init_check(const bool cdmessage=false);
|
||||
@ -50,6 +51,8 @@ class FixIntel : public Fix {
|
||||
void kspace_init_check();
|
||||
|
||||
void pre_reverse(int eflag = 0, int vflag = 0);
|
||||
inline void min_pre_reverse(int eflag = 0, int vflag = 0)
|
||||
{ pre_reverse(eflag, vflag); }
|
||||
|
||||
// Get all forces, calculation results from coprocesser
|
||||
void sync_coprocessor();
|
||||
|
||||
@ -409,6 +409,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
|
||||
IP_PRE_get_stride(_ccache_stride3, nsize * 3, sizeof(acc_t), 0);
|
||||
lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef");
|
||||
#endif
|
||||
memset(_ccachei, 0, vsize * sizeof(int));
|
||||
memset(_ccachej, 0, vsize * sizeof(int));
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
@ -425,7 +426,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
|
||||
#pragma offload_transfer target(mic:_cop) \
|
||||
nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
|
||||
nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
|
||||
nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \
|
||||
in(ccachei:length(vsize) alloc_if(1) free_if(0)) \
|
||||
in(ccachej:length(vsize) alloc_if(1) free_if(0))
|
||||
}
|
||||
#ifdef LMP_USE_AVXCD
|
||||
|
||||
@ -292,6 +292,15 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
||||
ito = inum; \
|
||||
}
|
||||
|
||||
#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum, \
|
||||
nthr, vecsize) \
|
||||
{ \
|
||||
tid = 0; \
|
||||
ifrom = 0; \
|
||||
ip = 1; \
|
||||
ito = inum; \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \
|
||||
|
||||
@ -319,7 +319,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
|
||||
const int bstart = binhead[ibin + binstart[k]];
|
||||
const int bend = binhead[ibin + binend[k]];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int jj = bstart; jj < bend; jj++)
|
||||
@ -341,7 +340,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
|
||||
const int bstart = binhead[ibin + stencil[k]];
|
||||
const int bend = binhead[ibin + stencil[k] + 1];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int jj = bstart; jj < bend; jj++)
|
||||
|
||||
@ -273,7 +273,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
||||
const int bstart = binhead[ibin + binstart[k]];
|
||||
const int bend = binhead[ibin + binend[k]];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int jj = bstart; jj < bend; jj++)
|
||||
@ -307,7 +306,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
||||
const int bstart = binhead[ibin];
|
||||
const int bend = binhead[ibin + 1];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int jj = bstart; jj < bend; jj++) {
|
||||
|
||||
617
src/USER-INTEL/pair_dpd_intel.cpp
Normal file
617
src/USER-INTEL/pair_dpd_intel.cpp
Normal file
@ -0,0 +1,617 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
Shun Xu (Computer Network Information Center, CAS)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <math.h>
|
||||
#include "pair_dpd_intel.h"
|
||||
#include "atom.h"
|
||||
#include "comm.h"
|
||||
#include "force.h"
|
||||
#include "memory.h"
|
||||
#include "modify.h"
|
||||
#include "neighbor.h"
|
||||
#include "neigh_list.h"
|
||||
#include "neigh_request.h"
|
||||
#include "suffix.h"
|
||||
using namespace LAMMPS_NS;
|
||||
|
||||
#define LMP_MKL_RNG VSL_BRNG_MT19937
|
||||
#define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
|
||||
#define IEPSILON 1.0e10
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
PairDPDIntel::PairDPDIntel(LAMMPS *lmp) :
|
||||
PairDPD(lmp)
|
||||
{
|
||||
suffix_flag |= Suffix::INTEL;
|
||||
respa_enable = 0;
|
||||
random_thread = NULL;
|
||||
_nrandom_thread = 0;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
PairDPDIntel::~PairDPDIntel()
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
if (_nrandom_thread) {
|
||||
#ifdef LMP_USE_MKL_RNG
|
||||
for (int i = 0; i < _nrandom_thread; i++)
|
||||
vslDeleteStream(&random_thread[i]);
|
||||
#else
|
||||
for (int i = 1; i < _nrandom_thread; i++)
|
||||
delete random_thread[i];
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
delete []random_thread;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void PairDPDIntel::compute(int eflag, int vflag)
|
||||
{
|
||||
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
|
||||
force_const_single);
|
||||
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
|
||||
force_const_double);
|
||||
else
|
||||
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
|
||||
force_const_single);
|
||||
|
||||
fix->balance_stamp();
|
||||
vflag_fdotr = 0;
|
||||
}
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void PairDPDIntel::compute(int eflag, int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
{
|
||||
if (eflag || vflag) {
|
||||
ev_setup(eflag, vflag);
|
||||
} else evflag = vflag_fdotr = 0;
|
||||
|
||||
const int inum = list->inum;
|
||||
const int nthreads = comm->nthreads;
|
||||
const int host_start = fix->host_start_pair();
|
||||
const int offload_end = fix->offload_end_pair();
|
||||
const int ago = neighbor->ago;
|
||||
|
||||
if (ago != 0 && fix->separate_buffers() == 0) {
|
||||
fix->start_watch(TIME_PACK);
|
||||
|
||||
int packthreads;
|
||||
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||
else packthreads = 1;
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel if(packthreads > 1)
|
||||
#endif
|
||||
{
|
||||
int ifrom, ito, tid;
|
||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||
packthreads, sizeof(ATOM_T));
|
||||
buffers->thr_pack(ifrom,ito,ago);
|
||||
}
|
||||
fix->stop_watch(TIME_PACK);
|
||||
}
|
||||
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (_onetype) {
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
void PairDPDIntel::eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc,
|
||||
const int astart, const int aend)
|
||||
{
|
||||
const int inum = aend - astart;
|
||||
if (inum == 0) return;
|
||||
int nlocal, nall, minlocal;
|
||||
fix->get_buffern(offload, nlocal, nall, minlocal);
|
||||
|
||||
const int ago = neighbor->ago;
|
||||
IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
|
||||
|
||||
ATOM_T * _noalias const x = buffers->get_x(offload);
|
||||
typedef struct { double x, y, z; } lmp_vt;
|
||||
lmp_vt *v = (lmp_vt *)atom->v[0];
|
||||
const flt_t dtinvsqrt = 1.0/sqrt(update->dt);
|
||||
|
||||
const int * _noalias const numneigh = list->numneigh;
|
||||
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
|
||||
const int * _noalias const firstneigh = buffers->firstneigh(list);
|
||||
const FC_PACKED1_T * _noalias const param = fc.param[0];
|
||||
const flt_t * _noalias const special_lj = fc.special_lj;
|
||||
int * _noalias const rngi_thread = fc.rngi;
|
||||
const int rng_size = buffers->get_max_nbors();
|
||||
|
||||
const int ntypes = atom->ntypes + 1;
|
||||
const int eatom = this->eflag_atom;
|
||||
|
||||
// Determine how much data to transfer
|
||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||
buffers, offload, fix, separate_flag,
|
||||
x_size, q_size, ev_size, f_stride);
|
||||
|
||||
int tc;
|
||||
FORCE_T * _noalias f_start;
|
||||
acc_t * _noalias ev_global;
|
||||
IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
|
||||
const int nthreads = tc;
|
||||
int *overflow = fix->get_off_overflow_flag();
|
||||
{
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime();
|
||||
#endif
|
||||
|
||||
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
|
||||
f_stride, x, 0);
|
||||
|
||||
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EFLAG) oevdwl = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
|
||||
// loop over neighbors of my atoms
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int iifrom, iip, iito, tid;
|
||||
IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
|
||||
iifrom += astart;
|
||||
iito += astart;
|
||||
|
||||
#ifdef LMP_USE_MKL_RNG
|
||||
VSLStreamStatePtr *my_random = &(random_thread[tid]);
|
||||
#else
|
||||
RanMars *my_random = random_thread[tid];
|
||||
#endif
|
||||
flt_t *my_rand_buffer = fc.rand_buffer_thread[tid];
|
||||
int rngi = rngi_thread[tid];
|
||||
|
||||
int foff;
|
||||
if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
|
||||
else foff = -minlocal;
|
||||
FORCE_T * _noalias const f = f_start + foff;
|
||||
if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||
|
||||
flt_t icut, a0, gamma, sigma;
|
||||
if (ONETYPE) {
|
||||
icut = param[3].icut;
|
||||
a0 = param[3].a0;
|
||||
gamma = param[3].gamma;
|
||||
sigma = param[3].sigma;
|
||||
}
|
||||
for (int i = iifrom; i < iito; i += iip) {
|
||||
int itype, ptr_off;
|
||||
const FC_PACKED1_T * _noalias parami;
|
||||
if (!ONETYPE) {
|
||||
itype = x[i].w;
|
||||
ptr_off = itype * ntypes;
|
||||
parami = param + ptr_off;
|
||||
}
|
||||
|
||||
const int * _noalias const jlist = firstneigh + cnumneigh[i];
|
||||
const int jnum = numneigh[i];
|
||||
|
||||
acc_t fxtmp, fytmp, fztmp, fwtmp;
|
||||
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||
|
||||
const flt_t xtmp = x[i].x;
|
||||
const flt_t ytmp = x[i].y;
|
||||
const flt_t ztmp = x[i].z;
|
||||
const flt_t vxtmp = v[i].x;
|
||||
const flt_t vytmp = v[i].y;
|
||||
const flt_t vztmp = v[i].z;
|
||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
||||
if (NEWTON_PAIR == 0)
|
||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||
|
||||
if (rngi + jnum > rng_size) {
|
||||
#ifdef LMP_USE_MKL_RNG
|
||||
if (sizeof(flt_t) == sizeof(float))
|
||||
vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi,
|
||||
(float*)my_rand_buffer, (float)0.0, (float)1.0 );
|
||||
else
|
||||
vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi,
|
||||
(double*)my_rand_buffer, 0.0, 1.0 );
|
||||
#else
|
||||
for (int jj = 0; jj < rngi; jj++)
|
||||
my_rand_buffer[jj] = my_random->gaussian();
|
||||
#endif
|
||||
rngi = 0;
|
||||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
flt_t forcelj, evdwl;
|
||||
forcelj = evdwl = (flt_t)0.0;
|
||||
|
||||
int j, jtype, sbindex;
|
||||
if (!ONETYPE) {
|
||||
sbindex = jlist[jj] >> SBBITS & 3;
|
||||
j = jlist[jj] & NEIGHMASK;
|
||||
} else
|
||||
j = jlist[jj];
|
||||
|
||||
const flt_t delx = xtmp - x[j].x;
|
||||
const flt_t dely = ytmp - x[j].y;
|
||||
const flt_t delz = ztmp - x[j].z;
|
||||
if (!ONETYPE) {
|
||||
jtype = x[j].w;
|
||||
icut = parami[jtype].icut;
|
||||
}
|
||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||
const flt_t rinv = (flt_t)1.0/sqrt(rsq);
|
||||
|
||||
if (rinv > icut) {
|
||||
flt_t factor_dpd;
|
||||
if (!ONETYPE) factor_dpd = special_lj[sbindex];
|
||||
|
||||
flt_t delvx = vxtmp - v[j].x;
|
||||
flt_t delvy = vytmp - v[j].y;
|
||||
flt_t delvz = vztmp - v[j].z;
|
||||
flt_t dot = delx*delvx + dely*delvy + delz*delvz;
|
||||
flt_t randnum = my_rand_buffer[jj];
|
||||
|
||||
flt_t iwd = rinv - icut;
|
||||
if (rinv > (flt_t)IEPSILON) iwd = (flt_t)0.0;
|
||||
|
||||
if (!ONETYPE) {
|
||||
a0 = parami[jtype].a0;
|
||||
gamma = parami[jtype].gamma;
|
||||
sigma = parami[jtype].sigma;
|
||||
}
|
||||
flt_t fpair = a0 - iwd * gamma * dot + sigma * randnum * dtinvsqrt;
|
||||
if (!ONETYPE) fpair *= factor_dpd;
|
||||
fpair *= iwd;
|
||||
|
||||
const flt_t fpx = fpair * delx;
|
||||
fxtmp += fpx;
|
||||
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||
const flt_t fpy = fpair * dely;
|
||||
fytmp += fpy;
|
||||
if (NEWTON_PAIR) f[j].y -= fpy;
|
||||
const flt_t fpz = fpair * delz;
|
||||
fztmp += fpz;
|
||||
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||
|
||||
if (EFLAG) {
|
||||
flt_t cut = (flt_t)1.0/icut;
|
||||
flt_t r = (flt_t)1.0/rinv;
|
||||
evdwl = (flt_t)0.5 * a0 * (cut - (flt_t)2.0*r + rsq * icut);
|
||||
if (!ONETYPE) evdwl *= factor_dpd;
|
||||
sevdwl += evdwl;
|
||||
if (eatom) {
|
||||
fwtmp += (flt_t)0.5 * evdwl;
|
||||
if (NEWTON_PAIR)
|
||||
f[j].w += (flt_t)0.5 * evdwl;
|
||||
}
|
||||
}
|
||||
|
||||
if (NEWTON_PAIR == 0)
|
||||
IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
|
||||
} // if rsq
|
||||
} // for jj
|
||||
if (NEWTON_PAIR) {
|
||||
f[i].x += fxtmp;
|
||||
f[i].y += fytmp;
|
||||
f[i].z += fztmp;
|
||||
} else {
|
||||
f[i].x = fxtmp;
|
||||
f[i].y = fytmp;
|
||||
f[i].z = fztmp;
|
||||
}
|
||||
|
||||
IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||
rngi += jnum;
|
||||
} // for ii
|
||||
|
||||
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||
ov4, ov5);
|
||||
rngi_thread[tid] = rngi;
|
||||
} // end omp
|
||||
|
||||
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
|
||||
if (EFLAG) {
|
||||
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = (acc_t)0.0;
|
||||
}
|
||||
if (vflag) {
|
||||
if (NEWTON_PAIR == 0) {
|
||||
ov0 *= (acc_t)0.5;
|
||||
ov1 *= (acc_t)0.5;
|
||||
ov2 *= (acc_t)0.5;
|
||||
ov3 *= (acc_t)0.5;
|
||||
ov4 *= (acc_t)0.5;
|
||||
ov5 *= (acc_t)0.5;
|
||||
}
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
}
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
#endif
|
||||
} // end offload
|
||||
|
||||
if (offload)
|
||||
fix->stop_watch(TIME_OFFLOAD_LATENCY);
|
||||
else
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EFLAG || vflag)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
global settings
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void PairDPDIntel::settings(int narg, char **arg) {
|
||||
#if defined(_OPENMP)
|
||||
if (_nrandom_thread) {
|
||||
#ifdef LMP_USE_MKL_RNG
|
||||
for (int i = 0; i < _nrandom_thread; i++)
|
||||
vslDeleteStream(&random_thread[i]);
|
||||
#else
|
||||
for (int i = 1; i < _nrandom_thread; i++)
|
||||
delete random_thread[i];
|
||||
#endif
|
||||
}
|
||||
delete []random_thread;
|
||||
#endif
|
||||
PairDPD::settings(narg,arg);
|
||||
_nrandom_thread = comm->nthreads;
|
||||
|
||||
#ifdef LMP_USE_MKL_RNG
|
||||
|
||||
random_thread=new VSLStreamStatePtr[comm->nthreads];
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
vslNewStream(&random_thread[tid], LMP_MKL_RNG,
|
||||
seed + comm->me + comm->nprocs * tid );
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
random_thread =new RanMars*[comm->nthreads];
|
||||
random_thread[0] = random;
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
if (tid > 0)
|
||||
random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void PairDPDIntel::init_style()
|
||||
{
|
||||
PairDPD::init_style();
|
||||
if (force->newton_pair == 0) {
|
||||
neighbor->requests[neighbor->nrequest-1]->half = 0;
|
||||
neighbor->requests[neighbor->nrequest-1]->full = 1;
|
||||
}
|
||||
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
||||
|
||||
int ifix = modify->find_fix("package_intel");
|
||||
if (ifix < 0)
|
||||
error->all(FLERR,
|
||||
"The 'package intel' command is required for /intel styles");
|
||||
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
||||
|
||||
fix->pair_init_check();
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (fix->offload_balance() != 0.0)
|
||||
error->all(FLERR,
|
||||
"Offload for dpd/intel is not yet available. Set balance to 0.");
|
||||
#endif
|
||||
|
||||
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||
pack_force_const(force_const_single, fix->get_mixed_buffers());
|
||||
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||
pack_force_const(force_const_double, fix->get_double_buffers());
|
||||
else
|
||||
pack_force_const(force_const_single, fix->get_single_buffers());
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void PairDPDIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t,acc_t> *buffers)
|
||||
{
|
||||
_onetype = 0;
|
||||
if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;
|
||||
|
||||
int tp1 = atom->ntypes + 1;
|
||||
fc.set_ntypes(tp1,comm->nthreads,buffers->get_max_nbors(),memory,_cop);
|
||||
buffers->set_ntypes(tp1);
|
||||
flt_t **cutneighsq = buffers->get_cutneighsq();
|
||||
|
||||
// Repeat cutsq calculation because done after call to init_style
|
||||
double cut, cutneigh;
|
||||
for (int i = 1; i <= atom->ntypes; i++) {
|
||||
for (int j = i; j <= atom->ntypes; j++) {
|
||||
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
|
||||
cut = init_one(i,j);
|
||||
cutneigh = cut + neighbor->skin;
|
||||
cutsq[i][j] = cutsq[j][i] = cut*cut;
|
||||
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
|
||||
double icut = 1.0 / cut;
|
||||
fc.param[i][j].icut = fc.param[j][i].icut = icut;
|
||||
} else {
|
||||
cut = init_one(i,j);
|
||||
double icut = 1.0 / cut;
|
||||
fc.param[i][j].icut = fc.param[j][i].icut = icut;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
fc.special_lj[i] = force->special_lj[i];
|
||||
fc.special_lj[0] = 1.0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < tp1; i++) {
|
||||
for (int j = 0; j < tp1; j++) {
|
||||
fc.param[i][j].a0 = a0[i][j];
|
||||
fc.param[i][j].gamma = gamma[i][j];
|
||||
fc.param[i][j].sigma = sigma[i][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t>
|
||||
void PairDPDIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
||||
const int nthreads,
|
||||
const int max_nbors,
|
||||
Memory *memory,
|
||||
const int cop) {
|
||||
if (ntypes != _ntypes) {
|
||||
if (_ntypes > 0) {
|
||||
_memory->destroy(param);
|
||||
_memory->destroy(rand_buffer_thread);
|
||||
_memory->destroy(rngi);
|
||||
}
|
||||
if (ntypes > 0) {
|
||||
_cop = cop;
|
||||
memory->create(param,ntypes,ntypes,"fc.param");
|
||||
memory->create(rand_buffer_thread, nthreads, max_nbors,
|
||||
"fc.rand_buffer_thread");
|
||||
memory->create(rngi,nthreads,"fc.param");
|
||||
for (int i = 0; i < nthreads; i++) rngi[i] = max_nbors;
|
||||
}
|
||||
}
|
||||
_ntypes = ntypes;
|
||||
_memory = memory;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
proc 0 reads from restart file, bcasts
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void PairDPDIntel::read_restart_settings(FILE *fp)
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
if (_nrandom_thread) {
|
||||
#ifdef LMP_USE_MKL_RNG
|
||||
for (int i = 0; i < _nrandom_thread; i++)
|
||||
vslDeleteStream(&random_thread[i]);
|
||||
#else
|
||||
for (int i = 1; i < _nrandom_thread; i++)
|
||||
delete random_thread[i];
|
||||
#endif
|
||||
}
|
||||
delete []random_thread;
|
||||
#endif
|
||||
PairDPD::read_restart_settings(fp);
|
||||
_nrandom_thread = comm->nthreads;
|
||||
|
||||
#ifdef LMP_USE_MKL_RNG
|
||||
|
||||
random_thread=new VSLStreamStatePtr[comm->nthreads];
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
vslNewStream(&random_thread[tid], LMP_MKL_RNG,
|
||||
seed + comm->me + comm->nprocs * tid );
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
random_thread =new RanMars*[comm->nthreads];
|
||||
random_thread[0] = random;
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
if (tid > 0)
|
||||
random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
110
src/USER-INTEL/pair_dpd_intel.h
Normal file
110
src/USER-INTEL/pair_dpd_intel.h
Normal file
@ -0,0 +1,110 @@
|
||||
/* -*- c++ -*- ----------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
Shun Xu (Computer Network Information Center, CAS)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef PAIR_CLASS
|
||||
|
||||
PairStyle(dpd/intel,PairDPDIntel)
|
||||
|
||||
#else
|
||||
|
||||
#ifndef LMP_PAIR_DPD_INTEL_H
|
||||
#define LMP_PAIR_DPD_INTEL_H
|
||||
|
||||
#include "pair_dpd.h"
|
||||
#include "fix_intel.h"
|
||||
|
||||
#ifdef LMP_USE_MKL_RNG
|
||||
#include "mkl_vsl.h"
|
||||
#else
|
||||
#include "random_mars.h"
|
||||
#endif
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
class PairDPDIntel : public PairDPD {
|
||||
|
||||
public:
|
||||
PairDPDIntel(class LAMMPS *);
|
||||
~PairDPDIntel();
|
||||
|
||||
virtual void compute(int, int);
|
||||
void settings(int, char **);
|
||||
void init_style();
|
||||
void read_restart_settings(FILE *);
|
||||
|
||||
private:
|
||||
FixIntel *fix;
|
||||
int _cop, _onetype, _nrandom_thread;
|
||||
|
||||
#ifdef LMP_USE_MKL_RNG
|
||||
VSLStreamStatePtr *random_thread;
|
||||
#else
|
||||
RanMars **random_thread;
|
||||
#endif
|
||||
|
||||
template <class flt_t> class ForceConst;
|
||||
template <class flt_t, class acc_t>
|
||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
void eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> * buffers,
|
||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t, acc_t> *buffers);
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
template <class flt_t>
|
||||
class ForceConst {
|
||||
public:
|
||||
typedef struct { flt_t icut, a0, gamma, sigma; } fc_packed1;
|
||||
|
||||
_alignvar(flt_t special_lj[4],64);
|
||||
fc_packed1 **param;
|
||||
flt_t **rand_buffer_thread;
|
||||
int *rngi;
|
||||
|
||||
ForceConst() : _ntypes(0) {}
|
||||
~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); }
|
||||
|
||||
void set_ntypes(const int ntypes, const int nthreads, const int max_nbors,
|
||||
Memory *memory, const int cop);
|
||||
|
||||
private:
|
||||
int _ntypes, _cop;
|
||||
Memory *_memory;
|
||||
};
|
||||
ForceConst<float> force_const_single;
|
||||
ForceConst<double> force_const_double;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* ERROR/WARNING messages:
|
||||
|
||||
E: The 'package intel' command is required for /intel styles
|
||||
|
||||
Self-explanatory.
|
||||
|
||||
*/
|
||||
@ -68,7 +68,7 @@ void VerletLRTIntel::init()
|
||||
|
||||
_intel_kspace = (PPPMIntel*)(force->kspace_match("pppm/intel", 0));
|
||||
|
||||
#ifdef LMP_INTEL_NOLRT
|
||||
#ifndef LMP_INTEL_USELRT
|
||||
error->all(FLERR,
|
||||
"LRT otion for Intel package disabled at compile time");
|
||||
#endif
|
||||
|
||||
@ -23,10 +23,7 @@ IntegrateStyle(verlet/lrt/intel,VerletLRTIntel)
|
||||
#include "verlet.h"
|
||||
#include "pppm_intel.h"
|
||||
|
||||
#ifndef LMP_INTEL_USELRT
|
||||
#define LMP_INTEL_NOLRT
|
||||
#else
|
||||
|
||||
#ifdef LMP_INTEL_USELRT
|
||||
#ifdef LMP_INTEL_LRT11
|
||||
#define _LMP_INTEL_LRT_11
|
||||
#include <thread>
|
||||
|
||||
@ -134,7 +134,7 @@ public:
|
||||
// Manifold itself:
|
||||
manifold_gaussian_bump::manifold_gaussian_bump(class LAMMPS* lmp,
|
||||
int narg, char **arg)
|
||||
: manifold(lmp), lut_z(NULL), lut_zp(NULL) {}
|
||||
: manifold(lmp), lut_z(NULL), lut_zp(NULL) {}
|
||||
|
||||
|
||||
manifold_gaussian_bump::~manifold_gaussian_bump()
|
||||
@ -361,13 +361,13 @@ void manifold_gaussian_bump::test_lut()
|
||||
n( x, nn );
|
||||
double taper_z;
|
||||
if( xx <= rc1 ){
|
||||
taper_z = gaussian_bump(xx);
|
||||
taper_z = gaussian_bump(xx);
|
||||
}else if( xx < rc2 ){
|
||||
taper_z = lut_get_z( xx );
|
||||
taper_z = lut_get_z( xx );
|
||||
}else{
|
||||
taper_z = 0.0;
|
||||
taper_z = 0.0;
|
||||
}
|
||||
fprintf( fp, "%g %g %g %g %g\n", xx, gaussian_bump(xx), taper_z,
|
||||
fprintf( fp, "%g %g %g %g %g %g %g\n", xx, gaussian_bump(xx), taper_z,
|
||||
gg, nn[0], nn[1], nn[2] );
|
||||
}
|
||||
fclose(fp);
|
||||
|
||||
@ -98,7 +98,7 @@ int FixSRP::setmask()
|
||||
|
||||
void FixSRP::init()
|
||||
{
|
||||
if (force->pair_match("hybrid",1) == NULL)
|
||||
if (force->pair_match("hybrid",1) == NULL && force->pair_match("hybrid/overlay",1) == NULL)
|
||||
error->all(FLERR,"Cannot use pair srp without pair_style hybrid");
|
||||
|
||||
int has_rigid = 0;
|
||||
|
||||
@ -88,8 +88,8 @@ DumpNetCDF::DumpNetCDF(LAMMPS *lmp, int narg, char **arg) :
|
||||
|
||||
if (multiproc)
|
||||
error->all(FLERR,"Multi-processor writes are not supported.");
|
||||
if (multifile)
|
||||
error->all(FLERR,"Multiple files are not supported.");
|
||||
if (append_flag && multifile)
|
||||
error->all(FLERR,"Cannot append when writing to multiple files.");
|
||||
|
||||
perat = new nc_perat_t[nfield];
|
||||
|
||||
@ -224,6 +224,24 @@ DumpNetCDF::~DumpNetCDF()
|
||||
|
||||
void DumpNetCDF::openfile()
|
||||
{
|
||||
char *filecurrent = filename;
|
||||
if (multifile && !singlefile_opened) {
|
||||
char *filestar = filecurrent;
|
||||
filecurrent = new char[strlen(filestar) + 16];
|
||||
char *ptr = strchr(filestar,'*');
|
||||
*ptr = '\0';
|
||||
if (padflag == 0)
|
||||
sprintf(filecurrent,"%s" BIGINT_FORMAT "%s",
|
||||
filestar,update->ntimestep,ptr+1);
|
||||
else {
|
||||
char bif[8],pad[16];
|
||||
strcpy(bif,BIGINT_FORMAT);
|
||||
sprintf(pad,"%%s%%0%d%s%%s",padflag,&bif[1]);
|
||||
sprintf(filecurrent,pad,filestar,update->ntimestep,ptr+1);
|
||||
}
|
||||
*ptr = '*';
|
||||
}
|
||||
|
||||
if (thermo && !singlefile_opened) {
|
||||
if (thermovar) delete [] thermovar;
|
||||
thermovar = new int[output->thermo->nfield];
|
||||
@ -268,14 +286,14 @@ void DumpNetCDF::openfile()
|
||||
ntotalgr = group->count(igroup);
|
||||
|
||||
if (filewriter) {
|
||||
if (append_flag && access(filename, F_OK) != -1) {
|
||||
if (append_flag && !multifile && access(filecurrent, F_OK) != -1) {
|
||||
// Fixme! Perform checks if dimensions and variables conform with
|
||||
// data structure standard.
|
||||
|
||||
if (singlefile_opened) return;
|
||||
singlefile_opened = 1;
|
||||
|
||||
NCERRX( nc_open(filename, NC_WRITE, &ncid), filename );
|
||||
NCERRX( nc_open(filecurrent, NC_WRITE, &ncid), filecurrent );
|
||||
|
||||
// dimensions
|
||||
NCERRX( nc_inq_dimid(ncid, NC_FRAME_STR, &frame_dim), NC_FRAME_STR );
|
||||
@ -348,8 +366,8 @@ void DumpNetCDF::openfile()
|
||||
if (singlefile_opened) return;
|
||||
singlefile_opened = 1;
|
||||
|
||||
NCERRX( nc_create(filename, NC_64BIT_DATA, &ncid),
|
||||
filename );
|
||||
NCERRX( nc_create(filecurrent, NC_64BIT_DATA, &ncid),
|
||||
filecurrent );
|
||||
|
||||
// dimensions
|
||||
NCERRX( nc_def_dim(ncid, NC_FRAME_STR, NC_UNLIMITED, &frame_dim),
|
||||
@ -598,15 +616,39 @@ void DumpNetCDF::closefile()
|
||||
if (filewriter && singlefile_opened) {
|
||||
NCERR( nc_close(ncid) );
|
||||
singlefile_opened = 0;
|
||||
// append next time DumpNetCDF::openfile is called
|
||||
append_flag = 1;
|
||||
// write to next frame upon next open
|
||||
framei++;
|
||||
if (multifile)
|
||||
framei = 1;
|
||||
else {
|
||||
// append next time DumpNetCDF::openfile is called
|
||||
append_flag = 1;
|
||||
framei++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <typename T>
|
||||
int nc_put_var1_bigint(int ncid, int varid, const size_t index[], const T* tp)
|
||||
{
|
||||
return nc_put_var1_int(ncid, varid, index, tp);
|
||||
}
|
||||
|
||||
template <>
|
||||
int nc_put_var1_bigint<long>(int ncid, int varid, const size_t index[],
|
||||
const long* tp)
|
||||
{
|
||||
return nc_put_var1_long(ncid, varid, index, tp);
|
||||
}
|
||||
|
||||
template <>
|
||||
int nc_put_var1_bigint<long long>(int ncid, int varid, const size_t index[],
|
||||
const long long* tp)
|
||||
{
|
||||
return nc_put_var1_longlong(ncid, varid, index, tp);
|
||||
}
|
||||
|
||||
void DumpNetCDF::write()
|
||||
{
|
||||
// open file
|
||||
@ -638,13 +680,8 @@ void DumpNetCDF::write()
|
||||
th->keyword[i] );
|
||||
}
|
||||
else if (th->vtype[i] == BIGINT) {
|
||||
#if defined(LAMMPS_SMALLBIG) || defined(LAMMPS_BIGBIG)
|
||||
NCERRX( nc_put_var1_long(ncid, thermovar[i], start, &th->bivalue),
|
||||
NCERRX( nc_put_var1_bigint(ncid, thermovar[i], start, &th->bivalue),
|
||||
th->keyword[i] );
|
||||
#else
|
||||
NCERRX( nc_put_var1_int(ncid, thermovar[i], start, &th->bivalue),
|
||||
th->keyword[i] );
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -887,6 +924,8 @@ int DumpNetCDF::modify_param(int narg, char **arg)
|
||||
return 2;
|
||||
}
|
||||
else if (strcmp(arg[iarg],"at") == 0) {
|
||||
if (!append_flag)
|
||||
error->all(FLERR,"expected 'append yes' before 'at' keyword");
|
||||
iarg++;
|
||||
framei = force->inumeric(FLERR,arg[iarg]);
|
||||
if (framei < 0) framei--;
|
||||
@ -911,68 +950,6 @@ int DumpNetCDF::modify_param(int narg, char **arg)
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void DumpNetCDF::write_prmtop()
|
||||
{
|
||||
char fn[1024];
|
||||
char tmp[81];
|
||||
FILE *f;
|
||||
|
||||
strcpy(fn, filename);
|
||||
strcat(fn, ".prmtop");
|
||||
|
||||
f = fopen(fn, "w");
|
||||
fprintf(f, "%%VERSION LAMMPS\n");
|
||||
fprintf(f, "%%FLAG TITLE\n");
|
||||
fprintf(f, "%%FORMAT(20a4)\n");
|
||||
memset(tmp, ' ', 76);
|
||||
tmp[76] = '\0';
|
||||
fprintf(f, "NASN%s\n", tmp);
|
||||
|
||||
fprintf(f, "%%FLAG POINTERS\n");
|
||||
fprintf(f, "%%FORMAT(10I8)\n");
|
||||
#if defined(LAMMPS_SMALLBIG) || defined(LAMMPS_BIGBIG)
|
||||
fprintf(f, "%8li", ntotalgr);
|
||||
#else
|
||||
fprintf(f, "%8i", ntotalgr);
|
||||
#endif
|
||||
for (int i = 0; i < 11; i++)
|
||||
fprintf(f, "%8i", 0);
|
||||
fprintf(f, "\n");
|
||||
for (int i = 0; i < 12; i++)
|
||||
fprintf(f, "%8i", 0);
|
||||
fprintf(f, "\n");
|
||||
for (int i = 0; i < 6; i++)
|
||||
fprintf(f, "%8i", 0);
|
||||
fprintf(f, "\n");
|
||||
|
||||
fprintf(f, "%%FLAG ATOM_NAME\n");
|
||||
fprintf(f, "%%FORMAT(20a4)\n");
|
||||
for (int i = 0; i < ntotalgr; i++) {
|
||||
fprintf(f, "%4s", "He");
|
||||
if ((i+1) % 20 == 0)
|
||||
fprintf(f, "\n");
|
||||
}
|
||||
|
||||
fprintf(f, "%%FLAG CHARGE\n");
|
||||
fprintf(f, "%%FORMAT(5E16.5)\n");
|
||||
for (int i = 0; i < ntotalgr; i++) {
|
||||
fprintf(f, "%16.5e", 0.0);
|
||||
if ((i+1) % 5 == 0)
|
||||
fprintf(f, "\n");
|
||||
}
|
||||
|
||||
fprintf(f, "%%FLAG MASS\n");
|
||||
fprintf(f, "%%FORMAT(5E16.5)\n");
|
||||
for (int i = 0; i < ntotalgr; i++) {
|
||||
fprintf(f, "%16.5e", 1.0);
|
||||
if ((i+1) % 5 == 0)
|
||||
fprintf(f, "\n");
|
||||
}
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void DumpNetCDF::ncerr(int err, const char *descr, int line)
|
||||
{
|
||||
if (err != NC_NOERR) {
|
||||
|
||||
@ -92,7 +92,6 @@ class DumpNetCDF : public DumpCustom {
|
||||
void closefile();
|
||||
virtual void write_header(bigint);
|
||||
virtual void write_data(int, double *);
|
||||
void write_prmtop();
|
||||
|
||||
virtual int modify_param(int, char **);
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user